In [63]:
import copy,numpy as np
np.random.seed(28)

In [64]:
def sigmoid(x):
    output = 1/(1+np.exp(-x))
    return output

def sigmoid_output_to_derivative(output):
    return output*(1-output)


In [65]:
int2binary = {}
binary_dim = 8

In [66]:
largest_number = pow(2,binary_dim);largest_number
binary = np.unpackbits(
    np.array([range(largest_number)],dtype=np.uint8).T,axis=1)
for i in range(largest_number):
    int2binary[i] = binary[i]

In [67]:
int2binary[10],len(int2binary)

(array([0, 0, 0, 0, 1, 0, 1, 0], dtype=uint8), 256)

In [68]:
# input variables
alpha = 0.1
input_dim = 2
hidden_dim = 16
output_dim = 1

In [69]:
# initialize the neural network weights,否则不能启动
synapse_0 = 2*np.random.random((input_dim,hidden_dim)) -1 # 输入，（0，1）变换到（-1，1）
synapse_h = 2*np.random.random((hidden_dim,hidden_dim)) -1 # 隐层
synapse_1 = 2*np.random.random((hidden_dim,output_dim)) -1 # 输出

# initialize the weights derivative，
synapse_0_update = np.zeros_like(synapse_0)
synapse_h_update = np.zeros_like(synapse_h)
synapse_1_update = np.zeros_like(synapse_1)

In [70]:
synapse_h.shape

(16, 16)

In [71]:
synapse_0_update,synapse_0_update.shape

(array([[0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.]]),
 (2, 16))

In [99]:
# training logic
for i in range(10000):
    a_int = np.random.randint(largest_number/2) # 最大数不超过128
    a     = int2binary[a_int]
    b_int = np.random.randint(largest_number/2)
    b     = int2binary[b_int]
    
    # true answer
    c_int = a_int+b_int
    c = int2binary[c_int]
    
    # where store the best guess(binary encoded)
    d = np.zeros_like(c)
    
    overallError = 0
    layer_2_deltas = list() # 输出错误对本次输入的导数
    layer_1_values = list() # 第一层输出的值，同时会作为隐层加入到下一个值的输入中
    layer_1_values.append(np.zeros(hidden_dim)) # 初始时隐层输入为0
    
    
    # moving along the postions in the binary encoding from lower to higher
    for position in range(binary_dim): # 循环一个序列的每个元素
        
        # 生成一个输入样本和正确输出，从低位到高位
        # 为什么要二维？为了和二维得到权重正确的矩阵相乘
        # 输入层
        x = np.array([[a[binary_dim-position-1],b[binary_dim-position-1]]]) #1*2的二维数组,
        y = np.array([[c[binary_dim-position-1]]]).T # 1×1 的二维数组，还要转置？
        # 隐层 (input + prev_hidden)
        layer_1 = sigmoid(np.dot(x,synapse_0) + np.dot(layer_1_values[-1],synapse_h)) # 上次隐层的输入也加入中
        # 输出层
        layer_2 = sigmoid(np.dot(layer_1,synapse_1))
                      
        # cost function
        layer_2_error = y-layer_2 # 矩阵相减结果还是矩阵（1x1）, 假如用均方损失的话
        layer_2_deltas.append(layer_2_error * sigmoid_output_to_derivative(layer_2)) # 对layer2的输入求导
        overallError += np.abs(layer_2_error[0]) # 一维数组
        
        # decode 预计输出用于后续打印
        d[binary_dim-position-1] = np.round(layer_2[0][0]) # 四舍五入取数
        
        # 储存本次隐藏输出用于下一个timestep
        layer_1_values.append(copy.deepcopy(layer_1)) #注意使用深拷贝
    
    future_layer_1_delta = np.zeros(hidden_dim) # (1x16)最后一个隐层输出的导数，可以设置为0，因为对系统没有影响了
    
                      
    for position in range(binary_dim): # 反向传播算法，从最后的误差BP
        x = np.array([[a[position],b[position]]]) # 当前输入的x(1x2)
        layer_1 = layer_1_values[-position-1] # layer_1的输出
        prev_layer_1 = layer_1_values[-position-2] # 前一个layer_1的输出(16*16)，-9怎么办？
        
        # 逐层反向误差偏导
        # error at output layer
        layer_2_delta = layer_2_deltas[-position-1] # 当前输出的导数(1*1)
        # error at hiddenlayer 前一层的导数
        # 这层输出同时也是下个timestep的隐层输入，所以还要计算这次L1对下一次的影响，最tricy的地方
        layer_1_delta = sigmoid_output_to_derivative(layer_1) * (np.dot(layer_2_delta, synapse_1.T) + 
                        np.dot(future_layer_1_delta, synapse_h)) 
        
        #layer_1_delta = (future_layer_1_delta.dot(synapse_h.T) + 
        #                 layer_2_delta.dot(synapse_1.T)) * sigmoid_output_to_derivative(layer_1)  
        
        # 据此，根据求导链式法则，求三个weights（synapse_0，synapse_h，synapse_1）,参看上面正向传播的公式
        # 这里每次都要累加，np.atleast_2d确保layer_1,pre_layer_1的输出必须是2维的
        #synapse_1_update += np.dot(layer_1.T,layer_2_delta) # (16*1)
        synapse_1_update += np.atleast_2d(layer_1).T.dot(layer_2_delta) # (16*1)，使用np.atleast2d？
        # 隐层有些特殊，它的对当前输出的传播误差，还对下个timestep的输出传播误差，
        #synapse_h_update += np.dot(layer_1_delta.T,prev_layer_1.reshape(1,16)) 
        synapse_h_update += np.atleast_2d(prev_layer_1).T.dot(layer_1_delta) # 使用np.atleast2d
        synapse_0_update += np.dot(x.T,layer_1_delta) # 
        
        future_layer_1_delta = layer_1_delta # ()
    
    # update weights,为什么要+，不是梯度的负方向吗？
    synapse_0 += alpha * synapse_0_update  # 为什是+
    synapse_h += alpha * synapse_h_update
    synapse_1 += alpha * synapse_1_update
    
    # 前面梯度一直在+=，所以每个新样本时，权重需要重置为0
    synapse_0_update *= 0
    synapse_h_update *= 0
    synapse_1_update *= 0
    
    
    if i % 1000 == 0:
        print('a is:', a)
        print('b is:', b)
        print('guess is:',d)
        
        def bin2dec(bin):
            dec = 0
            length = len(bin)
            for i in range(length):
                dec += bin[i] * pow(2,(length-i-1))
            return dec
        
        print('{} + {} ={}'.format(a_int, b_int, bin2dec(d)))
        print('the error is:',overallError)
        print('--------------------------------------------')
                      
        

a is: [0 0 1 0 0 0 0 0]
b is: [0 0 1 0 1 0 1 1]
guess is: [0 0 0 0 0 0 0 0]
32 + 43 =0
the error is: [3.77378019]
--------------------------------------------
a is: [0 0 0 0 1 0 1 1]
b is: [0 0 0 0 0 1 1 1]
guess is: [0 0 0 1 0 0 1 0]
11 + 7 =18
the error is: [1.00946714]
--------------------------------------------
a is: [0 1 0 1 1 1 1 1]
b is: [0 0 0 0 0 1 1 1]
guess is: [0 1 1 0 0 0 1 0]
95 + 7 =98
the error is: [1.69663157]
--------------------------------------------
a is: [0 1 1 0 0 0 1 0]
b is: [0 0 1 0 0 0 1 1]
guess is: [1 0 0 0 0 1 0 1]
98 + 35 =133
the error is: [1.01098954]
--------------------------------------------
a is: [0 0 0 1 0 1 0 0]
b is: [0 1 1 1 0 1 0 1]
guess is: [1 0 0 0 1 0 0 1]
20 + 117 =137
the error is: [2.12981935]
--------------------------------------------
a is: [0 0 0 1 1 0 1 0]
b is: [0 1 0 0 0 0 1 0]
guess is: [0 1 0 1 1 1 0 0]
26 + 66 =92
the error is: [0.27142224]
--------------------------------------------
a is: [0 1 0 1 1 0 1 1]
b is: [0 1 1 0 1