In [1]:
"""
请大家使用numpy库完成relu, derivation_relu, sigmoid三个函数的填空，以及forward、backward和train中部分功能的实现
"""

'\n请大家使用numpy库完成relu, derivation_relu, sigmoid三个函数的填空，以及forward、backward和train中部分功能的实现\n'

In [1]:
import numpy as np

In [2]:
def relu(z):
    """
    Args:
        z: (batch_size, hidden_size)
    return:
        a: (batch_size, hidden_size)激活值
    """
    a = np.maximum(0,z) #X和Y逐位进行比较,选择最大值
    # pass
    return a

def derivation_relu(z):
    """
    Args:
        z: (batch_size, hidden_size)
    return:
        dz: (batch_size, hidden_size)导数值
    """
    
    dz = np.ones(z.shape)#1矩阵方法
    dz[z<=0] = 0
    # pass
    return dz

def sigmoid(z):
    """
    Args:
        z: (batch_size, hidden_size)
    return:
        a: (batch_size, hidden_size)激活值
    """
    a = 1/(1+np.exp(-z))
    # pass
    return a

def cross_entropy(y, y_hat):
    """
    Args:
        y: (batch_size, ) 每个样本的真实label
        y_hat: (batch_size, output_size)， 网络的输出预测得分，已经过sigmoid概率化。output_size即分类类别数
    return:
        loss: scalar
    """
    n_batch = y_hat.shape[0] # 样本数量
    loss = -np.sum(np.log(y_hat)) / n_batch # loss = -np.sum(-(y*np.log(y_hat)+(1-y)*np.log(1-y_hat))
    return loss

def derivation_sigmoid_cross_entropy(y, y_hat):
    """
    Args:
        logits: (batch_size, output_size)， 网络的输出预测得分, 还没有进行 softmax概率化
        y: (batch_size, ) 每个样本的真实label
    
    Return:
        \frac {\partial C}{\partial z^L}
        (batch_size, output_size)
    """
    y_hat -= 1  #dz
    return y_hat

In [3]:
class Network(object):
    """
    fully-connected neural network
    Attributions:
        sizes: list, 输入层、隐藏层、输出层尺寸
        num_layers: 神经网络的层数
        weights: list, 每个元素是一层神经网络的权重
        bias: list, 每个元素是一层神经网络的偏置
        dws: list，存储权重梯度
        dbs: list，存储偏置梯度
        zs: list，存储前向传播临时变量
        _as：list，存储前向传播临时变量
    """
    def __init__(self, sizes):
        #sizes=[2,3,2]
        self.sizes = sizes
        self.num_layers = len(sizes) #3
        #随机产生每条连线的权重
        self.weights = [np.random.randn(i, j) for i, j in zip(self.sizes[:-1], self.sizes[1:])]
        #随机产生隐层与输出层中每个神经元的偏置（0-1）
        self.bias = [np.random.randn(1, j) for j in self.sizes[1:]]
        self.dws = None
        self.dbs = None
        self.zs = [] 
        self._as = []


    def forward(self, x):
        """
        前向传播
        x: (batch_size, input_size)
        """
        a = x
        self._as.append(a)
        for weight, bias in zip(self.weights[:-1], self.bias[:-1]):
            # 计算临时变量z和a并存入self.zs和self._as
            z = np.dot(a,weight) + bias
            a = relu(z)
            self.zs.append(z)
            self._as.append(a)
            
            #########################################
        logits = np.dot(a, self.weights[-1]) + self.bias[-1]
        y_hat = sigmoid(logits)
        self.zs.append(logits)
        self._as.append(y_hat)
        return y_hat 

    def backward(self, x, y):
        """
        反向传播
        Args:
            x: (batch_size, input_size)
            y: (batch_size, )
        """

        y_hat = self.forward(x)
        
        loss = cross_entropy(y, y_hat) 

        ################# 反向传播梯度计算 ##############################
        # 输出层误差
        dl = derivation_sigmoid_cross_entropy(y, y_hat)
        print(dl)
        n = len(x)
        # 最后一层的梯度
        # 每个样本得的梯度求和、求平均
        self.dws[-1] = np.dot(self._as[-2].T, dl) / n  
        self.dbs[-1] = np.sum(dl, axis=0, keepdims=True) / n 
        # 计算梯度
        for i in range(2, self.num_layers):
            # 计算梯度并存入self.dws和self.dbs，注意矩阵乘法和逐元素乘法
            dl = np.dot(dl, self.weights[-i+1].T) * derivation_relu(self.zs[-i])
            self.dws[-i] = np.dot(self._as[-i-1].T, dl) / n
            self.dbs[-i] = np.sum(dl, axis=0, keepdims=True) / n
            ############################################################
            
        self.zs = [] 
        self._as = []
    
    def zero_grad(self):
        """清空梯度"""
        self.dws = [np.zeros((i, j)) for i, j in zip(self.sizes[:-1], self.sizes[1:])]
        self.dbs = [np.zeros((1, j)) for j in self.sizes[1:]]
        
    def optimize(self, learning_rate):
        """更新梯度"""
        self.weights = [weight - learning_rate * dw for weight, dw in zip(self.weights, self.dws)]
        self.bias = [bias - learning_rate * db for bias, db in zip(self.bias, self.dbs)]

        
def train():
    
    n_batch = 5
    n_input_layer = 2
    n_hidden_layer = 3
    n_output_layer = 2
    n_class = 2
    x = np.random.rand(n_batch, n_input_layer) # 5行 2列
    y = np.random.randint(0, n_class, size=n_batch) #返回一个 0 1 的5个值的array (5,1)
    net = Network((n_input_layer, n_hidden_layer, n_output_layer))
    print('initial weights:', net.weights)
    print('initial bias:', net.bias)
    # 执行梯度计算

    net.forward(x)
    net.zero_grad()
    net.backward(x,y)
    net.optimize(0.01)
    #net.zero_grad()
    
    ##############
    print('updated weights:', net.weights)
    print('updated bias:', net.bias)

In [4]:
train()

initial weights: [array([[ 0.11065484, -1.70818053, -0.75418618],
       [ 0.32770487, -1.68940839,  0.03553788]]), array([[-1.51670536, -1.33758874],
       [ 2.09150799,  0.4039313 ],
       [ 1.10866779,  1.86360581]])]
initial bias: [array([[ 0.1136647 , -0.09211859,  0.6123297 ]]), array([[0.31869108, 1.08240231]])]
[[-0.39810227 -0.15356417]
 [-0.51646386 -0.32229427]
 [-0.48134031 -0.26237657]
 [-0.35210271 -0.13816738]
 [-0.42282006 -0.19049196]]
updated weights: [array([[ 0.10662929, -1.70818053, -0.75236322],
       [ 0.32395669, -1.68940839,  0.03866206]]), array([[-1.51543716, -1.33697334],
       [ 2.09150799,  0.4039313 ],
       [ 1.11007977,  1.86420484]])]
updated bias: [array([[ 0.10422556, -0.09211859,  0.61877326]]), array([[0.32303274, 1.0845361 ]])]
