In [1]:
import numpy as np

In [2]:
def relu(z):
    """
    Args:
        z: (batch_size, hidden_size)
    """
    flag = (z <= 0) # 需要修改为0的部分
    z[flag] = 0
    return z

def derivation_relu(z):
    flag = (z <= 0)
    z[flag] = 0
    z[~flag] = 1
    return z

def sigmoid(z):
    """
    Args:
        z: (batch_size, hidden_size)
    """
    return 1 / (1 + np.exp(-z))

def cross_entropy(y, y_hat):
    """
    Args:
        y: (batch_size, ) 每个样本的真实label
        y_hat: (batch_size, output_size)， 网络的输出预测得分，经过sigmoid概率化。output_size即分类类别数
    return:
        a: (batch_size, output_size)
        loss: scalar
    """
    n_batch = y_hat.shape[0]
#     scores = y_hat[range(n_batch), y]
    loss = -np.sum(np.log(y_hat)) / n_batch
    return loss
def derivation_sigmoid_cross_entropy(y, y_hat):
    """
    Args:
        logits: (batch_size, output_size)， 网络的输出预测得分, 还没有进行 softmax概率化
        y: (batch_size, ) 每个样本的真实label
    
    Return:
        \frac {\partial C}{\partial z^L}
        (batch_size, output_size)
    """
    y_hat -= 1
    return y_hat

In [8]:
class Network(object):
    """
    fully-connected neural network
    Attributions:
        sizes: list, 每个元素是每层的神经元的个数, 包括输入输出层
        num_layers: 神经网络的层数
        weights: list, 每个元素是一层神经网络的权重
        bias: list, 每个元素是一层神经网络的偏置
    """
    def __init__(self, sizes):
        self.sizes = sizes
        self.num_layers = len(sizes)
        self.weights = [np.random.randn(i, j) for i, j in zip(self.sizes[:-1], self.sizes[1:])]
        self.bias = [np.random.randn(1, j) for j in self.sizes[1:]]
        self.dws = None
        self.dbs = None
        self.zs = [] 
        self._as = []


    def forward(self, x):
        """
        用于推理，前向传播时不进行softmax概率化
        x: (batch_size, input_size)
        """
        a = x
        self._as.append(a)
        for weight, bias in zip(self.weights[:-1], self.bias[:-1]):
            z = np.dot(a, weight) + bias
            a = relu(z) 
            self.zs.append(z)
            self._as.append(a)
        # 在前向传播时不需要进行softmax概率化， 反向传播时才会用到
        logits = np.dot(a, self.weights[-1]) + self.bias[-1]
        y_hat = sigmoid(logits)
        self.zs.append(logits)
        self._as.append(y_hat)
        
        return y_hat

    def backward(self, x, y):
        """
        Args:
            x: (batch_size, input_size)
            y: (batch_size, )
        returns:
            dws: list， 每个元素是每一层权重的梯度
            dbs: list, 每个元素是每一层偏置的梯度
        """
        # 存储每一层的损失函数对参数的梯度


        ################# 前向传播 ##############################
        # zs, _as存储前向传播过程中的中间变量z和a，供反向传播时使用

#         self.zero_grad()
        y_hat = self.forward(x)
        loss = cross_entropy(y, y_hat)

        ################# 反向传播 ##############################
        # 输出层误差
        dl = derivation_sigmoid_cross_entropy(y, y_hat)
        # batch的大小
        n = len(x)
        # 最后一层的梯度
        # 每个样本得的梯度求和、求平均
        self.dws[-1] = np.dot(self._as[-2].T, dl) / n
        self.dbs[-1] = np.sum(dl, axis=0, keepdims=True) / n
        # 误差反向传播
        for i in range(2, self.num_layers):
            dl = np.dot(dl, self.weights[-i+1].T) * derivation_relu(self.zs[-i])
            self.dws[-i] = np.dot(self._as[-i-1].T, dl) / n
            self.dbs[-i] = np.sum(dl, axis=0, keepdims=True) / n
            
        self.zs = [] 
        self._as = []

#         return loss, dws, dbs
    
    def zero_grad(self):
        self.dws = [np.zeros((i, j)) for i, j in zip(self.sizes[:-1], self.sizes[1:])]
        self.dbs = [np.zeros((1, j)) for j in self.sizes[1:]]
        
    def optimize(self, learning_rate):

        self.weights = [weight - learning_rate * dw for weight, dw in zip(self.weights, self.dws)]
        self.bias = [bias - learning_rate * db for bias, db in zip(self.bias, self.dbs)]

        
def train():
    
    n_batch = 5
    n_input_layer = 2
    n_hidden_layer = 3
    n_output_layer = 1
    n_class = 2
    x = np.random.rand(n_batch, n_input_layer)
    y = np.random.randint(0, n_class, size=n_batch)
    net = Network((n_input_layer, n_hidden_layer, n_output_layer))
    print('initial weights:', net.weights)
    print('initial bias:', net.bias)
    net.zero_grad()
    net.backward(x, y)
    net.optimize(0.1)
    print('updated weights:', net.weights)
    print('updated bias:', net.bias)

In [9]:
train()

initial weights: [array([[-1.1045154 , -3.0637398 ,  0.58647799],
       [ 0.18172784, -0.46666983,  0.63525025]]), array([[-1.50858713],
       [-0.19424424],
       [ 0.17025739]])]
initial bias: [array([[2.53635155, 1.15107515, 0.21394453]]), array([[-0.54529983]])]
updated weights: [array([[-1.15680535, -3.06451547,  0.59237937],
       [ 0.10586316, -0.471385  ,  0.64381225]]), array([[-1.29011776],
       [-0.17234133],
       [ 0.24341806]])]
updated bias: [array([[2.38907369, 1.14340284, 0.23056614]]), array([[-0.44767348]])]
