数值微分虽然简单，也容易实现，但缺点是计算上比较费时间。
本章我们将学习一个能够高效计算权重参数的梯度的方法——误差反向传播法。
采用计算图法
如果正向传播时的输入值小于等于0，则反向传播的值为0。因此，反向传播中会使用正向传播时保存的 mask，将从上游传来的 dout的mask中的元素为True的地方设为0。

In [7]:
import numpy as np


class MulLayer:
    def __init__(self):
        self.x = None
        self.y = None
    def forward(self,x,y):
        self.x = x
        self.y = y
        out = x*y
        
        return  out
    def backward(self,dout):
        dx = dout * self.y
        dy = dout * self.x
        
        return dx, dy
    
class AddLayer:
    def __init__(self):
        pass
    def forward(self, x, y):
        out = x + y
        return out
    def backward(self, dout):
        dx = dout * 1
        dy = dout * 1
        return dx, dy
class Relu:
    def __init__(self):
        self.mask = None
    def forward(self,x):
        self.mask = (x <= 0)
        out = x.copy()
        out[self.mask] = 0
        return out
    def backward(self,dout):
        dout[self.mask] = 0
        dx = dout
        
        return dx

In [5]:
apple = 100
apple_num = 2
tax = 1.1
# layer
mul_apple_layer = MulLayer()
mul_tax_layer = MulLayer()
# forward
apple_price = mul_apple_layer.forward(apple, apple_num)
price = mul_tax_layer.forward(apple_price, tax)
print(price) # 220

# backward
dprice = 1
dapple_price, dtax = mul_tax_layer.backward(dprice)
dapple, dapple_num = mul_apple_layer.backward(dapple_price)
print(dapple, dapple_num, dtax) # 2.2 110 200

220.00000000000003
2.2 110.00000000000001 200


In [8]:
apple = 100
apple_num = 2
orange = 150
orange_num = 3
tax = 1.1
# layer
mul_apple_layer = MulLayer()
mul_orange_layer = MulLayer()
add_apple_orange_layer = AddLayer()
mul_tax_layer = MulLayer()
# forward
apple_price = mul_apple_layer.forward(apple, apple_num) #(1)
orange_price = mul_orange_layer.forward(orange, orange_num) #(2)
all_price = add_apple_orange_layer.forward(apple_price, orange_price) #(3)
price = mul_tax_layer.forward(all_price, tax) #(4)
# backward
dprice = 1
dall_price, dtax = mul_tax_layer.backward(dprice) #(4)
dapple_price, dorange_price = add_apple_orange_layer.backward(dall_price) #(3)
dorange, dorange_num = mul_orange_layer.backward(dorange_price) #(2)
dapple, dapple_num = mul_apple_layer.backward(dapple_price) #(1)
print(price) # 715
print(dapple_num, dapple, dorange, dorange_num, dtax) # 110 2.2 3.3 165 650

715.0000000000001
110.00000000000001 2.2 3.3000000000000003 165.0 650


对 sigmoid 分解为$\times$ -1, exp, +1, 取倒数四个步骤分别计算反向传播算法
从后往前看：
$\frac{\partial y}{\partial x} = -\frac{1}{x^2} = -y^2$
....
$\begin{aligned}
\begin{aligned}\frac{\partial L}{\partial y}y^2\exp(-x)\end{aligned}& =\frac{\partial L}{\partial y}\frac1{(1+\exp(-x))^2}\exp(-x)  \\
&=\frac{\partial L}{\partial y}\frac1{1+\exp(-x)}\frac{\exp(-x)}{1+\exp(-x)} \\
&=\frac{\partial L}{\partial y}y(1-y)
\end{aligned}$
图示见notion


In [None]:
class sigmoid:
    def __init__(self):
        self.out = None
    def forward(self, x):
        out = 1/(1 + np.exp(-x))
        self.out = out
        
        return out
    def backward(self,dout):
        dx = dout * (1.0 - self.out) * self.out
        return dx

5.6 Affine
Affine层
(2,) 当作1行2列看
$X(2,) \cdot W(2,3) = O(3,)$ 

$\begin{aligned}\frac{\partial L}{\partial\boldsymbol{X}}&=\frac{\partial L}{\partial\boldsymbol{Y}}\cdot\boldsymbol{W}^\mathrm{T}\\\frac{\partial L}{\partial\boldsymbol{W}}&=\boldsymbol{X}^\mathrm{T}\cdot\frac{\partial L}{\partial\boldsymbol{Y}}\end{aligned}$

5.6.2 批版本的Affine层
$\begin{gathered}
\begin{aligned}\frac{\partial L}{\partial X}=\frac{\partial L}{\partial Y}\cdot W^\mathrm{T}\end{aligned} \\
\begin{matrix}(N,2)&(N,3)&(3,2)\end{matrix} \\
\frac{\partial L}{\partial W} =X^{\mathrm{T}}\cdot\frac{\partial L}{\partial Y} \\
(2,3) (2,N)(N,3) 
\end{gathered}$

In [None]:
class Affine:
    def __init__(self,W,b):
        self.W = W
        self.b = b
        
        self.x = None
        self.original_x_shape = None
        
        self.dW = None
        self.db = None
    def forward(self,x):
        self.original_x_shape = x.shape
        x = x.reshape(x.shape[0],-1)
        # 这行代码的目的是将输入数据 x 调整为一个二维数组，其中第一个维度保持与原始数据相同，而第二个维度将被拉平为一维。这通常在神经网络的层中使用，以便在进行仿射变换（矩阵乘法）时适应权重矩阵的维度。
        self.x = x
        
        out = np.dot(self.x, self.W) + self.b
        
        return out
    def backward(self,dout):
        dx = np.dot(dout, self.W.T)
        self.dW = np.dot(self.x.T ,dout)
        self.db = np.sum(dout, axis=0)
        
        dx = dx.reshape(*self.original_x_shape)
        # 在神经网络的反向传播过程中，梯度的形状需要与前向传播中相应的输入数据的形状保持一致，以确保正确的梯度传递
        return dx
    

In [None]:
class SoftmaxWithLoss:
    def __init__(self):
        self.loss = None
        self.y = None
        self.t = None
    def forward(self, x, t):
        self.t = t
        self.y = softmax(x)
        self.loss = cross_entropy_error(self.y, self.t)
        return self.loss
    def backward(self, dout = 1):
        batch_size = self.t.shape[0]
        dx = (self.y - self.t) / batch_size
        
        return dx