In [1]:
# 在Jupyter Notebook中运行以下代码来配置显示参数
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

import numpy as np

## 前向传播
$y_1 = W_1 \times x + b_1$ \
$y_2 = W_2 \times y_1 + b_2$ \
$y_3 = W_3 \times y_2 + b_3$

相当于3层带偏置项的全连接层
```python
import torch
from torch import nn

model = nn.Sequential(
    nn.Linear(3, 4, bias=True),
    nn.Linear(4, 5, bias=True),
    nn.Linear(5, 1, bias=True)
)
```

$L_{loss} = |y_3 - y_{true}|$

### 初始化输入与权重参数

In [2]:
y_true = np.array([[50]]) # 初始化真实值
x = np.array([[1, 2, 3]]).T # 初始化输入

W1 = np.ones((4, 3)) # 初始化W
W2 = np.ones((5, 4)) # 初始化W
W3 = np.ones((1, 5)) # 初始化W

b1 = np.zeros((4, 1)) # 初始化W
b2 = np.zeros((5, 1)) # 初始化W
b3 = np.zeros((1, 1)) # 初始化W

print(f"x.shape: {x.shape}")
print(f"W1.shape: {W1.shape}")
print(f"W2.shape: {W2.shape}")
print(f"W3.shape: {W3.shape}")

print(f"b1.shape: {b1.shape}")
print(f"b2.shape: {b2.shape}")
print(f"b3.shape: {b3.shape}")

x.shape: (3, 1)
W1.shape: (4, 3)
W2.shape: (5, 4)
W3.shape: (1, 5)
b1.shape: (4, 1)
b2.shape: (5, 1)
b3.shape: (1, 1)


### 第1层
$y_1 = W_1 \times x + b_1$ \
其中\
x为[3, 1]的矩阵\
$W_1$为[4, 3]的矩阵\
$b_1$为[4, 1]的矩阵\
$y_1$为[4, 1]的矩阵

In [3]:
y1 = W1 @ x + b1

print(f"x.shape: {x.shape}")
print(f"W1.shape: {W1.shape}")
print(f"b1.shape: {b1.shape}")
print(f"y1.shape: {y1.shape}")

y1

x.shape: (3, 1)
W1.shape: (4, 3)
b1.shape: (4, 1)
y1.shape: (4, 1)


array([[6.],
       [6.],
       [6.],
       [6.]])

### 第2层
$y_2 = W_2 \times y_1 + b_2$ \
其中\
$y_1$为[4, 1]的矩阵\
$b_2$为[5, 1]的矩阵\
$W_2$为[5, 4]的矩阵\
$y_2$为[5, 1]的矩阵


In [4]:
y2 = W2 @ y1 + b2

print(f"y1.shape: {y1.shape}")
print(f"W2.shape: {W2.shape}")
print(f"b2.shape: {b2.shape}")
print(f"y2.shape: {y2.shape}")

y2

y1.shape: (4, 1)
W2.shape: (5, 4)
b2.shape: (5, 1)
y2.shape: (5, 1)


array([[24.],
       [24.],
       [24.],
       [24.],
       [24.]])

### 第3层
$y_3 = W_3 \times y_2 + b_3$ \
其中\
$y_2$为[5, 1]的矩阵\
$W_3$为[1, 5]的矩阵\
$b_3$为[1, 1]的矩阵\
$y_3$为[1, 1]的矩阵


In [5]:
y3 = W3 @ y2 + b3

print(f"y2.shape: {y2.shape}")
print(f"W3.shape: {W3.shape}")
print(f"b3.shape: {b3.shape}")
print(f"y3.shape: {y3.shape}")

y3

y2.shape: (5, 1)
W3.shape: (1, 5)
b3.shape: (1, 1)
y3.shape: (1, 1)


array([[120.]])

### 定义损失函数
$L_{loss} = |y3 - y_{true}|$

In [6]:
L = np.abs(y3 - y_true)

print(f"y3.shape: {y3.shape}")
print(f"y_true.shape: {y_true.shape}")
print(f"L.shape: {L.shape}")

print(f"现在的Loss: {L.item()}")

y3.shape: (1, 1)
y_true.shape: (1, 1)
L.shape: (1, 1)
现在的Loss: 70.0


## 反向传播
$L_{loss}$对损失函数的输入(即第3层的输出)$y_3$的梯度为：\
当$y_3 > y_{true}$ 即 $y_3 - y_{true} > 0$时:
$\frac{\partial L_{loss}}{\partial y_3} = 1$ \
当$y_3 < y_{true}$ 即 $y_3 - y_{true} < 0$时:
$\frac{\partial L_{loss}}{\partial y_3} = -1$


In [7]:
dL_dy3 = 0 # 梯度初始化为0
if (y3 - y_true) > 0:
    dL_dy3 = 1
else:
    dL_dy3 = -1

dL_dy3

1

$L_{loss}$对第3层的权重参数$W_3$的梯度为：
$\frac{\partial L_{loss}}{\partial W_3} = \frac{\partial L_{loss}}{\partial y_3} \frac{\partial y_3}{\partial W_3} = \pm1 \times y_2^T$ \
$L_{loss}$对第3层的偏置参数$b_3$的梯度为：
$\frac{\partial L_{loss}}{\partial b_3} = \frac{\partial L_{loss}}{\partial y_3} \frac{\partial y_3}{\partial b_3} = \pm1 \times 1$

In [8]:
dL_dW3 = 0 # 梯度初始化为0
dL_dW3 = dL_dy3 * y2.T

dL_db3 = 0 # 梯度初始化为0
dL_db3 = dL_dy3 * 1

print(f"y2.T.shape: {y2.T.shape}")
print(f"dL_dW3.shape: {dL_dW3.shape}")

dL_dW3
dL_db3

y2.T.shape: (1, 5)
dL_dW3.shape: (1, 5)


array([[24., 24., 24., 24., 24.]])

1

$L_{loss}$对第2层的权重参数$W_2$的梯度为：
$\frac{\partial L_{loss}}{\partial W_2} = \frac{\partial L_{loss}}{\partial y_3} \frac{\partial y_3}{\partial y_2} \frac{\partial y_2}{\partial W_2} = (\pm1 \times W_3)^T y_1^T$ \
$L_{loss}$对第2层的偏置参数$b_2$的梯度为：
$\frac{\partial L_{loss}}{\partial b_2} = \frac{\partial L_{loss}}{\partial y_3} \frac{\partial y_3}{\partial y_2} \frac{\partial y_2}{\partial b_2} = (\pm1 \times W_3)^T \times 1$

In [9]:
dL_dW2 = 0 # 梯度初始化为0
dL_dW2 = (dL_dy3 * W3).T @ y1.T

dL_db2 = 0 # 梯度初始化为0
dL_db2 = (dL_dy3 * W3).T * 1

print(f"y1.T.shape: {y1.T.shape}")
print(f"(dL_dy3 * W3).T.shape: {(dL_dy3 * W3).T.shape}")
print(f"dL_dW2.shape: {dL_dW2.shape}")
print(f"dL_db2.shape: {dL_db2.shape}")

dL_dW2
dL_db2

y1.T.shape: (1, 4)
(dL_dy3 * W3).T.shape: (5, 1)
dL_dW2.shape: (5, 4)
dL_db2.shape: (5, 1)


array([[6., 6., 6., 6.],
       [6., 6., 6., 6.],
       [6., 6., 6., 6.],
       [6., 6., 6., 6.],
       [6., 6., 6., 6.]])

array([[1.],
       [1.],
       [1.],
       [1.],
       [1.]])

$L_{loss}$对第1层的权重参数$W_1$的梯度为：
$\frac{\partial L_{loss}}{\partial W_1} = \frac{\partial L_{loss}}{\partial y_3} \frac{\partial y_3}{\partial y_2} \frac{\partial y_2}{\partial y_1} \frac{\partial y_1}{\partial W_1} = (\pm1 \times W_3 \times W_2)^T x^T$ \
$L_{loss}$对第1层的偏置参数$b_1$的梯度为：
$\frac{\partial L_{loss}}{\partial b_1} = \frac{\partial L_{loss}}{\partial y_3} \frac{\partial y_3}{\partial y_2} \frac{\partial y_2}{\partial y_1} \frac{\partial y_1}{\partial b_1} = (\pm1 \times W_3 \times W_2)^T \times 1$

In [10]:
dL_dW1 = 0 # 梯度初始化为0
dL_dW1 = (dL_dy3 * W3 @ W2).T @ x.T

dL_db1 = 0 # 梯度初始化为0
dL_db1 = (dL_dy3 * W3 @ W2).T * 1

print(f"x.T.shape: {x.T.shape}")
print(f"(dL_dy3 * W3 @ W2).T.shape: {(dL_dy3 * W3 @ W2).T.shape}")
print(f"dL_dW1.shape: {dL_dW1.shape}")
print(f"dL_db1.shape: {dL_db1.shape}")

dL_dW1
dL_db1

x.T.shape: (1, 3)
(dL_dy3 * W3 @ W2).T.shape: (4, 1)
dL_dW1.shape: (4, 3)
dL_db1.shape: (4, 1)


array([[ 5., 10., 15.],
       [ 5., 10., 15.],
       [ 5., 10., 15.],
       [ 5., 10., 15.]])

array([[5.],
       [5.],
       [5.],
       [5.]])

## 梯度下降
$\alpha$ 即学习率 Learning Rate

In [11]:
lr = 0.01 # 学习率

### 更新第3层的参数
$W_3 = W_3 - \alpha \times \frac{\partial L_{loss}}{\partial W_3}$\
$b_3 = b_3 - \alpha \times \frac{\partial L_{loss}}{\partial b_3}$

In [12]:
W3 = W3 - lr * dL_dW3
b3 = b3 - lr * dL_db3

print(f"dL_dW3.shape: {dL_dW3.shape}")
print(f"b3.shape: {b3.shape}")
print(f"W3.shape: {W3.shape}")

W3
b3

dL_dW3.shape: (1, 5)
b3.shape: (1, 1)
W3.shape: (1, 5)


array([[0.76, 0.76, 0.76, 0.76, 0.76]])

array([[-0.01]])

### 更新第2层的参数
$W_2 = W_2 - \alpha \times \frac{\partial L_{loss}}{\partial W_2}$ \
$b_2 = b_2 - \alpha \times \frac{\partial L_{loss}}{\partial b_2}$

In [13]:
W2 = W2 - lr * dL_dW2
b2 = b2 - lr * dL_db2

print(f"dL_dW2.shape: {dL_dW2.shape}")
print(f"W2.shape: {W2.shape}")
print(f"b2.shape: {b2.shape}")

W2
b2

dL_dW2.shape: (5, 4)
W2.shape: (5, 4)
b2.shape: (5, 1)


array([[0.94, 0.94, 0.94, 0.94],
       [0.94, 0.94, 0.94, 0.94],
       [0.94, 0.94, 0.94, 0.94],
       [0.94, 0.94, 0.94, 0.94],
       [0.94, 0.94, 0.94, 0.94]])

array([[-0.01],
       [-0.01],
       [-0.01],
       [-0.01],
       [-0.01]])

### 更新后的第1层的参数
$W_1 = W_1 - \alpha \times \frac{\partial L_{loss}}{\partial W_1}$ \
$b_1 = b_1 - \alpha \times \frac{\partial L_{loss}}{\partial b_1}$

In [14]:
W1 = W1 - lr * dL_dW1
b1 = b1 - lr * dL_db1

print(f"dL_dW1.shape: {dL_dW1.shape}")
print(f"W1.shape: {W1.shape}")
print(f"b1.shape: {b1.shape}")

W1
b1

dL_dW1.shape: (4, 3)
W1.shape: (4, 3)
b1.shape: (4, 1)


array([[0.95, 0.9 , 0.85],
       [0.95, 0.9 , 0.85],
       [0.95, 0.9 , 0.85],
       [0.95, 0.9 , 0.85]])

array([[-0.05],
       [-0.05],
       [-0.05],
       [-0.05]])

## 整体更新过后新的Loss

In [15]:
y_new = W3 @ (W2 @ (W1 @ x + b1) + b2) + b3
y_new

array([[74.964]])

In [16]:
L_new = np.abs(y_new - y_true)
print(f"梯度下降前的Loss: {L.item()}")
print(f"梯度下降后的Loss: {L_new.item()}")

梯度下降前的Loss: 70.0
梯度下降后的Loss: 24.963999999999984


## 多次更新展示

In [17]:
y_true = np.array([[50]]) # 初始化真实值
x = np.array([[1, 2, 3]]).T # 初始化输入

W1 = np.ones((4, 3)) # 初始化W
W2 = np.ones((5, 4)) # 初始化W
W3 = np.ones((1, 5)) # 初始化W
b1 = np.zeros((4, 1)) # 初始化W
b2 = np.zeros((5, 1)) # 初始化W
b3 = np.zeros((1, 1)) # 初始化W

print(f"x.shape: {x.shape}")
print(f"W1.shape: {W1.shape}")
print(f"W2.shape: {W2.shape}")
print(f"W3.shape: {W3.shape}")
print(f"b1.shape: {b1.shape}")
print(f"b2.shape: {b2.shape}")
print(f"b3.shape: {b3.shape}")

x.shape: (3, 1)
W1.shape: (4, 3)
W2.shape: (5, 4)
W3.shape: (1, 5)
b1.shape: (4, 1)
b2.shape: (5, 1)
b3.shape: (1, 1)


In [18]:
epochs = 10
lr = 0.002 # 学习率
L_best = np.inf
y_best = None
epoch_best = None

for epoch in range(epochs):
    y1 = W1 @ x + b1
    y2 = W2 @ y1 + b2
    y3 = W3 @ y2 + b3

    L = np.abs(y3 - y_true)
    if L < L_best: # 保存Loss最低的结果
        L_best = L
        y_best = y3
        epoch_best = epoch + 1
    
    dL_dy3 = 0 # 梯度初始化为0
    dL_dW3 = 0 # 梯度初始化为0
    dL_dW2 = 0 # 梯度初始化为0
    dL_dW1 = 0 # 梯度初始化为0
    dL_db3 = 0 # 梯度初始化为0
    dL_db2 = 0 # 梯度初始化为0
    dL_db1 = 0 # 梯度初始化为0    
    if (y3 - y_true) > 0:
        dL_dy3 = 1
    else:
        dL_dy3 = -1

    dL_dW3 = dL_dy3 * y2.T
    dL_db3 = dL_dy3 * 1
    dL_dW2 = (dL_dy3 * W3).T @ y1.T
    dL_db2 = (dL_dy3 * W3).T * 1
    dL_dW1 = (dL_dy3 * W3 @ W2).T @ x.T
    dL_db1 = (dL_dy3 * W3 @ W2).T * 1
    
    W3 = W3 - lr * dL_dW3
    b3 = b3 - lr * dL_db3
    W2 = W2 - lr * dL_dW2
    b2 = b2 - lr * dL_db2
    W1 = W1 - lr * dL_dW1
    b1 = b1 - lr * dL_db1

    if epoch % 1 == 0:
        print(f"Epoch {epoch+1}/{epochs}, Loss: {L.item():.4f}")

print("训练完成！")
print("#" * 50)
print(f"标签值: {y_true.item():.4f}")
print(f"Loss最低时的Epoch数为: {epoch_best}")
print(f"Loss最低时的模型输出值为: {y_best.item():.4f}")
print(f"对应的Loss值为: {L_best.item():.4f}")

Epoch 1/10, Loss: 70.0000
Epoch 2/10, Loss: 60.0359
Epoch 3/10, Loss: 51.0041
Epoch 4/10, Loss: 42.7937
Epoch 5/10, Loss: 35.3088
Epoch 6/10, Loss: 28.4664
Epoch 7/10, Loss: 22.1944
Epoch 8/10, Loss: 16.4297
Epoch 9/10, Loss: 11.1172
Epoch 10/10, Loss: 6.2082
训练完成！
##################################################
标签值: 50.0000
Loss最低时的Epoch数为: 10
Loss最低时的模型输出值为: 56.2082
对应的Loss值为: 6.2082
