In [1]:
# Lab 10 MNIST and softmax
import torch
import torchvision.datasets as dsets
import torchvision.transforms as transforms

In [2]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'

# for reproducibility
torch.manual_seed(777)
if device == 'cuda':
    torch.cuda.manual_seed_all(777)

In [3]:
# parameters
learning_rate = 0.5
batch_size = 10

In [4]:
# MNIST dataset
mnist_train = dsets.MNIST(root='MNIST_data/',
                          train=True,
                          transform=transforms.ToTensor(),
                          download=True)

mnist_test = dsets.MNIST(root='MNIST_data/',
                         train=False,
                         transform=transforms.ToTensor(),
                         download=True)

In [5]:
# dataset loader
data_loader = torch.utils.data.DataLoader(dataset=mnist_train,
                                          batch_size=batch_size,
                                          shuffle=True,
                                          drop_last=True)

In [6]:
w1 = torch.nn.Parameter(torch.Tensor(784, 30)).to(device)
b1 = torch.nn.Parameter(torch.Tensor(30)).to(device)
w2 = torch.nn.Parameter(torch.Tensor(30, 10)).to(device)
b2 = torch.nn.Parameter(torch.Tensor(10)).to(device)

In [7]:
torch.nn.init.normal_(w1)
torch.nn.init.normal_(b1)
torch.nn.init.normal_(w2)
torch.nn.init.normal_(b2)

Parameter containing:
tensor([ 0.3078, -1.9857,  1.0512,  1.5122, -1.0199, -0.7402, -1.3111,  0.6142,
        -0.6474,  0.1758], requires_grad=True)

In [8]:
def sigmoid(x):
    #  sigmoid function
    return 1.0 / (1.0 + torch.exp(-x))
    # return torch.div(torch.tensor(1), torch.add(torch.tensor(1.0), torch.exp(-x)))

In [9]:
def sigmoid_prime(x):
    # derivative of the sigmoid function
    return sigmoid(x) * (1 - sigmoid(x))

In [10]:
X_test = mnist_test.test_data.view(-1, 28 * 28).float().to(device)[:1000]
Y_test = mnist_test.test_labels.to(device)[:1000]
i = 0
while not i == 10000:
    for X, Y in data_loader:
        i += 1

        # forward
        X = X.view(-1, 28 * 28).to(device)
        Y = torch.zeros((batch_size, 10)).scatter_(1, Y.unsqueeze(1), 1).to(device)    # one-hot
        l1 = torch.add(torch.matmul(X, w1), b1)
        a1 = sigmoid(l1)
        l2 = torch.add(torch.matmul(a1, w2), b2)
        y_pred = sigmoid(l2)

        diff = y_pred - Y

        # Back prop (chain rule)
        d_l2 = diff * sigmoid_prime(l2)
        d_b2 = d_l2
        d_w2 = torch.matmul(torch.transpose(a1, 0, 1), d_l2)

        d_a1 = torch.matmul(d_l2, torch.transpose(w2, 0, 1))
        d_l1 = d_a1 * sigmoid_prime(l1)
        d_b1 = d_l1
        d_w1 = torch.matmul(torch.transpose(X, 0, 1), d_l1)

        w1 = w1 - learning_rate * d_w1
        b1 = b1 - learning_rate * torch.mean(d_b1, 0)
        w2 = w2 - learning_rate * d_w2
        b2 = b2 - learning_rate * torch.mean(d_b2, 0)

        if i % 1000 == 0:
            l1 = torch.add(torch.matmul(X_test, w1), b1)
            a1 = sigmoid(l1)
            l2 = torch.add(torch.matmul(a1, w2), b2)
            y_pred = sigmoid(l2)
            acct_mat = torch.argmax(y_pred, 1) == Y_test
            acct_res = acct_mat.sum()
            print(acct_res.item())

        if i == 10000:
            break

736
862
860
881
874
890
904
923
916
920


# 神经网络前向传播与反向传播公式

## 1. 前向传播

### 1.1 隐藏层的计算

输入层到隐藏层的计算：

$$
\mathbf{l}_1 = \mathbf{X} \cdot \mathbf{W}_1 + \mathbf{b}_1
$$

其中，$\mathbf{X}$ 是输入数据，$\mathbf{W}_1$ 是第一个权重矩阵，$\mathbf{b}_1$ 是第一个偏置。

激活函数：使用 **Sigmoid** 激活函数：

$$
\mathbf{a}_1 = \sigma(\mathbf{l}_1) = \frac{1}{1 + e^{-\mathbf{l}_1}}
$$

### 1.2 输出层的计算

隐藏层到输出层的计算：

$$
\mathbf{l}_2 = \mathbf{a}_1 \cdot \mathbf{W}_2 + \mathbf{b}_2
$$

其中，$\mathbf{W}_2$ 是第二个权重矩阵，$\mathbf{b}_2$ 是第二个偏置。

输出的预测结果：

$$
\hat{\mathbf{Y}} = \sigma(\mathbf{l}_2) = \frac{1}{1 + e^{-\mathbf{l}_2}}
$$

## 2. 损失函数（均方误差 MSE）

损失函数使用 **均方误差**（MSE）来衡量预测值与真实值之间的差异：

$$
\mathcal{L} = \frac{1}{2} \| \hat{\mathbf{Y}} - \mathbf{Y} \|^2
$$

其中，$\hat{\mathbf{Y}}$ 是模型的预测输出，$\mathbf{Y}$ 是真实标签。

## 3. 反向传播（Backpropagation）

### 3.1 输出层的梯度

首先，计算输出层的误差：

$$
\frac{\partial \mathcal{L}}{\partial \hat{\mathbf{Y}}} = \hat{\mathbf{Y}} - \mathbf{Y}
$$

然后，将误差传递给前一层，通过 **Sigmoid** 的导数：

$$
\frac{\partial \mathcal{L}}{\partial \mathbf{l}_2} = (\hat{\mathbf{Y}} - \mathbf{Y}) \cdot \sigma'(\mathbf{l}_2)
$$

其中，Sigmoid 的导数为：

$$
\sigma'(z) = \sigma(z)(1 - \sigma(z))
$$

### 3.2 输出层权重与偏置的更新

计算权重梯度：

$$
\frac{\partial \mathcal{L}}{\partial \mathbf{W}_2} = \mathbf{a}_1^T \cdot \frac{\partial \mathcal{L}}{\partial \mathbf{l}_2}
$$

计算偏置梯度：

$$
\frac{\partial \mathcal{L}}{\partial \mathbf{b}_2} = \text{mean}\left( \frac{\partial \mathcal{L}}{\partial \mathbf{l}_2}, \text{axis=0} \right)
$$

### 3.3 传播到隐藏层

将误差反向传播到隐藏层：

$$
\frac{\partial \mathcal{L}}{\partial \mathbf{a}_1} = \frac{\partial \mathcal{L}}{\partial \mathbf{l}_2} \cdot \mathbf{W}_2^T
$$

然后，计算隐藏层的误差：

$$
\frac{\partial \mathcal{L}}{\partial \mathbf{l}_1} = \frac{\partial \mathcal{L}}{\partial \mathbf{a}_1} \cdot \sigma'(\mathbf{l}_1)
$$

### 3.4 隐藏层权重与偏置的更新

计算权重梯度：

$$
\frac{\partial \mathcal{L}}{\partial \mathbf{W}_1} = \mathbf{X}^T \cdot \frac{\partial \mathcal{L}}{\partial \mathbf{l}_1}
$$

计算偏置梯度：

$$
\frac{\partial \mathcal{L}}{\partial \mathbf{b}_1} = \text{mean}\left( \frac{\partial \mathcal{L}}{\partial \mathbf{l}_1}, \text{axis=0} \right)
$$

## 4. 参数更新（梯度下降）

使用 **梯度下降** 更新权重和偏置：

$$
\mathbf{W}_1 = \mathbf{W}_1 - \eta \cdot \frac{\partial \mathcal{L}}{\partial \mathbf{W}_1}
$$

$$
\mathbf{b}_1 = \mathbf{b}_1 - \eta \cdot \text{mean}\left( \frac{\partial \mathcal{L}}{\partial \mathbf{b}_1}, \text{axis=0} \right)
$$

$$
\mathbf{W}_2 = \mathbf{W}_2 - \eta \cdot \frac{\partial \mathcal{L}}{\partial \mathbf{W}_2}
$$

$$
\mathbf{b}_2 = \mathbf{b}_2 - \eta \cdot \text{mean}\left( \frac{\partial \mathcal{L}}{\partial \mathbf{b}_2}, \text{axis=0} \right)
$$

其中，$\eta$ 是学习率。

---
