In [1]:
import numpy as np
import matplotlib.pyplot as plt
import idx2numpy
from tqdm import tqdm

import torch
from torch import nn
from torch import optim
from torch.utils.data import TensorDataset, DataLoader

In [2]:
device = torch.accelerator.current_accelerator().type if torch.accelerator.is_available() else "cpu"
print(f"device: {device}")

device: cuda


In [3]:
# 残差块
class ResidualBlock(nn.Module):
    def __init__(self, in_channels, out_channels, stride=1):
        super().__init__()
        self.conv1 = nn.Conv2d(in_channels, out_channels, kernel_size=3, stride=stride, padding=1, bias=False)
        self.bn1 = nn.BatchNorm2d(out_channels)
        self.relu = nn.ReLU(inplace=True)
        self.conv2 = nn.Conv2d(out_channels, out_channels, kernel_size=3, stride=1, padding=1, bias=False)
        self.bn2 = nn.BatchNorm2d(out_channels)
        
        # 如果输入输出维度不匹配，使用1x1卷积调整
        self.shortcut = nn.Sequential()
        if stride != 1 or in_channels != out_channels:
            self.shortcut = nn.Sequential(
                nn.Conv2d(in_channels, out_channels, kernel_size=1, stride=stride, bias=False),
                nn.BatchNorm2d(out_channels)
            )
    
    def forward(self, x):
        residual = x
        out = self.conv1(x)
        out = self.bn1(out)
        out = self.relu(out)
        out = self.conv2(out)
        out = self.bn2(out)
        out += self.shortcut(residual)
        out = self.relu(out)
        return out

In [4]:
class MNIST(nn.Module):
    def __init__(self):
        super().__init__()
        self.net = nn.Sequential(
            nn.Conv2d(1, 32, kernel_size=3, padding=1),
            nn.BatchNorm2d(32),
            nn.ReLU(inplace=True),

            ResidualBlock(32, 32),
            nn.MaxPool2d(kernel_size=2),

            ResidualBlock(32, 64),
            nn.MaxPool2d(kernel_size=2),

            nn.Flatten(),
            nn.Linear(64 * 7 * 7, 128),
            nn.ReLU(),
            nn.Dropout(0.5),
            nn.Linear(128, 10),
        )

        for m in self.net.modules():
            if isinstance(m, nn.Conv2d):
                nn.init.kaiming_normal_(m.weight, mode='fan_out', nonlinearity='relu')
            elif isinstance(m, nn.Linear):
                nn.init.xavier_normal_(m.weight)
                nn.init.zeros_(m.bias)

    def forward(self, x):
        return self.net(x)

In [5]:
def one_hot(labels, num_classes):
    # one-hot 编码
    one_hot_labels = np.zeros((labels.shape[0], num_classes))
    for i in range(labels.shape[0]):
        one_hot_labels[i, labels[i]] = 1
    return one_hot_labels

In [6]:
datapath = "../data/MNIST/train-images.idx3-ubyte"
labelpath = "../data/MNIST/train-labels.idx1-ubyte"
modelpath = "../model/task3/MNIST.pth"

data = idx2numpy.convert_from_file(datapath)    # (60000, 28, 28)
data = np.expand_dims(data, axis=1)             # 添加通道维度 -> (60000, 1, 28, 28)
data = torch.from_numpy(data).float()

label = idx2numpy.convert_from_file(labelpath)
one_hot_labels = one_hot(label, 10)
one_hot_labels = torch.from_numpy(one_hot_labels).float()


  data = torch.from_numpy(data).float()


In [7]:
valid_datapath = "../valid/MNIST/t10k-images.idx3-ubyte"
valid_labelpath = "../valid/MNIST/t10k-labels.idx1-ubyte"

valid_data = idx2numpy.convert_from_file(valid_datapath)
valid_data = np.expand_dims(valid_data, axis=1)
valid_data = torch.from_numpy(valid_data).float()

valid_label = idx2numpy.convert_from_file(valid_labelpath)
valid_one_hot_labels = one_hot(valid_label, 10)
valid_one_hot_labels = torch.from_numpy(valid_one_hot_labels).float()

In [8]:
train_dataset = TensorDataset(data, one_hot_labels)
valid_dataset = TensorDataset(valid_data, valid_one_hot_labels)

In [9]:
epochs = 50            # 训练轮数

batch_size = 128        # 批大小
inital_lr = 0.001       # 初始学习率
lr_patience = 10        # 学习率衰减的耐心
lr_decay = 0.5          # 学习率衰减系数

best_accuracy = 0.0     # 最佳准确率
best_loss = float("inf")  # 最佳损失

In [10]:
train_loader = DataLoader(train_dataset, batch_size, True)
valid_loader = DataLoader(valid_dataset, batch_size, False)

model = MNIST().to(device)
loss_func = nn.CrossEntropyLoss()
optimizer = optim.AdamW(model.parameters(), lr=inital_lr, weight_decay=1e-4)
scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', factor=lr_decay, patience=lr_patience)

In [33]:
pbar = tqdm(range(epochs), desc="Training")
for i in pbar:
    model.train()
    running_loss = 0.0
    valid_loss = 0.0
    accuracy = 0.0
    for x, y in train_loader:
        # 加载进GPU
        x = x.to(device)
        y = y.to(device)
        # 清空梯度
        optimizer.zero_grad()
        # 前向传播
        output = model(x)
        # 计算损失
        loss = loss_func(output, y)
        running_loss += loss.item()
        # 反向传播
        loss.backward()
        # 更新参数
        optimizer.step()

    # 计算验证集损失和准确率
    model.eval()
    with torch.no_grad():
        for x, y in valid_loader:
            x = x.to(device)
            y = y.to(device)
            # 预测
            pred = model(x)
            # 计算损失
            loss = loss_func(pred, y)
            valid_loss += loss.item()
            # 计算准确率
            accuracy += torch.sum(torch.argmax(pred, dim=1) == torch.argmax(y, dim=1)).item()

    running_loss /= len(train_loader)
    valid_loss /= len(valid_loader)
    accuracy /= len(valid_loader.dataset)
    scheduler.step(valid_loss)  # 更新学习率

    # if accuracy > best_accuracy:
    #     best_accuracy = accuracy
    #     torch.save(model.state_dict(), modelpath)

    if valid_loss < best_loss:
        best_loss = valid_loss
        best_accuracy = accuracy
        torch.save(model.state_dict(), modelpath)

    pbar.set_postfix(
        loss=running_loss,
        valid_loss=valid_loss,
        best_loss=best_loss,
        accuracy=f"{accuracy*100:.2f}%",
        best_accuracy=f"{best_accuracy*100:.2f}%",
        lr=optimizer.param_groups[0]['lr'],
    )

Training: 100%|██████████| 50/50 [01:39<00:00,  2.00s/it, accuracy=99.60%, best_accuracy=99.54%, best_loss=0.0183, loss=0.00142, lr=0.00025, valid_loss=0.0307]


In [34]:
print("architecture:", model)
print("param", sum(p.numel() for p in model.parameters()))
print("savepath:", modelpath)
print("best_accuracy:", best_accuracy)

architecture: MNIST(
  (net): Sequential(
    (0): Conv2d(1, 32, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (1): BatchNorm2d(32, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (2): ReLU(inplace=True)
    (3): ResidualBlock(
      (conv1): Conv2d(32, 32, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn1): BatchNorm2d(32, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (relu): ReLU(inplace=True)
      (conv2): Conv2d(32, 32, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn2): BatchNorm2d(32, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (shortcut): Sequential()
    )
    (4): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
    (5): ResidualBlock(
      (conv1): Conv2d(32, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (

In [11]:
# For interview
interview_data_path = "../test/t10k-images.idx3-ubyte"
interview_label_path = "../test/t10k-labels.idx1-ubyte"

interview_data = idx2numpy.convert_from_file(interview_data_path)
interview_data = np.expand_dims(interview_data, axis=1)
interview_data = torch.from_numpy(interview_data).float().to(device)

interview_label = idx2numpy.convert_from_file(interview_label_path)
interview_labels = one_hot(interview_label, 10)
interview_labels = torch.from_numpy(interview_labels).float().to(device)

In [12]:
# For interview
model.load_state_dict(torch.load(modelpath))

model.eval()
with torch.no_grad():
    pred = model(interview_data)
    pred_labels = torch.argmax(pred, dim=1)
    true_labels = torch.argmax(interview_labels, dim=1)
    accuracy = torch.sum(pred_labels == true_labels).item() / len(interview_data)
    print("Interview Accuracy: {:.2f}%".format(accuracy * 100))

Interview Accuracy: 99.54%
