# Assignment 1

This code baseline is inspired by and modified from [this great tutorial](https://pytorch.org/tutorials/beginner/blitz/cifar10_tutorial.html).

This code can achieve an accuracy of approximately 86.50% on CIFAR-10. Please set up the environment and run your experiments starting from this baseline. You are expected to achieve an accuracy higher than this baseline.

**Task1**——5 points




In [None]:
!pip3 install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu121

Looking in indexes: https://download.pytorch.org/whl/cu121


In [None]:
import torch
print(f"PyTorch Version: {torch.__version__}")
print(f"CUDA Available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"Device Name: {torch.cuda.get_device_name(0)}")

PyTorch Version: 2.8.0+cu126
CUDA Available: True
Device Name: Tesla T4


In [None]:
# import some necessary packages
import torch
import torch.nn as nn
import torch.optim as optim

import torchvision.datasets as tv_datasets
import torchvision.transforms as tv_transforms

In [None]:
# some experimental setup
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

num_epochs = 128
batch_size = 64
num_workers = 2
print_every = 200

optim_name = "Adam"
optim_kwargs = dict(
    lr=3e-4,
    weight_decay=1e-6,
)

# preprocessing pipeline for input images
transformation = dict()
for data_type in ("train", "test"):
    is_train = data_type=="train"
    transformation[data_type] = tv_transforms.Compose(([
        tv_transforms.RandomRotation(degrees=15),
        tv_transforms.RandomHorizontalFlip(),
        tv_transforms.RandomAffine(degrees=0, translate=(0.1, 0.1)),
    ] if is_train else []) +
    [
        tv_transforms.ToTensor(),
        tv_transforms.Normalize(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5]),
    ])

In [None]:
# prepare datasets
dataset, loader = {}, {}
for data_type in ("train", "test"):
    is_train = data_type=="train"
    dataset[data_type] = tv_datasets.CIFAR10(
        root="./data", train=is_train, download=True, transform=transformation[data_type],
    )
    loader[data_type] = torch.utils.data.DataLoader(
        dataset[data_type], batch_size=batch_size, shuffle=is_train, num_workers=num_workers,
    )


100%|██████████| 170M/170M [00:14<00:00, 12.0MB/s]


In [None]:
# our network architecture
net = nn.Sequential(
    nn.Conv2d(3, 128, 3, padding=1), nn.ReLU(inplace=True), nn.MaxPool2d(2), nn.Dropout(0.3),
    nn.Conv2d(128, 256, 3, padding=1), nn.ReLU(inplace=True), nn.MaxPool2d(2), nn.Dropout(0.3),
    nn.Conv2d(256, 512, 3, padding=1), nn.ReLU(inplace=True),
    nn.Conv2d(512, 512, 3, padding=1), nn.ReLU(inplace=True),
    nn.Conv2d(512, 256, 3, padding=1), nn.ReLU(inplace=True), nn.MaxPool2d(2), nn.Dropout(0.3),
    nn.Flatten(),
    nn.Linear(256 * 4 * 4, 512), nn.ReLU(inplace=True), nn.Dropout(0.5),
    nn.Linear(512, 256), nn.ReLU(inplace=True), nn.Dropout(0.5),
    nn.Linear(256, 128), nn.ReLU(inplace=True), nn.Dropout(0.5),
    nn.Linear(128, 10),
)

# move to device
net.to(device)

# print the number of parameters
print(f"number of parameters: {sum(p.numel() for p in net.parameters() if p.requires_grad) / 1_000_000:.2f}M")

number of parameters: 7.28M


## Start Training

In [None]:
# # the network optimizer
# optimizer = getattr(optim, optim_name)(net.parameters(), **optim_kwargs)

# # loss function
# criterion = nn.CrossEntropyLoss()

# # training loop
# net.train()
# for epoch in range(num_epochs):

#     running_loss = 0.0
#     for i, (img, target) in enumerate(loader["train"]):
#         img, target = img.to(device), target.to(device)

#         pred = net(img)
#         loss = criterion(pred, target)

#         optimizer.zero_grad()
#         loss.backward()
#         optimizer.step()

#         # print statistics
#         running_loss += loss.item()
#         if i % print_every == print_every - 1:
#             print(f"[epoch={epoch + 1:3d}, iter={i + 1:5d}] loss: {running_loss / print_every:.3f}")
#             running_loss = 0.0

# print("Finished Training")

[epoch=  1, iter=  200] loss: 2.197
[epoch=  1, iter=  400] loss: 1.977
[epoch=  1, iter=  600] loss: 1.856
[epoch=  2, iter=  200] loss: 1.698
[epoch=  2, iter=  400] loss: 1.598
[epoch=  2, iter=  600] loss: 1.543
[epoch=  3, iter=  200] loss: 1.457
[epoch=  3, iter=  400] loss: 1.402
[epoch=  3, iter=  600] loss: 1.379
[epoch=  4, iter=  200] loss: 1.302
[epoch=  4, iter=  400] loss: 1.286
[epoch=  4, iter=  600] loss: 1.220
[epoch=  5, iter=  200] loss: 1.179
[epoch=  5, iter=  400] loss: 1.175
[epoch=  5, iter=  600] loss: 1.160
[epoch=  6, iter=  200] loss: 1.088
[epoch=  6, iter=  400] loss: 1.088
[epoch=  6, iter=  600] loss: 1.071
[epoch=  7, iter=  200] loss: 1.047
[epoch=  7, iter=  400] loss: 1.017
[epoch=  7, iter=  600] loss: 0.990
[epoch=  8, iter=  200] loss: 0.960
[epoch=  8, iter=  400] loss: 0.965
[epoch=  8, iter=  600] loss: 0.948
[epoch=  9, iter=  200] loss: 0.911
[epoch=  9, iter=  400] loss: 0.903
[epoch=  9, iter=  600] loss: 0.902
[epoch= 10, iter=  200] loss

## Evaluating its accuracy

In [None]:
# net.eval()
# correct, total = 0, 0
# with torch.no_grad():
#     for img, target in loader["test"]:
#         img, target = img.to(device), target.to(device)

#         # make prediction
#         pred = net(img)

#         # accumulate
#         total += len(target)
#         correct += (torch.argmax(pred, dim=1) == target).sum().item()

# print(f"Accuracy of the network on the {total} test images: {100 * correct / total:.2f}%")

Accuracy of the network on the 10000 test images: 86.72%


**Task2**——25 points

Due to computational resource limitations, we reduced the number of epochs for performance comparison. We found that the performance was already quite good at 40 epochs, so we directly used a smaller number of epochs for comparison.


In [None]:
# # 实验 0: 基线 (40 Epochs)
# # 改动: num_epochs 从 128 改为 40

# import torch
# import torch.nn as nn
# import torch.optim as optim
# import torchvision.datasets as tv_datasets
# import torchvision.transforms as tv_transforms

# # --- 实验配置 ---
# device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
# num_epochs = 40
# batch_size = 64
# num_workers = 2
# print_every = 200
# optim_name = "Adam"
# optim_kwargs = dict(lr=3e-4, weight_decay=1e-6)

# # --- 数据预处理 ---
# transformation = dict()
# for data_type in ("train", "test"):
#     is_train = data_type=="train"
#     transformation[data_type] = tv_transforms.Compose(([
#         tv_transforms.RandomRotation(degrees=15),
#         tv_transforms.RandomHorizontalFlip(),
#         tv_transforms.RandomAffine(degrees=0, translate=(0.1, 0.1)),
#     ] if is_train else []) +
#     [
#         tv_transforms.ToTensor(),
#         tv_transforms.Normalize(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5]),
#     ])

# # --- 数据加载器 ---
# dataset, loader = {}, {}
# for data_type in ("train", "test"):
#     is_train = data_type=="train"
#     dataset[data_type] = tv_datasets.CIFAR10(root="./data", train=is_train, download=True, transform=transformation[data_type])
#     loader[data_type] = torch.utils.data.DataLoader(dataset[data_type], batch_size=batch_size, shuffle=is_train, num_workers=num_workers)

# # --- 网络架构 ---
# net = nn.Sequential(
#     nn.Conv2d(3, 128, 3, padding=1), nn.ReLU(inplace=True), nn.MaxPool2d(2), nn.Dropout(0.3),
#     nn.Conv2d(128, 256, 3, padding=1), nn.ReLU(inplace=True), nn.MaxPool2d(2), nn.Dropout(0.3),
#     nn.Conv2d(256, 512, 3, padding=1), nn.ReLU(inplace=True),
#     nn.Conv2d(512, 512, 3, padding=1), nn.ReLU(inplace=True),
#     nn.Conv2d(512, 256, 3, padding=1), nn.ReLU(inplace=True), nn.MaxPool2d(2), nn.Dropout(0.3),
#     nn.Flatten(),
#     nn.Linear(256 * 4 * 4, 512), nn.ReLU(inplace=True), nn.Dropout(0.5),
#     nn.Linear(512, 256), nn.ReLU(inplace=True), nn.Dropout(0.5),
#     nn.Linear(256, 128), nn.ReLU(inplace=True), nn.Dropout(0.5),
#     nn.Linear(128, 10),
# ).to(device)

# # --- 优化器与损失函数 ---
# optimizer = getattr(optim, optim_name)(net.parameters(), **optim_kwargs)
# criterion = nn.CrossEntropyLoss()

# # --- 训练循环 ---
# print("--- 开始训练: 实验 0 (基线) ---")
# net.train()
# for epoch in range(num_epochs):
#     running_loss = 0.0
#     for i, (img, target) in enumerate(loader["train"]):
#         img, target = img.to(device), target.to(device)
#         pred = net(img)
#         loss = criterion(pred, target)
#         optimizer.zero_grad()
#         loss.backward()
#         optimizer.step()
#         running_loss += loss.item()
#         if i % print_every == print_every - 1:
#             print(f"[epoch={epoch + 1:3d}, iter={i + 1:5d}] loss: {running_loss / print_every:.3f}")
#             running_loss = 0.0
# print("--- 训练结束 ---")

# # --- 评估 ---
# net.eval()
# correct, total = 0, 0
# with torch.no_grad():
#     for img, target in loader["test"]:
#         img, target = img.to(device), target.to(device)
#         pred = net(img)
#         total += len(target)
#         correct += (torch.argmax(pred, dim=1) == target).sum().item()
# print(f"实验 0 (基线) 准确率 @ 40 epochs: {100 * correct / total:.2f}%")

--- 开始训练: 实验 0 (基线) ---
[epoch=  1, iter=  200] loss: 2.217
[epoch=  1, iter=  400] loss: 1.984
[epoch=  1, iter=  600] loss: 1.856
[epoch=  2, iter=  200] loss: 1.684
[epoch=  2, iter=  400] loss: 1.634
[epoch=  2, iter=  600] loss: 1.579
[epoch=  3, iter=  200] loss: 1.471
[epoch=  3, iter=  400] loss: 1.421
[epoch=  3, iter=  600] loss: 1.398
[epoch=  4, iter=  200] loss: 1.303
[epoch=  4, iter=  400] loss: 1.281
[epoch=  4, iter=  600] loss: 1.258
[epoch=  5, iter=  200] loss: 1.173
[epoch=  5, iter=  400] loss: 1.173
[epoch=  5, iter=  600] loss: 1.134
[epoch=  6, iter=  200] loss: 1.103
[epoch=  6, iter=  400] loss: 1.088
[epoch=  6, iter=  600] loss: 1.067
[epoch=  7, iter=  200] loss: 1.036
[epoch=  7, iter=  400] loss: 1.021
[epoch=  7, iter=  600] loss: 1.000
[epoch=  8, iter=  200] loss: 0.970
[epoch=  8, iter=  400] loss: 0.952
[epoch=  8, iter=  600] loss: 0.949
[epoch=  9, iter=  200] loss: 0.929
[epoch=  9, iter=  400] loss: 0.911
[epoch=  9, iter=  600] loss: 0.916
[epo

In [None]:
# # 实验 1: 引入残差机制
# # 改动: 1. num_epochs=40  2. 网络重写为ResNet风格

# import torch
# import torch.nn as nn
# import torch.optim as optim
# import torchvision.datasets as tv_datasets
# import torchvision.transforms as tv_transforms
# import torch.nn.functional as F

# # --- 实验配置 ---
# device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
# num_epochs = 40
# batch_size = 64
# num_workers = 2
# print_every = 200
# optim_name = "Adam"
# optim_kwargs = dict(lr=3e-4, weight_decay=1e-6)

# # --- 数据预处理 (与基线相同) ---
# transformation = dict()
# for data_type in ("train", "test"):
#     is_train = data_type=="train"
#     transformation[data_type] = tv_transforms.Compose(([
#         tv_transforms.RandomRotation(degrees=15),
#         tv_transforms.RandomHorizontalFlip(),
#         tv_transforms.RandomAffine(degrees=0, translate=(0.1, 0.1)),
#     ] if is_train else []) +
#     [
#         tv_transforms.ToTensor(),
#         tv_transforms.Normalize(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5]),
#     ])

# # --- 数据加载器 (与基线相同) ---
# dataset, loader = {}, {}
# for data_type in ("train", "test"):
#     is_train = data_type=="train"
#     dataset[data_type] = tv_datasets.CIFAR10(root="./data", train=is_train, download=True, transform=transformation[data_type])
#     loader[data_type] = torch.utils.data.DataLoader(dataset[data_type], batch_size=batch_size, shuffle=is_train, num_workers=num_workers)

# # --- 网络架构 (核心修改) ---
# class ResidualBlock(nn.Module):
#     def __init__(self, in_channels, out_channels, stride=1):
#         super().__init__()
#         self.conv1 = nn.Conv2d(in_channels, out_channels, kernel_size=3, stride=stride, padding=1, bias=False)
#         self.bn1 = nn.BatchNorm2d(out_channels)
#         self.conv2 = nn.Conv2d(out_channels, out_channels, kernel_size=3, stride=1, padding=1, bias=False)
#         self.bn2 = nn.BatchNorm2d(out_channels)
#         self.shortcut = nn.Sequential()
#         if stride != 1 or in_channels != out_channels:
#             self.shortcut = nn.Sequential(nn.Conv2d(in_channels, out_channels, kernel_size=1, stride=stride, bias=False), nn.BatchNorm2d(out_channels))
#     def forward(self, x):
#         out = F.relu(self.bn1(self.conv1(x)))
#         out = self.bn2(self.conv2(out))
#         out += self.shortcut(x)
#         return F.relu(out)

# net = nn.Sequential(
#     nn.Conv2d(3, 128, 3, padding=1), nn.ReLU(inplace=True),
#     ResidualBlock(128, 128), nn.MaxPool2d(2), nn.Dropout(0.3),
#     ResidualBlock(128, 256), nn.MaxPool2d(2), nn.Dropout(0.3),
#     ResidualBlock(256, 512),
#     ResidualBlock(512, 512),
#     ResidualBlock(512, 256), nn.MaxPool2d(2), nn.Dropout(0.3),
#     nn.Flatten(),
#     nn.Linear(256 * 4 * 4, 512), nn.ReLU(inplace=True), nn.Dropout(0.5),
#     nn.Linear(512, 256), nn.ReLU(inplace=True), nn.Dropout(0.5),
#     nn.Linear(256, 128), nn.ReLU(inplace=True), nn.Dropout(0.5),
#     nn.Linear(128, 10),
# ).to(device)

# # --- 优化器与损失函数 (与基线相同) ---
# optimizer = getattr(optim, optim_name)(net.parameters(), **optim_kwargs)
# criterion = nn.CrossEntropyLoss()

# # --- 训练循环 ---
# print("--- 开始训练: 实验 1 (残差机制) ---")
# net.train()
# for epoch in range(num_epochs):
#     running_loss = 0.0
#     for i, (img, target) in enumerate(loader["train"]):
#         img, target = img.to(device), target.to(device)
#         pred = net(img)
#         loss = criterion(pred, target)
#         optimizer.zero_grad()
#         loss.backward()
#         optimizer.step()
#         running_loss += loss.item()
#         if i % print_every == print_every - 1:
#             print(f"[epoch={epoch + 1:3d}, iter={i + 1:5d}] loss: {running_loss / print_every:.3f}")
#             running_loss = 0.0
# print("--- 训练结束 ---")

# # --- 评估 ---
# net.eval()
# correct, total = 0, 0
# with torch.no_grad():
#     for img, target in loader["test"]:
#         img, target = img.to(device), target.to(device)
#         pred = net(img)
#         total += len(target)
#         correct += (torch.argmax(pred, dim=1) == target).sum().item()
# print(f"实验 1 (残差机制) 准确率 @ 40 epochs: {100 * correct / total:.2f}%")

--- 开始训练: 实验 1 (残差机制) ---
[epoch=  1, iter=  200] loss: 2.167
[epoch=  1, iter=  400] loss: 1.888
[epoch=  1, iter=  600] loss: 1.735
[epoch=  2, iter=  200] loss: 1.550
[epoch=  2, iter=  400] loss: 1.498
[epoch=  2, iter=  600] loss: 1.395
[epoch=  3, iter=  200] loss: 1.303
[epoch=  3, iter=  400] loss: 1.268
[epoch=  3, iter=  600] loss: 1.230
[epoch=  4, iter=  200] loss: 1.164
[epoch=  4, iter=  400] loss: 1.127
[epoch=  4, iter=  600] loss: 1.102
[epoch=  5, iter=  200] loss: 1.062
[epoch=  5, iter=  400] loss: 1.036
[epoch=  5, iter=  600] loss: 1.032
[epoch=  6, iter=  200] loss: 0.999
[epoch=  6, iter=  400] loss: 0.968
[epoch=  6, iter=  600] loss: 0.945
[epoch=  7, iter=  200] loss: 0.921
[epoch=  7, iter=  400] loss: 0.894
[epoch=  7, iter=  600] loss: 0.901
[epoch=  8, iter=  200] loss: 0.859
[epoch=  8, iter=  400] loss: 0.850
[epoch=  8, iter=  600] loss: 0.854
[epoch=  9, iter=  200] loss: 0.814
[epoch=  9, iter=  400] loss: 0.805
[epoch=  9, iter=  600] loss: 0.766
[e

In [None]:
# # 实验 2: 更宽的网络
# # 改动: 1. num_epochs=40  2. 网络各层通道数乘以1.5

# import torch
# import torch.nn as nn
# import torch.optim as optim
# import torchvision.datasets as tv_datasets
# import torchvision.transforms as tv_transforms

# # --- 实验配置 (与基线相同) ---
# device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
# num_epochs = 40
# batch_size = 64
# num_workers = 2
# print_every = 200
# optim_name = "Adam"
# optim_kwargs = dict(lr=3e-4, weight_decay=1e-6)

# # --- 数据预处理 (与基线相同) ---
# transformation = dict()
# for data_type in ("train", "test"):
#     is_train = data_type=="train"
#     transformation[data_type] = tv_transforms.Compose(([
#         tv_transforms.RandomRotation(degrees=15),
#         tv_transforms.RandomHorizontalFlip(),
#         tv_transforms.RandomAffine(degrees=0, translate=(0.1, 0.1)),
#     ] if is_train else []) +
#     [
#         tv_transforms.ToTensor(),
#         tv_transforms.Normalize(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5]),
#     ])

# # --- 数据加载器 (与基线相同) ---
# dataset, loader = {}, {}
# for data_type in ("train", "test"):
#     is_train = data_type=="train"
#     dataset[data_type] = tv_datasets.CIFAR10(root="./data", train=is_train, download=True, transform=transformation[data_type])
#     loader[data_type] = torch.utils.data.DataLoader(dataset[data_type], batch_size=batch_size, shuffle=is_train, num_workers=num_workers)

# # --- 网络架构 (核心修改) ---
# wm = 1.5 # width_multiplier
# c1, c2, c3, c4 = int(128*wm), int(256*wm), int(512*wm), int(256*wm)
# l1, l2, l3, l4 = int(512*wm), int(256*wm), int(128*wm), 10

# net = nn.Sequential(
#     nn.Conv2d(3, c1, 3, padding=1), nn.ReLU(inplace=True), nn.MaxPool2d(2), nn.Dropout(0.3),
#     nn.Conv2d(c1, c2, 3, padding=1), nn.ReLU(inplace=True), nn.MaxPool2d(2), nn.Dropout(0.3),
#     nn.Conv2d(c2, c3, 3, padding=1), nn.ReLU(inplace=True),
#     nn.Conv2d(c3, c3, 3, padding=1), nn.ReLU(inplace=True),
#     nn.Conv2d(c3, c4, 3, padding=1), nn.ReLU(inplace=True), nn.MaxPool2d(2), nn.Dropout(0.3),
#     nn.Flatten(),
#     nn.Linear(c4 * 4 * 4, l1), nn.ReLU(inplace=True), nn.Dropout(0.5),
#     nn.Linear(l1, l2), nn.ReLU(inplace=True), nn.Dropout(0.5),
#     nn.Linear(l2, l3), nn.ReLU(inplace=True), nn.Dropout(0.5),
#     nn.Linear(l3, l4),
# ).to(device)

# # --- 优化器与损失函数 (与基线相同) ---
# optimizer = getattr(optim, optim_name)(net.parameters(), **optim_kwargs)
# criterion = nn.CrossEntropyLoss()

# # --- 训练循环 ---
# print("--- 开始训练: 实验 2 (更宽网络) ---")
# net.train()
# for epoch in range(num_epochs):
#     running_loss = 0.0
#     for i, (img, target) in enumerate(loader["train"]):
#         img, target = img.to(device), target.to(device)
#         pred = net(img)
#         loss = criterion(pred, target)
#         optimizer.zero_grad()
#         loss.backward()
#         optimizer.step()
#         running_loss += loss.item()
#         if i % print_every == print_every - 1:
#             print(f"[epoch={epoch + 1:3d}, iter={i + 1:5d}] loss: {running_loss / print_every:.3f}")
#             running_loss = 0.0
# print("--- 训练结束 ---")

# # --- 评估 ---
# net.eval()
# correct, total = 0, 0
# with torch.no_grad():
#     for img, target in loader["test"]:
#         img, target = img.to(device), target.to(device)
#         pred = net(img)
#         total += len(target)
#         correct += (torch.argmax(pred, dim=1) == target).sum().item()
# print(f"实验 2 (更宽网络) 准确率 @ 40 epochs: {100 * correct / total:.2f}%")

--- 开始训练: 实验 2 (更宽网络) ---
[epoch=  1, iter=  200] loss: 2.173
[epoch=  1, iter=  400] loss: 1.914
[epoch=  1, iter=  600] loss: 1.799
[epoch=  2, iter=  200] loss: 1.602
[epoch=  2, iter=  400] loss: 1.545
[epoch=  2, iter=  600] loss: 1.479
[epoch=  3, iter=  200] loss: 1.391
[epoch=  3, iter=  400] loss: 1.346
[epoch=  3, iter=  600] loss: 1.316
[epoch=  4, iter=  200] loss: 1.248
[epoch=  4, iter=  400] loss: 1.191
[epoch=  4, iter=  600] loss: 1.211
[epoch=  5, iter=  200] loss: 1.134
[epoch=  5, iter=  400] loss: 1.109
[epoch=  5, iter=  600] loss: 1.094
[epoch=  6, iter=  200] loss: 1.052
[epoch=  6, iter=  400] loss: 1.027
[epoch=  6, iter=  600] loss: 1.002
[epoch=  7, iter=  200] loss: 0.963
[epoch=  7, iter=  400] loss: 0.962
[epoch=  7, iter=  600] loss: 0.954
[epoch=  8, iter=  200] loss: 0.896
[epoch=  8, iter=  400] loss: 0.912
[epoch=  8, iter=  600] loss: 0.891
[epoch=  9, iter=  200] loss: 0.850
[epoch=  9, iter=  400] loss: 0.857
[epoch=  9, iter=  600] loss: 0.851
[e

In [1]:
# # 实验 3: 使用AdamW优化器
# # 改动: 1. num_epochs=40  2. optim_name改为"AdamW"  3. 调整weight_decay

# import torch
# import torch.nn as nn
# import torch.optim as optim
# import torchvision.datasets as tv_datasets
# import torchvision.transforms as tv_transforms

# # --- 实验配置 (核心修改) ---
# device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
# num_epochs = 40
# batch_size = 64
# num_workers = 2
# print_every = 200
# optim_name = "AdamW"
# optim_kwargs = dict(lr=3e-4, weight_decay=1e-4) # AdamW使用更典型的weight_decay

# # --- 数据预处理 (与基线相同) ---
# transformation = dict()
# for data_type in ("train", "test"):
#     is_train = data_type=="train"
#     transformation[data_type] = tv_transforms.Compose(([
#         tv_transforms.RandomRotation(degrees=15),
#         tv_transforms.RandomHorizontalFlip(),
#         tv_transforms.RandomAffine(degrees=0, translate=(0.1, 0.1)),
#     ] if is_train else []) +
#     [
#         tv_transforms.ToTensor(),
#         tv_transforms.Normalize(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5]),
#     ])

# # --- 数据加载器 (与基线相同) ---
# dataset, loader = {}, {}
# for data_type in ("train", "test"):
#     is_train = data_type=="train"
#     dataset[data_type] = tv_datasets.CIFAR10(root="./data", train=is_train, download=True, transform=transformation[data_type])
#     loader[data_type] = torch.utils.data.DataLoader(dataset[data_type], batch_size=batch_size, shuffle=is_train, num_workers=num_workers)

# # --- 网络架构 (与基线相同) ---
# net = nn.Sequential(
#     nn.Conv2d(3, 128, 3, padding=1), nn.ReLU(inplace=True), nn.MaxPool2d(2), nn.Dropout(0.3),
#     nn.Conv2d(128, 256, 3, padding=1), nn.ReLU(inplace=True), nn.MaxPool2d(2), nn.Dropout(0.3),
#     nn.Conv2d(256, 512, 3, padding=1), nn.ReLU(inplace=True),
#     nn.Conv2d(512, 512, 3, padding=1), nn.ReLU(inplace=True),
#     nn.Conv2d(512, 256, 3, padding=1), nn.ReLU(inplace=True), nn.MaxPool2d(2), nn.Dropout(0.3),
#     nn.Flatten(),
#     nn.Linear(256 * 4 * 4, 512), nn.ReLU(inplace=True), nn.Dropout(0.5),
#     nn.Linear(512, 256), nn.ReLU(inplace=True), nn.Dropout(0.5),
#     nn.Linear(256, 128), nn.ReLU(inplace=True), nn.Dropout(0.5),
#     nn.Linear(128, 10),
# ).to(device)

# # --- 优化器与损失函数 ---
# optimizer = getattr(optim, optim_name)(net.parameters(), **optim_kwargs)
# criterion = nn.CrossEntropyLoss()

# # --- 训练循环 ---
# print("--- 开始训练: 实验 3 (AdamW优化器) ---")
# net.train()
# for epoch in range(num_epochs):
#     running_loss = 0.0
#     for i, (img, target) in enumerate(loader["train"]):
#         img, target = img.to(device), target.to(device)
#         pred = net(img)
#         loss = criterion(pred, target)
#         optimizer.zero_grad()
#         loss.backward()
#         optimizer.step()
#         running_loss += loss.item()
#         if i % print_every == print_every - 1:
#             print(f"[epoch={epoch + 1:3d}, iter={i + 1:5d}] loss: {running_loss / print_every:.3f}")
#             running_loss = 0.0
# print("--- 训练结束 ---")

# # --- 评估 ---
# net.eval()
# correct, total = 0, 0
# with torch.no_grad():
#     for img, target in loader["test"]:
#         img, target = img.to(device), target.to(device)
#         pred = net(img)
#         total += len(target)
#         correct += (torch.argmax(pred, dim=1) == target).sum().item()
# print(f"实验 3 (AdamW优化器) 准确率 @ 40 epochs: {100 * correct / total:.2f}%")

100%|██████████| 170M/170M [00:05<00:00, 28.9MB/s]


--- 开始训练: 实验 3 (AdamW优化器) ---
[epoch=  1, iter=  200] loss: 2.191
[epoch=  1, iter=  400] loss: 1.972
[epoch=  1, iter=  600] loss: 1.818
[epoch=  2, iter=  200] loss: 1.664
[epoch=  2, iter=  400] loss: 1.608
[epoch=  2, iter=  600] loss: 1.549
[epoch=  3, iter=  200] loss: 1.469
[epoch=  3, iter=  400] loss: 1.430
[epoch=  3, iter=  600] loss: 1.365
[epoch=  4, iter=  200] loss: 1.300
[epoch=  4, iter=  400] loss: 1.256
[epoch=  4, iter=  600] loss: 1.228
[epoch=  5, iter=  200] loss: 1.183
[epoch=  5, iter=  400] loss: 1.147
[epoch=  5, iter=  600] loss: 1.145
[epoch=  6, iter=  200] loss: 1.096
[epoch=  6, iter=  400] loss: 1.076
[epoch=  6, iter=  600] loss: 1.051
[epoch=  7, iter=  200] loss: 1.010
[epoch=  7, iter=  400] loss: 1.000
[epoch=  7, iter=  600] loss: 0.996
[epoch=  8, iter=  200] loss: 0.960
[epoch=  8, iter=  400] loss: 0.953
[epoch=  8, iter=  600] loss: 0.940
[epoch=  9, iter=  200] loss: 0.886
[epoch=  9, iter=  400] loss: 0.908
[epoch=  9, iter=  600] loss: 0.87

In [2]:
# # 实验 4: 更丰富的数据增强
# # 改动: 1. num_epochs=40  2. 增加了ColorJitter

# import torch
# import torch.nn as nn
# import torch.optim as optim
# import torchvision.datasets as tv_datasets
# import torchvision.transforms as tv_transforms

# # --- 实验配置 (与基线相同) ---
# device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
# num_epochs = 40
# batch_size = 64
# num_workers = 2
# print_every = 200
# optim_name = "Adam"
# optim_kwargs = dict(lr=3e-4, weight_decay=1e-6)

# # --- 数据预处理 (核心修改) ---
# transformation = dict()
# for data_type in ("train", "test"):
#     is_train = data_type=="train"
#     transformation[data_type] = tv_transforms.Compose(([
#         tv_transforms.RandomRotation(degrees=15),
#         tv_transforms.RandomHorizontalFlip(),
#         tv_transforms.RandomAffine(degrees=0, translate=(0.1, 0.1)),
#         tv_transforms.ColorJitter(brightness=0.2, contrast=0.2, saturation=0.2, hue=0.1),
#     ] if is_train else []) +
#     [
#         tv_transforms.ToTensor(),
#         tv_transforms.Normalize(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5]),
#     ])

# # --- 数据加载器 ---
# dataset, loader = {}, {}
# for data_type in ("train", "test"):
#     is_train = data_type=="train"
#     dataset[data_type] = tv_datasets.CIFAR10(root="./data", train=is_train, download=True, transform=transformation[data_type])
#     loader[data_type] = torch.utils.data.DataLoader(dataset[data_type], batch_size=batch_size, shuffle=is_train, num_workers=num_workers)

# # --- 网络架构 (与基线相同) ---
# net = nn.Sequential(
#     nn.Conv2d(3, 128, 3, padding=1), nn.ReLU(inplace=True), nn.MaxPool2d(2), nn.Dropout(0.3),
#     nn.Conv2d(128, 256, 3, padding=1), nn.ReLU(inplace=True), nn.MaxPool2d(2), nn.Dropout(0.3),
#     nn.Conv2d(256, 512, 3, padding=1), nn.ReLU(inplace=True),
#     nn.Conv2d(512, 512, 3, padding=1), nn.ReLU(inplace=True),
#     nn.Conv2d(512, 256, 3, padding=1), nn.ReLU(inplace=True), nn.MaxPool2d(2), nn.Dropout(0.3),
#     nn.Flatten(),
#     nn.Linear(256 * 4 * 4, 512), nn.ReLU(inplace=True), nn.Dropout(0.5),
#     nn.Linear(512, 256), nn.ReLU(inplace=True), nn.Dropout(0.5),
#     nn.Linear(256, 128), nn.ReLU(inplace=True), nn.Dropout(0.5),
#     nn.Linear(128, 10),
# ).to(device)

# # --- 优化器与损失函数 (与基线相同) ---
# optimizer = getattr(optim, optim_name)(net.parameters(), **optim_kwargs)
# criterion = nn.CrossEntropyLoss()

# # --- 训练循环 ---
# print("--- 开始训练: 实验 4 (更丰富的数据增强) ---")
# net.train()
# for epoch in range(num_epochs):
#     running_loss = 0.0
#     for i, (img, target) in enumerate(loader["train"]):
#         img, target = img.to(device), target.to(device)
#         pred = net(img)
#         loss = criterion(pred, target)
#         optimizer.zero_grad()
#         loss.backward()
#         optimizer.step()
#         running_loss += loss.item()
#         if i % print_every == print_every - 1:
#             print(f"[epoch={epoch + 1:3d}, iter={i + 1:5d}] loss: {running_loss / print_every:.3f}")
#             running_loss = 0.0
# print("--- 训练结束 ---")

# # --- 评估 ---
# net.eval()
# correct, total = 0, 0
# with torch.no_grad():
#     for img, target in loader["test"]:
#         img, target = img.to(device), target.to(device)
#         pred = net(img)
#         total += len(target)
#         correct += (torch.argmax(pred, dim=1) == target).sum().item()
# print(f"实验 4 (更丰富的数据增强) 准确率 @ 40 epochs: {100 * correct / total:.2f}%")

--- 开始训练: 实验 4 (更丰富的数据增强) ---
[epoch=  1, iter=  200] loss: 2.219
[epoch=  1, iter=  400] loss: 2.000
[epoch=  1, iter=  600] loss: 1.880
[epoch=  2, iter=  200] loss: 1.713
[epoch=  2, iter=  400] loss: 1.662
[epoch=  2, iter=  600] loss: 1.619
[epoch=  3, iter=  200] loss: 1.521
[epoch=  3, iter=  400] loss: 1.472
[epoch=  3, iter=  600] loss: 1.451
[epoch=  4, iter=  200] loss: 1.374
[epoch=  4, iter=  400] loss: 1.332
[epoch=  4, iter=  600] loss: 1.320
[epoch=  5, iter=  200] loss: 1.253
[epoch=  5, iter=  400] loss: 1.215
[epoch=  5, iter=  600] loss: 1.213
[epoch=  6, iter=  200] loss: 1.180
[epoch=  6, iter=  400] loss: 1.148
[epoch=  6, iter=  600] loss: 1.127
[epoch=  7, iter=  200] loss: 1.099
[epoch=  7, iter=  400] loss: 1.062
[epoch=  7, iter=  600] loss: 1.051
[epoch=  8, iter=  200] loss: 1.026
[epoch=  8, iter=  400] loss: 1.028
[epoch=  8, iter=  600] loss: 1.005
[epoch=  9, iter=  200] loss: 0.980
[epoch=  9, iter=  400] loss: 0.960
[epoch=  9, iter=  600] loss: 0.95

In [3]:
# # 实验 5: 引入注意力机制
# # 改动: 1. num_epochs=40  2. 网络重写为带SE-Block的结构

# import torch
# import torch.nn as nn
# import torch.optim as optim
# import torchvision.datasets as tv_datasets
# import torchvision.transforms as tv_transforms

# # --- 实验配置 (与基线相同) ---
# device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
# num_epochs = 40
# batch_size = 64
# num_workers = 2
# print_every = 200
# optim_name = "Adam"
# optim_kwargs = dict(lr=3e-4, weight_decay=1e-6)

# # --- 数据预处理 (与基线相同) ---
# transformation = dict()
# for data_type in ("train", "test"):
#     is_train = data_type=="train"
#     transformation[data_type] = tv_transforms.Compose(([
#         tv_transforms.RandomRotation(degrees=15),
#         tv_transforms.RandomHorizontalFlip(),
#         tv_transforms.RandomAffine(degrees=0, translate=(0.1, 0.1)),
#     ] if is_train else []) +
#     [
#         tv_transforms.ToTensor(),
#         tv_transforms.Normalize(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5]),
#     ])

# # --- 数据加载器 (与基线相同) ---
# dataset, loader = {}, {}
# for data_type in ("train", "test"):
#     is_train = data_type=="train"
#     dataset[data_type] = tv_datasets.CIFAR10(root="./data", train=is_train, download=True, transform=transformation[data_type])
#     loader[data_type] = torch.utils.data.DataLoader(dataset[data_type], batch_size=batch_size, shuffle=is_train, num_workers=num_workers)

# # --- 网络架构 (核心修改) ---
# class SEBlock(nn.Module):
#     def __init__(self, channel, reduction=16):
#         super().__init__()
#         self.avg_pool = nn.AdaptiveAvgPool2d(1)
#         self.fc = nn.Sequential(
#             nn.Linear(channel, channel // reduction, bias=False), nn.ReLU(inplace=True),
#             nn.Linear(channel // reduction, channel, bias=False), nn.Sigmoid())
#     def forward(self, x):
#         b, c, _, _ = x.size()
#         y = self.avg_pool(x).view(b, c)
#         y = self.fc(y).view(b, c, 1, 1)
#         return x * y.expand_as(x)

# class NetWithAttention(nn.Module):
#     def __init__(self):
#         super().__init__()
#         self.conv_block1 = nn.Sequential(
#             nn.Conv2d(3, 128, 3, padding=1), nn.ReLU(inplace=True),
#             SEBlock(128),
#             nn.MaxPool2d(2), nn.Dropout(0.3)
#         )
#         self.conv_block2 = nn.Sequential(
#             nn.Conv2d(128, 256, 3, padding=1), nn.ReLU(inplace=True),
#             SEBlock(256),
#             nn.MaxPool2d(2), nn.Dropout(0.3)
#         )
#         self.conv_block3 = nn.Sequential(
#             nn.Conv2d(256, 512, 3, padding=1), nn.ReLU(inplace=True),
#             nn.Conv2d(512, 512, 3, padding=1), nn.ReLU(inplace=True),
#             nn.Conv2d(512, 256, 3, padding=1), nn.ReLU(inplace=True),
#             SEBlock(256),
#             nn.MaxPool2d(2), nn.Dropout(0.3)
#         )
#         self.fc_block = nn.Sequential(
#             nn.Flatten(),
#             nn.Linear(256 * 4 * 4, 512), nn.ReLU(inplace=True), nn.Dropout(0.5),
#             nn.Linear(512, 256), nn.ReLU(inplace=True), nn.Dropout(0.5),
#             nn.Linear(256, 128), nn.ReLU(inplace=True), nn.Dropout(0.5),
#             nn.Linear(128, 10),
#         )
#     def forward(self, x):
#         x = self.conv_block1(x)
#         x = self.conv_block2(x)
#         x = self.conv_block3(x)
#         x = self.fc_block(x)
#         return x
# net = NetWithAttention().to(device)

# # --- 优化器与损失函数 (与基线相同) ---
# optimizer = getattr(optim, optim_name)(net.parameters(), **optim_kwargs)
# criterion = nn.CrossEntropyLoss()

# # --- 训练循环 ---
# print("--- 开始训练: 实验 5 (注意力机制) ---")
# net.train()
# for epoch in range(num_epochs):
#     running_loss = 0.0
#     for i, (img, target) in enumerate(loader["train"]):
#         img, target = img.to(device), target.to(device)
#         pred = net(img)
#         loss = criterion(pred, target)
#         optimizer.zero_grad()
#         loss.backward()
#         optimizer.step()
#         running_loss += loss.item()
#         if i % print_every == print_every - 1:
#             print(f"[epoch={epoch + 1:3d}, iter={i + 1:5d}] loss: {running_loss / print_every:.3f}")
#             running_loss = 0.0
# print("--- 训练结束 ---")

# # --- 评估 ---
# net.eval()
# correct, total = 0, 0
# with torch.no_grad():
#     for img, target in loader["test"]:
#         img, target = img.to(device), target.to(device)
#         pred = net(img)
#         total += len(target)
#         correct += (torch.argmax(pred, dim=1) == target).sum().item()
# print(f"实验 5 (注意力机制) 准确率 @ 40 epochs: {100 * correct / total:.2f}%")

--- 开始训练: 实验 5 (注意力机制) ---
[epoch=  1, iter=  200] loss: 2.210
[epoch=  1, iter=  400] loss: 1.993
[epoch=  1, iter=  600] loss: 1.828
[epoch=  2, iter=  200] loss: 1.691
[epoch=  2, iter=  400] loss: 1.622
[epoch=  2, iter=  600] loss: 1.598
[epoch=  3, iter=  200] loss: 1.489
[epoch=  3, iter=  400] loss: 1.432
[epoch=  3, iter=  600] loss: 1.411
[epoch=  4, iter=  200] loss: 1.361
[epoch=  4, iter=  400] loss: 1.312
[epoch=  4, iter=  600] loss: 1.288
[epoch=  5, iter=  200] loss: 1.236
[epoch=  5, iter=  400] loss: 1.224
[epoch=  5, iter=  600] loss: 1.190
[epoch=  6, iter=  200] loss: 1.149
[epoch=  6, iter=  400] loss: 1.115
[epoch=  6, iter=  600] loss: 1.116
[epoch=  7, iter=  200] loss: 1.072
[epoch=  7, iter=  400] loss: 1.061
[epoch=  7, iter=  600] loss: 1.051
[epoch=  8, iter=  200] loss: 1.003
[epoch=  8, iter=  400] loss: 1.009
[epoch=  8, iter=  600] loss: 1.024
[epoch=  9, iter=  200] loss: 0.962
[epoch=  9, iter=  400] loss: 0.961
[epoch=  9, iter=  600] loss: 0.951
[

**Task3**——60 points

Next, we performed the same operations on the new dataset, Tiny ImageNet, as we did in Task 2.

In [4]:
# --- 准备工作 1: 数据集处理 ---

import torch
import torch.nn as nn
import torch.optim as optim
import torchvision.datasets as tv_datasets
import torchvision.transforms as tv_transforms
import os
from PIL import Image

# 下载并解压数据集 (如果尚未执行)
if not os.path.exists('tiny-imagenet-200'):
    print("Downloading and unzipping Tiny ImageNet...")
    !wget -q http://cs231n.stanford.edu/tiny-imagenet-200.zip
    !unzip -q tiny-imagenet-200.zip
    print("Download complete.")

class TinyImageNetDataset(torch.utils.data.Dataset):
    """用于加载Tiny ImageNet的自定义Dataset类"""
    def __init__(self, root, split='train', transform=None):
        self.root = os.path.join(root, split)
        self.transform = transform
        self.class_to_idx = {}
        self.data = []

        # 建立类别到索引的映射
        with open(os.path.join(root, 'wnids.txt'), 'r') as f:
            for i, line in enumerate(f):
                self.class_to_idx[line.strip()] = i

        # 加载图像路径和标签
        if split == 'train':
            for class_name in os.listdir(self.root):
                class_dir = os.path.join(self.root, class_name, 'images')
                label = self.class_to_idx[class_name]
                for img_name in os.listdir(class_dir):
                    self.data.append((os.path.join(class_dir, img_name), label))
        elif split == 'val':
            with open(os.path.join(root, 'val', 'val_annotations.txt'), 'r') as f:
                for line in f:
                    parts = line.strip().split('\t')
                    img_name, class_name = parts[0], parts[1]
                    label = self.class_to_idx[class_name]
                    img_path = os.path.join(root, 'val', 'images', img_name)
                    self.data.append((img_path, label))

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        img_path, label = self.data[idx]
        image = Image.open(img_path).convert('RGB')
        if self.transform:
            image = self.transform(image)
        return image, label

Downloading and unzipping Tiny ImageNet...
Download complete.


In [5]:
# --- 准备工作 2: 定义数据增强 ---

# Tiny ImageNet的均值和标准差
TINY_IMAGENET_MEAN = [0.485, 0.456, 0.406]
TINY_IMAGENET_STD = [0.229, 0.224, 0.225]

transformation = dict()
for data_type in ("train", "val"): # Tiny ImageNet使用'val'作为测试集
    is_train = data_type == "train"
    transformation[data_type] = tv_transforms.Compose(([
        tv_transforms.RandomRotation(degrees=15),
        tv_transforms.RandomHorizontalFlip(),
        tv_transforms.RandomAffine(degrees=0, translate=(0.1, 0.1)),
    ] if is_train else []) +
    [
        tv_transforms.ToTensor(),
        tv_transforms.Normalize(mean=TINY_IMAGENET_MEAN, std=TINY_IMAGENET_STD),
    ])

In [6]:
# # 实验 0: Tiny ImageNet 基线
# # 改动: 1. 使用TinyImageNet的Dataset和transform
# #       2. 修改网络以适配64x64输入和200个分类
# #       3. epoch设置为40

# import torch
# import torch.nn as nn
# import torch.optim as optim

# # --- 实验配置 ---
# device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
# num_epochs = 40
# batch_size = 64
# num_workers = 2
# print_every = 200
# optim_name = "Adam"
# optim_kwargs = dict(lr=3e-4, weight_decay=1e-6)
# data_root = 'tiny-imagenet-200'

# # --- 数据加载器 ---
# dataset, loader = {}, {}
# for data_type in ("train", "val"):
#     is_train = data_type == "train"
#     dataset[data_type] = TinyImageNetDataset(root=data_root, split=data_type, transform=transformation[data_type])
#     loader[data_type] = torch.utils.data.DataLoader(dataset[data_type], batch_size=batch_size, shuffle=is_train, num_workers=num_workers)

# # --- 网络架构 (核心修改) ---
# net = nn.Sequential(
#     nn.Conv2d(3, 128, 3, padding=1), nn.ReLU(inplace=True), nn.MaxPool2d(2), nn.Dropout(0.3),     # 64x64 -> 32x32
#     nn.Conv2d(128, 256, 3, padding=1), nn.ReLU(inplace=True), nn.MaxPool2d(2), nn.Dropout(0.3),  # 32x32 -> 16x16
#     nn.Conv2d(256, 512, 3, padding=1), nn.ReLU(inplace=True),
#     nn.Conv2d(512, 512, 3, padding=1), nn.ReLU(inplace=True),
#     nn.Conv2d(512, 256, 3, padding=1), nn.ReLU(inplace=True), nn.MaxPool2d(2), nn.Dropout(0.3),  # 16x16 -> 8x8
#     nn.Flatten(),
#     nn.Linear(256 * 8 * 8, 512), nn.ReLU(inplace=True), nn.Dropout(0.5), # 适配64x64输入
#     nn.Linear(512, 256), nn.ReLU(inplace=True), nn.Dropout(0.5),
#     nn.Linear(256, 128), nn.ReLU(inplace=True), nn.Dropout(0.5),
#     nn.Linear(128, 200), # 适配200个分类
# ).to(device)

# # --- 优化器与损失函数 ---
# optimizer = getattr(optim, optim_name)(net.parameters(), **optim_kwargs)
# criterion = nn.CrossEntropyLoss()

# # --- 训练循环 ---
# print("--- 开始训练: 实验 0 (Tiny ImageNet 基线) ---")
# net.train()
# for epoch in range(num_epochs):
#     running_loss = 0.0
#     for i, (img, target) in enumerate(loader["train"]):
#         img, target = img.to(device), target.to(device)
#         pred = net(img)
#         loss = criterion(pred, target)
#         optimizer.zero_grad()
#         loss.backward()
#         optimizer.step()
#         running_loss += loss.item()
#         if i % print_every == print_every - 1:
#             print(f"[epoch={epoch + 1:3d}, iter={i + 1:5d}] loss: {running_loss / print_every:.3f}")
#             running_loss = 0.0
# print("--- 训练结束 ---")

# # --- 评估 ---
# net.eval()
# correct, total = 0, 0
# with torch.no_grad():
#     for img, target in loader["val"]: # 使用 'val' loader
#         img, target = img.to(device), target.to(device)
#         pred = net(img)
#         total += len(target)
#         correct += (torch.argmax(pred, dim=1) == target).sum().item()
# print(f"实验 0 (Tiny ImageNet 基线) 准确率 @ 40 epochs: {100 * correct / total:.2f}%")

--- 开始训练: 实验 0 (Tiny ImageNet 基线) ---
[epoch=  1, iter=  200] loss: 5.300
[epoch=  1, iter=  400] loss: 5.300
[epoch=  1, iter=  600] loss: 5.300
[epoch=  1, iter=  800] loss: 5.300
[epoch=  1, iter= 1000] loss: 5.300
[epoch=  1, iter= 1200] loss: 5.300
[epoch=  1, iter= 1400] loss: 5.300
[epoch=  2, iter=  200] loss: 5.299
[epoch=  2, iter=  400] loss: 5.299
[epoch=  2, iter=  600] loss: 5.299
[epoch=  2, iter=  800] loss: 5.299
[epoch=  2, iter= 1000] loss: 5.299
[epoch=  2, iter= 1200] loss: 5.300
[epoch=  2, iter= 1400] loss: 5.299
[epoch=  3, iter=  200] loss: 5.282
[epoch=  3, iter=  400] loss: 5.235
[epoch=  3, iter=  600] loss: 5.170
[epoch=  3, iter=  800] loss: 5.148
[epoch=  3, iter= 1000] loss: 5.124
[epoch=  3, iter= 1200] loss: 5.098
[epoch=  3, iter= 1400] loss: 5.077
[epoch=  4, iter=  200] loss: 5.049
[epoch=  4, iter=  400] loss: 5.020
[epoch=  4, iter=  600] loss: 5.005
[epoch=  4, iter=  800] loss: 4.990
[epoch=  4, iter= 1000] loss: 4.966
[epoch=  4, iter= 1200] lo

In [None]:
# 实验 1: Tiny ImageNet 残差机制
# 改动: 1. 网络重写为带跳跃连接的ResNet风格
#       2. 适配TinyImageNet
#       3. epoch设置为40

import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F

# --- 实验配置 ---
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
num_epochs = 40
batch_size = 64
num_workers = 2
print_every = 200
optim_name = "Adam"
optim_kwargs = dict(lr=3e-4, weight_decay=1e-6)
data_root = 'tiny-imagenet-200'

# --- 数据加载器 (与基线相同) ---
dataset, loader = {}, {}
for data_type in ("train", "val"):
    is_train = data_type == "train"
    dataset[data_type] = TinyImageNetDataset(root=data_root, split=data_type, transform=transformation[data_type])
    loader[data_type] = torch.utils.data.DataLoader(dataset[data_type], batch_size=batch_size, shuffle=is_train, num_workers=num_workers)

# --- 网络架构 (核心修改) ---
class ResidualBlock(nn.Module):
    """一个简单的残差块"""
    def __init__(self, in_channels, out_channels, stride=1):
        super().__init__()
        self.conv1 = nn.Conv2d(in_channels, out_channels, 3, stride=stride, padding=1)
        self.conv2 = nn.Conv2d(out_channels, out_channels, 3, padding=1)
        # 如果维度不匹配(通道数或尺寸变化)，则需要一个1x1卷积来调整shortcut
        self.shortcut = nn.Sequential()
        if stride != 1 or in_channels != out_channels:
            self.shortcut = nn.Conv2d(in_channels, out_channels, 1, stride=stride)

    def forward(self, x):
        out = F.relu(self.conv1(x))
        out = self.conv2(out)
        out += self.shortcut(x) # 核心：跳跃连接
        return F.relu(out)

class NetWithResidual(nn.Module):
    def __init__(self):
        super().__init__()
        self.conv_block1 = nn.Sequential(
            nn.Conv2d(3, 128, 3, padding=1), nn.ReLU(inplace=True),
            nn.MaxPool2d(2), nn.Dropout(0.3)
        )
        self.conv_block2 = nn.Sequential(
            ResidualBlock(128, 256, stride=1), # stride=1, 尺寸不变
            nn.MaxPool2d(2), nn.Dropout(0.3)
        )
        self.conv_block3 = nn.Sequential(
            ResidualBlock(256, 512, stride=1),
            ResidualBlock(512, 512, stride=1),
            ResidualBlock(512, 256, stride=1),
            nn.MaxPool2d(2), nn.Dropout(0.3)
        )
        self.fc_block = nn.Sequential(
            nn.Flatten(),
            nn.Linear(256 * 8 * 8, 512), nn.ReLU(inplace=True), nn.Dropout(0.5),
            nn.Linear(512, 256), nn.ReLU(inplace=True), nn.Dropout(0.5),
            nn.Linear(256, 128), nn.ReLU(inplace=True), nn.Dropout(0.5),
            nn.Linear(128, 200),
        )

    def forward(self, x):
        x = self.conv_block1(x)
        x = self.conv_block2(x)
        x = self.conv_block3(x)
        x = self.fc_block(x)
        return x

net = NetWithResidual().to(device)

# --- 优化器与损失函数 ---
optimizer = getattr(optim, optim_name)(net.parameters(), **optim_kwargs)
criterion = nn.CrossEntropyLoss()

# --- 训练循环 ---
print("--- 开始训练: 实验 1 (Tiny ImageNet 残差机制) ---")
net.train()
for epoch in range(num_epochs):
    running_loss = 0.0
    for i, (img, target) in enumerate(loader["train"]):
        img, target = img.to(device), target.to(device)
        pred = net(img)
        loss = criterion(pred, target)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        running_loss += loss.item()
        if i % print_every == print_every - 1:
            print(f"[epoch={epoch + 1:3d}, iter={i + 1:5d}] loss: {running_loss / print_every:.3f}")
            running_loss = 0.0
print("--- 训练结束 ---")

# --- 评估 ---
net.eval()
correct, total = 0, 0
with torch.no_grad():
    for img, target in loader["val"]:
        img, target = img.to(device), target.to(device)
        pred = net(img)
        total += len(target)
        correct += (torch.argmax(pred, dim=1) == target).sum().item()
print(f"实验 1 (Tiny ImageNet 残差机制) 准确率 @ 40 epochs: {100 * correct / total:.2f}%")

--- 开始训练: 实验 1 (Tiny ImageNet 残差机制) ---
[epoch=  1, iter=  200] loss: 5.300
[epoch=  1, iter=  400] loss: 5.300
[epoch=  1, iter=  600] loss: 5.289
[epoch=  1, iter=  800] loss: 5.248
[epoch=  1, iter= 1000] loss: 5.226
[epoch=  1, iter= 1200] loss: 5.194
[epoch=  1, iter= 1400] loss: 5.156
[epoch=  2, iter=  200] loss: 5.114
[epoch=  2, iter=  400] loss: 5.090
[epoch=  2, iter=  600] loss: 5.061
[epoch=  2, iter=  800] loss: 5.035
[epoch=  2, iter= 1000] loss: 5.014
[epoch=  2, iter= 1200] loss: 4.980
[epoch=  2, iter= 1400] loss: 4.948
[epoch=  3, iter=  200] loss: 4.899
[epoch=  3, iter=  400] loss: 4.897
[epoch=  3, iter=  600] loss: 4.842
[epoch=  3, iter=  800] loss: 4.812
[epoch=  3, iter= 1000] loss: 4.764
[epoch=  3, iter= 1200] loss: 4.731
[epoch=  3, iter= 1400] loss: 4.695
[epoch=  4, iter=  200] loss: 4.637
[epoch=  4, iter=  400] loss: 4.596
[epoch=  4, iter=  600] loss: 4.598
[epoch=  4, iter=  800] loss: 4.533
[epoch=  4, iter= 1000] loss: 4.526
[epoch=  4, iter= 1200] 

In [None]:
# 实验 2: Tiny ImageNet 更宽的网络
# 改动: 1. 网络各层通道数乘以1.5
#       2. 适配TinyImageNet
#       3. epoch设置为40

import torch
import torch.nn as nn
import torch.optim as optim
# (假设 TinyImageNetDataset 和 transformation 已定义)

# --- 实验配置 ---
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
num_epochs = 40
batch_size = 64
num_workers = 2
print_every = 200
optim_name = "Adam"
optim_kwargs = dict(lr=3e-4, weight_decay=1e-6)
data_root = 'tiny-imagenet-200'

# --- 数据加载器 (与基线相同) ---
dataset, loader = {}, {}
for data_type in ("train", "val"):
    is_train = data_type == "train"
    dataset[data_type] = TinyImageNetDataset(root=data_root, split=data_type, transform=transformation[data_type])
    loader[data_type] = torch.utils.data.DataLoader(dataset[data_type], batch_size=batch_size, shuffle=is_train, num_workers=num_workers)

# --- 网络架构 (核心修改) ---
wm = 1.5 # 宽度乘数
c1, c2, c3, c4 = int(128*wm), int(256*wm), int(512*wm), int(256*wm)
l1, l2, l3, l4 = int(512*wm), int(256*wm), int(128*wm), 200

net = nn.Sequential(
    nn.Conv2d(3, c1, 3, padding=1), nn.ReLU(inplace=True), nn.MaxPool2d(2), nn.Dropout(0.3),
    nn.Conv2d(c1, c2, 3, padding=1), nn.ReLU(inplace=True), nn.MaxPool2d(2), nn.Dropout(0.3),
    nn.Conv2d(c2, c3, 3, padding=1), nn.ReLU(inplace=True),
    nn.Conv2d(c3, c3, 3, padding=1), nn.ReLU(inplace=True),
    nn.Conv2d(c3, c4, 3, padding=1), nn.ReLU(inplace=True), nn.MaxPool2d(2), nn.Dropout(0.3),
    nn.Flatten(),
    nn.Linear(c4 * 8 * 8, l1), nn.ReLU(inplace=True), nn.Dropout(0.5), # 适配64x64输入
    nn.Linear(l1, l2), nn.ReLU(inplace=True), nn.Dropout(0.5),
    nn.Linear(l2, l3), nn.ReLU(inplace=True), nn.Dropout(0.5),
    nn.Linear(l3, l4), # 适配200个分类
).to(device)

# --- 优化器与损失函数 ---
optimizer = getattr(optim, optim_name)(net.parameters(), **optim_kwargs)
criterion = nn.CrossEntropyLoss()

# --- 训练循环 ---
print("--- 开始训练: 实验 2 (Tiny ImageNet 更宽网络) ---")
net.train()
for epoch in range(num_epochs):
    running_loss = 0.0
    for i, (img, target) in enumerate(loader["train"]):
        img, target = img.to(device), target.to(device)
        pred = net(img)
        loss = criterion(pred, target)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        running_loss += loss.item()
        if i % print_every == print_every - 1:
            print(f"[epoch={epoch + 1:3d}, iter={i + 1:5d}] loss: {running_loss / print_every:.3f}")
            running_loss = 0.0
print("--- 训练结束 ---")

# --- 评估 ---
net.eval()
correct, total = 0, 0
with torch.no_grad():
    for img, target in loader["val"]:
        img, target = img.to(device), target.to(device)
        pred = net(img)
        total += len(target)
        correct += (torch.argmax(pred, dim=1) == target).sum().item()
print(f"实验 2 (Tiny ImageNet 更宽网络) 准确率 @ 40 epochs: {100 * correct / total:.2f}%")

In [None]:
# 实验 3: Tiny ImageNet AdamW优化器
# 改动: 1. optim_name改为"AdamW", 调整weight_decay
#       2. 适配TinyImageNet
#       3. epoch设置为40

import torch
import torch.nn as nn
import torch.optim as optim
# (假设 TinyImageNetDataset 和 transformation 已定义)

# --- 实验配置 (核心修改) ---
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
num_epochs = 40
batch_size = 64
num_workers = 2
print_every = 200
optim_name = "AdamW"
optim_kwargs = dict(lr=3e-4, weight_decay=1e-4) # AdamW使用更典型的weight_decay
data_root = 'tiny-imagenet-200'

# --- 数据加载器 (与基线相同) ---
dataset, loader = {}, {}
for data_type in ("train", "val"):
    is_train = data_type == "train"
    dataset[data_type] = TinyImageNetDataset(root=data_root, split=data_type, transform=transformation[data_type])
    loader[data_type] = torch.utils.data.DataLoader(dataset[data_type], batch_size=batch_size, shuffle=is_train, num_workers=num_workers)

# --- 网络架构 (与基线相同) ---
net = nn.Sequential(
    nn.Conv2d(3, 128, 3, padding=1), nn.ReLU(inplace=True), nn.MaxPool2d(2), nn.Dropout(0.3),
    nn.Conv2d(128, 256, 3, padding=1), nn.ReLU(inplace=True), nn.MaxPool2d(2), nn.Dropout(0.3),
    nn.Conv2d(256, 512, 3, padding=1), nn.ReLU(inplace=True),
    nn.Conv2d(512, 512, 3, padding=1), nn.ReLU(inplace=True),
    nn.Conv2d(512, 256, 3, padding=1), nn.ReLU(inplace=True), nn.MaxPool2d(2), nn.Dropout(0.3),
    nn.Flatten(),
    nn.Linear(256 * 8 * 8, 512), nn.ReLU(inplace=True), nn.Dropout(0.5),
    nn.Linear(512, 256), nn.ReLU(inplace=True), nn.Dropout(0.5),
    nn.Linear(256, 128), nn.ReLU(inplace=True), nn.Dropout(0.5),
    nn.Linear(128, 200),
).to(device)

# --- 优化器与损失函数 ---
optimizer = getattr(optim, optim_name)(net.parameters(), **optim_kwargs)
criterion = nn.CrossEntropyLoss()

# --- 训练循环 ---
print("--- 开始训练: 实验 3 (Tiny ImageNet AdamW优化器) ---")
net.train()
for epoch in range(num_epochs):
    running_loss = 0.0
    for i, (img, target) in enumerate(loader["train"]):
        img, target = img.to(device), target.to(device)
        pred = net(img)
        loss = criterion(pred, target)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        running_loss += loss.item()
        if i % print_every == print_every - 1:
            print(f"[epoch={epoch + 1:3d}, iter={i + 1:5d}] loss: {running_loss / print_every:.3f}")
            running_loss = 0.0
print("--- 训练结束 ---")

# --- 评估 ---
net.eval()
correct, total = 0, 0
with torch.no_grad():
    for img, target in loader["val"]:
        img, target = img.to(device), target.to(device)
        pred = net(img)
        total += len(target)
        correct += (torch.argmax(pred, dim=1) == target).sum().item()
print(f"实验 3 (Tiny ImageNet AdamW优化器) 准确率 @ 40 epochs: {100 * correct / total:.2f}%")

In [None]:
# 实验 4: Tiny ImageNet 更丰富的数据增强
# 改动: 1. 在数据增强中加入ColorJitter
#       2. 适配TinyImageNet
#       3. epoch设置为40

import torch
import torch.nn as nn
import torch.optim as optim
# (假设 TinyImageNetDataset 和 TINY_IMAGENET... 已定义)

# --- 实验配置 ---
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
num_epochs = 40
batch_size = 64
num_workers = 2
print_every = 200
optim_name = "Adam"
optim_kwargs = dict(lr=3e-4, weight_decay=1e-6)
data_root = 'tiny-imagenet-200'

# --- 数据预处理 (核心修改) ---
transformation_augmented = dict()
for data_type in ("train", "val"):
    is_train = data_type == "train"
    transformation_augmented[data_type] = tv_transforms.Compose(([
        tv_transforms.RandomRotation(degrees=15),
        tv_transforms.RandomHorizontalFlip(),
        tv_transforms.RandomAffine(degrees=0, translate=(0.1, 0.1)),
        tv_transforms.ColorJitter(brightness=0.2, contrast=0.2, saturation=0.2, hue=0.1), # 新增项
    ] if is_train else []) +
    [
        tv_transforms.ToTensor(),
        tv_transforms.Normalize(mean=TINY_IMAGENET_MEAN, std=TINY_IMAGENET_STD),
    ])

# --- 数据加载器 ---
dataset, loader = {}, {}
for data_type in ("train", "val"):
    is_train = data_type == "train"
    dataset[data_type] = TinyImageNetDataset(root=data_root, split=data_type, transform=transformation_augmented[data_type])
    loader[data_type] = torch.utils.data.DataLoader(dataset[data_type], batch_size=batch_size, shuffle=is_train, num_workers=num_workers)

# --- 网络架构 (与基线相同) ---
net = nn.Sequential(
    nn.Conv2d(3, 128, 3, padding=1), nn.ReLU(inplace=True), nn.MaxPool2d(2), nn.Dropout(0.3),
    nn.Conv2d(128, 256, 3, padding=1), nn.ReLU(inplace=True), nn.MaxPool2d(2), nn.Dropout(0.3),
    nn.Conv2d(256, 512, 3, padding=1), nn.ReLU(inplace=True),
    nn.Conv2d(512, 512, 3, padding=1), nn.ReLU(inplace=True),
    nn.Conv2d(512, 256, 3, padding=1), nn.ReLU(inplace=True), nn.MaxPool2d(2), nn.Dropout(0.3),
    nn.Flatten(),
    nn.Linear(256 * 8 * 8, 512), nn.ReLU(inplace=True), nn.Dropout(0.5),
    nn.Linear(512, 256), nn.ReLU(inplace=True), nn.Dropout(0.5),
    nn.Linear(256, 128), nn.ReLU(inplace=True), nn.Dropout(0.5),
    nn.Linear(128, 200),
).to(device)

# --- 优化器与损失函数 ---
optimizer = getattr(optim, optim_name)(net.parameters(), **optim_kwargs)
criterion = nn.CrossEntropyLoss()

# --- 训练循环 ---
print("--- 开始训练: 实验 4 (Tiny ImageNet 更丰富的数据增强) ---")
net.train()
for epoch in range(num_epochs):
    running_loss = 0.0
    for i, (img, target) in enumerate(loader["train"]):
        img, target = img.to(device), target.to(device)
        pred = net(img)
        loss = criterion(pred, target)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        running_loss += loss.item()
        if i % print_every == print_every - 1:
            print(f"[epoch={epoch + 1:3d}, iter={i + 1:5d}] loss: {running_loss / print_every:.3f}")
            running_loss = 0.0
print("--- 训练结束 ---")

# --- 评估 ---
net.eval()
correct, total = 0, 0
with torch.no_grad():
    for img, target in loader["val"]:
        img, target = img.to(device), target.to(device)
        pred = net(img)
        total += len(target)
        correct += (torch.argmax(pred, dim=1) == target).sum().item()
print(f"实验 4 (Tiny ImageNet 更丰富的数据增强) 准确率 @ 40 epochs: {100 * correct / total:.2f}%")

In [None]:
# 实验 5: Tiny ImageNet 注意力机制
# 改动: 1. 网络重写为带SE-Block的结构
#       2. 适配TinyImageNet
#       3. epoch设置为40

import torch
import torch.nn as nn
import torch.optim as optim
# (假设 TinyImageNetDataset 和 transformation 已定义)

# --- 实验配置 ---
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
num_epochs = 40
batch_size = 64
num_workers = 2
print_every = 200
optim_name = "Adam"
optim_kwargs = dict(lr=3e-4, weight_decay=1e-6)
data_root = 'tiny-imagenet-200'

# --- 数据加载器 (与基线相同) ---
dataset, loader = {}, {}
for data_type in ("train", "val"):
    is_train = data_type == "train"
    dataset[data_type] = TinyImageNetDataset(root=data_root, split=data_type, transform=transformation[data_type])
    loader[data_type] = torch.utils.data.DataLoader(dataset[data_type], batch_size=batch_size, shuffle=is_train, num_workers=num_workers)

# --- 网络架构 (核心修改) ---
class SEBlock(nn.Module):
    def __init__(self, channel, reduction=16):
        super().__init__()
        self.avg_pool = nn.AdaptiveAvgPool2d(1)
        self.fc = nn.Sequential(
            nn.Linear(channel, channel // reduction, bias=False), nn.ReLU(inplace=True),
            nn.Linear(channel // reduction, channel, bias=False), nn.Sigmoid())
    def forward(self, x):
        b, c, _, _ = x.size()
        y = self.avg_pool(x).view(b, c)
        y = self.fc(y).view(b, c, 1, 1)
        return x * y.expand_as(x)

class NetWithAttention(nn.Module):
    def __init__(self):
        super().__init__()
        self.conv_block1 = nn.Sequential(
            nn.Conv2d(3, 128, 3, padding=1), nn.ReLU(inplace=True),
            SEBlock(128),
            nn.MaxPool2d(2), nn.Dropout(0.3)
        )
        self.conv_block2 = nn.Sequential(
            nn.Conv2d(128, 256, 3, padding=1), nn.ReLU(inplace=True),
            SEBlock(256),
            nn.MaxPool2d(2), nn.Dropout(0.3)
        )
        self.conv_block3 = nn.Sequential(
            nn.Conv2d(256, 512, 3, padding=1), nn.ReLU(inplace=True),
            nn.Conv2d(512, 512, 3, padding=1), nn.ReLU(inplace=True),
            nn.Conv2d(512, 256, 3, padding=1), nn.ReLU(inplace=True),
            SEBlock(256),
            nn.MaxPool2d(2), nn.Dropout(0.3)
        )
        self.fc_block = nn.Sequential(
            nn.Flatten(),
            nn.Linear(256 * 8 * 8, 512), nn.ReLU(inplace=True), nn.Dropout(0.5),
            nn.Linear(512, 256), nn.ReLU(inplace=True), nn.Dropout(0.5),
            nn.Linear(256, 128), nn.ReLU(inplace=True), nn.Dropout(0.5),
            nn.Linear(128, 200),
        )
    def forward(self, x):
        x = self.conv_block1(x)
        x = self.conv_block2(x)
        x = self.conv_block3(x)
        x = self.fc_block(x)
        return x
net = NetWithAttention().to(device)

# --- 优化器与损失函数 ---
optimizer = getattr(optim, optim_name)(net.parameters(), **optim_kwargs)
criterion = nn.CrossEntropyLoss()

# --- 训练循环 ---
print("--- 开始训练: 实验 5 (Tiny ImageNet 注意力机制) ---")
net.train()
for epoch in range(num_epochs):
    running_loss = 0.0
    for i, (img, target) in enumerate(loader["train"]):
        img, target = img.to(device), target.to(device)
        pred = net(img)
        loss = criterion(pred, target)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        running_loss += loss.item()
        if i % print_every == print_every - 1:
            print(f"[epoch={epoch + 1:3d}, iter={i + 1:5d}] loss: {running_loss / print_every:.3f}")
            running_loss = 0.0
print("--- 训练结束 ---")

# --- 评估 ---
net.eval()
correct, total = 0, 0
with torch.no_grad():
    for img, target in loader["val"]:
        img, target = img.to(device), target.to(device)
        pred = net(img)
        total += len(target)
        correct += (torch.argmax(pred, dim=1) == target).sum().item()
print(f"实验 5 (Tiny ImageNet 注意力机制) 准确率 @ 40 epochs: {100 * correct / total:.2f}%")