In [35]:
!pip install torchinfo



In [36]:
import torch
from torch import nn
from torch.nn import functional as F
from torchinfo import summary
from torchvision import datasets, transforms
from torch.utils.data import DataLoader
from torch.optim.lr_scheduler import LambdaLR


In [37]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)
if device.type == "cuda":
  !nvidia-smi

Using device: cuda
Sun Sep 28 16:47:36 2025       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 550.54.15              Driver Version: 550.54.15      CUDA Version: 12.4     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  Tesla T4                       Off |   00000000:00:04.0 Off |                    0 |
| N/A   74C    P0             31W /   70W |   14864MiB /  15360MiB |      0%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+
                             

定义模型

In [None]:
class Model(nn.Module):
  def __init__(self): # 使用卷积神经网络
    super().__init__()
    self.conv1 = nn.Conv2d(3,96,kernel_size=11,stride=4)  # 卷积层
    self.pool1 = nn.MaxPool2d(kernel_size=3,stride=2)   # 池化层

    self.conv2 = nn.Conv2d(96,256,kernel_size=5,padding=2)
    self.pool2 = nn.MaxPool2d(kernel_size=3,stride=2)

    self.conv3 = nn.Conv2d(256,384,kernel_size=3,padding=1)
    self.conv4 = nn.Conv2d(384,384,kernel_size=3,padding=1)
    self.conv5 = nn.Conv2d(384,256,kernel_size=3,padding=1)
    self.pool3 = nn.MaxPool2d(kernel_size=3,stride=2)
    #FC
    self.fc1 = nn.Linear(6*6*256,4096)
    self.fc2 = nn.Linear(4096,4096)
    self.fc3 = nn.Linear(4096,100)  # 最后一层输出维度为100，对应CIFAR-100数据集中100个类别的分类任务，这个网络原先为ImageNet(1000分类且数据量更大)设计，这里直接改最后一层输出维度导致过拟合
    # CIFAR-100数据量太小而这个模型太过复杂


  def forward(self,x):
    x = F.relu(self.conv1(x)) # 使用ReLu激活函数，减少梯度消失问题
    x = self.pool1(x)
    x = F.relu(self.conv2(x))
    x = self.pool2(x)
    x = F.relu(self.conv3(x))
    x = F.relu(self.conv4(x))
    x = F.relu(self.conv5(x))
    x = self.pool3(x)
    x = x.view(-1,6*6*256) # 把多维特征图拉平为一维向量，送入全连接层
    x = F.relu(F.dropout(self.fc1(x),0.5)) # Dropout以50%的概率随机置0神经元的输出，用于减少过拟合并降低神经元间的协同依赖（co-adaptation）
    x = F.relu(F.dropout(self.fc2(x),0.5))
    output = self.fc3(x)  # 不要加上softmax,后续使用CrossEntropyLoss作为loss函数会自动进行log-softmax。如果这里使用softmax，会导致梯度消失
    return output

In [39]:
transform = transforms.Compose([   # transform将图片转化成神经网络需要的输入
    transforms.Resize((227,227)),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.5071,0.4867,0.4408],std=[0.2675,0.2565,0.2761])  # 这是 CIFAR-100 数据集每个通道的均值与标准差
])

In [40]:
train_set = datasets.CIFAR100(root='./data',train=True,download=True,transform=transform)
test_set = datasets.CIFAR100(root='./data',train=False,download=True,transform=transform)

In [41]:
train_loader = DataLoader(train_set,batch_size=128,shuffle=True) # 按批次从训练集中提取出数据，供模型训练
test_loader = DataLoader(test_set,batch_size=128,shuffle=False)

In [42]:
# 快速测试神经网络结构是否搭建成功、数据是否兼容、能否在GPU上跑通。
net = Model().to(device)  # 模型的全部参数和缓存转移到指定设备（通常是GPU）
images, labels = next(iter(train_loader))
images, labels = images.to(device), labels.to(device) # 把数据加载到指定设备
out = net(images)
print(out.shape)

torch.Size([128, 100])


In [43]:
def lr_lambda(epoch):  # 自定义学习率
  warmup_epochs = 5   # 热身
  if epoch < warmup_epochs:
    return (epoch + 1) / warmup_epochs
  else:   #训练到一定程度降低学习率
    return 0.5 ** ((epoch - warmup_epochs) // 20)

In [44]:
loss_fn = nn.CrossEntropyLoss()  # 使用交叉熵损失函数（适用于多分类问题），内部会自动执行 log_softmax + NLLLoss。不需要手动添加 softmax 层
optimizer = torch.optim.SGD(net.parameters(),lr=0.01,momentum=0.9)  #使用带动量的SGD（随机梯度下降）
scheduler = LambdaLR(optimizer=optimizer,lr_lambda=lr_lambda)  # LambdaLR自定义学习率调度器

Train

In [None]:
from tqdm import tqdm

num_epochs = 50

loss_list = []

for epoch in range(num_epochs):
  print(f"lr_lambda(epoch={epoch}): {lr_lambda(epoch):.6f}")
  epoch_loss = 0
  num_batches = 0
  for images, labels in tqdm(train_loader, desc=f"Epoch {epoch+1}/{num_epochs}"):
    images, labels = images.to(device), labels.to(device) # 数据加载到指定设备
    optimizer.zero_grad()  # 把梯度清空
    outputs = net(images)  # 取得预测值，正向传播
    loss = loss_fn(outputs,labels) # 计算loss函数
    loss.backward() # 反向传播
    optimizer.step()  # 更新模型参数
    epoch_loss += loss.item()
    num_batches += 1
  avg_loss = epoch_loss / num_batches
  loss_list.append(avg_loss)
  print(f"Epoch [{epoch+1}/{num_epochs}], Loss: {avg_loss:.4f}")
  net.eval() # 切换到评估模式（禁用dropout, BN）
  correct = 0
  total = 0
  scheduler.step()  # 学习率调度器更新
  with torch.no_grad(): # 禁用梯度计算（节省内存 & 加速 & 避免使用测试集训练）
    for images, labels in tqdm(test_loader, desc=f"Epoch {epoch+1}/{num_epochs}"):
      images, labels = images.to(device), labels.to(device)
      outputs = net(images)
      preds = outputs.argmax(dim=1) # 获取每行最大值索引（即预测的类别）
      correct += (preds == labels).sum().item()
      total += labels.size(0)
    accuracy = correct / total
    print(f"Test Accuracy: {accuracy * 100:.2f}%")
    net.train() # 将模型改为训练模式

    # 保存模型
    torch.save(net.state_dict(), "alexnet_cifar100_latest.pth")
    print(f"已保存模型参数：alexnet_cifar100_latest.pth")

lr_lambda(epoch=0): 0.200000


Epoch 1/50: 100%|██████████| 391/391 [02:19<00:00,  2.81it/s]


Epoch [1/50], Loss: 4.6051
当前学习率: 0.004000


Epoch 1/50: 100%|██████████| 79/79 [00:20<00:00,  3.93it/s]


Test Accuracy: 0.98%
已保存模型参数：alexnet_cifar100_latest.pth
lr_lambda(epoch=1): 0.400000


Epoch 2/50: 100%|██████████| 391/391 [02:12<00:00,  2.95it/s]


Epoch [2/50], Loss: 4.6027
当前学习率: 0.006000


Epoch 2/50: 100%|██████████| 79/79 [00:19<00:00,  3.99it/s]


Test Accuracy: 1.53%
已保存模型参数：alexnet_cifar100_latest.pth
lr_lambda(epoch=2): 0.600000


Epoch 3/50: 100%|██████████| 391/391 [02:12<00:00,  2.96it/s]


Epoch [3/50], Loss: 4.2407
当前学习率: 0.008000


Epoch 3/50: 100%|██████████| 79/79 [00:20<00:00,  3.92it/s]


Test Accuracy: 8.15%
已保存模型参数：alexnet_cifar100_latest.pth
lr_lambda(epoch=3): 0.800000


Epoch 4/50: 100%|██████████| 391/391 [02:13<00:00,  2.93it/s]


Epoch [4/50], Loss: 3.8041
当前学习率: 0.010000


Epoch 4/50: 100%|██████████| 79/79 [00:19<00:00,  4.00it/s]


Test Accuracy: 14.91%
已保存模型参数：alexnet_cifar100_latest.pth
lr_lambda(epoch=4): 1.000000


Epoch 5/50: 100%|██████████| 391/391 [02:12<00:00,  2.95it/s]


Epoch [5/50], Loss: 3.4299
当前学习率: 0.010000


Epoch 5/50: 100%|██████████| 79/79 [00:20<00:00,  3.87it/s]


Test Accuracy: 20.07%
已保存模型参数：alexnet_cifar100_latest.pth
lr_lambda(epoch=5): 1.000000


Epoch 6/50: 100%|██████████| 391/391 [02:12<00:00,  2.95it/s]


Epoch [6/50], Loss: 3.0642
当前学习率: 0.010000


Epoch 6/50: 100%|██████████| 79/79 [00:20<00:00,  3.91it/s]


Test Accuracy: 26.59%
已保存模型参数：alexnet_cifar100_latest.pth
lr_lambda(epoch=6): 1.000000


Epoch 7/50: 100%|██████████| 391/391 [02:12<00:00,  2.95it/s]


Epoch [7/50], Loss: 2.7460
当前学习率: 0.010000


Epoch 7/50: 100%|██████████| 79/79 [00:19<00:00,  4.01it/s]


Test Accuracy: 32.67%
已保存模型参数：alexnet_cifar100_latest.pth
lr_lambda(epoch=7): 1.000000


Epoch 8/50: 100%|██████████| 391/391 [02:12<00:00,  2.94it/s]


Epoch [8/50], Loss: 2.4291
当前学习率: 0.010000


Epoch 8/50: 100%|██████████| 79/79 [00:20<00:00,  3.85it/s]


Test Accuracy: 35.01%
已保存模型参数：alexnet_cifar100_latest.pth
lr_lambda(epoch=8): 1.000000


Epoch 9/50: 100%|██████████| 391/391 [02:13<00:00,  2.93it/s]


Epoch [9/50], Loss: 2.1603
当前学习率: 0.010000


Epoch 9/50: 100%|██████████| 79/79 [00:20<00:00,  3.87it/s]


Test Accuracy: 39.03%
已保存模型参数：alexnet_cifar100_latest.pth
lr_lambda(epoch=9): 1.000000


Epoch 10/50: 100%|██████████| 391/391 [02:12<00:00,  2.94it/s]


Epoch [10/50], Loss: 1.8639
当前学习率: 0.010000


Epoch 10/50: 100%|██████████| 79/79 [00:19<00:00,  4.00it/s]


Test Accuracy: 40.95%
已保存模型参数：alexnet_cifar100_latest.pth
lr_lambda(epoch=10): 1.000000


Epoch 11/50: 100%|██████████| 391/391 [02:12<00:00,  2.96it/s]


Epoch [11/50], Loss: 1.6022
当前学习率: 0.010000


Epoch 11/50: 100%|██████████| 79/79 [00:19<00:00,  4.02it/s]


Test Accuracy: 42.83%
已保存模型参数：alexnet_cifar100_latest.pth
lr_lambda(epoch=11): 1.000000


Epoch 12/50: 100%|██████████| 391/391 [02:12<00:00,  2.96it/s]


Epoch [12/50], Loss: 1.3309
当前学习率: 0.010000


Epoch 12/50: 100%|██████████| 79/79 [00:19<00:00,  4.00it/s]


Test Accuracy: 42.02%
已保存模型参数：alexnet_cifar100_latest.pth
lr_lambda(epoch=12): 1.000000


Epoch 13/50: 100%|██████████| 391/391 [02:13<00:00,  2.94it/s]


Epoch [13/50], Loss: 1.0863
当前学习率: 0.010000


Epoch 13/50: 100%|██████████| 79/79 [00:19<00:00,  3.97it/s]


Test Accuracy: 44.06%
已保存模型参数：alexnet_cifar100_latest.pth
lr_lambda(epoch=13): 1.000000


Epoch 14/50: 100%|██████████| 391/391 [02:12<00:00,  2.95it/s]


Epoch [14/50], Loss: 0.8745
当前学习率: 0.010000


Epoch 14/50: 100%|██████████| 79/79 [00:20<00:00,  3.94it/s]


Test Accuracy: 43.33%
已保存模型参数：alexnet_cifar100_latest.pth
lr_lambda(epoch=14): 1.000000


Epoch 15/50: 100%|██████████| 391/391 [02:13<00:00,  2.92it/s]


Epoch [15/50], Loss: 0.7218
当前学习率: 0.010000


Epoch 15/50: 100%|██████████| 79/79 [00:19<00:00,  3.98it/s]


Test Accuracy: 42.54%
已保存模型参数：alexnet_cifar100_latest.pth
lr_lambda(epoch=15): 1.000000


Epoch 16/50: 100%|██████████| 391/391 [02:12<00:00,  2.95it/s]


Epoch [16/50], Loss: 0.5868
当前学习率: 0.010000


Epoch 16/50: 100%|██████████| 79/79 [00:19<00:00,  3.95it/s]


Test Accuracy: 42.79%
已保存模型参数：alexnet_cifar100_latest.pth
lr_lambda(epoch=16): 1.000000


Epoch 17/50: 100%|██████████| 391/391 [02:12<00:00,  2.95it/s]


Epoch [17/50], Loss: 0.5158
当前学习率: 0.010000


Epoch 17/50: 100%|██████████| 79/79 [00:21<00:00,  3.75it/s]


Test Accuracy: 42.67%
已保存模型参数：alexnet_cifar100_latest.pth
lr_lambda(epoch=17): 1.000000


Epoch 18/50: 100%|██████████| 391/391 [02:11<00:00,  2.97it/s]


Epoch [18/50], Loss: 0.4141
当前学习率: 0.010000


Epoch 18/50: 100%|██████████| 79/79 [00:20<00:00,  3.90it/s]


Test Accuracy: 42.06%
已保存模型参数：alexnet_cifar100_latest.pth
lr_lambda(epoch=18): 1.000000


Epoch 19/50: 100%|██████████| 391/391 [02:12<00:00,  2.96it/s]


Epoch [19/50], Loss: 0.3740
当前学习率: 0.010000


Epoch 19/50: 100%|██████████| 79/79 [00:20<00:00,  3.92it/s]


Test Accuracy: 42.03%


In [None]:
torch.save(net.state_dict(), "alexnet_cifar100.pth")
print("模型参数已保存为 alexnet_cifar100.pth")

In [None]:
import matplotlib.pyplot as plt
plt.plot(loss_list, marker='o')
plt.xlabel("Epoch")
plt.ylabel("Training Loss")
plt.title("Loss Curve")
plt.grid(True)
plt.show()