# 在 CPU 环境下运行的 CIFAR-10 ResNet 训练笔记本
# 保持原有训练方式与精度，提供详细计时与日志输出

# 目录（按顺序运行）
# 1. 环境与依赖检查（CPU 运行）
# 2. 数据预处理与加载器（CPU 设置）
# 3. 模型定义：带残差块的 ResNet_CIFAR10
# 4. 训练与测试函数：详细日志与计时
# 5. 主程序：CPU 训练流程、学习率调度与模型保存
# 6. 附录：执行顺序说明


In [None]:
# 1. 环境与依赖检查（CPU 运行）
import time
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader
from torchvision import datasets, transforms
import numpy as np
import matplotlib.pyplot as plt  # 绘制训练曲线

# 强制使用CPU，在线平台通常无GPU
device = torch.device('cpu')
print(f"使用设备: {device}")
print(f"PyTorch 版本: {torch.__version__}")

# 设定随机种子，保证可复现
seed = 42
np.random.seed(seed)
torch.manual_seed(seed)
torch.backends.cudnn.deterministic = False  # CPU 无效，但保持显式


In [None]:
# 2. 数据预处理与加载器（CPU 设置）
# 使用与原脚本一致的增强：随机裁剪+水平翻转+标准化
transform = transforms.Compose([
    transforms.RandomCrop(32, padding=4),
    transforms.RandomHorizontalFlip(),
    transforms.ToTensor(),
    transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010))
])

# 下载/加载 CIFAR-10 数据集到 ./data 目录
train_dataset = datasets.CIFAR10(root='./data', train=True, download=True, transform=transform)
test_dataset = datasets.CIFAR10(root='./data', train=False, download=True, transform=transform)

# DataLoader：在线平台兼容性，num_workers=0，pin_memory=False（CPU）
batch_size = 128
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, num_workers=0, pin_memory=False)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False, num_workers=0, pin_memory=False)

print(f"训练集大小: {len(train_dataset)}, 测试集大小: {len(test_dataset)}")
print(f"批量大小: {batch_size}, 迭代步数/epoch: {len(train_loader)}")


In [None]:
# 3. 模型定义：带残差块的 ResNet_CIFAR10
class ResidualBlock(nn.Module):
    """基本残差块：两层3x3卷积 + 可选1x1 shortcut，用于对齐通道/步幅"""
    def __init__(self, in_channels, out_channels, stride=1):
        super().__init__()
        self.conv1 = nn.Conv2d(in_channels, out_channels, kernel_size=3, stride=stride, padding=1, bias=False)
        self.bn1 = nn.BatchNorm2d(out_channels)
        self.relu = nn.ReLU(inplace=True)
        self.conv2 = nn.Conv2d(out_channels, out_channels, kernel_size=3, stride=1, padding=1, bias=False)
        self.bn2 = nn.BatchNorm2d(out_channels)

        # 若步幅或通道不一致，使用1x1卷积调整shortcut形状
        self.shortcut = nn.Sequential()
        if stride != 1 or in_channels != out_channels:
            self.shortcut = nn.Sequential(
                nn.Conv2d(in_channels, out_channels, kernel_size=1, stride=stride, bias=False),
                nn.BatchNorm2d(out_channels)
            )

    def forward(self, x):
        residual = self.shortcut(x)
        out = self.conv1(x)
        out = self.bn1(out)
        out = self.relu(out)
        out = self.conv2(out)
        out = self.bn2(out)
        out += residual
        out = self.relu(out)
        return out


class ResNetCIFAR10(nn.Module):
    """四个stage的简化 ResNet：通道 64/128/256/512，对应 stride 1/2/2/2"""
    def __init__(self, num_classes=10):
        super().__init__()
        self.in_channels = 64
        self.conv1 = nn.Conv2d(3, 64, kernel_size=3, stride=1, padding=1, bias=False)
        self.bn1 = nn.BatchNorm2d(64)
        self.relu = nn.ReLU(inplace=True)
        self.layer1 = self._make_layer(64, num_blocks=2, stride=1)
        self.layer2 = self._make_layer(128, num_blocks=2, stride=2)
        self.layer3 = self._make_layer(256, num_blocks=2, stride=2)
        self.layer4 = self._make_layer(512, num_blocks=2, stride=2)
        self.avg_pool = nn.AdaptiveAvgPool2d((1, 1))
        self.fc = nn.Linear(512, num_classes)

    def _make_layer(self, out_channels, num_blocks, stride):
        strides = [stride] + [1] * (num_blocks - 1)
        layers = []
        for s in strides:
            layers.append(ResidualBlock(self.in_channels, out_channels, s))
            self.in_channels = out_channels
        return nn.Sequential(*layers)

    def forward(self, x):
        out = self.conv1(x)
        out = self.bn1(out)
        out = self.relu(out)
        out = self.layer1(out)
        out = self.layer2(out)
        out = self.layer3(out)
        out = self.layer4(out)
        out = self.avg_pool(out)
        out = out.view(out.size(0), -1)
        out = self.fc(out)
        return out


# 实例化模型到CPU
device_model = ResNetCIFAR10(num_classes=10).to(device)
print(device_model)


In [None]:
# 4. 训练与测试函数：详细日志与计时
def train(model, train_loader, criterion, optimizer, epoch, device):
    model.train()
    total_loss = 0.0
    correct = 0
    total = 0
    start_time = time.time()

    for batch_idx, (inputs, targets) in enumerate(train_loader):
        inputs, targets = inputs.to(device), targets.to(device)

        optimizer.zero_grad()
        outputs = model(inputs)
        loss = criterion(outputs, targets)
        loss.backward()
        optimizer.step()

        total_loss += loss.item()
        _, predicted = outputs.max(1)
        total += targets.size(0)
        correct += predicted.eq(targets).sum().item()

        # 适度打印中间进度，避免过多刷屏
        if (batch_idx + 1) % 100 == 0 or (batch_idx + 1) == len(train_loader):
            avg_loss = total_loss / (batch_idx + 1)
            acc = 100.0 * correct / total
            print(f"  [Epoch {epoch:03d}][Batch {batch_idx+1:04d}/{len(train_loader):04d}] "
                  f"Loss {avg_loss:.4f} | Acc {acc:.2f}%")

    elapsed = time.time() - start_time
    epoch_loss = total_loss / len(train_loader)
    epoch_acc = 100.0 * correct / total
    samples_per_sec = total / elapsed if elapsed > 0 else 0.0
    return epoch_acc, epoch_loss, elapsed, samples_per_sec


def test(model, test_loader, criterion, device):
    model.eval()
    total_loss = 0.0
    correct = 0
    total = 0
    with torch.no_grad():
        for inputs, targets in test_loader:
            inputs, targets = inputs.to(device), targets.to(device)
            outputs = model(inputs)
            loss = criterion(outputs, targets)
            total_loss += loss.item()
            _, predicted = outputs.max(1)
            total += targets.size(0)
            correct += predicted.eq(targets).sum().item()

    test_loss = total_loss / len(test_loader)
    test_acc = 100.0 * correct / total
    return test_acc, test_loss


In [None]:
# 5. 主程序：CPU 训练流程、学习率调度与模型保存
num_epochs = 200
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(device_model.parameters(), lr=1e-3, weight_decay=1e-4)
scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=50, gamma=0.5)

best_test_acc = 0.0
best_model_path = 'resnet_cifar10_cpu_best.pth'

# 记录训练历史用于可视化
train_loss_hist, train_acc_hist = [], []
test_loss_hist, test_acc_hist = [], []

print("开始CPU训练（无GPU加速，时间可能较长）...")
for epoch in range(1, num_epochs + 1):
    current_lr = optimizer.param_groups[0]['lr']

    train_acc, train_loss, elapsed, throughput = train(device_model, train_loader, criterion, optimizer, epoch, device)
    test_acc, test_loss = test(device_model, test_loader, criterion, device)
    scheduler.step()

    # 记录历史
    train_loss_hist.append(train_loss)
    train_acc_hist.append(train_acc)
    test_loss_hist.append(test_loss)
    test_acc_hist.append(test_acc)

    print(f"Epoch {epoch:03d} | LR {current_lr:.5f} | "
          f"Train Loss {train_loss:.4f} Acc {train_acc:.2f}% | "
          f"Test Loss {test_loss:.4f} Acc {test_acc:.2f}% | "
          f"Epoch Time {elapsed:0.2f}s | Throughput {throughput:0.1f} samples/s")

    if test_acc > best_test_acc:
        best_test_acc = test_acc
        torch.save({
            'epoch': epoch,
            'model_state_dict': device_model.state_dict(),
            'optimizer_state_dict': optimizer.state_dict(),
            'best_test_acc': best_test_acc
        }, best_model_path)
        print(f"  >>> 保存最优模型至 {best_model_path}，当前最佳测试准确率: {best_test_acc:.2f}%")

print(f"训练结束，最佳测试准确率: {best_test_acc:.2f}%")


# 6. 附录：执行顺序说明
# 按以下顺序逐个运行上方单元：
# 1) 环境与依赖检查（CPU 运行）
# 2) 数据预处理与加载器（CPU 设置）
# 3) 模型定义：带残差块的 ResNet_CIFAR10
# 4) 训练与测试函数：详细日志与计时
# 5) 主程序：CPU 训练流程、学习率调度与模型保存
# 运行完毕后，最佳模型保存在当前目录: resnet_cifar10_cpu_best.pth


In [None]:
# 7. 可视化训练曲线（英文标题/标签，中文注释）
plt.figure(figsize=(12, 5))

# Loss 曲线
plt.subplot(1, 2, 1)
plt.plot(train_loss_hist, label='Train Loss', color='steelblue', linewidth=2)
plt.plot(test_loss_hist, label='Test Loss', color='darkorange', linewidth=2)
plt.title('Loss Curves', fontsize=14)
plt.xlabel('Epoch', fontsize=12)
plt.ylabel('Loss', fontsize=12)
plt.grid(True, alpha=0.3)
plt.legend()

# Accuracy 曲线
plt.subplot(1, 2, 2)
plt.plot(train_acc_hist, label='Train Acc', color='seagreen', linewidth=2)
plt.plot(test_acc_hist, label='Test Acc', color='firebrick', linewidth=2)
plt.title('Accuracy Curves', fontsize=14)
plt.xlabel('Epoch', fontsize=12)
plt.ylabel('Accuracy (%)', fontsize=12)
plt.grid(True, alpha=0.3)
plt.legend()

plt.tight_layout()
plt.savefig('training_curves.png', dpi=150, bbox_inches='tight')
print("Saved training curves to training_curves.png")
plt.show()
