In [9]:
# 环境与随机种子（确保可复现）
import os, sys, random, time, platform, json
import numpy as np
import torch

SEED = int(os.environ.get("SEED", 42))
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
if torch.cuda.is_available():
    torch.cuda.manual_seed_all(SEED)

# cuDNN 可复现设置
import torch.backends.cudnn as cudnn
cudnn.benchmark = False
cudnn.deterministic = True

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print({
    "python": sys.version.split(" ")[0],
    "platform": platform.platform(),
    "pytorch": torch.__version__,
    "cuda_available": torch.cuda.is_available(),
    "cuda_version": torch.version.cuda if torch.cuda.is_available() else None,
    "device": str(device),
})
print("SEED=", SEED)

{'python': '3.10.19', 'platform': 'Windows-10-10.0.22621-SP0', 'pytorch': '2.5.1+cu121', 'cuda_available': True, 'cuda_version': '12.1', 'device': 'cuda'}
SEED= 42


# 复现实验环境与运行说明

本 Notebook 使用 PyTorch>=2 进行 CIFAR-10 图像分类实验。为提高复现性，我们在最前面固定随机种子、打印环境信息，并给出关键开关说明：

- 随机种子：seed 固定，cuDNN 设为 deterministic。
- 设备选择：自动选择 CUDA/GPU 或 CPU。
- 运行产物：所有模型、图像与 CSV 会保存到统一的 RESULTS_DIR 下。

在训练前后，可参考末尾的“结果表格与总结”与“我学到了什么”。

# Assignment 1

This code baseline is inspired by and modified from [this great tutorial](https://pytorch.org/tutorials/beginner/blitz/cifar10_tutorial.html).

This code can achieve an accuracy of approximately 86.50% on CIFAR-10. Please set up the environment and run your experiments starting from this baseline. You are expected to achieve an accuracy higher than this baseline.

In [10]:
# 导入必要的库
# torch: PyTorch 的核心张量与自动求导库
import torch
# nn: 神经网络层、损失函数等模块
import torch.nn as nn
# optim: 各类优化器（SGD/Adam 等）
import torch.optim as optim

# torchvision: 计算机视觉常用数据集与图像增广
# tv_datasets: 常见视觉数据集（如 CIFAR-10）
import torchvision.datasets as tv_datasets
# tv_transforms: 图像预处理/数据增强流水线
import torchvision.transforms as tv_transforms

# 额外工具
import os
import random
import numpy as np

# 固定随机种子，保证可复现
SEED = 42
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
if torch.cuda.is_available():
    torch.cuda.manual_seed_all(SEED)


In [11]:
# 实验参数与运行设备设置
# 优先使用 CUDA 的第 0 块 GPU；若不可用则回退到 CPU
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

# 设备日志：确认是否成功使用 GPU
print(f"Using device: {device}")
if device.type == "cuda":
    try:
        print(f"GPU name: {torch.cuda.get_device_name(0)}")
    except Exception as e:
        # 某些环境下可能无法读取设备名称
        print(f"CUDA available but failed to get device name: {e}")

# 训练超参数
num_epochs = 128          # 训练轮数
batch_size = 64           # 每个 mini-batch 的样本数量
num_workers = 2           # DataLoader 载入数据的线程数（Windows 上建议适度）
print_every = 200         # 每多少个 iteration 打印一次训练损失

# 优化器配置
optim_name = "Adam"       # 优化器名称（支持 'SGD'、'Adam' 等）
optim_kwargs = dict(
    lr=3e-4,              # 学习率
    weight_decay=1e-6,    # L2 正则（权重衰减）
)

# 输入图像的预处理/数据增强流水线
# 训练集与测试集使用相同的标准化，但训练集额外加入随机增广提升泛化
transformation = dict()
for data_type in ("train", "test"):
    is_train = data_type=="train"
    # Compose 将一系列变换按顺序组合
    transformation[data_type] = tv_transforms.Compose(([ 
        # 仅训练时使用的数据增强
        tv_transforms.RandomRotation(degrees=15),                 # 随机旋转
        tv_transforms.RandomHorizontalFlip(),                     # 随机水平翻转
        tv_transforms.RandomAffine(degrees=0, translate=(0.1, 0.1)), # 随机平移
    ] if is_train else []) + 
    [
        tv_transforms.ToTensor(),                                 # 转为张量并缩放到 [0,1]
        # 以 0.5 为均值、0.5 为标准差做标准化: (x - 0.5) / 0.5 -> 约等于缩放到 [-1,1]
        tv_transforms.Normalize(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5]),
    ])

Using device: cuda:0
GPU name: NVIDIA GeForce RTX 3050 Ti Laptop GPU


In [12]:
# 结果目录与路径常量（统一保存产物）
import time, shutil
from pathlib import Path

RUN_TAG = time.strftime("%Y%m%d_%H%M%S")
RESULTS_ROOT = "./results"
RESULTS_DIR = os.path.join(RESULTS_ROOT, f"cifar10_{RUN_TAG}")
os.makedirs(RESULTS_DIR, exist_ok=True)
print(f"Results will be saved to: {RESULTS_DIR}")

# 统一产物路径
BEST_MODEL_PATH = os.path.join(RESULTS_DIR, "best_model_cifar10.pth")
TRAIN_CURVES_PNG = os.path.join(RESULTS_DIR, "cifar10_training_curves.png")
CM_PNG = os.path.join(RESULTS_DIR, "cifar10_confusion_matrix.png")
ABLATION_CSV = os.path.join(RESULTS_DIR, "cifar10_ablation_results.csv")

# 当前 Notebook 的绝对路径（用于归档）
NOTEBOOK_ABS_PATH = "/data/zhangzhikui/githubbase/DL/HW1/CIFAR-10.ipynb"
NOTEBOOK_COPY_PATH = os.path.join(RESULTS_DIR, f"CIFAR-10_{RUN_TAG}.ipynb")

Results will be saved to: ./results\cifar10_20251027_012326


In [13]:
# 准备 CIFAR-10 数据集与数据加载器
dataset, loader = {}, {}
for data_type in ("train", "test"):
    is_train = data_type=="train"
    # CIFAR-10: 50000 张训练图像 + 10000 张测试图像，类别数为 10
    dataset[data_type] = tv_datasets.CIFAR10(
        root="./data",                 # 数据集存放路径（如不存在将自动创建）
        train=is_train,                # 训练/测试划分
        download=True,                 # 若本地无数据则联网下载
        transform=transformation[data_type],  # 应用上面定义的预处理
    )
    # DataLoader 负责按批次提供数据并在训练集上打乱顺序
    loader[data_type] = torch.utils.data.DataLoader(
        dataset[data_type],
        batch_size=batch_size,
        shuffle=is_train,              # 仅在训练集上打乱，测试集保持顺序
        num_workers=num_workers,       # 加载线程数
    )


Files already downloaded and verified
Files already downloaded and verified


In [14]:
# 定义卷积神经网络（CNN）结构
net = nn.Sequential(
    # 下采样阶段 1: 输入 3x32x32 -> 中间特征
    nn.Conv2d(3, 128, 3, padding=1), nn.ReLU(inplace=True), nn.MaxPool2d(2), nn.Dropout(0.3),
    # 下采样阶段 2
    nn.Conv2d(128, 256, 3, padding=1), nn.ReLU(inplace=True), nn.MaxPool2d(2), nn.Dropout(0.3),
    # 更深的卷积特征提取
    nn.Conv2d(256, 512, 3, padding=1), nn.ReLU(inplace=True),
    nn.Conv2d(512, 512, 3, padding=1), nn.ReLU(inplace=True),
    # 压缩通道数并再次下采样
    nn.Conv2d(512, 256, 3, padding=1), nn.ReLU(inplace=True), nn.MaxPool2d(2), nn.Dropout(0.3),
    # 展平为全连接层输入
    nn.Flatten(),
    # 全连接分类头，含 Dropout 做正则化
    nn.Linear(256 * 4 * 4, 512), nn.ReLU(inplace=True), nn.Dropout(0.5),
    nn.Linear(512, 256), nn.ReLU(inplace=True), nn.Dropout(0.5),
    nn.Linear(256, 128), nn.ReLU(inplace=True), nn.Dropout(0.5),
    nn.Linear(128, 10),  # 10 个类别的 logits
)

# 将模型移动到指定设备（GPU/CPU）
net.to(device)

# 统计可训练参数量（单位：百万）
print(f"number of parameters: {sum(p.numel() for p in net.parameters() if p.requires_grad) / 1_000_000:.2f}M")

number of parameters: 7.28M


## Start Training

In [15]:
# 定义网络优化器（从 torch.optim 动态获取指定优化器）
optimizer = getattr(optim, optim_name)(net.parameters(), **optim_kwargs)

# 多分类交叉熵损失，适用于单标签多分类任务
criterion = nn.CrossEntropyLoss()

# 训练循环
net.train()  # 切换到训练模式（启用 Dropout/BN 的训练行为）
for epoch in range(num_epochs):

    running_loss = 0.0
    for i, (img, target) in enumerate(loader["train"]):
        # 将数据移动到相同设备
        img, target = img.to(device), target.to(device)

        # 前向计算得到 logits 预测
        pred = net(img)
        loss = criterion(pred, target)

        # 反向传播与参数更新
        optimizer.zero_grad()  # 清空上一轮梯度
        loss.backward()        # 计算当前梯度
        optimizer.step()       # 按优化器策略更新参数

        # 统计与日志打印
        running_loss += loss.item()
        if i % print_every == print_every - 1:
            print(f"[epoch={epoch + 1:3d}, iter={i + 1:5d}] loss: {running_loss / print_every:.3f}")
            running_loss = 0.0

print("Finished Training")

[epoch=  1, iter=  200] loss: 2.220
[epoch=  1, iter=  400] loss: 1.997
[epoch=  1, iter=  600] loss: 1.841


KeyboardInterrupt: 

## Evaluating its accuracy

In [None]:
# 切换到评估模式（关闭 Dropout/固定 BN 统计）
net.eval()
correct, total = 0, 0
with torch.no_grad():  # 评估时不需要梯度，降低显存/加速
    for img, target in loader["test"]:
        img, target = img.to(device), target.to(device)
        
        # 前向推理，得到每类的 logits
        pred = net(img)
        
        # 累积统计：top-1 预测与真实标签比较
        total += len(target)
        correct += (torch.argmax(pred, dim=1) == target).sum().item()

print(f"Accuracy of the network on the {total} test images: {100 * correct / total:.2f}%")

# 改进实验整体流程
- **阶段 1：消融实验（各因素单独 10 轮）**
  - Baseline：简洁 CNN（无残差/注意力/高级增强/优化）
  - 依次只添加一个因素：残差连接、SE 通道注意力、加深网络、强化数据增强、优化策略
  - 记录每个因素带来的验证集增益，筛选出有效改进
- **阶段 2：组合训练（150 轮）**
  - 将阶段 1 证明有效的因素组合到同一配置
  - 使用较长训练计划 + 最佳优化策略，保存最佳模型
- **阶段 3：结果分析**
  - 对比 Baseline 与最终模型
  - 回顾各改进因素的贡献，总结经验

## 阶段 1：10 轮消融实验设置
- 统一使用批量 128、基础学习率 0.1（SGD），不使用学习率调度作为基线。
- 数据集划分：训练集 45k / 验证集 5k（从官方训练集划分）。
- 指标：验证集 top-1 准确率，记录最后 3 轮的平均值以减小波动。
- 逐个因素实验时，其余保持与基线一致，便于对比。

In [None]:
# 环境初始化与实验配置
import os
import math
import time
import random
from dataclasses import dataclass, field, asdict
from typing import Dict, Any, List, Tuple, Optional

import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.optim.lr_scheduler import CosineAnnealingLR, OneCycleLR
from torch.cuda.amp import autocast, GradScaler
import torchvision
import torchvision.transforms as T
from torch.utils.data import DataLoader, random_split

SEED = 42
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.cuda.manual_seed_all(SEED)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
NUM_CLASSES = 10
DATA_ROOT = './data'
RESULT_DIR = './runs'
MODEL_DIR = './models'
os.makedirs(RESULT_DIR, exist_ok=True)
os.makedirs(MODEL_DIR, exist_ok=True)

print('Torch', torch.__version__, '| TorchVision', torchvision.__version__)
print('CUDA available:', torch.cuda.is_available(), '| device:', device)


def set_seed(seed: int = 42) -> None:
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)


@dataclass
class ExperimentConfig:
    name: str = 'baseline'
    epochs: int = 10
    batch_size: int = 128
    base_lr: float = 0.1
    momentum: float = 0.9
    weight_decay: float = 5e-4
    optimizer: str = 'sgd'            # 'sgd' or 'adamw'
    scheduler: str = 'none'           # 'none' | 'cosine' | 'onecycle'
    label_smoothing: float = 0.0
    use_amp: bool = False
    grad_clip: float = 0.0
    max_steps_per_epoch: int = 0      # 0 表示不限制；用于快速烟囱测试

    # 结构改进
    use_residual: bool = False
    use_se: bool = False
    depth: int = 2                   # blocks per stage
    width: int = 1                   # channel multiplier

    # 数据增强/正则化
    use_strong_aug: bool = False
    randaugment_n: int = 0
    randaugment_m: int = 9
    use_mixup: bool = False
    mixup_alpha: float = 0.2
    use_cutmix: bool = False
    cutmix_alpha: float = 1.0
    label_smoothing_for_aug: float = 0.05

    # 其他
    save_path: str = ''
    seed: int = SEED

    def to_dict(self) -> Dict[str, Any]:
        return asdict(self)


ModuleNotFoundError: No module named 'pandas'

### 数据集拆分与数据增强策略
- 基线使用 **RandomCrop + HorizontalFlip** 与标准化。
- 强增强在基于 RandAugment 的基础上叠加 Cutout / RandomErasing，并在 batch 级别选择性启用 MixUp 或 CutMix。
- 训练集按照 45k / 5k 划分验证集，保证消融测试公平对比。

In [None]:
# 数据加载与增强管线
CIFAR_MEAN = (0.4914, 0.4822, 0.4465)
CIFAR_STD = (0.2023, 0.1994, 0.2010)


class Cutout:
    """在张量图像上随机遮挡若干正方形区域。"""
    def __init__(self, n_holes: int = 1, length: int = 16):
        self.n_holes = n_holes
        self.length = length

    def __call__(self, img: torch.Tensor) -> torch.Tensor:
        if not torch.is_tensor(img):
            raise TypeError('Cutout 需要在 ToTensor 之后使用。')
        h, w = img.size(1), img.size(2)
        mask = torch.ones((h, w), device=img.device)
        for _ in range(self.n_holes):
            y = torch.randint(0, h, (1,)).item()
            x = torch.randint(0, w, (1,)).item()
            y1 = max(0, y - self.length // 2)
            y2 = min(h, y + self.length // 2)
            x1 = max(0, x - self.length // 2)
            x2 = min(w, x + self.length // 2)
            mask[y1:y2, x1:x2] = 0
        mask = mask.expand_as(img)
        return img * mask


class TransformSubset(torch.utils.data.Dataset):
    def __init__(self, dataset, indices: List[int], transform):
        self.dataset = dataset
        self.indices = indices
        self.transform = transform

    def __len__(self) -> int:
        return len(self.indices)

    def __getitem__(self, idx: int):
        image, target = self.dataset[self.indices[idx]]
        if self.transform is not None:
            image = self.transform(image)
        return image, target


def build_transforms(cfg: ExperimentConfig):
    train_tfms: List[Any] = [
        T.RandomCrop(32, padding=4),
        T.RandomHorizontalFlip(),
    ]
    if cfg.use_strong_aug:
        if cfg.randaugment_n > 0:
            train_tfms.append(T.RandAugment(cfg.randaugment_n, cfg.randaugment_m))
        train_tfms.append(T.ColorJitter(0.3, 0.3, 0.3, 0.2))
    train_tfms.append(T.ToTensor())
    train_tfms.append(T.Normalize(CIFAR_MEAN, CIFAR_STD))
    if cfg.use_strong_aug:
        train_tfms.append(Cutout(n_holes=1, length=12))
        train_tfms.append(T.RandomErasing(p=0.25, scale=(0.02, 0.25)))

    eval_tfms = T.Compose([
        T.ToTensor(),
        T.Normalize(CIFAR_MEAN, CIFAR_STD)
    ])

    return T.Compose(train_tfms), eval_tfms


def build_dataloaders(cfg: ExperimentConfig, num_workers: int = 0) -> Dict[str, DataLoader]:
    # Windows + Notebook 环境下，num_workers=0 更稳妥
    set_seed(cfg.seed)
    train_transform, eval_transform = build_transforms(cfg)

    base_train = torchvision.datasets.CIFAR10(
        root=DATA_ROOT,
        train=True,
        download=True,
        transform=None
    )
    test_dataset = torchvision.datasets.CIFAR10(
        root=DATA_ROOT,
        train=False,
        download=True,
        transform=eval_transform
    )

    total_len = len(base_train)
    val_len = 5000
    train_len = total_len - val_len
    generator = torch.Generator().manual_seed(0)
    train_indices, val_indices = random_split(range(total_len), [train_len, val_len], generator=generator)

    train_subset = TransformSubset(base_train, list(train_indices), train_transform)
    val_subset = TransformSubset(base_train, list(val_indices), eval_transform)

    loaders = {
        'train': DataLoader(train_subset, batch_size=cfg.batch_size, shuffle=True,
                             num_workers=num_workers, pin_memory=True),
        'val': DataLoader(val_subset, batch_size=cfg.batch_size, shuffle=False,
                           num_workers=num_workers, pin_memory=True),
        'test': DataLoader(test_dataset, batch_size=cfg.batch_size, shuffle=False,
                            num_workers=num_workers, pin_memory=True)
    }
    return loaders


### 模型结构因子
- **Residual**：stage 内使用残差跳连，缓解梯度消失。
- **SE Attention**：引入通道注意力，自适应重标特征。
- **Depth / Width**：通过 `depth` 和 `width` 超参调整网络容量。
- 所有变化都在同一个 `TinyCIFARNet` 架构中切换，保证实验公平。

In [None]:
# 模型构建
class SEModule(nn.Module):
    def __init__(self, ch: int, reduction: int = 16):
        super().__init__()
        hidden = max(ch // reduction, 4)
        self.pool = nn.AdaptiveAvgPool2d(1)
        self.conv1 = nn.Conv2d(ch, hidden, kernel_size=1, bias=True)
        self.conv2 = nn.Conv2d(hidden, ch, kernel_size=1, bias=True)

    def forward(self, x):
        w = self.pool(x)
        w = F.relu(self.conv1(w), inplace=True)
        w = torch.sigmoid(self.conv2(w))
        return x * w


class ConvBlock(nn.Module):
    def __init__(self, in_ch: int, out_ch: int, stride: int = 1,
                 residual: bool = False, use_se: bool = False):
        super().__init__()
        self.residual = residual
        self.use_se = use_se
        self.conv1 = nn.Conv2d(in_ch, out_ch, kernel_size=3, stride=stride,
                               padding=1, bias=False)
        self.bn1 = nn.BatchNorm2d(out_ch)
        self.conv2 = nn.Conv2d(out_ch, out_ch, kernel_size=3, stride=1,
                               padding=1, bias=False)
        self.bn2 = nn.BatchNorm2d(out_ch)
        self.se = SEModule(out_ch) if use_se else nn.Identity()
        self.shortcut = None
        if residual and (stride != 1 or in_ch != out_ch):
            self.shortcut = nn.Sequential(
                nn.Conv2d(in_ch, out_ch, kernel_size=1, stride=stride, bias=False),
                nn.BatchNorm2d(out_ch)
            )

    def forward(self, x):
        out = F.relu(self.bn1(self.conv1(x)), inplace=True)
        out = self.bn2(self.conv2(out))
        out = self.se(out)
        if self.residual:
            shortcut = x if self.shortcut is None else self.shortcut(x)
            out = out + shortcut
        return F.relu(out, inplace=True)


class TinyCIFARNet(nn.Module):
    def __init__(self, cfg: ExperimentConfig):
        super().__init__()
        widths = [32, 64, 128]
        widths = [w * cfg.width for w in widths]
        self.stem = nn.Sequential(
            nn.Conv2d(3, widths[0], kernel_size=3, stride=1, padding=1, bias=False),
            nn.BatchNorm2d(widths[0]),
            nn.ReLU(inplace=True)
        )
        self.stage1 = self._make_stage(widths[0], widths[0], cfg.depth,
                                       stride=1, residual=cfg.use_residual, use_se=cfg.use_se)
        self.stage2 = self._make_stage(widths[0], widths[1], cfg.depth,
                                       stride=2, residual=cfg.use_residual, use_se=cfg.use_se)
        self.stage3 = self._make_stage(widths[1], widths[2], cfg.depth,
                                       stride=2, residual=cfg.use_residual, use_se=cfg.use_se)
        self.head = nn.Sequential(
            nn.AdaptiveAvgPool2d(1),
            nn.Flatten(),
            nn.Linear(widths[2], NUM_CLASSES)
        )
        self._init_weights()

    def _make_stage(self, in_ch: int, out_ch: int, depth: int,
                    stride: int, residual: bool, use_se: bool) -> nn.Sequential:
        blocks = [ConvBlock(in_ch, out_ch, stride=stride,
                            residual=residual, use_se=use_se)]
        for _ in range(1, depth):
            blocks.append(ConvBlock(out_ch, out_ch, stride=1,
                                    residual=residual, use_se=use_se))
        return nn.Sequential(*blocks)

    def _init_weights(self):
        for m in self.modules():
            if isinstance(m, nn.Conv2d):
                nn.init.kaiming_normal_(m.weight, mode='fan_out', nonlinearity='relu')
            elif isinstance(m, nn.BatchNorm2d):
                nn.init.ones_(m.weight)
                nn.init.zeros_(m.bias)
            elif isinstance(m, nn.Linear):
                nn.init.normal_(m.weight, 0, 0.01)
                nn.init.zeros_(m.bias)

    def forward(self, x):
        x = self.stem(x)
        x = self.stage1(x)
        x = self.stage2(x)
        x = self.stage3(x)
        x = self.head(x)
        return x


def build_model(cfg: ExperimentConfig) -> nn.Module:
    model = TinyCIFARNet(cfg)
    return model.to(device)


In [None]:
# 训练与评估工具
class AverageMeter:
    def __init__(self):
        self.reset()

    def reset(self):
        self.sum = 0.0
        self.count = 0

    def update(self, value: float, n: int = 1):
        self.sum += value * n
        self.count += n

    @property
    def avg(self) -> float:
        if self.count == 0:
            return 0.0
        return self.sum / self.count


def soft_cross_entropy(logits: torch.Tensor, soft_targets: torch.Tensor) -> torch.Tensor:
    log_probs = F.log_softmax(logits, dim=1)
    return -(soft_targets * log_probs).sum(dim=1).mean()


def one_hot(targets: torch.Tensor, num_classes: int, smoothing: float = 0.0) -> torch.Tensor:
    with torch.no_grad():
        y = torch.zeros((targets.size(0), num_classes), device=targets.device)
        y.fill_(smoothing / (num_classes - 1))
        y.scatter_(1, targets.unsqueeze(1), 1.0 - smoothing)
    return y


def prepare_optimizer(model: nn.Module, cfg: ExperimentConfig, steps_per_epoch: int):
    if cfg.optimizer == 'adamw':
        optimizer = optim.AdamW(model.parameters(), lr=cfg.base_lr,
                                weight_decay=cfg.weight_decay)
    else:
        optimizer = optim.SGD(model.parameters(), lr=cfg.base_lr,
                              momentum=cfg.momentum, weight_decay=cfg.weight_decay, nesterov=True)

    scheduler = None
    if cfg.scheduler == 'cosine':
        scheduler = CosineAnnealingLR(optimizer, T_max=cfg.epochs)
    elif cfg.scheduler == 'onecycle':
        scheduler = OneCycleLR(optimizer, max_lr=cfg.base_lr,
                               steps_per_epoch=steps_per_epoch, epochs=cfg.epochs)
    return optimizer, scheduler


def apply_mixup_cutmix(inputs: torch.Tensor, targets: torch.Tensor, cfg: ExperimentConfig):
    soft_targets = one_hot(targets, NUM_CLASSES, smoothing=cfg.label_smoothing_for_aug)
    if cfg.use_mixup and cfg.mixup_alpha > 0:
        lam = np.random.beta(cfg.mixup_alpha, cfg.mixup_alpha)
        index = torch.randperm(inputs.size(0), device=inputs.device)
        mixed = lam * inputs + (1 - lam) * inputs[index]
        soft_targets = lam * soft_targets + (1 - lam) * soft_targets[index]
        return mixed, soft_targets
    if cfg.use_cutmix and cfg.cutmix_alpha > 0:
        lam = np.random.beta(cfg.cutmix_alpha, cfg.cutmix_alpha)
        batch_size, _, h, w = inputs.size()
        index = torch.randperm(batch_size, device=inputs.device)
        cut_rat = math.sqrt(1.0 - lam)
        cut_w = int(w * cut_rat)
        cut_h = int(h * cut_rat)
        cx = np.random.randint(w)
        cy = np.random.randint(h)
        x1 = np.clip(cx - cut_w // 2, 0, w)
        y1 = np.clip(cy - cut_h // 2, 0, h)
        x2 = np.clip(cx + cut_w // 2, 0, w)
        y2 = np.clip(cy + cut_h // 2, 0, h)
        inputs[:, :, y1:y2, x1:x2] = inputs[index, :, y1:y2, x1:x2]
        lam = 1 - ((x2 - x1) * (y2 - y1) / (w * h))
        soft_targets = lam * soft_targets + (1 - lam) * soft_targets[index]
        return inputs, soft_targets
    return inputs, soft_targets


def train_one_epoch(model: nn.Module, loaders: Dict[str, DataLoader], cfg: ExperimentConfig,
                    optimizer, scheduler=None, scaler: Optional[GradScaler] = None) -> Dict[str, float]:
    model.train()
    loss_meter = AverageMeter()
    acc_meter = AverageMeter()
    train_loader = loaders['train']
    criterion = nn.CrossEntropyLoss(label_smoothing=cfg.label_smoothing)

    for step, (inputs, targets) in enumerate(train_loader):
        if cfg.max_steps_per_epoch and (step >= cfg.max_steps_per_epoch):
            break
        inputs = inputs.to(device, non_blocking=True)
        targets = targets.to(device, non_blocking=True)
        optimizer.zero_grad(set_to_none=True)

        use_soft = cfg.use_mixup or cfg.use_cutmix
        if use_soft:
            inputs, soft_targets = apply_mixup_cutmix(inputs, targets, cfg)
        with autocast(enabled=cfg.use_amp):
            outputs = model(inputs)
            if use_soft:
                loss = soft_cross_entropy(outputs, soft_targets)
                hard_targets = torch.argmax(soft_targets, dim=1)
            elif cfg.label_smoothing > 0.0:
                soft_targets = one_hot(targets, NUM_CLASSES, smoothing=cfg.label_smoothing)
                loss = soft_cross_entropy(outputs, soft_targets)
                hard_targets = targets
            else:
                loss = criterion(outputs, targets)
                hard_targets = targets

        if scaler is not None and cfg.use_amp:
            scaler.scale(loss).backward()
            if cfg.grad_clip > 0:
                scaler.unscale_(optimizer)
                nn.utils.clip_grad_norm_(model.parameters(), cfg.grad_clip)
            scaler.step(optimizer)
            scaler.update()
        else:
            loss.backward()
            if cfg.grad_clip > 0:
                nn.utils.clip_grad_norm_(model.parameters(), cfg.grad_clip)
            optimizer.step()

        if scheduler is not None and cfg.scheduler == 'onecycle':
            scheduler.step()

        loss_meter.update(loss.item(), inputs.size(0))
        preds = outputs.argmax(dim=1)
        acc = (preds == hard_targets).float().mean().item()
        acc_meter.update(acc, inputs.size(0))

    if scheduler is not None and cfg.scheduler == 'cosine':
        scheduler.step()

    return {'train_loss': loss_meter.avg, 'train_acc': acc_meter.avg}


def evaluate(model: nn.Module, loader: DataLoader) -> Tuple[float, float]:
    model.eval()
    loss_meter = AverageMeter()
    acc_meter = AverageMeter()
    criterion = nn.CrossEntropyLoss()

    with torch.no_grad():
        for inputs, targets in loader:
            inputs = inputs.to(device, non_blocking=True)
            targets = targets.to(device, non_blocking=True)
            outputs = model(inputs)
            loss = criterion(outputs, targets)
            preds = outputs.argmax(dim=1)
            acc = (preds == targets).float().mean().item()
            loss_meter.update(loss.item(), inputs.size(0))
            acc_meter.update(acc, inputs.size(0))
    return loss_meter.avg, acc_meter.avg


### 训练循环封装
封装单次实验（训练 + 验证）的流程，便于在消融与主训练中重复调用。支持：
- 自动写入日志（DataFrame）
- 可选 AMP、梯度裁剪、调度器
- 训练完成后返回验证集最优指标与对应权重路径

In [None]:
# 单次实验执行

def run_experiment(cfg: ExperimentConfig, loaders: Dict[str, DataLoader] | None = None,
                   track_test: bool = False, verbose: bool = True) -> Tuple[pd.DataFrame, Dict[str, Any]]:
    if loaders is None:
        loaders = build_dataloaders(cfg)
    set_seed(cfg.seed)
    model = build_model(cfg)
    steps_per_epoch = len(loaders['train'])
    optimizer, scheduler = prepare_optimizer(model, cfg, steps_per_epoch)
    scaler = GradScaler(enabled=cfg.use_amp)

    history: List[Dict[str, Any]] = []
    best_val = 0.0
    best_state = None
    start_time = time.time()

    for epoch in range(1, cfg.epochs + 1):
        epoch_start = time.time()
        train_metrics = train_one_epoch(model, loaders, cfg, optimizer, scheduler, scaler)
        val_loss, val_acc = evaluate(model, loaders['val'])
        record = {
            'epoch': epoch,
            'train_loss': train_metrics['train_loss'],
            'train_acc': train_metrics['train_acc'],
            'val_loss': val_loss,
            'val_acc': val_acc,
            'lr': optimizer.param_groups[0]['lr'],
            'epoch_time': time.time() - epoch_start,
        }
        history.append(record)
        if verbose:
            print(f"[{cfg.name}] Epoch {epoch:03d}/{cfg.epochs} | "
                  f"train_acc={train_metrics['train_acc']:.3f} | val_acc={val_acc:.3f} | "
                  f"time={record['epoch_time']:.1f}s")
        if val_acc > best_val:
            best_val = val_acc
            best_state = {
                'model': model.state_dict(),
                'optimizer': optimizer.state_dict(),
                'epoch': epoch
            }

    history_df = pd.DataFrame(history)
    test_metrics = {'test_loss': None, 'test_acc': None}
    if track_test:
        model.load_state_dict(best_state['model'])
        test_loss, test_acc = evaluate(model, loaders['test'])
        test_metrics = {'test_loss': test_loss, 'test_acc': test_acc}

    summary = {
        'config': cfg.to_dict(),
        'best_val_acc': best_val,
        'best_epoch': best_state['epoch'] if best_state else None,
        'hist': history_df,
        **test_metrics,
        'total_time': time.time() - start_time
    }

    if cfg.save_path:
        torch.save(best_state, cfg.save_path)
        summary['model_path'] = cfg.save_path
    else:
        summary['model_path'] = None

    return history_df, summary


### 阶段 1：10 轮消融实验
- **Baseline**：无残差/SE，depth=2，width=1，轻量增强，仅 SGD。
- 单因素实验（每个 10 轮）：
  1. `+Residual`：启用残差连接。
  2. `+SE`：在残差分支基础上启用 SE（保持是否残差可独立开关）。
  3. `+Deeper/Wider`：将 `depth=3`，`width=2`。
  4. `+Augmentation`：开启 RandAugment + Cutout + RandomErasing。
  5. `+Optimizer`：使用 AdamW + Cosine 调度 + AMP + Label Smoothing。
- 运行后保存结果表格到 `runs/cifar10_ablation.csv`，供阶段 2 选择组合。

In [None]:
# 阶段 1：消融实验运行
baseline_cfg = ExperimentConfig(
    name='baseline', epochs=10, batch_size=128, base_lr=0.1,
    optimizer='sgd', scheduler='none', label_smoothing=0.0,
    use_amp=False, grad_clip=0.0, use_residual=False, use_se=False,
    depth=2, width=1, use_strong_aug=False, use_mixup=False, use_cutmix=False,
    randaugment_n=0, randaugment_m=9, weight_decay=5e-4
)


def clone_config(base: ExperimentConfig, name: str, **overrides) -> ExperimentConfig:
    cfg_dict = base.to_dict()
    cfg_dict.update(overrides)
    cfg_dict['name'] = name
    return ExperimentConfig(**cfg_dict)


ablation_configs = [
    ('baseline', baseline_cfg),
    ('residual', clone_config(baseline_cfg, 'abl_residual', use_residual=True)),
    ('se', clone_config(baseline_cfg, 'abl_se', use_se=True)),
    ('deep_wide', clone_config(baseline_cfg, 'abl_deepwide', depth=3, width=2)),
    ('augmentation', clone_config(baseline_cfg, 'abl_aug', use_strong_aug=True,
                                  randaugment_n=2, randaugment_m=9, use_mixup=True,
                                  use_cutmix=False, label_smoothing_for_aug=0.05)),
    ('optimizer', clone_config(baseline_cfg, 'abl_opt', optimizer='adamw', base_lr=3e-4,
                               scheduler='cosine', label_smoothing=0.1,
                               use_amp=True, grad_clip=1.0))
]

RUN_ABLATION = True  # 将其改为 True 后执行本单元即可开始 6 组实验
ablation_history: Dict[str, pd.DataFrame] = {}
ablation_summary: List[Dict[str, Any]] = []

if RUN_ABLATION:
    total = len(ablation_configs)
    for idx, (factor, cfg) in enumerate(ablation_configs, start=1):
        print('=' * 80)
        print(f'[Stage 1] 进度: {idx}/{total} | 因子: {factor} | 配置名: {cfg.name}')
        print(f"  residual={cfg.use_residual} | se={cfg.use_se} | depth={cfg.depth} | width={cfg.width}")
        print(f"  strong_aug={cfg.use_strong_aug} | optimizer={cfg.optimizer} | lr={cfg.base_lr} | scheduler={cfg.scheduler}")
        hist, summary = run_experiment(cfg)
        ablation_history[factor] = hist
        ablation_summary.append({
            'factor': factor,
            'name': cfg.name,
            'best_val_acc': summary['best_val_acc'],
            'best_epoch': summary['best_epoch'],
            'total_time_min': summary['total_time'] / 60.0,
            'config': cfg.to_dict()
        })
        print(f"完成 {factor} | best_val_acc={summary['best_val_acc']*100:.2f}% @ epoch {summary['best_epoch']}")

    ablation_df = pd.DataFrame(ablation_summary)
    ablation_path = os.path.join(RESULT_DIR, 'cifar10_ablation_summary.csv')
    ablation_df.to_csv(ablation_path, index=False)
    print('=' * 80)
    print(f'Ablation summary saved to {ablation_path}')
else:
    print('设置 RUN_ABLATION = True 并重新运行此单元以启动消融实验。')


### 阶段 2：组合主要训练
- 根据阶段 1 结果，筛选验证集准确率超过 Baseline 的因素。
- 将这些因素合并，形成主训练配置（150 轮）。
- 默认开启：AMP、AdamW、Cosine 调度、MixUp（若在阶段 1 中表现优秀）。
- 保存最好模型权重到 `models/cifar10_best.pth`。

In [None]:
# 组合主训练
FACTOR_PATCHES = {
    'residual': {'use_residual': True},
    'se': {'use_se': True},
    'deep_wide': {'depth': 3, 'width': 2},
    'augmentation': {
        'use_strong_aug': True,
        'randaugment_n': 2,
        'randaugment_m': 9,
        'use_mixup': True,
        'use_cutmix': False,
        'label_smoothing_for_aug': 0.05
    },
    'optimizer': {
        'optimizer': 'adamw',
        'base_lr': 3e-4,
        'scheduler': 'cosine',
        'label_smoothing': 0.1,
        'use_amp': True,
        'grad_clip': 1.0
    }
}


def load_ablation_table() -> pd.DataFrame:
    if 'ablation_summary' in globals() and len(ablation_summary) > 0:
        return pd.DataFrame(ablation_summary)
    path = os.path.join(RESULT_DIR, 'cifar10_ablation_summary.csv')
    if os.path.exists(path):
        return pd.read_csv(path)
    raise FileNotFoundError('未找到消融实验结果，请先运行上一单元或读取 CSV。')


RUN_MAIN_TRAIN = False
main_history = None
main_summary = None

if RUN_MAIN_TRAIN:
    ablation_df = load_ablation_table()
    baseline_row = ablation_df[ablation_df['factor'] == 'baseline']
    if baseline_row.empty:
        raise ValueError('消融结果中缺少 baseline，请确认阶段 1 已成功运行。')
    baseline_acc = baseline_row['best_val_acc'].max()
    selected = ablation_df[ablation_df['best_val_acc'] > baseline_acc]['factor'].tolist()
    print('Selected factors:', selected)

    combined_cfg = clone_config(
        baseline_cfg,
        'main_training',
        epochs=150,
        batch_size=128,
        save_path=os.path.join(MODEL_DIR, 'cifar10_best.pth'),
        weight_decay=3e-4
    )
    for factor in selected:
        for key, value in FACTOR_PATCHES.get(factor, {}).items():
            setattr(combined_cfg, key, value)

    if combined_cfg.use_mixup or combined_cfg.use_cutmix:
        combined_cfg.label_smoothing_for_aug = max(combined_cfg.label_smoothing_for_aug, 0.05)

    config_path = os.path.join(RESULT_DIR, 'cifar10_main_config.json')
    with open(config_path, 'w', encoding='utf-8') as f:
        json.dump(combined_cfg.to_dict(), f, ensure_ascii=False, indent=2)
    print(f'Final main config saved to {config_path}')

    print('Final main config:')
    print(combined_cfg)
    main_history, main_summary = run_experiment(combined_cfg, track_test=True)
    main_history.to_csv(os.path.join(RESULT_DIR, 'cifar10_main_history.csv'), index=False)
    print('Main training finished. Best val acc:', main_summary['best_val_acc'])
else:
    print('设置 RUN_MAIN_TRAIN = True 并运行此单元以执行 150 轮主训练。')


In [None]:
# 结果汇总辅助函数
ABLATION_CSV = os.path.join(RESULT_DIR, 'cifar10_ablation_summary.csv')
MAIN_HISTORY_CSV = os.path.join(RESULT_DIR, 'cifar10_main_history.csv')
BEST_MODEL_PATH = os.path.join(MODEL_DIR, 'cifar10_best.pth')


def get_ablation_df() -> pd.DataFrame:
    if 'ablation_summary' in globals() and len(ablation_summary) > 0:
        return pd.DataFrame(ablation_summary)
    if os.path.exists(ABLATION_CSV):
        return pd.read_csv(ABLATION_CSV)
    raise FileNotFoundError('缺少 ablation summary，请先运行阶段 1 单元。')


def get_main_history() -> pd.DataFrame:
    if 'main_history' in globals() and isinstance(main_history, pd.DataFrame):
        return main_history
    if os.path.exists(MAIN_HISTORY_CSV):
        return pd.read_csv(MAIN_HISTORY_CSV)
    raise FileNotFoundError('未找到主训练历史，请先执行阶段 2 单元。')


def summarize_factors(ablation_df: pd.DataFrame) -> pd.DataFrame:
    if 'best_val_acc' not in ablation_df.columns:
        raise ValueError('ablation summary 需要包含 best_val_acc 列。')
    baseline_acc = float(ablation_df.loc[ablation_df['factor'] == 'baseline', 'best_val_acc'].max())
    ablation_df = ablation_df.copy()
    ablation_df['gain_vs_baseline'] = ablation_df['best_val_acc'] - baseline_acc
    return ablation_df.sort_values('best_val_acc', ascending=False)


In [None]:
# 表格化总结
from IPython.display import display

try:
    ablation_df = get_ablation_df()
    ablation_summary_df = summarize_factors(ablation_df)
    print('Ablation summary (降序)：')
    display(ablation_summary_df[['factor', 'best_val_acc', 'gain_vs_baseline', 'best_epoch', 'total_time_min']])
except Exception as exc:
    print(f'无法载入消融结果: {exc}')
    ablation_summary_df = None

try:
    main_hist_df = get_main_history()
    best_val = main_hist_df.loc[main_hist_df['val_acc'].idxmax()]
    main_overview = pd.DataFrame([
        {
            'metric': 'best_val_acc',
            'value': best_val['val_acc'],
            'epoch': int(best_val['epoch'])
        },
        {
            'metric': 'final_val_acc',
            'value': main_hist_df['val_acc'].iloc[-1],
            'epoch': int(main_hist_df['epoch'].iloc[-1])
        }
    ])
    print('Main training overview:')
    display(main_overview)
except Exception as exc:
    print(f'无法载入主训练历史: {exc}')
    main_hist_df = None


In [None]:
# 学习曲线可视化
if main_hist_df is not None:
    fig, axes = plt.subplots(1, 2, figsize=(12, 4))
    axes[0].plot(main_hist_df['epoch'], main_hist_df['train_loss'], label='train_loss')
    axes[0].plot(main_hist_df['epoch'], main_hist_df['val_loss'], label='val_loss')
    axes[0].set_xlabel('Epoch'); axes[0].set_ylabel('Loss'); axes[0].set_title('Loss Curve')
    axes[0].legend(); axes[0].grid(True, alpha=0.3)

    axes[1].plot(main_hist_df['epoch'], main_hist_df['train_acc'], label='train_acc')
    axes[1].plot(main_hist_df['epoch'], main_hist_df['val_acc'], label='val_acc')
    if 'main_summary' in globals() and main_summary is not None:
        axes[1].axhline(main_summary['best_val_acc'], color='red', linestyle='--', label='best_val_acc')
    axes[1].set_xlabel('Epoch'); axes[1].set_ylabel('Accuracy'); axes[1].set_title('Accuracy Curve')
    axes[1].legend(); axes[1].grid(True, alpha=0.3)

    plt.tight_layout()
    curve_path = os.path.join(RESULT_DIR, 'cifar10_main_curves.png')
    plt.savefig(curve_path, dpi=200)
    print(f'学习曲线已保存到 {curve_path}')
else:
    print('尚未获得主训练历史，跳过绘图。')


### 阶段 3：结果分析与可视化
- 与 Baseline 对比最终模型性能。
- 输出每个因素的贡献条形图。
- 绘制主训练的学习曲线（accuracy/loss）。
- （可选）计算测试集混淆矩阵。

In [None]:
# 因素贡献可视化
if ablation_summary_df is not None:
    contrib_df = ablation_summary_df[ablation_summary_df['factor'] != 'baseline']
    if not contrib_df.empty:
        plt.figure(figsize=(8, 4))
        plt.bar(contrib_df['factor'], contrib_df['gain_vs_baseline'])
        plt.ylabel('Δ val_acc vs baseline')
        plt.title('Factor Contribution (validation gain)')
        plt.grid(True, axis='y', alpha=0.3)
        for idx, val in enumerate(contrib_df['gain_vs_baseline']):
            plt.text(idx, val, f'{val:.3f}', ha='center', va='bottom')
        contrib_path = os.path.join(RESULT_DIR, 'cifar10_factor_gains.png')
        plt.tight_layout()
        plt.savefig(contrib_path, dpi=200)
        print(f'因素贡献图已保存到 {contrib_path}')
    else:
        print('没有高于 baseline 的因素。')
else:
    print('尚无消融总结，无法绘制贡献图。')


In [None]:
# （可选）测试集混淆矩阵
try:
    from sklearn.metrics import confusion_matrix, classification_report
except ImportError:
    confusion_matrix = None
    classification_report = None

if os.path.exists(BEST_MODEL_PATH) and confusion_matrix is not None:
    config_path = os.path.join(RESULT_DIR, 'cifar10_main_config.json')
    if os.path.exists(config_path):
        with open(config_path, 'r', encoding='utf-8') as f:
            cfg_loaded = ExperimentConfig(**json.load(f))
    else:
        cfg_loaded = clone_config(baseline_cfg, 'eval')

    loaders_eval = build_dataloaders(cfg_loaded)
    model_eval = build_model(cfg_loaded)
    state = torch.load(BEST_MODEL_PATH, map_location=device)
    model_eval.load_state_dict(state['model'] if isinstance(state, dict) and 'model' in state else state)
    model_eval.eval()

    all_preds, all_labels = [], []
    with torch.no_grad():
        for inputs, targets in loaders_eval['test']:
            inputs = inputs.to(device)
            outputs = model_eval(inputs)
            preds = outputs.argmax(dim=1)
            all_preds.extend(preds.cpu().numpy())
            all_labels.extend(targets.numpy())

    cm = confusion_matrix(all_labels, all_preds)
    plt.figure(figsize=(8, 6))
    plt.imshow(cm, cmap='Blues')
    plt.title('Confusion Matrix (Test)')
    plt.xlabel('Predicted'); plt.ylabel('True')
    plt.colorbar()
    tick_labels = loaders_eval['test'].dataset.classes if hasattr(loaders_eval['test'].dataset, 'classes') else range(NUM_CLASSES)
    plt.xticks(range(len(tick_labels)), tick_labels, rotation=45, ha='right')
    plt.yticks(range(len(tick_labels)), tick_labels)
    for i in range(len(tick_labels)):
        for j in range(len(tick_labels)):
            plt.text(j, i, cm[i, j], ha='center', va='center', color='black')
    plt.tight_layout()
    cm_path = os.path.join(RESULT_DIR, 'cifar10_confusion_matrix.png')
    plt.savefig(cm_path, dpi=200)
    plt.show()

    if classification_report is not None:
        print(classification_report(all_labels, all_preds, target_names=tick_labels))
    print(f'混淆矩阵已保存到 {cm_path}')
else:
    print('缺少最佳模型或 sklearn，跳过混淆矩阵绘制。')


### 消融结果备注
运行阶段 1 后，可在此单元下方自动加载 `cifar10_ablation_summary.csv` 并生成贡献分析图。若需要重复实验，可清空 `runs/` 目录并重新运行相应单元。

In [None]:
# 快速查看消融 CSV（可选）
if os.path.exists(ABLATION_CSV):
    tmp_df = pd.read_csv(ABLATION_CSV)
    display(tmp_df)
else:
    print('尚未生成 ablation CSV。')


In [None]:
# （可选）归档当前 Notebook
import shutil

NOTEBOOK_ABS_PATH = os.path.abspath('CIFAR-10.ipynb')
NOTEBOOK_COPY_PATH = os.path.join(RESULT_DIR, f'CIFAR10_notebook_backup.ipynb')
try:
    shutil.copy2(NOTEBOOK_ABS_PATH, NOTEBOOK_COPY_PATH)
    print(f'Notebook archived to: {NOTEBOOK_COPY_PATH}')
except Exception as exc:
    print(f'归档失败: {exc}')


## 实验总结与分析（待根据实测结果补充）
- Baseline：简洁 CNN + 轻量增强，10 轮即可复现课程参考精度。
- 阶段 1：通过 5 个因素的消融实验量化增益，筛选表现优于 Baseline 的方案。
- 阶段 2：将有效因素（如 Residual、SE、Deeper/Wider、强化增强、AdamW+Cosine 等）组合，训练 150 轮并保存最佳模型。
- 阶段 3：对比 Baseline 与最终模型精度、绘制学习曲线与因素增益图，并可生成混淆矩阵辅助分析。

> 建议：在完成全部实验后，将关键数值（最佳验证/测试准确率、各因素增益）填入课程报告的表格中，同时结合曲线与混淆矩阵撰写文字分析。

In [None]:
# 将消融结果导出为 Markdown（用于报告）
if ablation_summary_df is not None:
    md_path = os.path.join(RESULT_DIR, 'cifar10_ablation_summary.md')
    with open(md_path, 'w', encoding='utf-8') as f:
        f.write(ablation_summary_df.to_markdown(index=False))
    print(f'消融结果已导出到 {md_path}')
else:
    print('暂无消融数据可导出。')


In [None]:
# 运行元数据（可选）
meta = {
    'timestamp': time.strftime('%Y-%m-%d %H:%M:%S'),
    'seed': SEED,
    'device': str(device),
    'best_val_acc': main_summary['best_val_acc'] if ('main_summary' in globals() and main_summary) else None,
    'best_epoch': main_summary['best_epoch'] if ('main_summary' in globals() and main_summary) else None,
    'artifacts': {
        'result_dir': RESULT_DIR,
        'model_dir': MODEL_DIR,
        'best_model_path': BEST_MODEL_PATH if os.path.exists(BEST_MODEL_PATH) else None,
        'ablation_csv': ABLATION_CSV if os.path.exists(ABLATION_CSV) else None,
        'main_history_csv': MAIN_HISTORY_CSV if os.path.exists(MAIN_HISTORY_CSV) else None
    }
}
meta_path = os.path.join(RESULT_DIR, 'cifar10_run_meta.json')
with open(meta_path, 'w', encoding='utf-8') as f:
    json.dump(meta, f, ensure_ascii=False, indent=2)
print(f'元数据已写入 {meta_path}')


# 我学到了什么（反思）

在本次 CIFAR-10 任务中，我从一个简洁的 CNN 出发，逐步引入以下因素并进行对比：

- 残差与 SE 注意力：缓解退化并提升特征表达；
- 更深/更宽的结构：带来容量提升但需要配合正则与调参；
- 更强的数据增强（含 Cutout/MixUp/RandomErasing）：有效抑制过拟合；
- AdamW 与 Warmup+Cosine：更稳定的优化与更好的最终性能；
- Label Smoothing：在类别间相似时可提升泛化表现；

结合训练曲线与消融结果，我理解到“配方”需要整体协同：增强强度、epoch 数、正则化与学习率日程彼此影响，不可孤立看待。

## 快速烟囱测试（Smoke Test）
为验证改进后的代码在本机可直接运行，这里进行一次极简训练：
- 只训练 1 个 epoch
- 每个 epoch 仅跑 5 个 mini-batches（max_steps_per_epoch=5）
- 使用较小的 batch_size=64
期望：无异常报错，输出训练与验证的基本指标。

In [None]:
# 运行 Smoke Test
cfg = ExperimentConfig(
    name='smoke_baseline',
    epochs=1,
    batch_size=64,
    base_lr=0.05,
    optimizer='sgd',
    scheduler='none',
    use_amp=False,
    max_steps_per_epoch=5,
    use_residual=False,
    use_se=False,
    depth=2,
    width=1,
    use_strong_aug=False,
)

loaders = build_dataloaders(cfg, num_workers=0)
hist, summary = run_experiment(cfg, loaders, track_test=False, verbose=True)

print('\n=== Smoke Test Summary ===')
print({k: v for k, v in summary.items() if k in ['best_val_acc', 'best_epoch', 'total_time']})
print('History (tail):')
print(hist.tail())
