In [19]:
# 1. 环境初始化、高级配置与辅助函数

# 核心库导入
import os, sys, random, time, platform, json, math
from dataclasses import dataclass, asdict
from typing import Dict, Any, List, Tuple, Optional

# 科学计算与深度学习库
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.optim.lr_scheduler import CosineAnnealingLR, OneCycleLR
from torch.cuda.amp import autocast, GradScaler
import torchvision
import torchvision.transforms as T
from torch.utils.data import DataLoader, random_split
from IPython.display import display

# --- 初始设置 ---
SEED = 42
def set_seed(seed: int):
    """设置所有相关库的随机种子以保证可复现性"""
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed_all(seed)
        # cuDNN 可复现设置
        import torch.backends.cudnn as cudnn
        cudnn.benchmark = False
        cudnn.deterministic = True

set_seed(SEED) # 全局设置一次

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# --- 统一路径配置 ---
RUN_TAG = time.strftime("%Y%m%d_%H%M%S")
RESULTS_ROOT = "./results"
RESULTS_DIR = os.path.join(RESULTS_ROOT, f"cifar10_{RUN_TAG}")
os.makedirs(RESULTS_DIR, exist_ok=True)
print(f"Results will be saved to: {RESULTS_DIR}")

# 统一产物路径
BEST_MODEL_PATH = os.path.join(RESULTS_DIR, "best_model_cifar10.pth")
ABLATION_CSV = os.path.join(RESULTS_DIR, "cifar10_ablation_results.csv")
MAIN_HISTORY_CSV = os.path.join(RESULTS_DIR, 'cifar10_main_history.csv')
NOTEBOOK_ABS_PATH = "/data/zhangzhikui/githubbase/DL/HW1/CIFAR-10.ipynb" # 请根据实际情况检查此路径
NOTEBOOK_COPY_PATH = os.path.join(RESULTS_DIR, f"CIFAR-10_{RUN_TAG}.ipynb")
NUM_CLASSES = 10
DATA_ROOT = './data'

# --- 环境信息打印 ---
print({
    "python": sys.version.split(" ")[0],
    "platform": platform.platform(),
    "pytorch": torch.__version__,
    "torchvision": torchvision.__version__,
    "cuda_available": torch.cuda.is_available(),
    "cuda_version": torch.version.cuda if torch.cuda.is_available() else None,
    "device": str(device),
    "seed": SEED
})


# --- 实验配置 ---
@dataclass
class ExperimentConfig:
    name: str = 'baseline'
    epochs: int = 10
    batch_size: int = 128
    base_lr: float = 0.1
    momentum: float = 0.9
    weight_decay: float = 5e-4
    optimizer: str = 'sgd'            # 'sgd' or 'adamw'
    scheduler: str = 'none'           # 'none' | 'cosine' | 'onecycle'
    label_smoothing: float = 0.0
    use_amp: bool = False
    grad_clip: float = 0.0
    max_steps_per_epoch: int = 0      # 0 表示不限制；用于快速烟囱测试

    # 结构改进
    use_residual: bool = False
    use_se: bool = False
    depth: int = 2                   # blocks per stage
    width: int = 1                   # channel multiplier

    # 数据增强/正则化
    use_strong_aug: bool = False
    randaugment_n: int = 0
    randaugment_m: int = 9
    use_mixup: bool = False
    mixup_alpha: float = 0.2
    use_cutmix: bool = False
    cutmix_alpha: float = 1.0
    label_smoothing_for_aug: float = 0.05

    # 其他
    save_path: str = ''
    seed: int = SEED

    def to_dict(self) -> Dict[str, Any]:
        return asdict(self)

# --- 辅助函数 ---
def clone_config(base: ExperimentConfig, name: str, **overrides) -> ExperimentConfig:
    """克隆并覆盖配置"""
    cfg_dict = base.to_dict()
    cfg_dict.update(overrides)
    cfg_dict['name'] = name
    return ExperimentConfig(**cfg_dict)


Results will be saved to: ./results/cifar10_20251027_103538
{'python': '3.10.0', 'platform': 'Linux-5.15.0-139-generic-x86_64-with-glibc2.31', 'pytorch': '2.5.1+cu121', 'torchvision': '0.20.1+cu121', 'cuda_available': True, 'cuda_version': '12.1', 'device': 'cuda', 'seed': 42}


In [20]:
# 2. 数据集拆分与数据增强策略

# 数据加载与增强管线
CIFAR_MEAN = (0.4914, 0.4822, 0.4465)
CIFAR_STD = (0.2023, 0.1994, 0.2010)


class Cutout:
    """在张量图像上随机遮挡若干正方形区域。"""
    def __init__(self, n_holes: int = 1, length: int = 16):
        self.n_holes = n_holes
        self.length = length

    def __call__(self, img: torch.Tensor) -> torch.Tensor:
        if not torch.is_tensor(img):
            raise TypeError('Cutout 需要在 ToTensor 之后使用。')
        h, w = img.size(1), img.size(2)
        mask = torch.ones((h, w), device=img.device)
        for _ in range(self.n_holes):
            y = torch.randint(0, h, (1,)).item()
            x = torch.randint(0, w, (1,)).item()
            y1 = max(0, y - self.length // 2)
            y2 = min(h, y + self.length // 2)
            x1 = max(0, x - self.length // 2)
            x2 = min(w, x + self.length // 2)
            mask[y1:y2, x1:x2] = 0
        mask = mask.expand_as(img)
        return img * mask


class TransformSubset(torch.utils.data.Dataset):
    def __init__(self, dataset, indices: List[int], transform):
        self.dataset = dataset
        self.indices = indices
        self.transform = transform

    def __len__(self) -> int:
        return len(self.indices)

    def __getitem__(self, idx: int):
        image, target = self.dataset[self.indices[idx]]
        if self.transform is not None:
            image = self.transform(image)
        return image, target


def build_transforms(cfg: ExperimentConfig):
    train_tfms: List[Any] = [
        T.RandomCrop(32, padding=4),
        T.RandomHorizontalFlip(),
    ]
    if cfg.use_strong_aug:
        if cfg.randaugment_n > 0:
            train_tfms.append(T.RandAugment(cfg.randaugment_n, cfg.randaugment_m))
        train_tfms.append(T.ColorJitter(0.3, 0.3, 0.3, 0.2))
    train_tfms.append(T.ToTensor())
    train_tfms.append(T.Normalize(CIFAR_MEAN, CIFAR_STD))
    if cfg.use_strong_aug:
        train_tfms.append(Cutout(n_holes=1, length=12))
        train_tfms.append(T.RandomErasing(p=0.25, scale=(0.02, 0.25)))

    eval_tfms = T.Compose([
        T.ToTensor(),
        T.Normalize(CIFAR_MEAN, CIFAR_STD)
    ])

    return T.Compose(train_tfms), eval_tfms


def build_dataloaders(cfg: ExperimentConfig, num_workers: int = 0) -> Dict[str, DataLoader]:
    # Windows + Notebook 环境下，num_workers=0 更稳妥
    torch.manual_seed(cfg.seed) # 保证数据集划分一致
    train_transform, eval_transform = build_transforms(cfg)

    base_train = torchvision.datasets.CIFAR10(
        root=DATA_ROOT,
        train=True,
        download=True,
        transform=None
    )
    test_dataset = torchvision.datasets.CIFAR10(
        root=DATA_ROOT,
        train=False,
        download=True,
        transform=eval_transform
    )

    total_len = len(base_train)
    val_len = 5000
    train_len = total_len - val_len
    generator = torch.Generator().manual_seed(0) # 固定划分
    train_indices, val_indices = random_split(range(total_len), [train_len, val_len], generator=generator)

    train_subset = TransformSubset(base_train, list(train_indices), train_transform)
    val_subset = TransformSubset(base_train, list(val_indices), eval_transform)

    loaders = {
        'train': DataLoader(train_subset, batch_size=cfg.batch_size, shuffle=True,
                             num_workers=num_workers, pin_memory=True),
        'val': DataLoader(val_subset, batch_size=cfg.batch_size, shuffle=False,
                           num_workers=num_workers, pin_memory=True),
        'test': DataLoader(test_dataset, batch_size=cfg.batch_size, shuffle=False,
                            num_workers=num_workers, pin_memory=True)
    }
    print(f"DataLoaders created: train={len(train_subset)}, val={len(val_subset)}, test={len(test_dataset)}")
    return loaders


## 3. 模型结构
- **Residual**：stage 内使用残差跳连，缓解梯度消失。
- **SE Attention**：引入通道注意力，自适应重标特征。
- **Depth / Width**：通过 `depth` 和 `width` 超参调整网络容量。
- 所有变化都在同一个 `TinyCIFARNet` 架构中切换，保证实验公平。


In [21]:
# 模型构建
class SEModule(nn.Module):
    def __init__(self, ch: int, reduction: int = 16):
        super().__init__()
        hidden = max(ch // reduction, 4)
        self.pool = nn.AdaptiveAvgPool2d(1)
        self.conv1 = nn.Conv2d(ch, hidden, kernel_size=1, bias=True)
        self.conv2 = nn.Conv2d(hidden, ch, kernel_size=1, bias=True)

    def forward(self, x):
        w = self.pool(x)
        w = F.relu(self.conv1(w), inplace=True)
        w = torch.sigmoid(self.conv2(w))
        return x * w


class ConvBlock(nn.Module):
    def __init__(self, in_ch: int, out_ch: int, stride: int = 1,
                 residual: bool = False, use_se: bool = False):
        super().__init__()
        self.residual = residual
        self.use_se = use_se
        self.conv1 = nn.Conv2d(in_ch, out_ch, kernel_size=3, stride=stride,
                               padding=1, bias=False)
        self.bn1 = nn.BatchNorm2d(out_ch)
        self.conv2 = nn.Conv2d(out_ch, out_ch, kernel_size=3, stride=1,
                               padding=1, bias=False)
        self.bn2 = nn.BatchNorm2d(out_ch)
        self.se = SEModule(out_ch) if use_se else nn.Identity()
        self.shortcut = None
        if residual and (stride != 1 or in_ch != out_ch):
            self.shortcut = nn.Sequential(
                nn.Conv2d(in_ch, out_ch, kernel_size=1, stride=stride, bias=False),
                nn.BatchNorm2d(out_ch)
            )

    def forward(self, x):
        out = F.relu(self.bn1(self.conv1(x)), inplace=True)
        out = self.bn2(self.conv2(out))
        out = self.se(out)
        if self.residual:
            shortcut = x if self.shortcut is None else self.shortcut(x)
            out = out + shortcut
        return F.relu(out, inplace=True)


class TinyCIFARNet(nn.Module):
    def __init__(self, cfg: ExperimentConfig):
        super().__init__()
        widths = [32, 64, 128]
        widths = [w * cfg.width for w in widths]
        self.stem = nn.Sequential(
            nn.Conv2d(3, widths[0], kernel_size=3, stride=1, padding=1, bias=False),
            nn.BatchNorm2d(widths[0]),
            nn.ReLU(inplace=True)
        )
        self.stage1 = self._make_stage(widths[0], widths[0], cfg.depth,
                                       stride=1, residual=cfg.use_residual, use_se=cfg.use_se)
        self.stage2 = self._make_stage(widths[0], widths[1], cfg.depth,
                                       stride=2, residual=cfg.use_residual, use_se=cfg.use_se)
        self.stage3 = self._make_stage(widths[1], widths[2], cfg.depth,
                                       stride=2, residual=cfg.use_residual, use_se=cfg.use_se)
        self.head = nn.Sequential(
            nn.AdaptiveAvgPool2d(1),
            nn.Flatten(),
            nn.Linear(widths[2], NUM_CLASSES)
        )
        self._init_weights()

    def _make_stage(self, in_ch: int, out_ch: int, depth: int,
                    stride: int, residual: bool, use_se: bool) -> nn.Sequential:
        blocks = [ConvBlock(in_ch, out_ch, stride=stride,
                            residual=residual, use_se=use_se)]
        for _ in range(1, depth):
            blocks.append(ConvBlock(out_ch, out_ch, stride=1,
                                    residual=residual, use_se=use_se))
        return nn.Sequential(*blocks)

    def _init_weights(self):
        for m in self.modules():
            if isinstance(m, nn.Conv2d):
                nn.init.kaiming_normal_(m.weight, mode='fan_out', nonlinearity='relu')
            elif isinstance(m, nn.BatchNorm2d):
                nn.init.ones_(m.weight)
                nn.init.zeros_(m.bias)
            elif isinstance(m, nn.Linear):
                nn.init.normal_(m.weight, 0, 0.01)
                nn.init.zeros_(m.bias)

    def forward(self, x):
        x = self.stem(x)
        x = self.stage1(x)
        x = self.stage2(x)
        x = self.stage3(x)
        x = self.head(x)
        return x


def build_model(cfg: ExperimentConfig) -> nn.Module:
    model = TinyCIFARNet(cfg)
    return model.to(device)


In [22]:
# 4. 训练与评估工具

class AverageMeter:
    def __init__(self):
        self.reset()

    def reset(self):
        self.sum = 0.0
        self.count = 0

    def update(self, value: float, n: int = 1):
        self.sum += value * n
        self.count += n

    @property
    def avg(self) -> float:
        if self.count == 0:
            return 0.0
        return self.sum / self.count


def soft_cross_entropy(logits: torch.Tensor, soft_targets: torch.Tensor) -> torch.Tensor:
    log_probs = F.log_softmax(logits, dim=1)
    return -(soft_targets * log_probs).sum(dim=1).mean()


def one_hot(targets: torch.Tensor, num_classes: int, smoothing: float = 0.0) -> torch.Tensor:
    with torch.no_grad():
        y = torch.zeros((targets.size(0), num_classes), device=targets.device)
        y.fill_(smoothing / (num_classes - 1))
        y.scatter_(1, targets.unsqueeze(1), 1.0 - smoothing)
    return y


def prepare_optimizer(model: nn.Module, cfg: ExperimentConfig, steps_per_epoch: int):
    if cfg.optimizer == 'adamw':
        optimizer = optim.AdamW(model.parameters(), lr=cfg.base_lr,
                                weight_decay=cfg.weight_decay)
    else:
        optimizer = optim.SGD(model.parameters(), lr=cfg.base_lr,
                              momentum=cfg.momentum, weight_decay=cfg.weight_decay, nesterov=True)

    scheduler = None
    if cfg.scheduler == 'cosine':
        scheduler = CosineAnnealingLR(optimizer, T_max=cfg.epochs)
    elif cfg.scheduler == 'onecycle':
        scheduler = OneCycleLR(optimizer, max_lr=cfg.base_lr,
                               steps_per_epoch=steps_per_epoch, epochs=cfg.epochs)
    return optimizer, scheduler


def apply_mixup_cutmix(inputs: torch.Tensor, targets: torch.Tensor, cfg: ExperimentConfig):
    soft_targets = one_hot(targets, NUM_CLASSES, smoothing=cfg.label_smoothing_for_aug)
    if cfg.use_mixup and cfg.mixup_alpha > 0:
        lam = np.random.beta(cfg.mixup_alpha, cfg.mixup_alpha)
        index = torch.randperm(inputs.size(0), device=inputs.device)
        mixed = lam * inputs + (1 - lam) * inputs[index]
        soft_targets = lam * soft_targets + (1 - lam) * soft_targets[index]
        return mixed, soft_targets
    if cfg.use_cutmix and cfg.cutmix_alpha > 0:
        lam = np.random.beta(cfg.cutmix_alpha, cfg.cutmix_alpha)
        batch_size, _, h, w = inputs.size()
        index = torch.randperm(batch_size, device=inputs.device)
        cut_rat = math.sqrt(1.0 - lam)
        cut_w = int(w * cut_rat)
        cut_h = int(h * cut_rat)
        cx = np.random.randint(w)
        cy = np.random.randint(h)
        x1 = np.clip(cx - cut_w // 2, 0, w)
        y1 = np.clip(cy - cut_h // 2, 0, h)
        x2 = np.clip(cx + cut_w // 2, 0, w)
        y2 = np.clip(cy + cut_h // 2, 0, h)
        inputs[:, :, y1:y2, x1:x2] = inputs[index, :, y1:y2, x1:x2]
        lam = 1 - ((x2 - x1) * (y2 - y1) / (w * h))
        soft_targets = lam * soft_targets + (1 - lam) * soft_targets[index]
        return inputs, soft_targets
    return inputs, soft_targets


def train_one_epoch(model: nn.Module, loaders: Dict[str, DataLoader], cfg: ExperimentConfig,
                    optimizer, scheduler=None, scaler: Optional[GradScaler] = None) -> Dict[str, float]:
    model.train()
    loss_meter = AverageMeter()
    acc_meter = AverageMeter()
    train_loader = loaders['train']
    criterion = nn.CrossEntropyLoss(label_smoothing=cfg.label_smoothing)

    for step, (inputs, targets) in enumerate(train_loader):
        if cfg.max_steps_per_epoch and (step >= cfg.max_steps_per_epoch):
            break
        inputs = inputs.to(device, non_blocking=True)
        targets = targets.to(device, non_blocking=True)
        optimizer.zero_grad(set_to_none=True)

        use_soft = cfg.use_mixup or cfg.use_cutmix
        if use_soft:
            inputs, soft_targets = apply_mixup_cutmix(inputs, targets, cfg)
        with autocast(enabled=cfg.use_amp):
            outputs = model(inputs)
            if use_soft:
                loss = soft_cross_entropy(outputs, soft_targets)
                hard_targets = torch.argmax(soft_targets, dim=1)
            elif cfg.label_smoothing > 0.0:
                soft_targets = one_hot(targets, NUM_CLASSES, smoothing=cfg.label_smoothing)
                loss = soft_cross_entropy(outputs, soft_targets)
                hard_targets = targets
            else:
                loss = criterion(outputs, targets)
                hard_targets = targets

        if scaler is not None and cfg.use_amp:
            scaler.scale(loss).backward()
            if cfg.grad_clip > 0:
                scaler.unscale_(optimizer)
                nn.utils.clip_grad_norm_(model.parameters(), cfg.grad_clip)
            scaler.step(optimizer)
            scaler.update()
        else:
            loss.backward()
            if cfg.grad_clip > 0:
                nn.utils.clip_grad_norm_(model.parameters(), cfg.grad_clip)
            optimizer.step()

        if scheduler is not None and cfg.scheduler == 'onecycle':
            scheduler.step()

        loss_meter.update(loss.item(), inputs.size(0))
        preds = outputs.argmax(dim=1)
        acc = (preds == hard_targets).float().mean().item()
        acc_meter.update(acc, inputs.size(0))

    if scheduler is not None and cfg.scheduler == 'cosine':
        scheduler.step()

    return {'train_loss': loss_meter.avg, 'train_acc': acc_meter.avg}


def evaluate(model: nn.Module, loader: DataLoader) -> Tuple[float, float]:
    model.eval()
    loss_meter = AverageMeter()
    acc_meter = AverageMeter()
    criterion = nn.CrossEntropyLoss()

    with torch.no_grad():
        for inputs, targets in loader:
            inputs = inputs.to(device, non_blocking=True)
            targets = targets.to(device, non_blocking=True)
            outputs = model(inputs)
            loss = criterion(outputs, targets)
            preds = outputs.argmax(dim=1)
            acc = (preds == targets).float().mean().item()
            loss_meter.update(loss.item(), inputs.size(0))
            acc_meter.update(acc, inputs.size(0))
    return loss_meter.avg, acc_meter.avg


## 5. 训练循环封装
封装单次实验（训练 + 验证）的流程，便于在消融与主训练中重复调用。支持：
- 自动写入日志（DataFrame）
- 可选 AMP、梯度裁剪、调度器
- 训练完成后返回验证集最优指标与对应权重路径


In [23]:
# 单次实验执行

def run_experiment(cfg: ExperimentConfig, loaders: Dict[str, DataLoader] | None = None,
                   track_test: bool = False, verbose: bool = True) -> Tuple[pd.DataFrame, Dict[str, Any]]:
    if loaders is None:
        loaders = build_dataloaders(cfg)
    set_seed(cfg.seed)
    model = build_model(cfg)
    steps_per_epoch = len(loaders['train'])
    optimizer, scheduler = prepare_optimizer(model, cfg, steps_per_epoch)
    scaler = GradScaler(enabled=cfg.use_amp)

    history: List[Dict[str, Any]] = []
    best_val = 0.0
    best_state = None
    start_time = time.time()

    for epoch in range(1, cfg.epochs + 1):
        epoch_start = time.time()
        train_metrics = train_one_epoch(model, loaders, cfg, optimizer, scheduler, scaler)
        val_loss, val_acc = evaluate(model, loaders['val'])
        record = {
            'epoch': epoch,
            'train_loss': train_metrics['train_loss'],
            'train_acc': train_metrics['train_acc'],
            'val_loss': val_loss,
            'val_acc': val_acc,
            'lr': optimizer.param_groups[0]['lr'],
            'epoch_time': time.time() - epoch_start,
        }
        history.append(record)
        if verbose:
            print(f"[{cfg.name}] Epoch {epoch:03d}/{cfg.epochs} | "
                  f"train_acc={train_metrics['train_acc']:.3f} | val_acc={val_acc:.3f} | "
                  f"time={record['epoch_time']:.1f}s")
        if val_acc > best_val:
            best_val = val_acc
            best_state = {
                'model': model.state_dict(),
                'optimizer': optimizer.state_dict(),
                'epoch': epoch
            }

    history_df = pd.DataFrame(history)
    test_metrics = {'test_loss': None, 'test_acc': None}
    if track_test:
        model.load_state_dict(best_state['model'])
        test_loss, test_acc = evaluate(model, loaders['test'])
        test_metrics = {'test_loss': test_loss, 'test_acc': test_acc}

    summary = {
        'config': cfg.to_dict(),
        'best_val_acc': best_val,
        'best_epoch': best_state['epoch'] if best_state else None,
        'hist': history_df,
        **test_metrics,
        'total_time': time.time() - start_time
    }

    if cfg.save_path:
        torch.save(best_state, cfg.save_path)
        summary['model_path'] = cfg.save_path
    else:
        summary['model_path'] = None

    return history_df, summary


## 6. 阶段 1：消融实验
- **Baseline**：无残差/SE，depth=2，width=1，轻量增强，仅 SGD。
- 单因素实验（每个 10 轮）：
  1. `+Residual`：启用残差连接。
  2. `+SE`：在残差分支基础上启用 SE（保持是否残差可独立开关）。
  3. `+Deeper/Wider`：将 `depth=3`，`width=2`。
  4. `+Augmentation`：开启 RandAugment + Cutout + RandomErasing。
  5. `+Optimizer`：使用 AdamW + Cosine 调度 + AMP + Label Smoothing。
- 运行后保存结果表格到 `results/cifar10_.../cifar10_ablation_summary.csv`，供阶段 2 选择组合。


In [None]:
# 阶段 1：消融实验运行
baseline_cfg = ExperimentConfig(
    name='baseline', epochs=50, batch_size=128, base_lr=0.1,
    optimizer='sgd', scheduler='none', label_smoothing=0.0,
    use_amp=False, grad_clip=0.0, use_residual=False, use_se=False,
    depth=2, width=1, use_strong_aug=False, use_mixup=False, use_cutmix=False,
    randaugment_n=0, randaugment_m=9, weight_decay=5e-4
)

ablation_configs = [
    ('baseline', baseline_cfg),
    ('residual', clone_config(baseline_cfg, 'abl_residual', use_residual=True)),
    ('se', clone_config(baseline_cfg, 'abl_se', use_se=True)),
    ('deep_wide', clone_config(baseline_cfg, 'abl_deepwide', depth=3, width=2)),
    ('augmentation', clone_config(baseline_cfg, 'abl_aug', use_strong_aug=True,
                                  randaugment_n=2, randaugment_m=9, use_mixup=True,
                                  use_cutmix=False, label_smoothing_for_aug=0.05)),
    ('optimizer', clone_config(baseline_cfg, 'abl_opt', optimizer='adamw', base_lr=3e-4,
                               scheduler='cosine', label_smoothing=0.1,
                               use_amp=True, grad_clip=1.0))
]

# --- 控制开关 ---
RUN_ABLATION = True  # 将其改为 True 后执行本单元即可开始 6 组实验
# ---

ablation_history: Dict[str, pd.DataFrame] = {}
ablation_summary: List[Dict[str, Any]] = []

if RUN_ABLATION:
    total = len(ablation_configs)
    for idx, (factor, cfg) in enumerate(ablation_configs, start=1):
        print('=' * 80)
        print(f'[Stage 1] 进度: {idx}/{total} | 因子: {factor} | 配置名: {cfg.name}')
        print(f"  residual={cfg.use_residual} | se={cfg.use_se} | depth={cfg.depth} | width={cfg.width}")
        print(f"  strong_aug={cfg.use_strong_aug} | optimizer={cfg.optimizer} | lr={cfg.base_lr} | scheduler={cfg.scheduler}")
        hist, summary = run_experiment(cfg)
        ablation_history[factor] = hist
        ablation_summary.append({
            'factor': factor,
            'name': cfg.name,
            'best_val_acc': summary['best_val_acc'],
            'best_epoch': summary['best_epoch'],
            'total_time_min': summary['total_time'] / 60.0,
            'config': cfg.to_dict()
        })
        print(f"完成 {factor} | best_val_acc={summary['best_val_acc']*100:.2f}% @ epoch {summary['best_epoch']}")

    ablation_df = pd.DataFrame(ablation_summary)
    ablation_df.to_csv(ABLATION_CSV, index=False)
    print('=' * 80)
    print(f'Ablation summary saved to {ABLATION_CSV}')
else:
    print('设置 RUN_ABLATION = True 并重新运行此单元以启动消融实验。')


[Stage 1] 进度: 1/6 | 因子: baseline | 配置名: baseline
  residual=False | se=False | depth=2 | width=1
  strong_aug=False | optimizer=sgd | lr=0.1 | scheduler=none
Files already downloaded and verified
Files already downloaded and verified
Files already downloaded and verified
Files already downloaded and verified
DataLoaders created: train=45000, val=5000, test=10000
DataLoaders created: train=45000, val=5000, test=10000


  scaler = GradScaler(enabled=cfg.use_amp)
  with autocast(enabled=cfg.use_amp):


[baseline] Epoch 001/10 | train_acc=0.425 | val_acc=0.449 | time=26.2s
[baseline] Epoch 002/10 | train_acc=0.641 | val_acc=0.625 | time=25.0s
[baseline] Epoch 002/10 | train_acc=0.641 | val_acc=0.625 | time=25.0s


KeyboardInterrupt: 

## 7. 阶段 2：组合主要训练
- 根据阶段 1 结果，筛选验证集准确率超过 Baseline 的因素。
- 将这些因素合并，形成主训练配置（150 轮）。
- 默认开启：AMP、AdamW、Cosine 调度、MixUp（若在阶段 1 中表现优秀）。
- 保存最好模型权重到 `results/.../best_model_cifar10.pth`。


In [None]:
# 组合主训练
FACTOR_PATCHES = {
    'residual': {'use_residual': True},
    'se': {'use_se': True},
    'deep_wide': {'depth': 3, 'width': 2},
    'augmentation': {
        'use_strong_aug': True,
        'randaugment_n': 2,
        'randaugment_m': 9,
        'use_mixup': True,
        'use_cutmix': False,
        'label_smoothing_for_aug': 0.05
    },
    'optimizer': {
        'optimizer': 'adamw',
        'base_lr': 3e-4,
        'scheduler': 'cosine',
        'label_smoothing': 0.1,
        'use_amp': True,
        'grad_clip': 1.0
    }
}


def load_ablation_table() -> pd.DataFrame:
    if 'ablation_summary' in globals() and len(ablation_summary) > 0:
        return pd.DataFrame(ablation_summary)
    if os.path.exists(ABLATION_CSV):
        return pd.read_csv(ABLATION_CSV)
    raise FileNotFoundError(f'未找到消融实验结果 {ABLATION_CSV}，请先运行上一单元或读取 CSV。')

# --- 控制开关 ---
RUN_MAIN_TRAIN = True
# ---

main_history = None
main_summary = None

if RUN_MAIN_TRAIN:
    ablation_df = load_ablation_table()
    baseline_row = ablation_df[ablation_df['factor'] == 'baseline']
    if baseline_row.empty:
        raise ValueError('消融结果中缺少 baseline，请确认阶段 1 已成功运行。')
    baseline_acc = baseline_row['best_val_acc'].max()
    selected = ablation_df[ablation_df['best_val_acc'] > baseline_acc]['factor'].tolist()
    print('Selected factors with gain > baseline:', selected)

    combined_cfg = clone_config(
        baseline_cfg,
        'main_training',
        epochs=150,
        batch_size=128,
        save_path=BEST_MODEL_PATH,
        weight_decay=3e-4
    )
    for factor in selected:
        for key, value in FACTOR_PATCHES.get(factor, {}).items():
            setattr(combined_cfg, key, value)

    if combined_cfg.use_mixup or combined_cfg.use_cutmix:
        combined_cfg.label_smoothing_for_aug = max(combined_cfg.label_smoothing_for_aug, 0.05)

    config_path = os.path.join(RESULTS_DIR, 'cifar10_main_config.json')
    with open(config_path, 'w', encoding='utf-8') as f:
        json.dump(combined_cfg.to_dict(), f, ensure_ascii=False, indent=2)
    print(f'Final main config saved to {config_path}')

    print('Final main config:')
    print(combined_cfg)
    main_history, main_summary = run_experiment(combined_cfg, track_test=True)
    main_history.to_csv(MAIN_HISTORY_CSV, index=False)
    print('Main training finished. Best val acc:', main_summary['best_val_acc'])
else:
    print('设置 RUN_MAIN_TRAIN = True 并运行此单元以执行 150 轮主训练。')


In [None]:
# 8. 结果分析与可视化

# 结果汇总辅助函数
def get_ablation_df() -> pd.DataFrame:
    if 'ablation_summary' in globals() and len(ablation_summary) > 0:
        return pd.DataFrame(ablation_summary)
    if os.path.exists(ABLATION_CSV):
        return pd.read_csv(ABLATION_CSV)
    raise FileNotFoundError(f'缺少 ablation summary {ABLATION_CSV}，请先运行阶段 1 单元。')


def get_main_history() -> pd.DataFrame:
    if 'main_history' in globals() and isinstance(main_history, pd.DataFrame):
        return main_history
    if os.path.exists(MAIN_HISTORY_CSV):
        return pd.read_csv(MAIN_HISTORY_CSV)
    raise FileNotFoundError(f'未找到主训练历史 {MAIN_HISTORY_CSV}，请先执行阶段 2 单元。')


def summarize_factors(ablation_df: pd.DataFrame) -> pd.DataFrame:
    if 'best_val_acc' not in ablation_df.columns:
        raise ValueError('ablation summary 需要包含 best_val_acc 列。')
    baseline_acc = float(ablation_df.loc[ablation_df['factor'] == 'baseline', 'best_val_acc'].max())
    ablation_df = ablation_df.copy()
    ablation_df['gain_vs_baseline'] = ablation_df['best_val_acc'] - baseline_acc
    return ablation_df.sort_values('best_val_acc', ascending=False)


In [None]:
# 表格化总结
from IPython.display import display

try:
    ablation_df = get_ablation_df()
    ablation_summary_df = summarize_factors(ablation_df)
    print('Ablation summary (降序)：')
    display(ablation_summary_df[['factor', 'best_val_acc', 'gain_vs_baseline', 'best_epoch', 'total_time_min']])
except Exception as exc:
    print(f'无法载入消融结果: {exc}')
    ablation_summary_df = None

try:
    main_hist_df = get_main_history()
    best_val = main_hist_df.loc[main_hist_df['val_acc'].idxmax()]
    main_overview = pd.DataFrame([
        {
            'metric': 'best_val_acc',
            'value': best_val['val_acc'],
            'epoch': int(best_val['epoch'])
        },
        {
            'metric': 'final_val_acc',
            'value': main_hist_df['val_acc'].iloc[-1],
            'epoch': int(main_hist_df['epoch'].iloc[-1])
        }
    ])
    print('Main training overview:')
    display(main_overview)
except Exception as exc:
    print(f'无法载入主训练历史: {exc}')
    main_hist_df = None


In [None]:
# 学习曲线可视化
if main_hist_df is not None:
    fig, axes = plt.subplots(1, 2, figsize=(12, 4))
    axes[0].plot(main_hist_df['epoch'], main_hist_df['train_loss'], label='train_loss')
    axes[0].plot(main_hist_df['epoch'], main_hist_df['val_loss'], label='val_loss')
    axes[0].set_xlabel('Epoch'); axes[0].set_ylabel('Loss'); axes[0].set_title('Loss Curve')
    axes[0].legend(); axes[0].grid(True, alpha=0.3)

    axes[1].plot(main_hist_df['epoch'], main_hist_df['train_acc'], label='train_acc')
    axes[1].plot(main_hist_df['epoch'], main_hist_df['val_acc'], label='val_acc')
    if 'main_summary' in globals() and main_summary is not None:
        axes[1].axhline(main_summary['best_val_acc'], color='red', linestyle='--', label='best_val_acc')
    axes[1].set_xlabel('Epoch'); axes[1].set_ylabel('Accuracy'); axes[1].set_title('Accuracy Curve')
    axes[1].legend(); axes[1].grid(True, alpha=0.3)

    plt.tight_layout()
    curve_path = os.path.join(RESULT_DIR, 'cifar10_main_curves.png')
    plt.savefig(curve_path, dpi=200)
    print(f'学习曲线已保存到 {curve_path}')
else:
    print('尚未获得主训练历史，跳过绘图。')


In [None]:
# 因素贡献可视化
import matplotlib.pyplot as plt
import seaborn as sns

try:
    ablation_df_plot = get_ablation_df()
    contrib_df = summarize_factors(ablation_df_plot)
    contrib_df = contrib_df[contrib_df['factor'] != 'baseline']
    
    if not contrib_df.empty:
        plt.figure(figsize=(10, 5))
        sns.barplot(x='factor', y='gain_vs_baseline', data=contrib_df, palette='viridis')
        plt.ylabel('Δ val_acc vs baseline')
        plt.title('Factor Contribution (Validation Gain)')
        plt.grid(True, axis='y', alpha=0.3)
        for index, row in contrib_df.iterrows():
            plt.text(row.name, row.gain_vs_baseline, f'{row.gain_vs_baseline:.3f}', 
                     color='black', ha="center", va='bottom')
        
        contrib_path = os.path.join(RESULTS_DIR, 'cifar10_factor_gains.png')
        plt.tight_layout()
        plt.savefig(contrib_path, dpi=200)
        print(f'因素贡献图已保存到 {contrib_path}')
        plt.show()
    else:
        print('没有高于 baseline 的因素。')
except Exception as e:
    print(f'尚无消融总结，无法绘制贡献图: {e}')


In [None]:
# （可选）测试集混淆矩阵
try:
    from sklearn.metrics import confusion_matrix, classification_report
except ImportError:
    confusion_matrix = None
    classification_report = None

if os.path.exists(BEST_MODEL_PATH) and confusion_matrix is not None:
    config_path = os.path.join(RESULTS_DIR, 'cifar10_main_config.json')
    if os.path.exists(config_path):
        with open(config_path, 'r', encoding='utf-8') as f:
            cfg_loaded = ExperimentConfig(**json.load(f))
    else:
        # 如果找不到配置文件，用一个合理的默认值来加载模型结构
        cfg_loaded = clone_config(baseline_cfg, 'eval', use_residual=True, use_se=True, depth=3, width=2)

    loaders_eval = build_dataloaders(cfg_loaded, num_workers=0)
    model_eval = build_model(cfg_loaded)
    state = torch.load(BEST_MODEL_PATH, map_location=device)
    model_eval.load_state_dict(state['model'] if isinstance(state, dict) and 'model' in state else state)
    model_eval.eval()

    all_preds, all_labels = [], []
    with torch.no_grad():
        for inputs, targets in loaders_eval['test']:
            inputs = inputs.to(device)
            outputs = model_eval(inputs)
            preds = outputs.argmax(dim=1)
            all_preds.extend(preds.cpu().numpy())
            all_labels.extend(targets.numpy())

    cm = confusion_matrix(all_labels, all_preds)
    plt.figure(figsize=(10, 8))
    tick_labels = loaders_eval['test'].dataset.classes if hasattr(loaders_eval['test'].dataset, 'classes') else range(NUM_CLASSES)
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', 
                xticklabels=tick_labels, yticklabels=tick_labels)
    plt.title('Confusion Matrix (Test Set)')
    plt.xlabel('Predicted Label')
    plt.ylabel('True Label')
    
    cm_path = os.path.join(RESULTS_DIR, 'cifar10_confusion_matrix.png')
    plt.tight_layout()
    plt.savefig(cm_path, dpi=200)
    plt.show()

    if classification_report is not None:
        print(classification_report(all_labels, all_preds, target_names=tick_labels))
    print(f'混淆矩阵已保存到 {cm_path}')
else:
    print(f'缺少最佳模型 ({BEST_MODEL_PATH}) 或 sklearn，跳过混淆矩阵绘制。')


In [None]:
# 9. 快速检查与归档

# 快速查看消融 CSV（可选）
if os.path.exists(ABLATION_CSV):
    tmp_df = pd.read_csv(ABLATION_CSV)
    display(tmp_df)
else:
    print('尚未生成 ablation CSV。')


In [None]:
# （可选）归档当前 Notebook
import shutil

try:
    shutil.copy2(NOTEBOOK_ABS_PATH, NOTEBOOK_COPY_PATH)
    print(f'Notebook archived to: {NOTEBOOK_COPY_PATH}')
except Exception as exc:
    print(f'归档失败: {exc}')


## 10. 实验总结与分析（待根据实测结果补充）
- **阶段 1：消融实验** 通过 5 个因素的消融实验量化增益，筛选表现优于 Baseline 的方案。
- **阶段 2：组合训练** 将有效因素（如 Residual、SE、Deeper/Wider、强化增强、AdamW+Cosine 等）组合，训练 150 轮并保存最佳模型。
- **阶段 3：结果分析** 对比 Baseline 与最终模型精度、绘制学习曲线与因素增益图，并可生成混淆矩阵辅助分析。

> 建议：在完成全部实验后，将关键数值（最佳验证/测试准确率、各因素增益）填入课程报告的表格中，同时结合曲线与混淆矩阵撰写文字分析。


In [None]:
# 将消融结果导出为 Markdown（用于报告）
if ablation_summary_df is not None:
    md_path = os.path.join(RESULT_DIR, 'cifar10_ablation_summary.md')
    with open(md_path, 'w', encoding='utf-8') as f:
        f.write(ablation_summary_df.to_markdown(index=False))
    print(f'消融结果已导出到 {md_path}')
else:
    print('暂无消融数据可导出。')


In [None]:
# 运行元数据（可选）
meta = {
    'timestamp': time.strftime('%Y-%m-%d %H:%M:%S'),
    'seed': SEED,
    'device': str(device),
    'best_val_acc': main_summary['best_val_acc'] if ('main_summary' in globals() and main_summary) else None,
    'best_epoch': main_summary['best_epoch'] if ('main_summary' in globals() and main_summary) else None,
    'artifacts': {
        'result_dir': RESULTS_DIR,
        'best_model_path': BEST_MODEL_PATH if os.path.exists(BEST_MODEL_PATH) else None,
        'ablation_csv': ABLATION_CSV if os.path.exists(ABLATION_CSV) else None,
        'main_history_csv': MAIN_HISTORY_CSV if os.path.exists(MAIN_HISTORY_CSV) else None
    }
}
meta_path = os.path.join(RESULTS_DIR, 'cifar10_run_meta.json')
with open(meta_path, 'w', encoding='utf-8') as f:
    json.dump(meta, f, ensure_ascii=False, indent=2)
print(f'元数据已写入 {meta_path}')


# 我学到了什么（反思）

在本次 CIFAR-10 任务中，我从一个简洁的 CNN 出发，逐步引入以下因素并进行对比：

- 残差与 SE 注意力：缓解退化并提升特征表达；
- 更深/更宽的结构：带来容量提升但需要配合正则与调参；
- 更强的数据增强（含 Cutout/MixUp/RandomErasing）：有效抑制过拟合；
- AdamW 与 Warmup+Cosine：更稳定的优化与更好的最终性能；
- Label Smoothing：在类别间相似时可提升泛化表现；

结合训练曲线与消融结果，我理解到“配方”需要整体协同：增强强度、epoch 数、正则化与学习率日程彼此影响，不可孤立看待。

## 11. 快速烟囱测试（Smoke Test）
为验证改进后的代码在本机可直接运行，这里进行一次极简训练：
- 只训练 1 个 epoch
- 每个 epoch 仅跑 5 个 mini-batches（max_steps_per_epoch=5）
- 使用较小的 batch_size=64
期望：无异常报错，输出训练与验证的基本指标。


In [None]:
# 运行 Smoke Test
cfg = ExperimentConfig(
    name='smoke_baseline',
    epochs=1,
    batch_size=64,
    base_lr=0.05,
    optimizer='sgd',
    scheduler='none',
    use_amp=False,
    max_steps_per_epoch=5,
    use_residual=False,
    use_se=False,
    depth=2,
    width=1,
    use_strong_aug=False,
)

loaders = build_dataloaders(cfg, num_workers=0)
hist, summary = run_experiment(cfg, loaders, track_test=False, verbose=True)

print('\n=== Smoke Test Summary ===')
print({k: v for k, v in summary.items() if k in ['best_val_acc', 'best_epoch', 'total_time']})
print('History (tail):')
print(hist.tail())
