In [12]:
import argparse
import sys

def parse_args():
    # 在 Jupyter 环境下，跳过 argparse 解析
    if 'ipykernel_launcher' in sys.argv[0]:
        sys.argv = sys.argv[:1]  # 只保留脚本名，不处理后续参数

    parser = argparse.ArgumentParser(description="Your script description")
    parser.add_argument('--outf', type=str, help="Output file path")
    args = parser.parse_args()
    return args

args = parse_args()
print(args)

Namespace(outf=None)


In [14]:
import torch
import torch.nn as nn
import torch.optim as optim
import torchvision
import torchvision.transforms as transforms
import argparse
from resnet import ResNet18
import os
from colorama import init, Fore, Style

# 初始化 colorama
init()

# 设备配置
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")  # 自动选择GPU或CPU

# 命令行参数配置
parser = argparse.ArgumentParser(description="PyTorch CIFAR10 Training")
parser.add_argument(
    "--outf",
    default="./model/",
    help="Folder to output images and model checkpoints",
)  # 模型保存路径配置
args = parser.parse_args()

# 超参数设置
EPOCH = 200  # 总训练轮数
pre_epoch = 0  # 预训练轮数，从0开始
BATCH_SIZE = 128  # 每批训练样本数,得到 391 批次 (ceil(50000/128))
LR = 0.01  # 初始学习率

# 准备数据集并预处理
transform_train = transforms.Compose([
    transforms.RandomCrop(32, padding=4),  # 随机裁剪
    transforms.RandomHorizontalFlip(),     # 随机水平翻转
    transforms.ColorJitter(
        brightness=0.1,    # 亮度变化范围
        contrast=0.1,      # 对比度变化范围
    ),
    transforms.RandomRotation(10),         # 随机旋转，最大角度±10度
    transforms.RandomAffine(
        degrees=0,
        translate=(0.1, 0.1),    # 随机平移，最大幅度为10%
        scale=(0.9, 1.1),        # 随机缩放，范围是原尺寸的90%-110%
    ),
    transforms.ToTensor(),
    transforms.Normalize(
        (0.4914, 0.4822, 0.4465),
        (0.2462, 0.2424, 0.2609)
    ),
    transforms.RandomErasing(p=0.2)  # 随机遮挡，概率为0.2
])

transform_test = transforms.Compose([
    transforms.ToTensor(),
    transforms.Normalize(
        (0.4942, 0.4851, 0.4504),
        (0.2023, 0.1994, 0.2010)
    ),
])

# 数据加载配置
trainset = torchvision.datasets.CIFAR10(
    root="./data", train=True, download=False, transform=transform_train
)  # 加载训练集
trainloader = torch.utils.data.DataLoader(
    trainset, batch_size=BATCH_SIZE, shuffle=True, num_workers=2
)  # 配置训练数据加载器，启用多进程

testset = torchvision.datasets.CIFAR10(
    root="./data", train=False, download=False, transform=transform_test
)  # 加载测试集
testloader = torch.utils.data.DataLoader(
    testset, batch_size=100, shuffle=False, num_workers=2
)  # 配置测试数据加载器

# CIFAR-10的标签
classes = (
    "plane", "car", "bird", "cat", "deer",
    "dog", "frog", "horse", "ship", "truck"
)

# 模型与优化器配置
net = ResNet18().to(device)  # 创建ResNet18模型并移至指定设备
criterion = nn.CrossEntropyLoss()  # 使用交叉熵损失函数
optimizer = optim.SGD(
    net.parameters(), lr=LR, momentum=0.9, weight_decay=5e-4
)  # SGD优化器：学习率=0.01，动量=0.9，L2正则化=5e-4

# 训练主循环
if __name__ == "__main__":
    os.makedirs(args.outf, exist_ok=True)
    os.makedirs("./best", exist_ok=True)  # 创建保存最佳模型的文件夹
    best_acc = 90  # 将初始最佳准确率设置为一个较低的值
    
    # 确保best文件夹存在并且可写
    if not os.access("./best", os.W_OK):
        print(f"{Fore.RED}警告：无法写入best文件夹，请检查权限{Style.RESET_ALL}")
        exit(1)

    print(f"{Fore.GREEN}Start Training, ResNet-18!{Style.RESET_ALL}")
    print(f"{Fore.YELLOW}{'='*50}{Style.RESET_ALL}")

    with open("acc.txt", "w") as f_acc, open("log.txt", "w") as f_log:
        for epoch in range(pre_epoch, EPOCH):
            # 每个Epoch开始时的分隔显示
            print(f"\n{Fore.CYAN}{'='*20} Epoch: {epoch + 1} {'='*20}{Style.RESET_ALL}")
            net.train()
            sum_loss, correct, total = 0.0, 0.0, 0.0
            length = len(trainloader)

            for i, (inputs, labels) in enumerate(trainloader):
                inputs, labels = inputs.to(device), labels.to(device)  # 数据转移到指定设备
                optimizer.zero_grad()  # 清空所有参数的梯度

                outputs = net(inputs)  # 前向传播
                loss = criterion(outputs, labels)  # 计算损失值
                loss.backward()  # 反向传播，计算梯度
                optimizer.step()  # 更新网络参数

                # 统计训练指标
                sum_loss += loss.item()  # 累积损失值
                _, predicted = torch.max(outputs.data, 1)  # 获取预测结果
                total += labels.size(0)  # 累积样本总数
                correct += predicted.eq(labels).sum().item()  # 累积正确预测数

                # 修改训练状态的打印格式
                if (i + 1) % 100 == 0:  # 每100个batch打印一次
                    print(
                        f"[{Fore.BLUE}Batch: {i + 1}/{length}{Style.RESET_ALL}] "
                        f"Loss: {Fore.RED}{sum_loss / (i + 1):.3f}{Style.RESET_ALL} | "
                        f"Acc: {Fore.GREEN}{100. * correct / total:.3f}%{Style.RESET_ALL}"
                    )

            # 每个Epoch结束时打印总结
            print(f"\n{Fore.YELLOW}Epoch {epoch + 1} 训练完成:{Style.RESET_ALL}")
            print(f"平均损失: {Fore.RED}{sum_loss / length:.3f}{Style.RESET_ALL}")
            print(f"训练准确率: {Fore.GREEN}{100. * correct / total:.3f}%{Style.RESET_ALL}")
            
            # 测试阶段
            print(f"\n{Fore.YELLOW}正在测试 Epoch {epoch + 1} ...{Style.RESET_ALL}")
            with torch.no_grad():
                net.eval()
                correct, total = 0, 0
                
                for images, labels in testloader:
                    images, labels = images.to(device), labels.to(device)
                    outputs = net(images)
                    _, predicted = torch.max(outputs.data, 1)
                    total += labels.size(0)
                    correct += predicted.eq(labels).sum().item()
                
                acc = 100. * correct / total
                print(f"\n{Fore.CYAN}Epoch {epoch + 1} 测试结果:{Style.RESET_ALL}")
                print(f"测试准确率: {Fore.GREEN}{acc:.3f}%{Style.RESET_ALL}")

                # 保存模型和更新最佳记录
                print(f"\n{Fore.CYAN}正在保存 Epoch {epoch + 1} 模型...{Style.RESET_ALL}")
                model_path = os.path.join(args.outf, f"net_{epoch + 1:03d}.pth")
                torch.save(net.state_dict(), model_path)
                
                if acc > best_acc:
                    best_acc = acc
                    print(f"{Fore.GREEN}【新记录】最佳准确率：{acc:.3f}%{Style.RESET_ALL}")
                    # 保存最佳模型到best文件夹
                    best_model_path = f"./best/best_model_{acc:.2f}.pth"
                    try:
                        torch.save({
                            'epoch': epoch + 1,
                            'model_state_dict': net.state_dict(),
                            'accuracy': acc,
                            'optimizer_state_dict': optimizer.state_dict(),  # 添加优化器状态
                        }, best_model_path)
                        print(f"{Fore.GREEN}最佳模型已保存至: {best_model_path}{Style.RESET_ALL}")
                        
                        # 保存最佳准确率记录
                        best_acc_path = "./best/best_acc.txt"
                        with open(best_acc_path, "w") as f_best:
                            f_best.write(
                                f"EPOCH={epoch + 1}, best_acc={best_acc:.3f}%\n"
                                f"Model saved as: {best_model_path}"
                            )
                        print(f"{Fore.GREEN}最佳准确率记录已保存至: {best_acc_path}{Style.RESET_ALL}")
                    except Exception as e:
                        print(f"{Fore.RED}保存最佳模型时发生错误: {str(e)}{Style.RESET_ALL}")
            
            # 每个Epoch的结束分隔线
            print(f"\n{Fore.YELLOW}{'='*50}{Style.RESET_ALL}\n")

        print(f"{Fore.GREEN}Training Finished, Total EPOCH={EPOCH}{Style.RESET_ALL}")

Start Training, ResNet-18!

[Batch: 100/391] Loss: 1.895 | Acc: 29.172%
[Batch: 200/391] Loss: 1.742 | Acc: 35.445%
[Batch: 300/391] Loss: 1.649 | Acc: 39.260%

Epoch 1 训练完成:
平均损失: 1.573
训练准确率: 42.256%

正在测试 Epoch 1 ...

Epoch 1 测试结果:
测试准确率: 51.300%

正在保存 Epoch 1 模型...



[Batch: 100/391] Loss: 1.256 | Acc: 55.211%
[Batch: 200/391] Loss: 1.213 | Acc: 56.703%
[Batch: 300/391] Loss: 1.182 | Acc: 57.867%

Epoch 2 训练完成:
平均损失: 1.149
训练准确率: 59.036%

正在测试 Epoch 2 ...

Epoch 2 测试结果:
测试准确率: 63.710%

正在保存 Epoch 2 模型...



[Batch: 100/391] Loss: 1.010 | Acc: 64.188%
[Batch: 200/391] Loss: 0.987 | Acc: 65.074%
[Batch: 300/391] Loss: 0.972 | Acc: 65.602%

Epoch 3 训练完成:
平均损失: 0.954
训练准确率: 66.224%

正在测试 Epoch 3 ...

Epoch 3 测试结果:
测试准确率: 66.930%

正在保存 Epoch 3 模型...



[Batch: 100/391] Loss: 0.865 | Acc: 69.367%
[Batch: 200/391] Loss: 0.861 | Acc: 69.406%
[Batch: 300/391] Loss: 0.847 | Acc: 69.867%

Epoch 4 训练完成:
平均损失: 0.829
训练准确率: 70.610%

正在测试 Epoch 4 ...

Epoch 4 测试结果:
测试准确率: 75.420%

正在保存 Epoch 4 模