# VGG简单复现

## 导入所需的包

In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils import data
import torchvision
from torchvision import transforms
from torch.utils.tensorboard import SummaryWriter
import matplotlib.pyplot as plt
%matplotlib widget
from d2l import torch as d2l
import random
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
import time

### 在命令行输入`tensorboard --logdir=runs`

In [2]:
# 百度来的，不然下载不动。。
import ssl
ssl._create_default_https_context = ssl._create_unverified_context

## 计算CIFAR10数据集RGB三个通道的均值和方差

In [3]:
cifar_train = torchvision.datasets.CIFAR10(root="../data", train=True, download=True)
print(cifar_train.data.shape) # (50000, 32, 32, 3)
cifardata = cifar_train.data / 255
mean = cifardata.mean(axis=(0, 1, 2))
std = cifardata.std(axis=(0, 1, 2))
print(mean, std) # [0.49139968 0.48215841 0.44653091] [0.24703223 0.24348513 0.26158784]

Files already downloaded and verified
(50000, 32, 32, 3)
[0.49139968 0.48215841 0.44653091] [0.24703223 0.24348513 0.26158784]


In [4]:
mean, std = [0.491, 0.482, 0.446], [0.247, 0.243, 0.261]

## 定义训练数据集

#### 预处理顺序：将正方形图片裁剪成长宽都为$S$，再从中裁剪出64\*64的一块（CIFAR10的图片大小为32\*32，适度缩小网络以适应数据集），随机水平翻转，随机扰动色彩，转化为张量，归一化。

In [5]:
class TrainDataset(data.Dataset):
    def __init__(self, S):
        super().__init__()
        self.dataset = torchvision.datasets.CIFAR10(
            root="../data", train=True, download=True)
        
        self.S = S
        
        self.trans = [transforms.RandomCrop(64),
                      transforms.RandomHorizontalFlip(p=0.5),
                      transforms.ColorJitter(brightness=0.1,
                                             contrast=0.1,
                                             saturation=0.1,
                                             hue=0),
                      transforms.ToTensor(),
                      transforms.Normalize(mean, std, inplace=True)]
        self.trans = transforms.Compose(self.trans)
        
    def __len__(self):
        return len(self.dataset)
    
    def __getitem__(self, index):
        if isinstance(self.S, int):
            resize = transforms.Resize(max(64, self.S))
        elif isinstance(self.S, list):
            assert(len(self.S) == 2)
            resize = transforms.Resize(
                random.randint(self.S[0], self.S[1]))

        return (self.trans(resize(self.dataset[index][0])),
                self.dataset[index][1])

## 定义测试数据集

#### 预处理顺序：将图片尺寸缩放为$Q$，水平翻转（可选），转化为张量，归一化。

In [6]:
class TestDataset(data.Dataset):
    def __init__(self, Q, horizontal_flip=False):
        super().__init__()
        self.dataset = torchvision.datasets.CIFAR10(
            root="../data", train=False, download=True)
        
        self.Q = Q
        
        self.trans = [transforms.ToTensor(),
                      transforms.Normalize(mean, std, inplace=True)]
        if horizontal_flip:
            self.trans.insert(0, transforms.RandomHorizontalFlip(p=1))
        self.trans = transforms.Compose(self.trans)
        
    def __len__(self):
        return len(self.dataset)
    
    def __getitem__(self, index):
        assert(isinstance(self.Q, int))
        resize = transforms.Resize(max(64, self.Q))
            
        return (self.trans(resize(self.dataset[index][0])),
                self.dataset[index][1])

## 定义VGG块

In [7]:
class VGG_block(nn.Module):
    def __init__(self, in_channels, out_channels, num_3x3, conv_1x1=False, batch_norm=False):
        super().__init__()
        layers = []
        layers += [nn.Conv2d(in_channels, out_channels,
                             kernel_size=3, stride=1, padding=1),
                   nn.BatchNorm2d(out_channels),
                   nn.ReLU(inplace=True)] if batch_norm else \
                  [nn.Conv2d(in_channels, out_channels,
                             kernel_size=3, stride=1, padding=1),
                   nn.ReLU(inplace=True)]
        if num_3x3 > 1:
            for i in range(1, num_3x3):
                layers += [nn.Conv2d(out_channels, out_channels,
                                     kernel_size=3, stride=1, padding=1),
                           nn.BatchNorm2d(out_channels),
                           nn.ReLU(inplace=True)] if batch_norm else \
                          [nn.Conv2d(out_channels, out_channels,
                                     kernel_size=3, stride=1, padding=1),
                           nn.ReLU(inplace=True)]
        if conv_1x1:
            layers += [nn.Conv2d(out_channels, out_channels,
                                 kernel_size=1, stride=1, padding=0),
                       nn.BatchNorm2d(out_channels),
                       nn.ReLU(inplace=True)] if batch_norm else \
                      [nn.Conv2d(out_channels, out_channels,
                                 kernel_size=1, stride=1, padding=0),
                       nn.ReLU(inplace=True)]
        layers += [nn.MaxPool2d(kernel_size=2, stride=2)]
        
        self.block = nn.Sequential(*layers)
    
    def forward(self, X):
        return self.block(X)

## 定义VGG网络（缩小版）

#### 输入尺寸为64\*64，输出尺寸为10。

In [8]:
class VGG_mini(nn.Module):
    def __init__(self, configuration, batch_norm=False, dropout=0.5):
        super().__init__()
        self.configurations = {
            'A': [[3,   8,  1, False],
                  [8,   16, 1, False],
                  [16,  32, 2, False],
                  [32,  64, 2, False],
                  [64,  64, 2, False]],
            
            'B': [[3,   8,  2, False],
                  [8,   16, 2, False],
                  [16,  32, 2, False],
                  [32,  64, 2, False],
                  [64,  64, 2, False]],
            
            'C': [[3,   8,  2, False],
                  [8,   16, 2, False],
                  [16,  32, 2, True],
                  [32,  64, 2, True],
                  [64,  64, 2, True]],
            
            'D': [[3,   8,  2, False],
                  [8,   16, 2, False],
                  [16,  32, 3, False],
                  [32,  64, 3, False],
                  [64,  64, 3, False]],
            
            'E': [[3,   8,  2, False],
                  [8,   16, 2, False],
                  [16,  32, 4, False],
                  [32,  64, 4, False],
                  [64,  64, 4, False]]
        }
        self.configuration = configuration
        self.batch_norm = batch_norm
        self.blocks = []
        for arg_list in self.configurations[self.configuration]:
            self.blocks.append(VGG_block(*arg_list, self.batch_norm))
        self.blocks = nn.Sequential(*self.blocks)
        
        # 用全卷积代替全连接
        self.FC = nn.Sequential(
            nn.Conv2d(64, 512, kernel_size=2),
            nn.Dropout2d(p=dropout, inplace=True),
            nn.ReLU(inplace=True),
            nn.Conv2d(512, 512, kernel_size=1),
            nn.Dropout2d(p=dropout, inplace=True),
            nn.ReLU(inplace=True),
            nn.Conv2d(512, 10, kernel_size=1),
            # 把后两个空间维度（H、W）合并成一个
            nn.Flatten(start_dim=2, end_dim=-1)
        )
    
    def print_num_params(self):
        """打印网络参数数量"""
        total_params = sum(p.numel() for p in self.parameters())
        print(f'{total_params:,} total parameters.')
        total_trainable_params = sum(
            p.numel() for p in self.parameters() if p.requires_grad)
        print(f'{total_trainable_params:,} trainable parameters.')
        
    def forward(self, X):
        output = self.blocks(X)
        output = self.FC(output)
        # 这里为了实现简单，把空间平均的操作放在softmax前面了，原文是先softmax再空间平均
        # 输出维度为 (`batch_size`, 10)
        return output.mean(dim=2)

## 根据给定的图片缩放策略和$S$，计算$Q$。

#### 4.1 Single Scale Evaluation
-   $S$值固定时，$Q=S$。
-   $S$值可变（$S\in[S_{min},S_{max}]$）时，$Q=0.5(S_{min}+S_{max})$。

#### 4.2 Multi-Scale Evaluation
-   $S$值固定时，$Q=\{S-32,\ S,\ S+32\}$
-   $S$值可变时，$Q=\{S_{min}, \ 0.5(S_{min}+S_{max}), \ S_{max}\}$

In [9]:
def single_scale_eval_SQ(S):
    if isinstance(S, int):
        Q = S
    elif isinstance(S, list):
        assert(len(S) == 2)
        Q = int(0.5*(S[0] + S[1]))
    return S, Q

def multi_scale_eval_SQ(S):
    if isinstance(S, int):
        # 因为输入图片本来就不大，改成 +-8 了
        Q = [S-8, S, S+8]
    elif isinstance(S, list):
        assert(len(S) == 2)
        Q = [S[0], int(0.5*(S[0] + S[1])), S[1]]
    return S, Q

In [10]:
def get_S_and_Q(S, single_scale_eval):
    if single_scale_eval:
        return single_scale_eval_SQ(S)
    else:
        return multi_scale_eval_SQ(S)

## 定义性能评估类

In [11]:
class Evaluater:
    def __init__(self, S, batch_size, mode='single'):
        self.single_scale = mode=='single'
        self.S, self.Q = get_S_and_Q(S, self.single_scale)
        if self.single_scale:
            # 未开启水平翻转和开启水平翻转两个数据集
            self.datasets = [TestDataset(self.Q, False), TestDataset(self.Q, True)]
        else:
            self.datasets = []
            # 对Q中的每个尺寸都使用未开启水平翻转和开启水平翻转两个数据集，共6个数据集
            for q in self.Q:
                self.datasets += [TestDataset(q, False), TestDataset(q, True)]
        # 每个数据集创建一个dataloader
        self.dataloaders = [data.DataLoader(dataset,
                                            batch_size=batch_size,
                                            shuffle=False,
                                            num_workers=8) for dataset in self.datasets]
    def evaluate(self, net, criterion):
        net.eval()
        loss, accuracy = [], []
        outputs = {}
        with torch.no_grad():
            # 对每个dataloader都过一遍
            for dataloader in self.dataloaders:
                for i, (input, _) in enumerate(dataloader):
                    input = input.to(device)
                    output = net(input)
                    # 把网络的输出存储起来
                    try:
                        outputs[i] += F.softmax(output, dim=1)
                    except KeyError:
                        outputs[i] = F.softmax(output, dim=1)
            # 网络的输出收集完毕后，用第一个dataloader的target计算精度和loss
            for i, (_, target) in enumerate(self.dataloaders[0]):
                target = target.to(device)
                loss.append(criterion(outputs[i] / len(self.datasets), target))
                accuracy.append((outputs[i].argmax(dim=1)==target).sum() / target.shape[0])
        # 计算在所有batch上loss和accuracy的均值
        loss = torch.tensor(loss).mean().item()
        accuracy = torch.tensor(accuracy).mean().item()
        return loss, accuracy

## 访问优化器的学习率

In [12]:
def get_lr(optimizer):
    return (optimizer.state_dict()['param_groups'][0]['lr'])

## 定义训练函数

In [13]:
def train_VGG(net,
              batch_size,
              num_epochs,
              lr,
              evaluater,
              S=[64, 128],
              weight_decay=5e-4):

    writer = SummaryWriter(f'runs/VGG-mini-{net.configuration}'+('-batchnorm' if net.batch_norm else ''))
    cifar_train = TrainDataset(S)
    train_iter = data.DataLoader(cifar_train, batch_size=batch_size,
                                 shuffle=True, num_workers=8)
    def init_weights(m):
        if type(m) == nn.Linear or type(m) == nn.Conv2d:
            nn.init.normal_(m.weight, mean=0, std=0.1)
    net.apply(init_weights)
    optimizer = torch.optim.Adam(net.parameters(),
                                 lr=lr,
                                 weight_decay=weight_decay)
    scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer,
                                                           factor=0.1,
                                                           patience=3,
                                                           threshold=1e-3,
                                                           verbose=True)
    criterion = nn.CrossEntropyLoss()
    timer, num_batches = d2l.Timer(), len(train_iter)
    for epoch in range(num_epochs):
        tic = time.time()
        # 训练损失之和，训练准确率之和，范例数
        metric = d2l.Accumulator(3)
        net.train()
        for i, (input, target) in enumerate(train_iter):
            timer.start()
            optimizer.zero_grad()
            input, target = input.to(device), target.to(device)
            output = net(input)
            loss = criterion(output, target)
            loss.backward()
            optimizer.step()
            with torch.no_grad():
                metric.add(loss * input.shape[0],
                           d2l.accuracy(output, target),
                           input.shape[0])
            timer.stop()
            train_loss = metric[0] / metric[2]
            train_acc = metric[1] / metric[2]
            
            # if (i+1) % (num_batches//20) == 0:
            #     print(f"loss: {train_loss:.4f}, acc: {train_acc:.4f}")
        valid_loss, valid_acc = evaluater.evaluate(net, criterion)
        writer.add_scalar('train/loss', train_loss, global_step=epoch+1)
        writer.add_scalar('train/accuracy', train_acc, global_step=epoch+1)
        writer.add_scalar('valid/loss', valid_loss, global_step=epoch+1)
        writer.add_scalar('valid/accuracy', valid_acc, global_step=epoch+1)
        writer.add_scalar('learning rate', get_lr(optimizer), global_step=epoch+1)
        scheduler.step(valid_loss)
        toc = time.time()
        print(f"epoch {epoch+1:2d}, train loss: {train_loss:.4f}, train accuracy: {train_acc:.4f}, \
valid loss: {valid_loss:.4f}, valid accuracy: {valid_acc:.4f}, time: {toc-tic:.4f}")
    valid_loss, valid_acc = evaluater.evaluate(net, criterion)
    print(f'train loss {train_loss:.3f}, train acc {train_acc:.3f}, '
          f'valid loss {valid_loss:.3f}, valid acc {valid_acc:.3f}')
    print(f'{metric[2] * num_epochs / timer.sum():.1f} examples/sec '
          f'on {str(device)}')

## 创建VGG网络实例，查看网络参数数量，创建evaluater实例，查看网络结构

In [14]:
net = VGG_mini(configuration='A',
               batch_norm=False,
               dropout=0.5).to(device)
net.print_num_params()
S = [64, 128]
mode = 'multi'
evaluater = Evaluater(S, mode=mode, batch_size=256)
net

543,930 total parameters.
543,930 trainable parameters.
Files already downloaded and verified
Files already downloaded and verified
Files already downloaded and verified
Files already downloaded and verified
Files already downloaded and verified
Files already downloaded and verified


VGG_mini(
  (blocks): Sequential(
    (0): VGG_block(
      (block): Sequential(
        (0): Conv2d(3, 8, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
        (1): ReLU(inplace=True)
        (2): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
      )
    )
    (1): VGG_block(
      (block): Sequential(
        (0): Conv2d(8, 16, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
        (1): ReLU(inplace=True)
        (2): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
      )
    )
    (2): VGG_block(
      (block): Sequential(
        (0): Conv2d(16, 32, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
        (1): ReLU(inplace=True)
        (2): Conv2d(32, 32, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
        (3): ReLU(inplace=True)
        (4): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
      )
    )
    (3): VGG_block(
      (block): Sequential(
        (0): Conv2d(32, 64, ker

### 在训练之前测试一下，精度应该在10%左右

In [15]:
evaluater.evaluate(net, nn.CrossEntropyLoss())

(2.3025765419006348, 0.09912109375)

## 训练VGG

In [16]:
train_VGG(net,
          batch_size=256,
          num_epochs=70,
          lr=1e-3,
          evaluater=evaluater,
          S=[64, 128],
          weight_decay=5e-4)

Files already downloaded and verified
epoch  1, train loss: 2.3268, train accuracy: 0.1846, valid loss: 2.2290, valid accuracy: 0.3321, time: 12.4156
epoch  2, train loss: 1.8299, train accuracy: 0.3136, valid loss: 2.1952, valid accuracy: 0.4158, time: 12.7650
epoch  3, train loss: 1.7002, train accuracy: 0.3664, valid loss: 2.1736, valid accuracy: 0.4367, time: 12.5378
epoch  4, train loss: 1.6095, train accuracy: 0.4079, valid loss: 2.1324, valid accuracy: 0.4966, time: 12.7743
epoch  5, train loss: 1.5532, train accuracy: 0.4349, valid loss: 2.1350, valid accuracy: 0.5127, time: 12.7556
epoch  6, train loss: 1.4987, train accuracy: 0.4547, valid loss: 2.0994, valid accuracy: 0.5097, time: 12.9219
epoch  7, train loss: 1.4517, train accuracy: 0.4731, valid loss: 2.1000, valid accuracy: 0.5331, time: 12.9392
epoch  8, train loss: 1.4067, train accuracy: 0.4964, valid loss: 2.0801, valid accuracy: 0.5525, time: 12.8054
epoch  9, train loss: 1.3816, train accuracy: 0.5025, valid loss: 

### 训练完再评估一下

In [17]:
evaluater.evaluate(net, nn.CrossEntropyLoss())

(1.7888615131378174, 0.827832043170929)

## 保存模型

In [18]:
torch.save(net, f'VGG-mini-{net.configuration}' + ('-batchnorm.pth' if net.batch_norm else '.pth'))