# GoogLeNet简单复现（Kaggle狗狗品种分类）

In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils import data
import torchvision
from torchvision import transforms
from torch.utils.tensorboard import SummaryWriter
import matplotlib.pyplot as plt
%matplotlib inline
from d2l import torch as d2l
import random
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
import time
import pandas as pd
from PIL import Image

## [kaggle狗狗品种分类竞赛地址](https://www.kaggle.com/c/dog-breed-identification)

In [2]:
path = '../data/dog-breed-identification/'

## 读取训练集图片id和标签

In [3]:
train_csv = pd.read_csv(path + 'labels.csv')
train_csv

Unnamed: 0,id,breed
0,000bec180eb18c7604dcecc8fe0dba07,boston_bull
1,001513dfcb2ffafc82cccf4d8bbaba97,dingo
2,001cdf01b096e06d78e9e5112d419397,pekinese
3,00214f311d5d2247d5dfe4fe24b2303d,bluetick
4,0021f9ceb3235effd7fcde7f7538ed62,golden_retriever
...,...,...
10217,ffd25009d635cfd16e793503ac5edef0,borzoi
10218,ffd3f636f7f379c51ba3648a9ff8254f,dandie_dinmont
10219,ffe2ca6c940cddfee68fa3cc6c63213f,airedale
10220,ffe5f6d8e2bff356e9482a80a6e29aac,miniature_pinscher


In [4]:
print(train_csv.shape)

(10222, 2)


## 将类别按字母顺序排序，并读取前十个类别

In [6]:
label_list = sorted(train_csv['breed'].unique().tolist())
label_list[:10]

['affenpinscher',
 'afghan_hound',
 'african_hunting_dog',
 'airedale',
 'american_staffordshire_terrier',
 'appenzeller',
 'australian_terrier',
 'basenji',
 'basset',
 'beagle']

## 读取测试集图片id

In [7]:
test_csv = pd.read_csv(path + 'sample_submission.csv')

In [8]:
print(test_csv.shape)

(10357, 121)


## 在训练集上计算三个通道的均值和标准差

均值：\[0.4736, 0.4504, 0.3909\]

方差：\[0.2655, 0.2607, 0.2650\]

In [9]:
# images = []
# for i in range(len(train_csv['id'])):
#     image = (transforms.ToTensor()(Image.open(path + 'train/' + train_csv['id'][i] + '.jpg'))).to(torch.float32).flatten(1, 2)
#     images.append(image)
# flattened_image = torch.cat(images, dim=1)
# print(flattened_image.shape)                                    # torch.Size([3, 1882650608])
# print(flattened_image.mean(dim=1), flattened_image.std(dim=1))  # tensor([0.4736, 0.4504, 0.3909]) tensor([0.2655, 0.2607, 0.2650])

## 定义训练数据集
#### 数据预处理：随机裁剪并缩放（插值方法随机），随机水平翻转，扰动色彩，转化为张量，标准化

In [10]:
class TrainDataset(data.Dataset):
    def __init__(self, dataset):
        super().__init__()
        self.dataset = dataset
        self.interpolation_modes = [transforms.InterpolationMode.NEAREST,
                                    transforms.InterpolationMode.BILINEAR,
                                    transforms.InterpolationMode.BICUBIC]
        self.trans = transforms.Compose([transforms.RandomHorizontalFlip(p=0.5),
                                         transforms.ColorJitter(brightness=0.2,
                                                                contrast=0.2,
                                                                saturation=0.2,
                                                                hue=0.2),
                                         transforms.ToTensor(),
                                         transforms.Normalize(mean=[0.4736, 0.4504, 0.3909],
                                                              std=[0.2655, 0.2607, 0.2650],
                                                              inplace=True)])
        
    def __len__(self):
        return len(self.dataset)
    
    def __getitem__(self, index):
        image, label = self.dataset[index]
        resized_crop = transforms.RandomResizedCrop(size=224,
                                                    scale=(0.08, 1.0),
                                                    ratio=(0.75, 1.3333333333333333),
                                                    interpolation=self.interpolation_modes[random.randint(0, 2)])
        return self.trans(resized_crop(image)), label

## 定义验证数据集
#### 数据预处理：将短边缩放到256，在中心裁剪出224\*224的一块，转化为张量，标准化

In [11]:
class ValidDataset(data.Dataset):
    def __init__(self, dataset):
        super().__init__()
        self.dataset = dataset
        self.trans = transforms.Compose([transforms.Resize(256),
                                         transforms.CenterCrop(224),
                                         transforms.ToTensor(),
                                         transforms.Normalize(mean=[0.4736, 0.4504, 0.3909],
                                                              std=[0.2655, 0.2607, 0.2650],
                                                              inplace=True)])
        
    def __len__(self):
        return len(self.dataset)
    
    def __getitem__(self, index):
        image, label = self.dataset[index]
        return self.trans(image), label

## 定义加载图片的训练验证集，主要是将图片加载出来并配合`data.random_split`将训练集随机分成训练集和验证集

In [12]:
class TrainValidDataset(data.Dataset):
    def __init__(self):
        super().__init__()
                
    def __len__(self):
        return train_csv.shape[0]
    
    def __getitem__(self, index):
        image = Image.open(path + 'train/' + train_csv['id'][index] + '.jpg')
        label = label_list.index(train_csv['breed'][index])
        return image, label

## 定义测试数据集
#### 数据预处理：
1. 将图片短边缩放到如下四个尺寸：\[256, 288, 320, 352\]。
2. 对于宽而矮的图片，裁出左中右三个正方形；对于窄而高的图片，裁出上中下三个正方形。
3. 对于每个正方形子图，在四个角落和正中央裁出五张224\*224的图片，并把原正方形子图缩放为224\*224，共6张图片，及其水平翻转版本。

#### 一张图片共产生4\*3\*6\*2=144张测试样本。

In [13]:
class TestDataset(data.Dataset):
    def __init__(self):
        super().__init__()
        self.trans = transforms.Compose([transforms.Lambda(self.get_144_samples),
                                         transforms.Lambda(lambda crops: \
                                                           torch.stack([transforms.ToTensor()(crop) for crop in crops])),
                                         # 这里Lambda的用法参考了torchvision的doc中transforms.TenCrop()的example
                                         transforms.Normalize(mean=[0.4736, 0.4504, 0.3909],
                                                              std=[0.2655, 0.2607, 0.2650],
                                                              inplace=True)])
    def get_144_samples(self, image):
        """一张图片共产生4*3*6*2=144张测试样本。"""
        samples = []
        self.sizes = [256, 288, 320, 352]
        for size in self.sizes:
            resized_image = transforms.Resize(size)(image)
            crops = self.get_3_crops(resized_image)
            for crop in crops:
                samples.append(transforms.Resize(224)(crop))
                samples.append(transforms.RandomHorizontalFlip(1)(samples[-1]))
                samples += transforms.TenCrop(224)(crop)
        assert(len(samples) == len(self.sizes) * 3 * 12)
        return samples
            
    def get_3_crops(self, image):
        """对于宽而矮的图片，裁出左中右三个正方形；对于窄而高的图片，裁出上中下三个正方形。"""
        x, y = image.size
        if x > y:
            boxes = [[0, 0, y, y], [(x-y)//2, 0, (x+y)//2, y], [(x-y), 0, x, y]]
        elif x < y:
            boxes = [[0, 0, x, x], [0, (y-x)//2, x, (y+x)//2], [0, (y-x), x, y]]
        elif x == y:
            boxes = [[0, 0, x, y], [0, 0, x, y],  [0, 0, x, y]]
        return [image.crop(box) for box in boxes]
    
    def __len__(self):
        return test_csv.shape[0]
    
    def __getitem__(self, index):
        image = Image.open(path + 'test/' + test_csv['id'][index] + '.jpg')
        # 在intel i7 10870H上耗时0.16~0.19秒
        return self.trans(image)

## 定义网络基本块CBR：卷积  $\rightarrow$  批归一化（可选）$\rightarrow$  ReLU

In [14]:
class CBR(nn.Module):
    def __init__(self,
                 in_channels,
                 out_channels,
                 kernel_size=1,
                 padding='same',
                 stride=1,
                 batch_norm=False):
        super().__init__()
        self.batch_norm = batch_norm
        self.conv = nn.Conv2d(in_channels, out_channels,
                              kernel_size=kernel_size, padding=padding,
                              stride=stride, bias=not self.batch_norm)
        if self.batch_norm:
            self.bn = nn.BatchNorm2d(out_channels)
        self.ReLU = nn.ReLU(inplace=True)
    
    def forward(self, x):
        output = self.conv(x)
        if self.batch_norm:
            output = self.bn(output)
        output = self.ReLU(output)
        return output

## 定义Inception块
![InceptionBlock](../pictures/inception_block.svg)

In [15]:
class InceptionBlock(nn.Module):
    def __init__(self,
                 in_channels:int,
                 branch_1x1:int,
                 branch_3x3:list,
                 branch_5x5:list,
                 branch_pool:int,
                 batch_norm:bool):
        super().__init__()
        self.branch_1x1 = CBR(in_channels, branch_1x1, batch_norm=batch_norm)
        self.branch_3x3 = nn.Sequential(*[CBR(in_channels, branch_3x3[0], batch_norm=batch_norm),
                                         CBR(branch_3x3[0], branch_3x3[1], kernel_size=3, batch_norm=batch_norm)])
        self.branch_5x5 = nn.Sequential(*[CBR(in_channels, branch_5x5[0], batch_norm=batch_norm),
                                         CBR(branch_5x5[0], branch_5x5[1], kernel_size=5, batch_norm=batch_norm)])
        self.branch_pool = nn.Sequential(*[nn.MaxPool2d(kernel_size=3, padding=1, stride=1),
                                          CBR(in_channels, branch_pool, batch_norm=batch_norm)])
    def forward(self, x):
        output_1x1 = self.branch_1x1(x)
        output_3x3 = self.branch_3x3(x)
        output_5x5 = self.branch_5x5(x)
        output_pool = self.branch_pool(x)
        output = torch.cat((output_1x1, output_3x3, output_5x5, output_pool), dim=1)
        assert(output.shape[-2:] == x.shape[-2:])
        return output

## 定义GoogLeNet
![GoogLeNet-Table1](..\pictures\GoogLeNet-Table1.png)

In [16]:
class GoogLeNet(nn.Module):
    def __init__(self, batch_norm=False, dropout=0.4):
        super().__init__()
        self.batch_norm = batch_norm
        self.stage_1 = nn.Sequential(*[CBR(3, 64, kernel_size=7, stride=2, padding=3, batch_norm=batch_norm),
                                       nn.MaxPool2d(kernel_size=3, stride=2, padding=1)])
        
        self.stage_2 = nn.Sequential(*[CBR(64, 192, kernel_size=3, batch_norm=batch_norm),
                                       nn.MaxPool2d(kernel_size=3, stride=2, padding=1)])
        
        self.stage_3 = nn.Sequential(*[InceptionBlock(192, 64,  [96,  128], [16, 32],  32,  batch_norm),
                                       InceptionBlock(256, 128, [128, 192], [32, 96],  64,  batch_norm),
                                       nn.MaxPool2d(kernel_size=3, stride=2, padding=1)])
        
        self.stage_4 = nn.Sequential(*[InceptionBlock(480, 192, [96,  208], [16, 48],  64,  batch_norm),
                                       InceptionBlock(512, 160, [112, 224], [24, 64],  64,  batch_norm),
                                       InceptionBlock(512, 128, [128, 256], [24, 64],  64,  batch_norm),
                                       InceptionBlock(512, 112, [144, 288], [32, 64],  64,  batch_norm),
                                       InceptionBlock(528, 256, [160, 320], [32, 128], 128, batch_norm),
                                       nn.MaxPool2d(kernel_size=3, stride=2, padding=1)])
        
        self.stage_5 = nn.Sequential(*[InceptionBlock(832, 256, [160, 320], [32, 128], 128, batch_norm),
                                       InceptionBlock(832, 384, [192, 384], [48, 128], 128, batch_norm),
                                       nn.AdaptiveAvgPool2d((1, 1))])
        
        self.FC = nn.Sequential(*[nn.Flatten(),
                                  nn.Dropout(p=dropout),
                                  nn.Linear(1024, 120)])

    def print_num_params(self):
        """打印网络参数数量"""
        total_params = sum(p.numel() for p in self.parameters())
        print(f'{total_params:,} total parameters.')
        total_trainable_params = sum(
            p.numel() for p in self.parameters() if p.requires_grad)
        print(f'{total_trainable_params:,} trainable parameters.')
        
    def forward(self, x):
        for i in range(1, 6):
            x = getattr(self, 'stage_' + str(i))(x)
        x = self.FC(x)
        return x

## 将训练集随机分成训练集和验证集，实例化测试集

In [17]:
train_dataset, valid_dataset = data.random_split(TrainValidDataset(),
                                                 [8688, 10222-8688],
                                                 generator=torch.Generator().manual_seed(42))
train_dataset, valid_dataset = TrainDataset(train_dataset), ValidDataset(valid_dataset)

In [18]:
test_dataset = TestDataset()

In [19]:
def evaluate_loss_acc(net, data_iter, criterion, device=device):
    """使用GPU计算模型在数据集上的精度。"""
    net.eval()  # 设置为评估模式
    loss = []
    # 正确预测的数量，总预测的数量
    metric = d2l.Accumulator(2)
    with torch.no_grad():
        for input, target in data_iter:
            input = input.to(device)
            target = target.to(device)
            output = net(input)
            loss.append(float(criterion(output, target).item()))
            metric.add(d2l.accuracy(output, target), target.numel())
    return sum(loss) / len(loss), metric[0] / metric[1]

In [20]:
def get_lr(optimizer):
    return (optimizer.state_dict()['param_groups'][0]['lr'])

## 定义GoogLeNet训练函数

In [21]:
def train_GoogLeNet(net,
                    batch_size,
                    lr,
                    num_epochs,
                    weight_decay=5e-4):

    writer = SummaryWriter(f'runs/GoogLeNet')
    train_iter = data.DataLoader(train_dataset, batch_size=batch_size,
                                 shuffle=True, num_workers=8)
    valid_iter = data.DataLoader(valid_dataset, batch_size=batch_size, 
                                 shuffle=False, num_workers=8)
    def init_weights(m):
        if type(m) == nn.Linear or type(m) == nn.Conv2d:
            nn.init.normal_(m.weight, mean=0, std=0.1)
    net.apply(init_weights)
    optimizer = torch.optim.Adam(net.parameters(),
                                 lr=lr,
                                 weight_decay=weight_decay)
    # scheduler = torch.optim.lr_scheduler.StepLR(optimizer,step_size=8,gamma=0.96)
    scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=num_epochs)
    criterion = nn.CrossEntropyLoss()
    timer, num_batches = d2l.Timer(), len(train_iter)
    for epoch in range(num_epochs):
        tic = time.time()
        metric = d2l.Accumulator(3)
        net.train()
        for i, (input, target) in enumerate(train_iter):
            timer.start()
            optimizer.zero_grad()
            input, target = input.to(device), target.to(device)
            output = net(input)
            loss = criterion(output, target)
            loss.backward()
            optimizer.step()
            with torch.no_grad():
                metric.add(loss * input.shape[0],
                           d2l.accuracy(output, target),
                           input.shape[0])
            timer.stop()
            train_loss = metric[0] / metric[2]
            train_acc = metric[1] / metric[2]
        valid_loss, valid_acc = evaluate_loss_acc(net, valid_iter, criterion, device)
        writer.add_scalar('train/loss', train_loss, global_step=epoch+1)
        writer.add_scalar('train/accuracy', train_acc, global_step=epoch+1)
        writer.add_scalar('valid/loss', valid_loss, global_step=epoch+1)
        writer.add_scalar('valid/accuracy', valid_acc, global_step=epoch+1)
        writer.add_scalar('learning rate', get_lr(optimizer), global_step=epoch+1)
        scheduler.step()
        toc = time.time()
        print(f"epoch {epoch+1:3d}, train loss: {train_loss:.4f}, train accuracy: {train_acc:.4f}, \
valid loss: {valid_loss:.4f}, valid accuracy: {valid_acc:.4f}, time: {toc-tic:.4f}")
    print(f'train loss {train_loss:.3f}, train acc {train_acc:.3f}, '
          f'valid loss {valid_loss:.3f}, valid acc {valid_acc:.3f}')
    print(f'{metric[2] * num_epochs / timer.sum():.1f} examples/sec '
          f'on {str(device)}')

In [22]:
net = GoogLeNet(batch_norm=True, dropout=0.5).to(device)
net.print_num_params()
net

## 训练GoogLeNet

In [23]:
train_GoogLeNet(net,
                batch_size=256,
                lr=1e-3,
                num_epochs=200,
                weight_decay=5e-4)

In [24]:
torch.save(net, 'GoogLeNet.pth')

In [25]:
net = torch.load('GoogLeNet.pth').to(device)

In [26]:
test_dataloader = data.DataLoader(test_dataset, batch_size=1, shuffle=False, num_workers=8)

## 定义测试函数，生成和`sample_submission.csv`格式相同的`DataFrame`

In [27]:
def Test(net, one_hot=False):
    net.eval()
    column = ['id'] + label_list
    rows, outputs = [], []
    for input in test_dataloader:
        input = input.to(device).squeeze(0)
        with torch.no_grad():
            output = net(input)
        outputs.append(F.softmax(output, dim=1).mean(dim=0).to(torch.float16))
    print('Inference done! Building submission...')
    if one_hot:
        for i, output in enumerate(outputs):
            pred = int(output.argmax())
            row = [test_csv['id'][i]] + [0. for _ in range(test_csv.shape[1] - 1)]
            row[pred+1] = 1.
            rows.append(pd.Series(row, index=column))
    else:
        for i, output in enumerate(outputs):
            row = [test_csv['id'][i]] + list(output.cpu().numpy())
            rows.append(pd.Series(row, index=column))
    submission = pd.DataFrame(rows)
    return outputs, submission

In [28]:
outputs, submission = Test(net, one_hot=True)

Inference done! Building submission...


## 生成提交文件

In [29]:
submission.to_csv('submission_onehot.csv', index=False)