


改进思路:

- NiN认为如果使用了全连接层，可能会完全放弃表征的**空间结构**。且全连接层容易造成过拟合. 

- 所以相对于AlexNet, NiN完全取消了全连接层, 替换其为NiN块+全局平均汇聚层(将通道维度视为不同特征)


ResNet块:

- ResNet沿用了VGG完整的卷积层设计。 残差块里首先有2个有相同输出通道数的3x3卷积层, 每个卷积层后接一个批量规范化层和ReLU激活函数。

- 根据输出与输入通道一样否
  - 第一种跨层数据通路(一样): 将输入直接加在最后的ReLU激活函数前
  - 第二种跨层数据通路(不一样): 将输入通过1x1卷积层后(改变通道数), 再做相加运算

![](../image/resnet-block.svg)

整体架构:



- 第一个模块和GoogLeNet中的一样(7x7的卷积层, 3x3的最大汇聚层), 区别是在ReLU前加了批量规范化层.

- 8个Residual块, 2个2个分4个模块 
  
  第一个模块的2个残差块一样, 通道数同输入通道数一致。 
  
  之后的每个模块在第一个残差块里将上一个模块的通道数翻倍，并将高和宽减半, 第二个残差块通道数同输入通道数一致.

- 最后有一个全局平均汇聚层, 将10通道的结果, 每个通道汇聚成一个对数几率, 即10个表示是不是此类的概率.

![](../image/resnet18.svg)

In [1]:
import torch
# torchvision.datasets.FashionMNIST
import torchvision
# 修改数据集格式
from torchvision import transforms
# data.DataLoader
from torch.utils import data
# nn块
from torch import nn

In [2]:
# -----------参数-----------
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)
batch_size = 128
lr = 3e-2
num_epochs=10

cuda


In [3]:
# 列表
trans = [
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
]
# 转化列表为torchvision.transforms.transforms.Compose对象, 这样就能写 transform=trans
trans = transforms.Compose(trans)
mnist_train_totensor = torchvision.datasets.FashionMNIST(
    root="../data",
    train=True,
    download=True,
    transform=trans
)
mnist_test_totensor = torchvision.datasets.FashionMNIST(
    root="../data",
    train=False,
    download=True,
    transform=trans
)
# 28*28, 不用转化大小
# mnist_train_totensor[0][0]
mnist_train_totensor[0][0].shape

torch.Size([1, 224, 224])

In [4]:
# shuffle, 打乱
# num_workers, 使用4个进程来读取数据
train_iter = data.DataLoader(
    mnist_train_totensor, batch_size, shuffle=True, num_workers=4)
test_iter = data.DataLoader(
    mnist_test_totensor, batch_size, shuffle=True, num_workers=4)

In [5]:
# 制造Residual块
# input_channels: 输入通道数
# num_channels: 输出通道数
# use_1x1conv: 根据通道一样不一样. False表示第一种, True表示第二种
# strides: 图像用作缩小吗
class Residual(nn.Module):
    def __init__(self, input_channels, num_channels,
                 use_1x1conv=False, strides=1):
        super().__init__()
        # 2个3x3的卷积层
        self.conv1 = nn.Conv2d(input_channels, num_channels,
                               kernel_size=3, padding=1, stride=strides)
        self.conv2 = nn.Conv2d(num_channels, num_channels,
                               kernel_size=3, padding=1)
        if use_1x1conv:
            self.conv3 = nn.Conv2d(input_channels, num_channels,
                                   kernel_size=1, stride=strides)
        else:
            self.conv3 = None
        self.bn1 = nn.BatchNorm2d(num_channels)
        self.bn2 = nn.BatchNorm2d(num_channels)
        self.relu = nn.ReLU()

    def forward(self, X):
        Y = self.relu(self.bn1(self.conv1(X)))
        Y = self.bn2(self.conv2(Y))
        if self.conv3:
            X = self.conv3(X)
        Y += X
        return self.relu(Y)


def resnet_block(input_channels, num_channels, num_residuals,
                 first_block=False):
    blk = []
    for i in range(num_residuals):
        if i == 0 and not first_block:
            blk.append(Residual(input_channels, num_channels,
                                use_1x1conv=True, strides=2))
        else:
            blk.append(Residual(num_channels, num_channels))
    return blk


b1 = nn.Sequential(
    nn.Conv2d(1, 64, kernel_size=7, stride=2, padding=3),
    nn.BatchNorm2d(64),
    nn.ReLU(),
    nn.MaxPool2d(kernel_size=3, stride=2, padding=1)
)
b2 = nn.Sequential(
    Residual(64, 64),
    Residual(64, 64)
)
b3 = nn.Sequential(
    Residual(64, 128, use_1x1conv=True, strides=2),
    Residual(128, 128)
)
b4 = nn.Sequential(
    Residual(128, 256, use_1x1conv=True, strides=2),
    Residual(256, 256)
)
b5 = nn.Sequential(
    Residual(256, 512, use_1x1conv=True, strides=2),
    Residual(512, 512)
)


net = nn.Sequential(
    b1, 
    b2, 
    b3, 
    b4, 
    b5,
    nn.AdaptiveAvgPool2d((1,1)),
    nn.Flatten(), 
    nn.Linear(512, 10)
).to(device)
net

Sequential(
  (0): Sequential(
    (0): Conv2d(1, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3))
    (1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (2): ReLU()
    (3): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
  )
  (1): Sequential(
    (0): Residual(
      (conv1): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
      (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
      (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (bn2): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (relu): ReLU()
    )
    (1): Residual(
      (conv1): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
      (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
      (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (bn2): Ba

In [6]:
X = torch.rand(size=(1, 1, 224, 224), dtype=torch.float32).to(device)
for layer in net:
    X = layer(X)
    print(f'output shape: {layer.__class__.__name__: <15}{X.shape}')

output shape: Sequential     torch.Size([1, 64, 56, 56])
output shape: Sequential     torch.Size([1, 64, 56, 56])
output shape: Sequential     torch.Size([1, 128, 28, 28])
output shape: Sequential     torch.Size([1, 256, 14, 14])
output shape: Sequential     torch.Size([1, 512, 7, 7])
output shape: AdaptiveAvgPool2dtorch.Size([1, 512, 1, 1])
output shape: Flatten        torch.Size([1, 512])
output shape: Linear         torch.Size([1, 10])


In [7]:
def init_weights(m):
    if type(m) == nn.Linear or type(m) == nn.Conv2d:
        nn.init.xavier_uniform_(m.weight)


net.apply(init_weights)
optimizer = torch.optim.SGD(net.parameters(), lr=lr)
loss = nn.CrossEntropyLoss()

In [8]:
def train_loop(train_iter, net, loss, optimizer):
    # 共有几批
    num_batchs = len(train_iter)
    # 总平均loss
    total_train_loss = 0
    for batch, (X, y) in enumerate(train_iter):
        # move to device
        X, y = X.to(device), y.to(device)
        # 该批的推断结果
        y_hat = net(X)
        
        train_loss = loss(y_hat, y)
        total_train_loss += train_loss.item()

        # Backpropagation
        optimizer.zero_grad()
        train_loss.backward()
        optimizer.step()

        # --------打印进度        
        print(f"\r[{batch+1:>8d}/{num_batchs:>8d}]  ", end='')

    
    return total_train_loss / num_batchs

In [9]:
# ---------训练
for epoch in range(num_epochs):
    total_train_loss = train_loop(train_iter, net, loss, optimizer)
    print(f'epoch {epoch + 1}, total_train_loss {total_train_loss:f}')

[     469/     469]  epoch 1, total_train_loss 0.493547
[     469/     469]  epoch 2, total_train_loss 0.277788
[     469/     469]  epoch 3, total_train_loss 0.218713
[     469/     469]  epoch 4, total_train_loss 0.178624
[     469/     469]  epoch 5, total_train_loss 0.143290
[     469/     469]  epoch 6, total_train_loss 0.110940
[     469/     469]  epoch 7, total_train_loss 0.081509
[     469/     469]  epoch 8, total_train_loss 0.054200
[     469/     469]  epoch 9, total_train_loss 0.033889
[     469/     469]  epoch 10, total_train_loss 0.021092


In [10]:
# ----------预测
def test_net(test_iter, net, loss):
    # 共有几批
    num_batchs = len(test_iter)
    # 总平均loss, 总平均准确率
    total_test_loss, total_correct = 0, 0
    # 设定评估模式
    net.eval()
    # 不要梯度
    with torch.no_grad():
        for batch, (X, y) in enumerate(test_iter):
            # move to device
            X, y = X.to(device), y.to(device)
            y_hat = net(X)

            test_loss = loss(y_hat, y)
            # 分类0,1,2,3的类别对的上否
            correct = (y_hat.argmax(1) == y).float().sum().item()
            total_test_loss += test_loss.item()
            total_correct += correct/len(X)

            # --------打印进度
            print(f"\r[{batch+1:>8d}/{num_batchs:>8d}]  ", end='')


    total_test_loss /= num_batchs
    total_correct /= num_batchs
    print(
        f"\nTest: Accuracy: {total_correct:.1%}, Avg loss: {total_test_loss:f}")
    
test_net(test_iter, net, loss)

[      79/      79]  
Test: Accuracy: 88.8%, Avg loss: 0.581759
