![](../image/nin.svg)


改进思路:

- NiN认为如果使用了全连接层，可能会完全放弃表征的**空间结构**。且全连接层容易造成过拟合. 

- 所以相对于AlexNet, NiN完全取消了全连接层, 替换其为NiN块+全局平均汇聚层(将通道维度视为不同特征)


NiN块:

- NiN块以一个普通卷积层(第一层的卷积窗口形状通常由用户设置)开始，后面是两个1x1的卷积层。每个卷积层后都跟ReLU.

- 输出通道数由第一层卷积层调节(后面2个1x1卷积层都是相同输入输出通道数的)

- 1x1卷积层的意义是充当全连接层

整体架构:

- 4个NiN块. 这4个NiN块的第一层卷积窗口的形状参照AlexNet, 分别使用窗口形状为11x11,5x5,3x3的卷积核.

- 每个NiN块后有一个最大汇聚层，汇聚窗口形状为3x3，步幅为2。

- 最后有一个全局平均汇聚层, 将10通道的结果, 每个通道汇聚成一个对数几率, 即10个表示是不是此类的概率.

In [1]:
import torch
# torchvision.datasets.FashionMNIST
import torchvision
# 修改数据集格式
from torchvision import transforms
# data.DataLoader
from torch.utils import data
# nn块
from torch import nn

In [2]:
# -----------参数-----------
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)
batch_size = 128
lr = 3e-2
num_epochs=10

cuda


In [3]:
# 列表
trans = [
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
]
# 转化列表为torchvision.transforms.transforms.Compose对象, 这样就能写 transform=trans
trans = transforms.Compose(trans)
mnist_train_totensor = torchvision.datasets.FashionMNIST(
    root="../data",
    train=True,
    download=True,
    transform=trans
)
mnist_test_totensor = torchvision.datasets.FashionMNIST(
    root="../data",
    train=False,
    download=True,
    transform=trans
)
# 28*28, 不用转化大小
# mnist_train_totensor[0][0]
mnist_train_totensor[0][0].shape

torch.Size([1, 224, 224])

In [4]:
# shuffle, 打乱
# num_workers, 使用4个进程来读取数据
train_iter = data.DataLoader(
    mnist_train_totensor, batch_size, shuffle=True, num_workers=4)
test_iter = data.DataLoader(
    mnist_test_totensor, batch_size, shuffle=True, num_workers=4)

In [5]:
# 制造nin块
# in_channels: 输入通道数
# out_channels: 输出通道数
# kernel_size, strides, padding: 都是给第一层的卷积核
def nin_block(in_channels, out_channels, kernel_size, strides, padding):
    return nn.Sequential(
        nn.Conv2d(in_channels, out_channels, kernel_size, strides, padding),
        nn.ReLU(),
        
        nn.Conv2d(out_channels, out_channels, kernel_size=1), 
        nn.ReLU(),
        
        nn.Conv2d(out_channels, out_channels, kernel_size=1), 
        nn.ReLU()
    )


net = nn.Sequential(
    nin_block(1, 96, kernel_size=11, strides=4, padding=0),
    nn.MaxPool2d(3, stride=2),
    
    nin_block(96, 256, kernel_size=5, strides=1, padding=2),
    nn.MaxPool2d(3, stride=2),
    
    nin_block(256, 384, kernel_size=3, strides=1, padding=1),
    nn.MaxPool2d(3, stride=2),
    nn.Dropout(0.5),
    
    # 标签类别数是10
    nin_block(384, 10, kernel_size=3, strides=1, padding=1),
    
    nn.AdaptiveAvgPool2d((1, 1)),
    
    # 将四维的输出转成二维的输出，其形状为(批量大小,10)
    nn.Flatten()
).to(device)
net

Sequential(
  (0): Sequential(
    (0): Conv2d(1, 96, kernel_size=(11, 11), stride=(4, 4))
    (1): ReLU()
    (2): Conv2d(96, 96, kernel_size=(1, 1), stride=(1, 1))
    (3): ReLU()
    (4): Conv2d(96, 96, kernel_size=(1, 1), stride=(1, 1))
    (5): ReLU()
  )
  (1): MaxPool2d(kernel_size=3, stride=2, padding=0, dilation=1, ceil_mode=False)
  (2): Sequential(
    (0): Conv2d(96, 256, kernel_size=(5, 5), stride=(1, 1), padding=(2, 2))
    (1): ReLU()
    (2): Conv2d(256, 256, kernel_size=(1, 1), stride=(1, 1))
    (3): ReLU()
    (4): Conv2d(256, 256, kernel_size=(1, 1), stride=(1, 1))
    (5): ReLU()
  )
  (3): MaxPool2d(kernel_size=3, stride=2, padding=0, dilation=1, ceil_mode=False)
  (4): Sequential(
    (0): Conv2d(256, 384, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (1): ReLU()
    (2): Conv2d(384, 384, kernel_size=(1, 1), stride=(1, 1))
    (3): ReLU()
    (4): Conv2d(384, 384, kernel_size=(1, 1), stride=(1, 1))
    (5): ReLU()
  )
  (5): MaxPool2d(kernel_size=3, stri

In [6]:
X = torch.rand(size=(1, 1, 224, 224), dtype=torch.float32).to(device)
for layer in net:
    X = layer(X)
    print(f'output shape: {layer.__class__.__name__: <15}{X.shape}')

output shape: Sequential     torch.Size([1, 96, 54, 54])
output shape: MaxPool2d      torch.Size([1, 96, 26, 26])
output shape: Sequential     torch.Size([1, 256, 26, 26])
output shape: MaxPool2d      torch.Size([1, 256, 12, 12])
output shape: Sequential     torch.Size([1, 384, 12, 12])
output shape: MaxPool2d      torch.Size([1, 384, 5, 5])
output shape: Dropout        torch.Size([1, 384, 5, 5])
output shape: Sequential     torch.Size([1, 10, 5, 5])
output shape: AdaptiveAvgPool2dtorch.Size([1, 10, 1, 1])
output shape: Flatten        torch.Size([1, 10])


In [7]:
def init_weights(m):
    if type(m) == nn.Linear or type(m) == nn.Conv2d:
        nn.init.xavier_uniform_(m.weight)


net.apply(init_weights)
optimizer = torch.optim.SGD(net.parameters(), lr=lr)
loss = nn.CrossEntropyLoss()

In [8]:
def train_loop(train_iter, net, loss, optimizer):
    # 共有几批
    num_batchs = len(train_iter)
    # 总平均loss
    total_train_loss = 0
    for batch, (X, y) in enumerate(train_iter):
        # move to device
        X, y = X.to(device), y.to(device)
        # 该批的推断结果
        y_hat = net(X)
        
        train_loss = loss(y_hat, y)
        total_train_loss += train_loss.item()

        # Backpropagation
        optimizer.zero_grad()
        train_loss.backward()
        optimizer.step()

        # --------打印进度        
        print(f"\r[{batch+1:>8d}/{num_batchs:>8d}]  ", end='')

    
    return total_train_loss / num_batchs

In [9]:
# ---------训练
for epoch in range(num_epochs):
    total_train_loss = train_loop(train_iter, net, loss, optimizer)
    print(f'epoch {epoch + 1}, total_train_loss {total_train_loss:f}')

[     469/     469]  epoch 1, total_train_loss 2.306483
[     469/     469]  epoch 2, total_train_loss 2.302024
[     469/     469]  epoch 3, total_train_loss 2.196620
[     469/     469]  epoch 4, total_train_loss 1.227250
[     469/     469]  epoch 5, total_train_loss 0.776235
[     469/     469]  epoch 6, total_train_loss 0.633952
[     469/     469]  epoch 7, total_train_loss 0.607365
[     469/     469]  epoch 8, total_train_loss 0.549156
[     469/     469]  epoch 9, total_train_loss 0.475780
[     469/     469]  epoch 10, total_train_loss 0.441250


In [12]:
# ----------预测
def test_net(test_iter, net, loss):
    # 共有几批
    num_batchs = len(test_iter)
    # 总平均loss, 总平均准确率
    total_test_loss, total_correct = 0, 0
    # 设定评估模式
    net.eval()
    # 不要梯度
    with torch.no_grad():
        for batch, (X, y) in enumerate(test_iter):
            # move to device
            X, y = X.to(device), y.to(device)
            y_hat = net(X)

            test_loss = loss(y_hat, y)
            # 分类0,1,2,3的类别对的上否
            correct = (y_hat.argmax(1) == y).float().sum().item()
            total_test_loss += test_loss.item()
            total_correct += correct/len(X)

            # --------打印进度
            print(f"\r[{batch+1:>8d}/{num_batchs:>8d}]  ", end='')


    total_test_loss /= num_batchs
    total_correct /= num_batchs
    print(
        f"\nTest: Accuracy: {total_correct:.1%}, Avg loss: {total_test_loss:f}")
    
test_net(test_iter, net, loss)

[      79/      79]  
Test: Accuracy: 86.0%, Avg loss: 0.390855
