# 第4章 卷积神经网络

### 4.1 二维卷积层

In [1]:
import torch
from torch import nn
def corr2d(X, K):
    # 行、列值
    h, w = K.shape
    # 卷积结果的存放位置
    Y = torch.zeros((X.shape[0]-h+1, X.shape[1]-w+1))
    for i in range(Y.shape[0]):
        for j in range(Y.shape[1]):
            Y[i, j] = (X[i:i+h, j:j+w] * K).sum()
    return Y

In [2]:
X = torch.tensor([[0, 1, 2], [3, 4, 5], [6, 7, 8]])
K = torch.tensor([[0, 1], [2, 3]])
corr2d(X, K)

tensor([[19., 25.],
        [37., 43.]])

In [3]:
class Conv2D(nn.Module):
    def __init__(self, kernel_size):
        super(Conv2D, self).__init__()
        self.weight = nn.Parameter(torch.randn(kernel_size))
        self.bias = nn.Parameter(torch.randn(1))
    def forward(self, x):
        return corr2d(x, self.weight) + self.bias

In [4]:
X = torch.ones(6, 8)
X[:, 2:6] = 0
X

tensor([[1., 1., 0., 0., 0., 0., 1., 1.],
        [1., 1., 0., 0., 0., 0., 1., 1.],
        [1., 1., 0., 0., 0., 0., 1., 1.],
        [1., 1., 0., 0., 0., 0., 1., 1.],
        [1., 1., 0., 0., 0., 0., 1., 1.],
        [1., 1., 0., 0., 0., 0., 1., 1.]])

In [5]:
K = torch.tensor([[1, -1]])

In [6]:
Y = corr2d(X, K.float())
Y

tensor([[ 0.,  1.,  0.,  0.,  0., -1.,  0.],
        [ 0.,  1.,  0.,  0.,  0., -1.,  0.],
        [ 0.,  1.,  0.,  0.,  0., -1.,  0.],
        [ 0.,  1.,  0.,  0.,  0., -1.,  0.],
        [ 0.,  1.,  0.,  0.,  0., -1.,  0.],
        [ 0.,  1.,  0.,  0.,  0., -1.,  0.]])

In [7]:
conv2d = Conv2D(kernel_size=(1, 2))
step = 30
lr = 0.01
for i in range(step):
    Y_hat = conv2d(X)
    l = ((Y_hat - Y)**2).sum()
    l.backward()
    # 梯度下降
    conv2d.weight.data -= lr * conv2d.weight.grad
    conv2d.bias.data -= lr * conv2d.bias.grad
    # 梯度清零
    conv2d.weight.grad.fill_(0)
    conv2d.bias.grad.fill_(0)
    if (i+1)%5 == 0:
        print('Step %d, loss %.3f' % (i+1, l.item()))

Step 5, loss 5.936
Step 10, loss 1.648
Step 15, loss 0.459
Step 20, loss 0.128
Step 25, loss 0.036
Step 30, loss 0.010


In [8]:
print('weight: ', conv2d.weight.data)
print('bias: ', conv2d.bias.data)

weight:  tensor([[ 0.9746, -0.9748]])
bias:  tensor([9.9195e-05])


### 4.2 填充和步幅

In [9]:
# 定义一个函数来计算卷积层。它对输入和输出做相应的升维和降维
def comp_conv2d(conv2d, X):
    # (1，1)代表批量大小和通道数(“多输入通道和多输出通道”一节将介绍)均为1
    # (1, 1)+(8, 8)=(1, 1, 8, 8)
    X = X.view((1, 1)+X.shape)
    Y = conv2d(X)
    # 排除不关心的前两维：批量和通道
    return Y.view(Y.shape[2:])
# 注意这里是两侧分别填充1行或列，所以在两侧一共填充2行或列
conv2d = nn.Conv2d(in_channels=1, out_channels=1, kernel_size=3, padding=1)
X = torch.rand(8, 8)
comp_conv2d(conv2d, X).shape

torch.Size([8, 8])

In [10]:
# 使用高为5、宽为3的卷积核。在高和宽两侧的填充数分别为2和1
conv2d = nn.Conv2d(in_channels=1, out_channels=1, kernel_size=(5, 3), padding=(2, 1))
comp_conv2d(conv2d, X).shape

torch.Size([8, 8])

In [11]:
conv2d = nn.Conv2d(1, 1, kernel_size=3, padding=1, stride=2)
comp_conv2d(conv2d, X).shape

torch.Size([4, 4])

In [12]:
conv2d = nn.Conv2d(1, 1, kernel_size=(3, 5), padding=(0, 1), stride=(3, 4))
comp_conv2d(conv2d, X).shape

torch.Size([2, 2])

### 4.3 多输入通道和多输出通道

In [13]:
import d2lzh as d2l
def corr2d_multi_in(X, K):
    # 沿着X和K的第0维（通道维）分别计算再相加
    res = d2l.corr2d(X[0, :, :], K[0, :, :])
    for i in range(1, X.shape[0]):
        res += d2l.corr2d(X[i, :, :], K[i, :, :])
    return res

In [14]:
X = torch.tensor([
    [[0, 1, 2], [3, 4, 5], [6, 7, 8]], 
    [[1, 2, 3], [4, 5, 6], [7, 8, 9]]
], dtype=torch.float)
K = torch.tensor([
    [[0, 1], [2, 3]], 
    [[1, 2], [3, 4]]
], dtype=torch.float)
corr2d_multi_in(X, K)

tensor([[ 56.,  72.],
        [104., 120.]])

In [15]:
def corr2d_multi_in_out(X, K):
    # 对K的第0维遍历，每次同输入X做互相关计算。所有结果使用stack函数合并在一起
    return torch.stack([corr2d_multi_in(X, k) for k in K])

In [16]:
K = torch.stack([K, K+1, K+2])
K.shape

torch.Size([3, 2, 2, 2])

In [17]:
corr2d_multi_in_out(X, K)

tensor([[[ 56.,  72.],
         [104., 120.]],

        [[ 76., 100.],
         [148., 172.]],

        [[ 96., 128.],
         [192., 224.]]])

In [18]:
def corr2d_multi_in_out_1x1(X, K):
    # 通道、高、宽
    c_i, h, w = X.shape
    # 输出通道数目
    c_o = K.shape[0]
    # 通道、高*宽
    # 一个通道是一个特征
    X = X.view(c_i, h*w)
    K = K.view(c_o, c_i)
    # 全连接层的矩阵乘法
    Y = torch.mm(K, X)
    return Y.view(c_o, h, w)

In [19]:
X = torch.rand(3, 3, 3)
K = torch.rand(2, 3, 1, 1)
Y1 = corr2d_multi_in_out_1x1(X, K)
Y2 = corr2d_multi_in_out(X, K)
(Y1-Y2).norm().item() < 1e-6

True

### 4.4 池化层

In [20]:
def pool2d(X, pool_size, mode='max'):
    X = X.float()
    p_h, p_w = pool_size
    # 存储池化计算结果
    Y = torch.zeros(X.shape[0]-p_h+1, X.shape[1]-p_w+1)
    for i in range(Y.shape[0]):
        for j in range(Y.shape[1]):
            if mode=='max':
                Y[i, j] = X[i: i+p_h, j: j+p_w].max()
            elif mode=='avg':
                Y[i, j] = X[i: i+p_h, j: j+p_w].mean()
    return Y

In [21]:
X = torch.tensor([
    [0, 1, 2], 
    [3, 4, 5], 
    [6, 7, 8]
])
pool2d(X, (2, 2))

tensor([[4., 5.],
        [7., 8.]])

In [22]:
pool2d(X, (2, 2), mode='avg')

tensor([[2., 3.],
        [5., 6.]])

In [23]:
X = torch.arange(16, dtype=torch.float).view((1, 1, 4, 4))
X

tensor([[[[ 0.,  1.,  2.,  3.],
          [ 4.,  5.,  6.,  7.],
          [ 8.,  9., 10., 11.],
          [12., 13., 14., 15.]]]])

In [24]:
pool2d = nn.MaxPool2d(3)
pool2d(X)

tensor([[[[10.]]]])

In [25]:
pool2d = nn.MaxPool2d(3, padding=1, stride=2)
pool2d(X)

tensor([[[[ 5.,  7.],
          [13., 15.]]]])

In [26]:
pool2d = nn.MaxPool2d((2, 4), padding=(1, 2), stride=(2, 3))
pool2d(X)

tensor([[[[ 1.,  3.],
          [ 9., 11.],
          [13., 15.]]]])

In [27]:
X = torch.cat((X, X+1), dim=1)
X

tensor([[[[ 0.,  1.,  2.,  3.],
          [ 4.,  5.,  6.,  7.],
          [ 8.,  9., 10., 11.],
          [12., 13., 14., 15.]],

         [[ 1.,  2.,  3.,  4.],
          [ 5.,  6.,  7.,  8.],
          [ 9., 10., 11., 12.],
          [13., 14., 15., 16.]]]])

In [28]:
pool2d = nn.MaxPool2d(3, padding=1, stride=2)
pool2d(X)

tensor([[[[ 5.,  7.],
          [13., 15.]],

         [[ 6.,  8.],
          [14., 16.]]]])

### 4.5 LeNet-卷积神经网络

In [29]:
from torch import nn, optim
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
class LeNet(nn.Module):
    def __init__(self):
        super(LeNet, self).__init__()
        self.conv = nn.Sequential(
            # in_channels, out_channels, kernel_size
            nn.Conv2d(1, 6, 5),
            nn.Sigmoid(),
            # kernel_size, stride
            nn.MaxPool2d(2, 2),
            nn.Conv2d(6, 16, 5),
            nn.Sigmoid(),
            nn.MaxPool2d(2, 2)
        )
        self.fc = nn.Sequential(
            nn.Linear(16*4*4, 120), 
            nn.Sigmoid(), 
            nn.Linear(120, 84), 
            nn.Sigmoid(), 
            nn.Linear(84, 10)
        )
    def forward(self, img):
        feature = self.conv(img)
        output = self.fc(feature.view(img.shape[0], -1))
        return output

In [30]:
net = LeNet()
print(net)

LeNet(
  (conv): Sequential(
    (0): Conv2d(1, 6, kernel_size=(5, 5), stride=(1, 1))
    (1): Sigmoid()
    (2): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
    (3): Conv2d(6, 16, kernel_size=(5, 5), stride=(1, 1))
    (4): Sigmoid()
    (5): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  )
  (fc): Sequential(
    (0): Linear(in_features=256, out_features=120, bias=True)
    (1): Sigmoid()
    (2): Linear(in_features=120, out_features=84, bias=True)
    (3): Sigmoid()
    (4): Linear(in_features=84, out_features=10, bias=True)
  )
)


In [31]:
batch_size = 256
train_iter, test_iter = d2l.load_data_fashion_mnist(batch_size=batch_size)

In [32]:
lr, num_epochs = 0.001, 5
optimizer = torch.optim.Adam(net.parameters(), lr=lr)
d2l.train_ch5(net, train_iter, test_iter, batch_size, optimizer, device, num_epochs)

training on  cuda
epoch 1, loss 1.8646, train acc 0.318, test acc 0.587, time 4.7 sec
epoch 2, loss 0.9301, train acc 0.646, test acc 0.685, time 4.3 sec
epoch 3, loss 0.7464, train acc 0.720, test acc 0.727, time 4.2 sec
epoch 4, loss 0.6674, train acc 0.741, test acc 0.744, time 4.2 sec
epoch 5, loss 0.6137, train acc 0.757, test acc 0.760, time 4.3 sec


### 4.6 AlexNet-深度卷积神经网络

In [33]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
class AlexNet(nn.Module):
    def __init__(self):
        super(AlexNet, self).__init__()
        self.conv = nn.Sequential(
            # in_channels, out_channels, kernel_size, stride
            nn.Conv2d(1, 96, 11, 4), 
            nn.ReLU(), 
            # kernel_size, stride
            nn.MaxPool2d(3, 2), 
            # 减小卷积窗口，使用填充为2来使得输入与输出的高和宽一致，且增大输出通道数
            nn.Conv2d(96, 256, 5, 1, 2),
            nn.ReLU(), 
            nn.MaxPool2d(3, 2), 
            # 连续3个卷积层，且使用更小的卷积窗口。除了最后的卷积层外，进一步增大了输出通道数。
            # 前两个卷积层后不使用池化层来减小输入的高和宽
            nn.Conv2d(256, 384, 3, 1, 1),
            nn.ReLU(), 
            nn.Conv2d(384, 384, 3, 1, 1), 
            nn.ReLU(), 
            nn.Conv2d(384, 256, 3, 1, 1), 
            nn.ReLU(), 
            nn.MaxPool2d(3, 2)
        )
        # 这里全连接层的输出个数比LeNet中的大数倍。使用丢弃层来缓解过拟合
        self.fc = nn.Sequential(
            nn.Linear(256*5*5, 4096), 
            nn.ReLU(), 
            nn.Dropout(0.5), 
            nn.Linear(4096, 4096), 
            nn.ReLU(), 
            nn.Dropout(0.5), 
            # 输出层。由于这里使用Fashion-MNIST，所以用类别数为10，而非论文中的1000
            nn.Linear(4096, 10)
        )
    def forward(self, img):
        feature = self.conv(img)
        output = self.fc(feature.view(img.shape[0], -1))
        return output

In [34]:
net = AlexNet()
print(net)

AlexNet(
  (conv): Sequential(
    (0): Conv2d(1, 96, kernel_size=(11, 11), stride=(4, 4))
    (1): ReLU()
    (2): MaxPool2d(kernel_size=3, stride=2, padding=0, dilation=1, ceil_mode=False)
    (3): Conv2d(96, 256, kernel_size=(5, 5), stride=(1, 1), padding=(2, 2))
    (4): ReLU()
    (5): MaxPool2d(kernel_size=3, stride=2, padding=0, dilation=1, ceil_mode=False)
    (6): Conv2d(256, 384, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (7): ReLU()
    (8): Conv2d(384, 384, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (9): ReLU()
    (10): Conv2d(384, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (11): ReLU()
    (12): MaxPool2d(kernel_size=3, stride=2, padding=0, dilation=1, ceil_mode=False)
  )
  (fc): Sequential(
    (0): Linear(in_features=6400, out_features=4096, bias=True)
    (1): ReLU()
    (2): Dropout(p=0.5)
    (3): Linear(in_features=4096, out_features=4096, bias=True)
    (4): ReLU()
    (5): Dropout(p=0.5)
    (6): Linear(in_features=4096, o

In [35]:
batch_size = 256
train_iter, test_iter = d2l.load_data_fashion_mnist(batch_size, resize=224)

In [36]:
lr, num_epochs = 0.001, 5
optimizer = torch.optim.Adam(net.parameters(), lr=lr)
d2l.train_ch5(net, train_iter, test_iter, batch_size, optimizer, device, num_epochs)

training on  cuda
epoch 1, loss 0.6961, train acc 0.732, test acc 0.841, time 26.5 sec
epoch 2, loss 0.3328, train acc 0.874, test acc 0.884, time 26.5 sec
epoch 3, loss 0.2833, train acc 0.893, test acc 0.894, time 26.1 sec
epoch 4, loss 0.2503, train acc 0.907, test acc 0.903, time 26.5 sec
epoch 5, loss 0.2291, train acc 0.915, test acc 0.906, time 26.6 sec


### 4.7 VGG-使用重复元素的网络

In [37]:
def vgg_block(num_convs, in_channels, out_channels):
    blk = []
    for i in range(num_convs):
        if i==0:
            blk.append(nn.Conv2d(in_channels, out_channels, kernel_size=3, padding=1))
        else:
            blk.append(nn.Conv2d(out_channels, out_channels, kernel_size=3, padding=1))
        blk.append(nn.ReLU())
    # 这里会使宽高减半
    blk.append(nn.MaxPool2d(kernel_size=2, stride=2))
    return nn.Sequential(*blk)

In [38]:
conv_arch = ((1, 1, 64), (1, 64, 128), (2, 128, 256), (2, 256, 512), (2, 512, 512))
# 经过5个vgg_block，宽高会减半5次，变成224/2**5=224/32=7
# c * w * h
fc_features = 512*7*7
fc_hidden_units = 4096

In [39]:
def vgg(conv_arch, fc_features, fc_hidden_units=4096):
    net = nn.Sequential()
    # 卷积层部分
    for i, (num_convs, in_channels, out_channels) in enumerate(conv_arch):
        # 每经过一个vgg_block都会使宽高减半
        net.add_module('vgg_block_' + str(i+1), vgg_block(num_convs, in_channels, out_channels))
    # 全连接部分
    net.add_module('fc', nn.Sequential(
        d2l.FlattenLayer(), 
        nn.Linear(fc_features, fc_hidden_units), 
        nn.ReLU(), 
        nn.Dropout(0.5), 
        nn.Linear(fc_hidden_units, fc_hidden_units), 
        nn.ReLU(), 
        nn.Dropout(0.5), 
        nn.Linear(fc_hidden_units, 10)
    ))
    return net

In [42]:
net = vgg(conv_arch, fc_features, fc_hidden_units)
X = torch.rand(1, 1, 224, 224)
# named_children获取一级子模块及其名字（named_modules会返回所有子模块，包括子模块的子模块）
for name, blk in net.named_children():
    X = blk(X)
    print(name, 'output shape: ', X.shape)

vgg_block_1 output shape:  torch.Size([1, 64, 112, 112])
vgg_block_2 output shape:  torch.Size([1, 128, 56, 56])
vgg_block_3 output shape:  torch.Size([1, 256, 28, 28])
vgg_block_4 output shape:  torch.Size([1, 512, 14, 14])
vgg_block_5 output shape:  torch.Size([1, 512, 7, 7])
fc output shape:  torch.Size([1, 10])


In [43]:
ratio = 4
small_conv_arch = ((1, 1, 64//ratio), (1, 64//ratio, 128//ratio), (2, 128//ratio, 256//ratio), (2, 256//ratio, 512//ratio), (2, 512//ratio, 512//ratio))
net = vgg(small_conv_arch, fc_features//ratio, fc_hidden_units//ratio)

In [44]:
batch_size = 256
train_iter, test_iter = d2l.load_data_fashion_mnist(batch_size, resize=224)
lr, num_epochs = 0.001, 5
optimizer = torch.optim.Adam(net.parameters(), lr=lr)
d2l.train_ch5(net, train_iter, test_iter, batch_size, optimizer, device, num_epochs)

training on  cuda
epoch 1, loss 0.7833, train acc 0.702, test acc 0.839, time 44.3 sec
epoch 2, loss 0.3814, train acc 0.860, test acc 0.873, time 44.5 sec
epoch 3, loss 0.3125, train acc 0.885, test acc 0.895, time 44.5 sec
epoch 4, loss 0.2730, train acc 0.899, test acc 0.904, time 44.6 sec
epoch 5, loss 0.2428, train acc 0.911, test acc 0.912, time 44.5 sec


### 4.8 NiN-网络中的网络

In [45]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
def nin_block(in_channels, out_channels, kernel_size, stride, padding):
    blk = nn.Sequential(
        nn.Conv2d(in_channels, out_channels, kernel_size, stride, padding), 
        nn.ReLU(), 
        # 相当于全连接层
        nn.Conv2d(out_channels, out_channels, kernel_size=1), 
        nn.ReLU(), 
        nn.Conv2d(out_channels, out_channels, kernel_size=1), 
        nn.ReLU()
    )
    return blk

In [46]:
import torch.nn.functional as F
class GlobalAvgPool2d(nn.Module):
    # 全局平均池化层可通过将池化窗口形状设置成输入的高和宽实现
    def __init__(self):
        super(GlobalAvgPool2d, self).__init__()
    def forward(self, x):
        return F.avg_pool2d(x, kernel_size=x.size()[2:])

In [47]:
net = nn.Sequential(
    nin_block(1, 96, kernel_size=11, stride=4, padding=0), 
    nn.MaxPool2d(kernel_size=3, stride=2), 
    nin_block(96, 256, kernel_size=5, stride=1, padding=2), 
    nn.MaxPool2d(kernel_size=3, stride=2),
    nin_block(256, 384, kernel_size=3, stride=1, padding=1), 
    nn.MaxPool2d(kernel_size=3, stride=2),
    nn.Dropout(0.5), 
    # 标签类别数是10
    nin_block(384, 10, kernel_size=3, stride=1, padding=1), 
    GlobalAvgPool2d(), 
    # 将四维的输出转成二维的输出，其形状为(批量大小, 10)
    d2l.FlattenLayer()
)

In [48]:
X = torch.rand(1, 1, 224, 224)
for name, blk in net.named_children():
    X = blk(X)
    print(name, 'output shape: ', X.shape)

0 output shape:  torch.Size([1, 96, 54, 54])
1 output shape:  torch.Size([1, 96, 26, 26])
2 output shape:  torch.Size([1, 256, 26, 26])
3 output shape:  torch.Size([1, 256, 12, 12])
4 output shape:  torch.Size([1, 384, 12, 12])
5 output shape:  torch.Size([1, 384, 5, 5])
6 output shape:  torch.Size([1, 384, 5, 5])
7 output shape:  torch.Size([1, 10, 5, 5])
8 output shape:  torch.Size([1, 10, 1, 1])
9 output shape:  torch.Size([1, 10])


In [49]:
batch_size = 256
train_iter, test_iter = d2l.load_data_fashion_mnist(batch_size, resize=224)
lr, num_epochs = 0.002, 5
optimizer = torch.optim.Adam(net.parameters(), lr=lr)
d2l.train_ch5(net, train_iter, test_iter, batch_size, optimizer, device, num_epochs)

training on  cuda
epoch 1, loss 1.2753, train acc 0.524, test acc 0.726, time 31.9 sec
epoch 2, loss 0.6357, train acc 0.767, test acc 0.805, time 32.0 sec
epoch 3, loss 0.5100, train acc 0.815, test acc 0.832, time 32.2 sec
epoch 4, loss 0.4403, train acc 0.840, test acc 0.843, time 32.0 sec
epoch 5, loss 0.3973, train acc 0.854, test acc 0.856, time 32.0 sec


### 4.9 GoogLeNet-含并行连结的网络

In [51]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
class Inception(nn.Module):
    # c1-c4为每条路线里的层的输出通道数
    def __init__(self, in_c, c1, c2, c3, c4):
        super(Inception, self).__init__()
        # 线路1，单1*1卷积层
        self.p1_1 = nn.Conv2d(in_c, c1, kernel_size=1)
        # 线路2，1*1卷积层后接3*3卷积层
        self.p2_1 = nn.Conv2d(in_c, c2[0], kernel_size=1)
        self.p2_2 = nn.Conv2d(c2[0], c2[1], kernel_size=3, padding=1)
        # 线路3，1*1卷积层后接5*5卷积层
        self.p3_1 = nn.Conv2d(in_c, c3[0], kernel_size=1)
        self.p3_2 = nn.Conv2d(c3[0], c3[1], kernel_size=5, padding=2)
        # 线路4，3*3最大池化层后接1*1卷积层
        self.p4_1 = nn.MaxPool2d(kernel_size=3, stride=1, padding=1)
        self.p4_2 = nn.Conv2d(in_c, c4, kernel_size=1)
    def forward(self, x):
        p1 = F.relu(self.p1_1(x))
        p2 = F.relu(self.p2_2(self.p2_1(x)))
        p3 = F.relu(self.p3_2(self.p3_1(x)))
        p4 = F.relu(self.p4_2(self.p4_1(x)))
        # 在通道维上连结输出
        return torch.cat((p1, p2, p3, p4), dim=1)

In [52]:
b1 = nn.Sequential(
    nn.Conv2d(1, 64, kernel_size=7, stride=2, padding=3), 
    nn.ReLU(), 
    nn.MaxPool2d(kernel_size=3, stride=2, padding=1)
)

In [53]:
b2 = nn.Sequential(
    nn.Conv2d(64, 64, kernel_size=1), 
    nn.ReLU(), 
    nn.Conv2d(64, 192, kernel_size=3, padding=1), 
    nn.ReLU(), 
    nn.MaxPool2d(kernel_size=3, stride=2, padding=1)
)

In [54]:
b3 = nn.Sequential(
    Inception(192, 64, (96, 128), (16, 32), 32), 
    Inception(256, 128, (128, 192), (32, 96), 64),
    nn.MaxPool2d(kernel_size=3, stride=2, padding=1)
)

In [55]:
b4 = nn.Sequential(
    Inception(480, 192, (96, 208), (16, 48), 64),
    Inception(512, 160, (112, 224), (24, 64), 64),
    Inception(512, 128, (128, 256), (24, 64), 64),
    Inception(512, 112, (144, 288), (32, 64), 64),
    Inception(528, 256, (160, 320), (32, 128), 128),
    nn.MaxPool2d(kernel_size=3, stride=2, padding=1)
)

In [57]:
b5 = nn.Sequential(
    Inception(832, 256, (160, 320), (32, 128), 128),
    Inception(832, 384, (192, 384), (48, 128), 128),
    GlobalAvgPool2d()
)
net = nn.Sequential(
    b1, b2, b3, b4, b5, 
    d2l.FlattenLayer(), 
    nn.Linear(1024, 10)
)

In [58]:
X = torch.rand(1, 1, 96, 96)
for blk in net.children():
    X = blk(X)
    print('output shape: ', X.shape)

output shape:  torch.Size([1, 64, 24, 24])
output shape:  torch.Size([1, 192, 12, 12])
output shape:  torch.Size([1, 480, 6, 6])
output shape:  torch.Size([1, 832, 3, 3])
output shape:  torch.Size([1, 1024, 1, 1])
output shape:  torch.Size([1, 1024])
output shape:  torch.Size([1, 10])


In [59]:
batch_size = 256
train_iter, test_iter = d2l.load_data_fashion_mnist(batch_size, resize=96)
lr, num_epochs = 0.001, 5
optimizer = torch.optim.Adam(net.parameters(), lr=lr)
d2l.train_ch5(net, train_iter, test_iter, batch_size, optimizer, device, num_epochs)

training on  cuda
epoch 1, loss 2.4547, train acc 0.099, test acc 0.100, time 23.0 sec
epoch 2, loss 1.6381, train acc 0.344, test acc 0.496, time 23.1 sec
epoch 3, loss 0.7035, train acc 0.733, test acc 0.726, time 23.1 sec
epoch 4, loss 0.5071, train acc 0.815, test acc 0.824, time 23.1 sec
epoch 5, loss 0.4271, train acc 0.842, test acc 0.826, time 23.1 sec


### 4.10 批量归一化

In [1]:
import torch
from torch import nn

In [2]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
def batch_norm(is_training, X, gamma, beta, moving_mean, moving_var, eps, momentum):
    # 判断当前模式是训练模式还是预测模式
    if not is_training:
        # 如果是在预测模式下，直接使用传入的移动平均所得的均值和方差
        X_hat = (X - moving_mean) / torch.sqrt(moving_var + eps)
    else:
        # 前一层需要为全连接层或卷积层
        assert len(X.shape) in (2, 4)
        # 全连接层
        if len(X.shape) == 2:
            # 沿纵向求均值，(1, 特征个数)
            # 注意：逐特征求均值
            mean = X.mean(dim=0)
            # 广播
            var = ((X - mean) ** 2).mean(dim=0)
        else:
            # 使用二维卷积层的情况，计算通道维上（axis=1）的均值和方差。这里我们需要保持
            # X的形状以便后面可以做广播运算
            mean = X.mean(dim=0, keepdim=True).mean(dim=2, keepdim=True).mean(dim=3, keepdim=True)
            var = ((X - mean) ** 2).mean(dim=0, keepdim=True).mean(dim=2, keepdim=True).mean(dim=3, keepdim=True)
        # 训练模式下用当前的均值和方差做标准化
        X_hat = (X - mean) / torch.sqrt(var + eps)
        # 一阶指数平滑算法
        moving_mean = momentum * moving_mean + (1.0 - momentum) * mean
        moving_var = momentum * moving_var + (1.0 - momentum) * var
    # 拉伸和偏移
    Y = gamma * X_hat + beta
    return Y, moving_mean, moving_var

In [3]:
class BatchNorm(nn.Module):
    def __init__(self, num_features, num_dims):
        super(BatchNorm, self).__init__()
        # 全连接层
        if num_dims==2:
            shape = (1, num_features)
        # 卷积层
        else:
            shape = (1, num_features, 1, 1)
        # 参与求梯度和迭代的拉伸和偏移参数，分别初始化成0和1
        self.gamma = nn.Parameter(torch.ones(shape))
        self.beta = nn.Parameter(torch.zeros(shape))
        # 不参与求梯度和迭代的变量，全在内存上初始化成0
        self.moving_mean = torch.zeros(shape)
        self.moving_var = torch.zeros(shape)
    def forward(self, X):
        # 如果X不在显存上，将moving_mean和moving_var复制到X所在显存上
        if self.moving_mean.device != X.device:
            self.moving_mean = self.moving_mean.to(X.device)
            self.moving_var = self.moving_var.to(X.device)
        # 保存更新过的moving_mean和moving_var
        # Module实例的traning属性默认为true, 调用.eval()后设成false
        Y, self.moving_mean, self.moving_var = batch_norm(self.training, X, self.gamma, self.beta, self.moving_mean, self.moving_var, eps=1e-5, momentum=0.9)
        return Y

In [5]:
import d2lzh as d2l
net = nn.Sequential(
    # in_channels, out_channels, kernel_size
    nn.Conv2d(1, 6, 5), 
    BatchNorm(6, num_dims=4), 
    nn.Sigmoid(), 
    # kernel_size, stride
    nn.MaxPool2d(2, 2), 
    nn.Conv2d(6, 16, 5), 
    BatchNorm(16, num_dims=4), 
    nn.Sigmoid(), 
    nn.MaxPool2d(2, 2), 
    d2l.FlattenLayer(), 
    nn.Linear(16*4*4, 120), 
    BatchNorm(120, num_dims=2), 
    nn.Sigmoid(), 
    nn.Linear(120, 84), 
    BatchNorm(84, num_dims=2), 
    nn.Sigmoid(), 
    nn.Linear(84, 10)
)

In [6]:
batch_size = 256
train_iter, test_iter = d2l.load_data_fashion_mnist(batch_size)
lr, num_epochs = 0.001, 5
optimizer = torch.optim.Adam(net.parameters(), lr=lr)
d2l.train_ch5(net, train_iter, test_iter, batch_size, optimizer, device, num_epochs)

training on  cuda
epoch 1, loss 1.0072, train acc 0.788, test acc 0.827, time 6.7 sec
epoch 2, loss 0.4596, train acc 0.862, test acc 0.836, time 6.5 sec
epoch 3, loss 0.3726, train acc 0.876, test acc 0.852, time 6.5 sec
epoch 4, loss 0.3339, train acc 0.886, test acc 0.866, time 6.6 sec
epoch 5, loss 0.3132, train acc 0.890, test acc 0.855, time 6.3 sec


In [7]:
net[1].gamma.view((-1,)), net[1].beta.view((-1,))

(tensor([1.1173, 0.9368, 0.9685, 1.1043, 1.0498, 0.9036], device='cuda:0',
        grad_fn=<ViewBackward>),
 tensor([ 0.2380,  0.0031, -0.4098,  0.3822, -0.6539, -0.1376], device='cuda:0',
        grad_fn=<ViewBackward>))

In [8]:
net = nn.Sequential(
    # in_channels, out_channels, kernel_size
    nn.Conv2d(1, 6, 5), 
    nn.BatchNorm2d(6), 
    nn.Sigmoid(), 
    # kernel_size, stride
    nn.MaxPool2d(2, 2), 
    nn.Conv2d(6, 16, 5), 
    nn.BatchNorm2d(16), 
    nn.Sigmoid(), 
    nn.MaxPool2d(2, 2), 
    d2l.FlattenLayer(), 
    nn.Linear(16*4*4, 120), 
    nn.BatchNorm1d(120), 
    nn.Sigmoid(), 
    nn.Linear(120, 84), 
    nn.BatchNorm1d(84), 
    nn.Sigmoid(), 
    nn.Linear(84, 10)
)

In [9]:
batch_size = 256
train_iter, test_iter = d2l.load_data_fashion_mnist(batch_size)
lr, num_epochs = 0.001, 5
optimizer = torch.optim.Adam(net.parameters(), lr=lr)
d2l.train_ch5(net, train_iter, test_iter, batch_size, optimizer, device, num_epochs)

training on  cuda
epoch 1, loss 1.3550, train acc 0.765, test acc 0.806, time 5.2 sec
epoch 2, loss 0.5887, train acc 0.857, test acc 0.760, time 5.9 sec
epoch 3, loss 0.4131, train acc 0.875, test acc 0.810, time 6.6 sec
epoch 4, loss 0.3566, train acc 0.883, test acc 0.837, time 6.0 sec
epoch 5, loss 0.3254, train acc 0.891, test acc 0.821, time 5.4 sec


### 4.11 ResNet-残差网络

In [20]:
import torch.nn.functional as F
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
class Residual(nn.Module):
    # 输入通道数、输出通道数、是否使用1x1卷积核、步长
    def __init__(self, in_channels, out_channels, use_1x1conv=False, stride=1):
        super(Residual, self).__init__()
        # 3x3搭配1步长，特征图大小不变
        self.conv1 = nn.Conv2d(in_channels, out_channels, kernel_size=3, padding=1, stride=stride)
        self.conv2 = nn.Conv2d(out_channels, out_channels, kernel_size=3, padding=1)
        if use_1x1conv:
            self.conv3 = nn.Conv2d(in_channels, out_channels, kernel_size=1, stride=stride)
        else:
            self.conv3 = None
        self.bn1 = nn.BatchNorm2d(out_channels)
        self.bn2 = nn.BatchNorm2d(out_channels)
    def forward(self, X):
        Y = F.relu(self.bn1(self.conv1(X)))
        Y = self.bn2(self.conv2(Y))
        if self.conv3:
            X = self.conv3(X)
        return F.relu(Y+X)

In [21]:
blk = Residual(3, 3)
X = torch.rand((4, 3, 6, 6))
blk(X).shape

torch.Size([4, 3, 6, 6])

In [22]:
blk = Residual(3, 6, use_1x1conv=True, stride=2)
X = torch.rand((4, 3, 6, 6))
blk(X).shape

torch.Size([4, 6, 3, 3])

In [23]:
net = nn.Sequential(
    nn.Conv2d(1, 64, kernel_size=7, stride=2, padding=3), 
    nn.BatchNorm2d(64), 
    nn.ReLU(), 
    nn.MaxPool2d(kernel_size=3, stride=2, padding=1)
)

In [24]:
def resnet_block(in_channels, out_channels, num_residuals, first_block=False):
    if first_block:
        # 第一个模块的通道数同输入通道数一致
        assert in_channels == out_channels
    blk = []
    for i in range(num_residuals):
        if i==0 and not first_block:
            blk.append(Residual(in_channels, out_channels, use_1x1conv=True, stride=2))
        else:
            blk.append(Residual(out_channels, out_channels))
    return nn.Sequential(*blk)

In [25]:
net.add_module('resnet_block1', resnet_block(64, 64, 2, first_block=True))
net.add_module('resnet_block2', resnet_block(64, 128, 2))
net.add_module('resnet_block3', resnet_block(128, 256, 2))
net.add_module('resnet_block4', resnet_block(256, 512, 2))

In [26]:
# GlobalAvgPool2d的输出: (Batch, 512, 1, 1)
net.add_module('global_avg_pool', d2l.GlobalAvgPool2d())
net.add_module('fc', nn.Sequential(d2l.FlattenLayer(), nn.Linear(512, 10)))

In [27]:
X = torch.rand((1, 1, 224, 224))
for name, layer in net.named_children():
    X = layer(X)
    print(name, ' output shape:\t', X.shape)

0  output shape:	 torch.Size([1, 64, 112, 112])
1  output shape:	 torch.Size([1, 64, 112, 112])
2  output shape:	 torch.Size([1, 64, 112, 112])
3  output shape:	 torch.Size([1, 64, 56, 56])
resnet_block1  output shape:	 torch.Size([1, 64, 56, 56])
resnet_block2  output shape:	 torch.Size([1, 128, 28, 28])
resnet_block3  output shape:	 torch.Size([1, 256, 14, 14])
resnet_block4  output shape:	 torch.Size([1, 512, 7, 7])
global_avg_pool  output shape:	 torch.Size([1, 512, 1, 1])
fc  output shape:	 torch.Size([1, 10])


In [28]:
batch_size = 256
train_iter, test_iter = d2l.load_data_fashion_mnist(batch_size)
lr, num_epochs = 0.001, 5
optimizer = torch.optim.Adam(net.parameters(), lr=lr)
d2l.train_ch5(net, train_iter, test_iter, batch_size, optimizer, device, num_epochs)

training on  cuda
epoch 1, loss 0.4296, train acc 0.843, test acc 0.848, time 10.8 sec
epoch 2, loss 0.2982, train acc 0.890, test acc 0.886, time 10.7 sec
epoch 3, loss 0.2583, train acc 0.904, test acc 0.873, time 10.8 sec
epoch 4, loss 0.2299, train acc 0.915, test acc 0.899, time 10.9 sec
epoch 5, loss 0.2084, train acc 0.923, test acc 0.903, time 10.9 sec


### 4.12 DenseNet-稠密连接网络

In [1]:
import torch
from torch import nn
def conv_block(in_channels, out_channels):
    blk = nn.Sequential(
        nn.BatchNorm2d(in_channels), 
        nn.ReLU(), 
        nn.Conv2d(in_channels, out_channels, kernel_size=3, padding=1)
    )
    return blk

In [2]:
class DenseBlock(nn.Module):
    def __init__(self, num_convs, in_channels, out_channels):
        super(DenseBlock, self).__init__()
        net = []
        for i in range(num_convs):
            in_c = in_channels + i*out_channels
            net.append(conv_block(in_c, out_channels))
        self.net = nn.ModuleList(net)
        # 计算输出通道数
        self.out_channels = in_channels + num_convs*out_channels
    def forward(self, X):
        for blk in self.net:
            Y = blk(X)
            # 在通道维上将输入和输出连结
            X = torch.cat((X, Y), dim=1)
        return X

In [3]:
blk = DenseBlock(2, 3, 10)
X = torch.rand(4, 3, 8, 8)
Y = blk(X)
Y.shape

torch.Size([4, 23, 8, 8])

In [4]:
def transition_block(in_channels, out_channels):
    blk = nn.Sequential(
        nn.BatchNorm2d(in_channels), 
        nn.ReLU(), 
        nn.Conv2d(in_channels, out_channels, kernel_size=1), 
        nn.AvgPool2d(kernel_size=2, stride=2)
    )
    return blk

In [5]:
blk = transition_block(23, 10)
blk(Y).shape

torch.Size([4, 10, 4, 4])

In [6]:
net = nn.Sequential(
    nn.Conv2d(1, 64, kernel_size=7, stride=2, padding=3), 
    nn.BatchNorm2d(64), 
    nn.ReLU(), 
    nn.MaxPool2d(kernel_size=3, stride=2, padding=1)
)

In [7]:
# num_channels为当前的通道数
num_channels, growth_rate = 64, 32
num_convs_in_dense_blocks = [4, 4, 4, 4]
for i, num_convs in enumerate(num_convs_in_dense_blocks):
    DB = DenseBlock(num_convs, num_channels, growth_rate)
    net.add_module('DenseBlock_%d' % i, DB)
    # 上一个稠密块的输出通道数
    num_channels = DB.out_channels
    # 在稠密块之间加入通道数减半的过渡层
    if i != len(num_convs_in_dense_blocks)-1:
        net.add_module('transition_block_%d' % i, transition_block(num_channels, num_channels//2))
        num_channels = num_channels // 2

In [8]:
import d2lzh as d2l
net.add_module('BN', nn.BatchNorm2d(num_channels))
net.add_module('relu', nn.ReLU())
# GlobalAvgPool2d的输出: (Batch, num_channels, 1, 1)
net.add_module('global_avg_pool', d2l.GlobalAvgPool2d())
net.add_module('fc', nn.Sequential(
    d2l.FlattenLayer(), 
    nn.Linear(num_channels, 10)
))

In [9]:
X = torch.rand((1, 1, 96, 96))
for name, layer in net.named_children():
    X = layer(X)
    print(name, ' output shape:\t', X.shape)

0  output shape:	 torch.Size([1, 64, 48, 48])
1  output shape:	 torch.Size([1, 64, 48, 48])
2  output shape:	 torch.Size([1, 64, 48, 48])
3  output shape:	 torch.Size([1, 64, 24, 24])
DenseBlock_0  output shape:	 torch.Size([1, 192, 24, 24])
transition_block_0  output shape:	 torch.Size([1, 96, 12, 12])
DenseBlock_1  output shape:	 torch.Size([1, 224, 12, 12])
transition_block_1  output shape:	 torch.Size([1, 112, 6, 6])
DenseBlock_2  output shape:	 torch.Size([1, 240, 6, 6])
transition_block_2  output shape:	 torch.Size([1, 120, 3, 3])
DenseBlock_3  output shape:	 torch.Size([1, 248, 3, 3])
BN  output shape:	 torch.Size([1, 248, 3, 3])
relu  output shape:	 torch.Size([1, 248, 3, 3])
global_avg_pool  output shape:	 torch.Size([1, 248, 1, 1])
fc  output shape:	 torch.Size([1, 10])


In [11]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
batch_size = 256
train_iter, test_iter = d2l.load_data_fashion_mnist(batch_size)
lr, num_epochs = 0.001, 5
optimizer = torch.optim.Adam(net.parameters(), lr=lr)
d2l.train_ch5(net, train_iter, test_iter, batch_size, optimizer, device, num_epochs)

training on  cuda
epoch 1, loss 0.5218, train acc 0.828, test acc 0.833, time 12.7 sec
epoch 2, loss 0.3087, train acc 0.887, test acc 0.869, time 12.4 sec
epoch 3, loss 0.2656, train acc 0.902, test acc 0.890, time 12.5 sec
epoch 4, loss 0.2360, train acc 0.914, test acc 0.894, time 12.2 sec
epoch 5, loss 0.2186, train acc 0.919, test acc 0.871, time 12.3 sec
