# 5.11 残差网络（ResNet）

In [2]:
import time
import torch
from torch import nn, optim
import torch.nn.functional as F
#import collenctions
import sys
sys.path.append("..") 
import d2lzh_pytorch as d2l
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

print(torch.__version__)
print(device)

1.2.0
cuda


## 5.11.2 残差块

In [3]:
class Residual(nn.Module):  # 本类已保存在d2lzh_pytorch包中方便以后使用
    def __init__(self, in_channels, out_channels, use_1x1conv=False, stride=1):
        super(Residual, self).__init__()
        self.conv1 = nn.Conv2d(in_channels, out_channels, kernel_size=3, padding=1, stride=stride)
        self.conv2 = nn.Conv2d(out_channels, out_channels, kernel_size=3, padding=1)
        if use_1x1conv:
            self.conv3 = nn.Conv2d(in_channels, out_channels, kernel_size=1, stride=stride)
        else:
            self.conv3 = None
        self.bn1 = nn.BatchNorm2d(out_channels)
        self.bn2 = nn.BatchNorm2d(out_channels)

    def forward(self, X):
        Y = F.relu(self.bn1(self.conv1(X)))
        Y = self.bn2(self.conv2(Y))
        if self.conv3:
            X = self.conv3(X)
        return F.relu(Y + X)

In [4]:
blk = Residual(3, 3)
X = torch.rand((4, 3, 6, 6))
blk(X).shape

torch.Size([4, 3, 6, 6])

In [5]:
blk = Residual(3, 6, use_1x1conv=True, stride=2)
blk(X).shape

torch.Size([4, 6, 3, 3])

In [6]:
print(blk)

Residual(
  (conv1): Conv2d(3, 6, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1))
  (conv2): Conv2d(6, 6, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
  (conv3): Conv2d(3, 6, kernel_size=(1, 1), stride=(2, 2))
  (bn1): BatchNorm2d(6, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (bn2): BatchNorm2d(6, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
)


## 5.11.2 ResNet模型

In [8]:
net = nn.Sequential(
        nn.Conv2d(1, 64, kernel_size=7, stride=2, padding=3),
        nn.BatchNorm2d(64), 
        nn.ReLU(),
        nn.MaxPool2d(kernel_size=3, stride=2, padding=1))

In [9]:
def resnet_block(in_channels, out_channels, num_residuals, first_block=False):
    if first_block:
        assert in_channels == out_channels # 第一个模块的通道数同输入通道数一致
    blk = []
    for i in range(num_residuals):
        if i == 0 and not first_block:
            blk.append(Residual(in_channels, out_channels, use_1x1conv=True, stride=2))
        else:
            blk.append(Residual(out_channels, out_channels))
    return nn.Sequential(*blk)

In [10]:
net.add_module("resnet_block1", resnet_block(64, 64, 2, first_block=True))
net.add_module("resnet_block2", resnet_block(64, 128, 2))
net.add_module("resnet_block3", resnet_block(128, 256, 2))
net.add_module("resnet_block4", resnet_block(256, 512, 2))

In [11]:
net.add_module("global_avg_pool", d2l.GlobalAvgPool2d()) # GlobalAvgPool2d的输出: (Batch, 512, 1, 1)
net.add_module("fc", nn.Sequential(d2l.FlattenLayer(), nn.Linear(512, 10))) 

In [12]:
net

Sequential(
  (0): Conv2d(1, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3))
  (1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (2): ReLU()
  (3): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
  (resnet_block1): Sequential(
    (0): Residual(
      (conv1): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
      (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
      (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (bn2): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    )
    (1): Residual(
      (conv1): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
      (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
      (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (bn2): BatchNorm2d(64, eps=1e-05, momentum=0.1, a

In [9]:
X = torch.rand((1, 1, 224, 224))
for name, layer in net.named_children():
    X = layer(X)
    print(name, ' output shape:\t', X.shape)

0  output shape:	 torch.Size([1, 64, 112, 112])
1  output shape:	 torch.Size([1, 64, 112, 112])
2  output shape:	 torch.Size([1, 64, 112, 112])
3  output shape:	 torch.Size([1, 64, 56, 56])
resnet_block1  output shape:	 torch.Size([1, 64, 56, 56])
resnet_block2  output shape:	 torch.Size([1, 128, 28, 28])
resnet_block3  output shape:	 torch.Size([1, 256, 14, 14])
resnet_block4  output shape:	 torch.Size([1, 512, 7, 7])
global_avg_pool  output shape:	 torch.Size([1, 512, 1, 1])
fc  output shape:	 torch.Size([1, 10])


## 5.11.3 获取数据和训练模型

In [10]:
batch_size = 256
# 如出现“out of memory”的报错信息，可减小batch_size或resize
train_iter, test_iter = d2l.load_data_fashion_mnist(batch_size, resize=96)

lr, num_epochs = 0.001, 5
optimizer = torch.optim.Adam(net.parameters(), lr=lr)
d2l.train_ch5(net, train_iter, test_iter, batch_size, optimizer, device, num_epochs)

training on  cuda
epoch 1, loss 0.4064, train acc 0.852, test acc 0.871, time 25.0 sec
epoch 2, loss 0.1227, train acc 0.909, test acc 0.902, time 25.3 sec
epoch 3, loss 0.0687, train acc 0.924, test acc 0.861, time 45.6 sec
epoch 4, loss 0.0447, train acc 0.934, test acc 0.907, time 48.4 sec
epoch 5, loss 0.0312, train acc 0.942, test acc 0.914, time 49.3 sec


## Myresnet

In [17]:
class myresidual(nn.Module):
    def __init__(self,in_channels,out_channels,use_11=False,stride=1):
        super(myresidual,self).__init__()
        self.b1=nn.Conv2d(in_channels,out_channels,(3,3),stride,1)
        self.b2=nn.Conv2d(out_channels,out_channels,(3,3),1,1)
        if use_11:
            self.b3=nn.Conv2d(in_channels,out_channels,(1,1),stride)
        else:
            self.b3=None
        self.bn1=nn.BatchNorm2d(out_channels)
        self.bn2=nn.BatchNorm2d(out_channels)
    def forward(self,x):
        y=F.relu(self.bn1(self.b1(x)))
        y=self.bn2(self.b2(y))
        if self.b3:
            x=self.b3(x)
        return F.relu(x+y)

In [18]:
def myresnet_block(in_channels,out_channels,num_residuals,first_block=False):
    if first_block:
        assert in_channels == out_channels
    blk=[]
    for i in range(num_residuals):
        if i==0 and not first_block:
            blk.append(myresidual(in_channels,out_channels,use_11=True,stride=2))
        else:
            blk.append(myresidual(out_channels,out_channels))
        return nn.Sequential(*blk)

In [29]:
import collections
net=nn.Sequential(collections.OrderedDict([
    ("a",nn.Linear(2,4)),
    ("b",nn.Linear(4,6))
    
]))

In [40]:
net=nn.Sequential(collections.OrderedDict([
                  ("init",nn.Sequential(nn.Conv2d(1,64,(7,7),stride=2,padding=3),
                      nn.BatchNorm2d(64),
                      nn.ReLU(),
                      nn.MaxPool2d((3,3),stride=2,padding=1))),
                  ("resnet1",myresnet_block(64,64,2,first_block=True)),
                  ("resnet2",myresnet_block(64,128,2)),
                  ("resnet3",myresnet_block(128,256,2)),
                  ("resnet4",myresnet_block(256,512,2)),
                  ("global_avg",d2l.GlobalAvgPool2d()),
                  ("fc",nn.Sequential(d2l.FlattenLayer(), nn.Linear(512, 10)))
                 ]))

In [43]:
net.to(device)
net

Sequential(
  (init): Sequential(
    (0): Conv2d(1, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3))
    (1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (2): ReLU()
    (3): MaxPool2d(kernel_size=(3, 3), stride=2, padding=1, dilation=1, ceil_mode=False)
  )
  (resnet1): Sequential(
    (0): myresidual(
      (b1): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
      (b2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
      (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (bn2): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    )
  )
  (resnet2): Sequential(
    (0): myresidual(
      (b1): Conv2d(64, 128, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1))
      (b2): Conv2d(128, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
      (b3): Conv2d(64, 128, kernel_size=(1, 1), stride=(2, 2))
      (bn1): BatchNorm2d(128

In [42]:
X = torch.rand((1, 1, 224, 224))
for name, layer in net.named_children():
    X = layer(X)
    print(name, ' output shape:\t', X.shape)

init  output shape:	 torch.Size([1, 64, 56, 56])
resnet1  output shape:	 torch.Size([1, 64, 56, 56])
resnet2  output shape:	 torch.Size([1, 128, 28, 28])
resnet3  output shape:	 torch.Size([1, 256, 14, 14])
resnet4  output shape:	 torch.Size([1, 512, 7, 7])
global_avg  output shape:	 torch.Size([1, 512, 1, 1])
fc  output shape:	 torch.Size([1, 10])


In [44]:
batch_size = 256
train_iter, test_iter = d2l.load_data_fashion_mnist(batch_size, resize=96)

lr, num_epochs = 0.001, 5
optimizer = torch.optim.Adam(net.parameters(), lr=lr)
d2l.train_ch5(net, train_iter, test_iter, batch_size, optimizer, device, num_epochs)

training on  cuda
epoch 1, loss 0.3661, train acc 0.866, test acc 0.888, time 16.9 sec
epoch 2, loss 0.1154, train acc 0.915, test acc 0.889, time 17.1 sec
epoch 3, loss 0.0640, train acc 0.929, test acc 0.915, time 17.1 sec
epoch 4, loss 0.0394, train acc 0.942, test acc 0.918, time 17.1 sec
epoch 5, loss 0.0269, train acc 0.950, test acc 0.902, time 17.2 sec
