In [56]:
#googlenet
#Inception
#Inception 块使用1*1卷积减少通道数从而降低模型复杂度
import time
import torch
from torch import nn,optim
import torch.nn.functional as F

import sys
sys.path.append("..")
import d2l.torch as d2l
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

class Inception(nn.Module):
    #c1 - c4 为每一条线路里的层的输出通道数,这些作为超参可以自己定义
    def __init__(self,in_c,c1,c2,c3,c4):
        super(Inception,self).__init__()
        #线路1,单1*1卷积层
        self.p1_1 = nn.Conv2d(in_c,c1,kernel_size=1)
        #线路2,1*1卷积层后接3*3
        self.p2_1 = nn.Conv2d(in_c,c2[0],kernel_size=1)
        self.p2_2 = nn.Conv2d(c2[0],c2[1],kernel_size=3,padding=1)
        #线路3,1*1卷积层后接5*5
        self.p3_1 = nn.Conv2d(in_c,c3[0],kernel_size=1)
        self.p3_2 = nn.Conv2d(c3[0],c3[1],kernel_size=5,padding=2)
        #线路4,3*3最大池化接1*1卷积层
        self.p4_1 = nn.MaxPool2d(kernel_size=3,stride=1,padding=1)
        self.p4_2 = nn.Conv2d(in_c,c4,kernel_size=1)
    def forward(self,x):
        p1 = F.relu(self.p1_1(x))
        p2 = F.relu(self.p2_2(F.relu(self.p2_1(x))))
        p3 = F.relu(self.p3_2(F.relu(self.p3_1(x))))
        p4 = F.relu(self.p4_2(F.relu(self.p4_1(x))))
        return torch.cat((p1,p2,p3,p4),dim=1)
        

In [57]:
import torch.nn.functional as F
class GlobalAvgPool2d(nn.Module):
    # 全局平均池化层，可将窗口设置成输入的宽高实现
    #全局平均池化则直接把整幅feature maps（它的个数等于类别个数）进行平均池化，然后输入到softmax层中得到对应的每个类别的得分。
    def __init__(self):
        super(GlobalAvgPool2d,self).__init__()
    def forward(self,x):
        return F.avg_pool2d(x,kernel_size=x.size()[2:])
#第一模块使用一个64通道的7×7卷积层。
b1 = nn.Sequential(nn.Conv2d(1,64,kernel_size=7,stride=2,padding=3),
                  nn.ReLU(),
                  nn.MaxPool2d(kernel_size=3,stride=2,padding=1))
#第二模块使用2个卷积层：首先是64通道的1×1卷积层，然后是将通道增大3倍的3×3卷积层。
b2 = nn.Sequential(nn.Conv2d(64,64,kernel_size=1),
                   nn.Conv2d(64,192,kernel_size=3,padding=1),
                   nn.MaxPool2d(kernel_size=3,stride=2,padding=1)
                  )
#第三模块串联2个完整的Inception块。
##第一个输出通道数是63+128+32+32=256,第三第二分别将通道数减少到16/192=1/12,96/192=1/2
b3 = nn.Sequential(Inception(192,64,(96,128),(16,32),32),
                   Inception(256,128,(128,192),(32,96),64),#第二个是128+192+96+64=480
                   nn.MaxPool2d(kernel_size=3,stride=2,padding=1)
                  )
#第四个模块串联了5个Inception
b4 = nn.Sequential(Inception(480,192,(96,208),(16,48),64),#输出512
                   Inception(512,160,(112,224),(24,64),64),#输出512
                   Inception(512,128,(128,256),(24,64),64),#512
                   Inception(512,112,(144,288),(32,64),64),#528
                   Inception(528,256,(160,320),(32,128),128),#832
                   nn.MaxPool2d(kernel_size=3,stride=2,padding=1))
#第五个模块是串联了两个Inception
b5 = nn.Sequential(Inception(832,256,(160,320),(32,128),128),#832
                   Inception(832,384,(192,384),(48,128),128),#1024
                   GlobalAvgPool2d())


In [58]:
net = nn.Sequential(b1,b2,b3,b4,b5,nn.Flatten(),nn.Linear(1024,1024))
x = torch.rand(1,1,96,96)
for blk in net.children():
    x = blk(x)
    print('output shape',x.shape)

output shape torch.Size([1, 64, 24, 24])
output shape torch.Size([1, 192, 12, 12])
output shape torch.Size([1, 480, 6, 6])
output shape torch.Size([1, 832, 3, 3])
output shape torch.Size([1, 1024, 1, 1])
output shape torch.Size([1, 1024])
output shape torch.Size([1, 1024])


In [59]:
batch_size = 128
train_iter,test_iter = d2l.load_data_fashion_mnist(batch_size,resize=96)
lr,num_epochs = 0.001,5
optimizer = torch.optim.Adam(net.parameters(),lr=lr)
d2l.train_ch5(net,train_iter,test_iter,batch_size,optimizer,device,num_epochs)

training on cuda
epoch 1, loss 2.3679, train acc 0.098, test acc 0.100, time 21.9 sec
epoch 2, loss 2.3094, train acc 0.098, test acc 0.100, time 21.9 sec
epoch 3, loss 2.3110, train acc 0.104, test acc 0.100, time 21.9 sec
epoch 4, loss 2.3156, train acc 0.105, test acc 0.120, time 21.9 sec
epoch 5, loss 0.8400, train acc 0.675, test acc 0.813, time 22.0 sec


In [None]:
#对输入数据做了标准化处理：处理后的任意一个特征在数据集中所有样本上的均值为0、标准差为1。
#批量归一化利用小批量上的均值和标准差，不断调整神经网络中间输出，从而使整个神经网络在各层的中间输出的数值更稳定
#对全连接层做批量归一化ϕ(BN(x)),其中x = Wu+b(仿射变化)
#首先，对小批量B求均值和方差,B = {x_1,x_2,...,x_m},μ_B,σ_B^2
#使用按元素开方和按元素除法对x(i)标准化:x^(i)←x(i)−μ_B/根号(σ_^B^2+ϵ),
#批量归一化层引入了两个可以学习的模型参数，拉伸（scale）参数 γ 和偏移（shift）参数 β。这两个参数和x(i)形状相同，皆为d维向量。
#它们与x(i)分别做按元素乘法（符号⊙）和加法计算：

#CNN 
#如果卷积计算输出多个通道，我们需要对这些通道的输出分别做批量归一化，且每个通道都拥有独立的拉伸和偏移参数，并均为标量。
#设小批量中有mm个样本。在单个通道上，假设卷积计算输出的高和宽分别为p和q。我们需要对该通道中m×p×q个元素同时做批量归一化。

In [1]:
# import time
# import torch
# from torch import nn,optim
# import torch.nn.functional as F

# import sys
# sys.path.append("..")
# import d2l.torch as d2l
# device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# def batch_norm(is_training,X,gamma,beta,moving_mean,moving_var,eps,momentum):
#     #判断当前模式是训练模式还是预测模式
#     if not is_training:
#         #预测模式
#         x_hat = (X-moving_mean)/torch.sqrt(moving_var+eps) #直接用传进来的参数
#     else:
#         assert len(X.shape) in (2,4)
#         if len(X.shape) == 2:
#             #使用全连接层的情况,计算特征维度上的均值和方差
#             mean = X.mean(dim=0)
#             var = ((X - mean)**2).mean(dim=0)
#         else:
#             # 使用CNN(样本数,通道,宽,高),计算通道维度上(axis=1)的均值和方差,要保持X的形状方便以后做广播运算
#             mean = X.mean(dim=0,keepdim=True).mean(dim=2,keepdim=True).mean(dim=3,keepdim=True)
#             var = ((X-mean)**2).mean(dim=0,keepdim=True).mean(dim=2,keepdim=True).mean(dim=3,keepdim=True)
#         #训练模式下用当前的均值和方差做标准化
#         x_hat = (X-mean)/torch.sqrt(var+eps)
#         #更新移动平均和均值方差
#         moving_mean = momentum*moving_mean+(1.0-momentum)*mean
#         moving_var = momentum*moving_var+(1.0-momentum)*var
        
#     Y = gamma * x_hat + beta #拉伸和便宜
#     return Y,moving_mean,moving_var

import time
import torch
from torch import nn, optim
import torch.nn.functional as F

import sys
sys.path.append("..") 
import d2l.torch as d2l
device = torch.device('cuda:1' if torch.cuda.is_available() else 'cpu')

def batch_norm(is_training, X, gamma, beta, moving_mean, moving_var, eps, momentum):
    # 判断当前模式是训练模式还是预测模式
    if not is_training:
        # 如果是在预测模式下，直接使用传入的移动平均所得的均值和方差
        X_hat = (X - moving_mean) / torch.sqrt(moving_var + eps)
    else:
        assert len(X.shape) in (2, 4)
        if len(X.shape) == 2:
            # 使用全连接层的情况，计算特征维上的均值和方差
            mean = X.mean(dim=0)
            var = ((X - mean) ** 2).mean(dim=0)
        else:
            # 使用二维卷积层的情况(样本数,通道,宽,高)，计算通道维上（axis=1）的均值和方差。这里我们需要保持
            # X的形状以便后面可以做广播运算
            mean = X.mean(dim=0, keepdim=True).mean(dim=2, keepdim=True).mean(dim=3, keepdim=True)
            var = ((X - mean) ** 2).mean(dim=0, keepdim=True).mean(dim=2, keepdim=True).mean(dim=3, keepdim=True)
        # 训练模式下用当前的均值和方差做标准化
        X_hat = (X - mean) / torch.sqrt(var + eps)
        # 更新移动平均的均值和方差
        moving_mean = momentum * moving_mean + (1.0 - momentum) * mean
        moving_var = momentum * moving_var + (1.0 - momentum) * var
    Y = gamma * X_hat + beta  # 拉伸和偏移
    return Y, moving_mean, moving_var
        
            

In [2]:
# #batch norm层自定义
# #它保存参与求梯度和迭代的拉伸参数gamma和偏移参数beta，同时也维护移动平均得到的均值和方差，以便能够在模型预测时被使用。
# class BatchNorm(nn.Module):
#     def __init__(self,num_features,num_dims):
#         super(BatchNorm,self).__init__()
#         if num_dims == 2:
#             shape = (1,num_features)
#         else:
#             shape = (1,num_features,1,1)
#         #参与求梯度和迭代的拉伸和偏移系数,分别初始化为0和1
#         self.gamma = nn.Parameter(torch.ones(shape))
#         self.beta = nn.Parameter(torch.zeros(shape))
#         #不参与求梯度和迭代的变量
#         self.moving_mean = torch.zeros(shape)
#         self.moving_var = torch.zeros(shape)
#     def forward(self,X):
#         if self.moving_mean.device!=X.device:
#             self.moving_mean = self.moving_mean.to(X.device)
#             self.moving_var = self.moving_var.to(X.device)
#         #保存更新过的moving_mean和moving_var
#         Y,self.moving_mean,self.moving_var = batch_norm(self.training,X,self.gamma,self.beta,
#                                                         self.moving_mean,self.moving_var,eps=1e-5,momentum=0.9)
#         return Y
class BatchNorm(nn.Module):
    def __init__(self, num_features, num_dims):
        super(BatchNorm, self).__init__()
        if num_dims == 2:
            shape = (1, num_features)
        else:
            shape = (1, num_features, 1, 1)
        # 参与求梯度和迭代的拉伸和偏移参数，分别初始化成0和1
        self.gamma = nn.Parameter(torch.ones(shape))
        self.beta = nn.Parameter(torch.zeros(shape))
        # 不参与求梯度和迭代的变量，全在内存上初始化成0
        self.moving_mean = torch.zeros(shape)
        self.moving_var = torch.zeros(shape)

    def forward(self, X):
        # 如果X不在内存上，将moving_mean和moving_var复制到X所在显存上
        if self.moving_mean.device != X.device:
            self.moving_mean = self.moving_mean.to(X.device)
            self.moving_var = self.moving_var.to(X.device)
        # 保存更新过的moving_mean和moving_var, Module实例的traning属性默认为true, 调用.eval()后设成false
        Y, self.moving_mean, self.moving_var = batch_norm(self.training, 
            X, self.gamma, self.beta, self.moving_mean,
            self.moving_var, eps=1e-5, momentum=0.9)
        return Y
        

In [3]:
#批量归一化LeNet
net = nn.Sequential(
    nn.Conv2d(1,6,5),#in_channels,out_channels,kernel_size
    BatchNorm(6,num_dims=4),
    nn.Sigmoid(),
    nn.MaxPool2d(2,2),
    nn.Conv2d(6,16,5),
    BatchNorm(16,num_dims=4),
    nn.Sigmoid(),
    nn.MaxPool2d(2,2),
    nn.Flatten(),
    nn.Linear(16*4*4,120),
    BatchNorm(120,num_dims=2),
    nn.Sigmoid(),
    nn.Linear(120,84),
    BatchNorm(84,num_dims=2),
    nn.Sigmoid(),
    nn.Linear(84,10)
)
# class LeNet(nn.Module):
#     def __init__(self):
#         super(LeNet,self).__init__()
#         self.conv = nn.Sequential(
#             nn.Conv2d(1,6,5),#in_channels,out_channels,kernel_size
#             BatchNorm(6,num_dims=4),
#             nn.Sigmoid(),
#             nn.MaxPool2d(2,2),
#             nn.Conv2d(6,16,5),
#             BatchNorm(16,num_dims=4),
#             nn.Sigmoid(),
#             nn.MaxPool2d(2,2)
#         )
#         self.fc = nn.Sequential(
#             nn.Linear(16*4*4,120),
#             BatchNorm(120,num_dims=2),
#             nn.Sigmoid(),
#             nn.Linear(120,84),
#             BatchNorm(84,num_dims=2),
#             nn.Sigmoid(),
#             nn.Linear(84,10)
#         )
#     def forward(self,img):
#         feature = self.conv(img)
#         output = self.fc(feature.view(img.shape[0],-1))
#         return output
# net = LeNet()

In [4]:
batch_size = 128
train_iter,test_iter = d2l.load_data_fashion_mnist(batch_size=batch_size)
lr,num_epochs = 0.001,5
optimizer = torch.optim.Adam(net.parameters(),lr=lr)
d2l.train_ch5(net,train_iter,test_iter,batch_size,optimizer,device,num_epochs)

  return torch.from_numpy(parsed.astype(m[2], copy=False)).view(*s)


training on cuda:1


  return torch.max_pool2d(input, kernel_size, stride, padding, dilation, ceil_mode)


epoch 1, loss 0.7776, train acc 0.816, test acc 0.806, time 3.8 sec


RuntimeError: one of the variables needed for gradient computation has been modified by an inplace operation: [torch.cuda.FloatTensor [120, 84]], which is output 0 of TBackward, is at version 470; expected version 469 instead. Hint: enable anomaly detection to find the operation that failed to compute its gradient, with torch.autograd.set_detect_anomaly(True).

In [5]:
net = nn.Sequential(
            nn.Conv2d(1, 6, 5), # in_channels, out_channels, kernel_size
            BatchNorm(6, num_dims=4),
            nn.Sigmoid(),
            nn.MaxPool2d(2, 2), # kernel_size, stride
            nn.Conv2d(6, 16, 5),
            BatchNorm(16, num_dims=4),
            nn.Sigmoid(),
            nn.MaxPool2d(2, 2),
            nn.Flatten(),
            nn.Linear(16*4*4, 120),
            BatchNorm(120, num_dims=2),
            nn.Sigmoid(),
            nn.Linear(120, 84),
            BatchNorm(84, num_dims=2),
            nn.Sigmoid(),
            nn.Linear(84, 10)
        )

In [6]:
batch_size = 128
train_iter, test_iter = d2l.load_data_fashion_mnist(batch_size=batch_size)

lr, num_epochs = 0.001, 5
optimizer = torch.optim.Adam(net.parameters(), lr=lr)
d2l.train_ch5(net, train_iter, test_iter, batch_size, optimizer, device, num_epochs)
#....算了

training on cuda:1
epoch 1, loss 0.8004, train acc 0.804, test acc 0.832, time 3.9 sec


RuntimeError: one of the variables needed for gradient computation has been modified by an inplace operation: [torch.cuda.FloatTensor [120, 84]], which is output 0 of TBackward, is at version 470; expected version 469 instead. Hint: enable anomaly detection to find the operation that failed to compute its gradient, with torch.autograd.set_detect_anomaly(True).

In [9]:
net = nn.Sequential(
            nn.Conv2d(1, 6, 5), # in_channels, out_channels, kernel_size
            nn.BatchNorm2d(6),
            nn.Sigmoid(),
            nn.MaxPool2d(2, 2), # kernel_size, stride
            nn.Conv2d(6, 16, 5),
            nn.BatchNorm2d(16),
            nn.Sigmoid(),
            nn.MaxPool2d(2, 2),
            nn.Flatten(),
            nn.Linear(16*4*4, 120),
            nn.BatchNorm1d(120),
            nn.Sigmoid(),
            nn.Linear(120, 84),
            nn.BatchNorm1d(84),
            nn.Sigmoid(),
            nn.Linear(84, 10)
        )

In [10]:
batch_size = 256
train_iter, test_iter = d2l.load_data_fashion_mnist(batch_size=batch_size)

lr, num_epochs = 0.001, 5
optimizer = torch.optim.Adam(net.parameters(), lr=lr)
d2l.train_ch5(net, train_iter, test_iter, batch_size, optimizer, device, num_epochs)

training on cuda:1
epoch 1, loss 0.9969, train acc 0.787, test acc 0.834, time 1.8 sec
epoch 2, loss 0.5017, train acc 0.832, test acc 0.854, time 1.7 sec
epoch 3, loss 0.3896, train acc 0.861, test acc 0.864, time 1.8 sec
epoch 4, loss 0.3595, train acc 0.870, test acc 0.863, time 1.8 sec
epoch 5, loss 0.3362, train acc 0.880, test acc 0.874, time 1.9 sec


In [19]:
#resNet
#残差块
#首先有2个有相同输出通道数的3×3卷积层。每个卷积层后接一个批量归一化层和ReLU激活函数。
#然后我们将输入跳过这两个卷积运算后直接加在最后的ReLU激活函数前。
#如果想改变通道数，就需要引入一个额外的1×1卷积层来将输入变换成需要的形状后再做相加运算。
class Residual(nn.Module):
    def __init__(self,in_channels,out_channels,use_oneConv=False,stride=1):
        super(Residual,self).__init__()
        self.conv1 = nn.Conv2d(in_channels,out_channels,kernel_size=3,padding=1,stride=stride)
        self.conv2 = nn.Conv2d(out_channels,out_channels,kernel_size=3,padding=1)
        if use_oneConv:#使用1*1卷积
            self.conv3 = nn.Conv2d(in_channels,out_channels,kernel_size=1,stride=stride)
        else:
            self.conv3 = None
        self.bn1 = nn.BatchNorm2d(out_channels)
        self.bn2 = nn.BatchNorm2d(out_channels)
    def forward(self,X):
        Y1 = F.relu(self.bn1(self.conv1(X)))
        Y2 = self.bn2(self.conv2(Y1))
        if self.conv3:#是否要改变通道数
            X = self.conv3(X)
        return F.relu(Y2+X)
            
    

In [12]:
blk = Residual(3,3)
X = torch.rand((4,3,6,6))
blk(X).shape

torch.Size([4, 3, 6, 6])

In [16]:
blk = Residual(3,6,use_oneConv=True,stride=2)#可以通过1*1卷积增加通道数,用stride减半宽高
blk(X).shape

torch.Size([4, 6, 3, 3])

In [20]:
#ResNet模型
#跟之前介绍的GoogLeNet中的一样：在输出通道数为64、步幅为2的7×7卷积层后接步幅为2的3×3的最大池化层。
#不同之处在于ResNet每个卷积层后增加的批量归一化层。
net = nn.Sequential(
    nn.Conv2d(1,64,kernel_size=7,stride=2),
    nn.BatchNorm2d(64),
    nn.ReLU(),
    nn.MaxPool2d(kernel_size=3,stride=2,padding=1)
)
#4个残差块组成的模块,每个模块使用若干个同样输出通道数的残差快
#第一个模块的通道数同输入通道数一致。由于之前已经使用了步幅为2的最大池化层，所以无须减小高和宽。
#除了第一个模块 之后每个模块在第一个残差块里将上一个模块的通道数翻倍，并将高和宽减半。
def resnet_block(in_channels,out_channels,num_residuals,first_block=False):
    if first_block:
        assert in_channels==out_channels #第一个模块通道数和输入通道一致
    blk = []
    for i in range(num_residuals):
        if (i==0) and not first_block:#如果没有first
            blk.append(Residual(in_channels,out_channels,use_oneConv=True,stride=2))
        else:
            blk.append(Residual(out_channels,out_channels))
    return nn.Sequential(*blk)


In [21]:
net.add_module("resnet_block1",resnet_block(64,64,2,first_block=True))
net.add_module("resnet_block2",resnet_block(64,128,2))#通道翻倍宽高减半
net.add_module("resnet_block3",resnet_block(128,256,2))
net.add_module("resnet_block4",resnet_block(256,512,2))
net.add_module("global_avg_pool",d2l.GlobalAvgPool2d())#(Batch,512,1,1)
net.add_module("fc",nn.Sequential(nn.Flatten(),nn.Linear(512,10)))
#每个Residual有两个Conv(不算1*1),每个block共4个,加上最开始的卷积和最后的fc一共是18层,ResNet18
X = torch.rand((1,1,224,224))
for name ,layer in net.named_children():
    X = layer(X)
    print(name,'output shape:\t',X.shape)

0 output shape:	 torch.Size([1, 64, 109, 109])
1 output shape:	 torch.Size([1, 64, 109, 109])
2 output shape:	 torch.Size([1, 64, 109, 109])
3 output shape:	 torch.Size([1, 64, 55, 55])
resnet_block1 output shape:	 torch.Size([1, 64, 55, 55])
resnet_block2 output shape:	 torch.Size([1, 128, 28, 28])
resnet_block3 output shape:	 torch.Size([1, 256, 14, 14])
resnet_block4 output shape:	 torch.Size([1, 512, 7, 7])
global_avg_pool output shape:	 torch.Size([1, 512, 1, 1])
fc output shape:	 torch.Size([1, 10])


In [22]:
batch_size = 256
train_iter,test_iter = d2l.load_data_fashion_mnist(batch_size,resize=96)
lr,num_epochs = 0.001,5
optimizer = torch.optim.Adam(net.parameters(),lr=lr)
d2l.train_ch5(net,train_iter,test_iter,batch_size,optimizer,device,num_epochs)

training on cuda:1
epoch 1, loss 0.4185, train acc 0.846, test acc 0.879, time 17.4 sec
epoch 2, loss 1.6600, train acc 0.398, test acc 0.668, time 17.2 sec
epoch 3, loss 0.6397, train acc 0.754, test acc 0.789, time 17.3 sec
epoch 4, loss 0.4653, train acc 0.823, test acc 0.832, time 17.4 sec
epoch 5, loss 0.3657, train acc 0.863, test acc 0.872, time 17.5 sec


In [24]:
#DenseNet
#DenseNet里模块B的输出不是像ResNet那样和模块A的输出相加，而是在通道维上连结。这样模块A的输出可以直接传入模块B后面的层。
#DenseNet的主要构建模块是稠密块（dense block）和过渡层（transition layer）。
#前者定义了输入和输出是如何连结的，后者则用来控制通道数，使之不过大。
def conv_blk(in_channels,out_channels):
    blk = nn.Sequential(nn.BatchNorm2d(in_channels),
                        nn.ReLU(),
                        nn.Conv2d(in_channels,out_channels,kernel_size=3,padding=1))
    return blk

In [35]:
#稠密块由多个conv_block组成，每块使用相同的输出通道数。但在前向计算时，我们将每块的输入和输出在通道维上连结。
class DenseBlock(nn.Module):
    def __init__(self,num_convs,in_channels,out_channels):
        super(DenseBlock,self).__init__()
        net = []
        for i in range(num_convs):
            in_c = in_channels+i*out_channels
            net.append(conv_blk(in_c,out_channels))
        self.net = nn.ModuleList(net)
        self.out_channels = in_channels + num_convs * out_channels
    def forward(self,X):
        for blk in self.net:
           Y = blk(X)
           X = torch.cat((X,Y),dim=1)#通道维上将输入和输出连接
        return X

In [36]:
blk = DenseBlock(2,3,10)
X = torch.rand(4,3,8,8)
Y = blk(X)
Y.shape#3+2*10

torch.Size([4, 23, 8, 8])

In [37]:
#过渡层
#过渡层用来控制模型复杂度。它通过1×1卷积层来减小通道数，并使用步幅为2的平均池化层减半高和宽，从而进一步降低模型复杂度。
def transition_block(in_channels,out_channels):
    blk = nn.Sequential(
        nn.BatchNorm2d(in_channels),
        nn.ReLU(),
        nn.Conv2d(in_channels,out_channels,kernel_size=1),
        nn.AvgPool2d(kernel_size=2,stride=2)
        
    )
    return blk

In [38]:
blk = transition_block(23,10)
blk(Y).shape

torch.Size([4, 10, 4, 4])

In [39]:
#DenseNet模型
#DenseNet首先使用同ResNet一样的单卷积层和最大池化层。
net = nn.Sequential(
    nn.Conv2d(1,64,kernel_size=7,stride=2,padding=3),
    nn.BatchNorm2d(64),
    nn.ReLU(),
    nn.MaxPool2d(kernel_size=3,stride=2,padding=1)
)
#DenseNet使用的是4个稠密块。同ResNet一样，我们可以设置每个稠密块使用多少个卷积层。
#这里我们设成4，从而与上一节的ResNet-18保持一致。稠密块里的卷积层通道数（即增长率）设为32，所以每个稠密块将增加128个通道。
num_channels,grow_rate = 64,32
num_convs_in_dense_blk = [4,4,4,4]

for i,num_convs in enumerate(num_convs_in_dense_blk):
    DB = DenseBlock(num_convs,num_channels,grow_rate)
    net.add_module("DenseBlock_%d" % i,DB)
    #上一个稠密块输出的通道数
    num_channels = DB.out_channels
    #在稠密块上加上通道减半的过渡层
    if i != len(num_convs_in_dense_blk)-1:#如果没到最后一层
        net.add_module("transition_%d" % i,transition_block(num_channels,num_channels//2))
        num_channels = num_channels // 2
#和ResNet一样最后接上全局池化层
net.add_module("BN",nn.BatchNorm2d(num_channels))
net.add_module("relu",nn.ReLU())
net.add_module("global_avg_pool",d2l.GlobalAvgPool2d())
net.add_module("fc",nn.Sequential(nn.Flatten(),nn.Linear(num_channels,10)))

In [40]:
X = torch.rand((1,1,96,96))
#打印每层信息
for name,layer in net.named_children():
    X = layer(X)
    print(name,'output shape\t',X.shape)

0 output shape	 torch.Size([1, 64, 48, 48])
1 output shape	 torch.Size([1, 64, 48, 48])
2 output shape	 torch.Size([1, 64, 48, 48])
3 output shape	 torch.Size([1, 64, 24, 24])
DenseBlock_0 output shape	 torch.Size([1, 192, 24, 24])
transition_0 output shape	 torch.Size([1, 96, 12, 12])
DenseBlock_1 output shape	 torch.Size([1, 224, 12, 12])
transition_1 output shape	 torch.Size([1, 112, 6, 6])
DenseBlock_2 output shape	 torch.Size([1, 240, 6, 6])
transition_2 output shape	 torch.Size([1, 120, 3, 3])
DenseBlock_3 output shape	 torch.Size([1, 248, 3, 3])
BN output shape	 torch.Size([1, 248, 3, 3])
relu output shape	 torch.Size([1, 248, 3, 3])
global_avg_pool output shape	 torch.Size([1, 248, 1, 1])
fc output shape	 torch.Size([1, 10])


In [None]:
batch_size = 256
train_iter,test_iter = d2l.load_data_fashion_mnist(batch_size,resize=96)
lr,num_epochs = 0.001,5
optimizer = torch.optim.Adam(net.parameters(),lr=lr)
d2l.train_ch5(net,train_iter,test_iter,batch_size,optimizer,device,num_epochs)