In [2]:
import time
import torch
from torch import nn, optim
import torch.nn.functional as F

In [3]:
import sys
sys.path.append("..") 
import d2lzh_pytorch as d2l
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device


device(type='cpu')

### 5.10.2 从零开始实现

In [4]:
def batch_norm(is_training, X, gamma, beta, moving_mean, moving_var, eps, momentum):
    # 判断当前模式是训练模式还是预测模式
    if not is_training:
        # 如果是在预测模式下，直接使用传入的移动平均所得的均值和方差
        X_hat = (X - moving_mean) / torch.sqrt(moving_var + eps)
    else:
        assert len(X.shape) in (2, 4)
        if len(X.shape) == 2:
            # 使用全连接层的情况，计算特征维上的均值和方差
            mean = X.mean(dim=0)
            var = ((X - mean) ** 2).mean(dim=0)
        else:
            # 使用二维卷积层的情况，计算通道维上（axis=1）的均值和方差。这里我们需要保持
            # X的形状以便后面可以做广播运算
            mean = X.mean(dim=0, keepdim=True).mean(dim=2, keepdim=True).mean(dim=3, keepdim=True)
            var = ((X - mean) ** 2).mean(dim=0, keepdim=True).mean(dim=2, keepdim=True).mean(dim=3, keepdim=True)
        # 训练模式下用当前的均值和方差做标准化
        X_hat = (X - mean) / torch.sqrt(var + eps)
        # 更新移动平均的均值和方差
        moving_mean = momentum * moving_mean + (1.0 - momentum) * mean
        moving_var = momentum * moving_var + (1.0 - momentum) * var
    Y = gamma * X_hat + beta  # 拉伸和偏移
    return Y, moving_mean, moving_var


In [5]:
# 自定义一个BatchNorm层。
# 它保存参与求梯度和迭代的拉伸参数gamma和偏移参数beta，同时也维护移动平均得到的均值和方差，以便能够在模型预测时被使用。
# BatchNorm实例所需指定的num_features参数对于全连接层来说应为输出个数，对于卷积层来说则为输出通道数。
# 该实例所需指定的num_dims参数对于全连接层和卷积层来说分别为2和4。
class BatchNorm(nn.Module):
    def __init__(self, num_features, num_dims):
        super(BatchNorm, self).__init__()
        if num_dims == 2:
            shape = (1, num_features)
        else:
            shape = (1, num_features, 1, 1)
        # 参与求梯度和迭代的拉伸和偏移参数，分别初始化成0和1
        self.gamma = nn.Parameter(torch.ones(shape))
        self.beta = nn.Parameter(torch.zeros(shape))
        # 不参与求梯度和迭代的变量，全在内存上初始化成0
        self.moving_mean = torch.zeros(shape)
        self.moving_var = torch.zeros(shape)

    def forward(self, X):
        # 如果X不在内存上，将moving_mean和moving_var复制到X所在显存上
        if self.moving_mean.device != X.device:
            self.moving_mean = self.moving_mean.to(X.device)
            self.moving_var = self.moving_var.to(X.device)
        # 保存更新过的moving_mean和moving_var, Module实例的traning属性默认为true, 调用.eval()后设成false
        Y, self.moving_mean, self.moving_var = batch_norm(self.training, 
            X, self.gamma, self.beta, self.moving_mean,
            self.moving_var, eps=1e-5, momentum=0.9)
        return Y


In [6]:
# 修改5.5节（卷积神经网络（LeNet））介绍的LeNet模型，从而应用批量归一化层。
# 在所有的卷积层或全连接层之后、激活层之前加入批量归一化层。
net = nn.Sequential(
            nn.Conv2d(1, 6, 5), # in_channels, out_channels, kernel_size
            BatchNorm(6, num_dims=4),
            nn.Sigmoid(),
            nn.MaxPool2d(2, 2), # kernel_size, stride
            nn.Conv2d(6, 16, 5),
            BatchNorm(16, num_dims=4),
            nn.Sigmoid(),
            nn.MaxPool2d(2, 2),
            d2l.FlattenLayer(),
            nn.Linear(16*4*4, 120),
            BatchNorm(120, num_dims=2),
            nn.Sigmoid(),
            nn.Linear(120, 84),
            BatchNorm(84, num_dims=2),
            nn.Sigmoid(),
            nn.Linear(84, 10)
        )

In [7]:
# 设置超参数
batch_size = 256
train_iter, test_iter = d2l.load_data_fashion_mnist(batch_size=batch_size)

lr, num_epochs = 0.001, 5
optimizer = torch.optim.Adam(net.parameters(), lr=lr)


In [8]:
# 训练修改后的模型
d2l.train_ch5(net, train_iter, test_iter, batch_size, optimizer, device, num_epochs)

training on  cpu
epoch 1, loss 1.0024, train acc 0.784, test acc 0.797, time 21.0 sec
epoch 2, loss 0.4667, train acc 0.860, test acc 0.835, time 21.2 sec
epoch 3, loss 0.3730, train acc 0.877, test acc 0.858, time 20.9 sec
epoch 4, loss 0.3336, train acc 0.887, test acc 0.803, time 21.0 sec
epoch 5, loss 0.3091, train acc 0.893, test acc 0.821, time 21.1 sec


In [9]:
# 查看第一个批量归一化层学习到的拉伸参数gamma和偏移参数beta
net[1].gamma.view((-1,)), net[1].beta.view((-1,))


(tensor([0.9733, 1.1356, 1.0413, 1.0178, 0.9710, 1.2437],
        grad_fn=<ViewBackward0>),
 tensor([-0.0511,  0.4107,  0.3980, -0.4629, -0.5643,  0.2038],
        grad_fn=<ViewBackward0>))

### 5.10.3 简洁实现

In [10]:
# Pytorch中nn模块定义的BatchNorm1d和BatchNorm2d类使用起来更加简单
# 二者分别用于全连接层和卷积层，都需要指定输入的num_features参数值。
net = nn.Sequential(
            nn.Conv2d(1, 6, 5), # in_channels, out_channels, kernel_size
            nn.BatchNorm2d(6),
            nn.Sigmoid(),
            nn.MaxPool2d(2, 2), # kernel_size, stride
            nn.Conv2d(6, 16, 5),
            nn.BatchNorm2d(16),
            nn.Sigmoid(),
            nn.MaxPool2d(2, 2),
            d2l.FlattenLayer(),
            nn.Linear(16*4*4, 120),
            nn.BatchNorm1d(120),
            nn.Sigmoid(),
            nn.Linear(120, 84),
            nn.BatchNorm1d(84),
            nn.Sigmoid(),
            nn.Linear(84, 10)
        )


In [11]:
# 使用同样的超参数进行训练
batch_size = 256
train_iter, test_iter = d2l.load_data_fashion_mnist(batch_size=batch_size)

lr, num_epochs = 0.001, 5
optimizer = torch.optim.Adam(net.parameters(), lr=lr)

# 训练
d2l.train_ch5(net, train_iter, test_iter, batch_size, optimizer, device, num_epochs)


training on  cpu
epoch 1, loss 1.0065, train acc 0.781, test acc 0.817, time 13.6 sec
epoch 2, loss 0.4618, train acc 0.862, test acc 0.856, time 13.6 sec
epoch 3, loss 0.3688, train acc 0.878, test acc 0.858, time 13.7 sec
epoch 4, loss 0.3290, train acc 0.887, test acc 0.821, time 13.7 sec
epoch 5, loss 0.3073, train acc 0.894, test acc 0.856, time 13.7 sec
