## 1.从零开始实现

In [1]:
import gluonbook as gb
from mxnet import autograd, gluon, init, nd
from mxnet.gluon import nn

In [2]:
def batch_norm(X, gamma, beta, moving_mean, moving_var, eps, momentum):
    # 通过autograd来判断当前模式为训练模式或预测模式
    if not autograd.is_training():
        # 如果是在预测模式下，直接使用传入的移动平均所得的均值和方差
        X_hat = (X - moving_mean) / nd.sqrt(moving_var + eps)
    else:
        assert len(X.shape) in (2, 4)
        if len(X.shape) == 2:
            # 使用全连接层的情况，计算特征维上的均值和方差
            mean = X.mean(axis = 0)
            var = ((X - mean) ** 2).mean(axis = 0)
        else:
            # 使用二维卷积层的情况，计算通道维上(axis = 1)的均值和方差。这里我们需要
            # 保持X的形状以便后面可以做广播运算
            mean = X.mean(axis = (0, 2, 3), keepdims = True)
            var = ((X - mean) ** 2).mean(axis = (0, 2, 3), keepdims = True)
        # 训练模式下用当前的均值和方差做标准化
        X_hat = (X - mean) / nd.sqrt(var + eps)
        # 更新移动平均的均值和方差
        moving_mean = momentum * moving_mean + (1.0 - momentum) * mean
        moving_var = momentum * moving_var + (1.0 - momentum) * var
    Y = gamma * X_hat + beta # 拉升和偏移
    return Y, moving_mean, moving_var

In [3]:
'''
自定义一个BatchNorm层
'''
class BatchNorm(nn.Block):
    def __init__(self, num_features, num_dims, **kwargs):
        super(BatchNorm, self).__init__(**kwargs)
        if num_dims == 2:
            shape = (1, num_features)
        else:
            shape = (1, num_features, 1, 1)
        # 参与求梯度和和迭代的拉升和偏移参数，分别初始化成0和1
        self.gamma = self.params.get('gamma', shape = shape, init = init.One())
        self.beta = self.params.get('beta', shape = shape, init = init.Zero())
        # 不参与求梯度和迭代的变量，全在CPU上初始化成0
        self.moving_mean = nd.zeros(shape)
        self.moving_var = nd.zeros(shape)
        
    def forward(self, X):
        # 如果X不在CPU上，将moving_mean和moving_var复制到X所在设备上
        if self.moving_mean.context != X.context:
            self.moving_mean = self.moving_mean.copyto(X.context)
            self.moving_var = self.moving_var.copyto(X.context)
        # 保存更新过的moving_mean和moving_var
        Y, self.moving_mean, self.moving_var = batch_norm(
            X, self.gamma.data(), self.beta.data(), self.moving_mean,
            self.moving_var, eps = 1e-5, momentum = 0.9)
        return Y

In [4]:
'''
使用批量归一化层的LeNet
'''
net = nn.Sequential()
net.add(nn.Conv2D(6, kernel_size = 5),
       BatchNorm(6, num_dims = 4),
       nn.Activation('sigmoid'),
       nn.MaxPool2D(pool_size = 2, strides = 2),
       nn.Conv2D(16, kernel_size = 5),
       BatchNorm(16, num_dims = 4),
       nn.Activation('sigmoid'),
       nn.MaxPool2D(pool_size = 2, strides = 2),
       nn.Dense(120),
       BatchNorm(120, num_dims = 2),
       nn.Activation('sigmoid'),
       nn.Dense(84),
       BatchNorm(84, num_dims = 2),
       nn.Activation('sigmoid'),
       nn.Dense(10))

In [7]:
'''
训练修改后的模型
'''
lr, num_epochs, batch_size, ctx = 1.0, 5, 256, gb.try_gpu()
net.initialize(ctx = ctx, init = init.Xavier())
trainer = gluon.Trainer(net.collect_params(), 'sgd', {'learning_rate': lr})
train_iter, test_iter = gb.load_data_fashion_mnist(batch_size)
gb.train_ch5(net, train_iter, test_iter, batch_size, trainer, ctx, num_epochs)

training on cpu(0)
epoch 1, loss 0.6807, train acc 0.756, test acc 0.729, time 16.7 sec
epoch 2, loss 0.3950, train acc 0.857, test acc 0.862, time 16.0 sec
epoch 3, loss 0.3493, train acc 0.874, test acc 0.881, time 15.5 sec
epoch 4, loss 0.3250, train acc 0.882, test acc 0.844, time 15.6 sec
epoch 5, loss 0.3076, train acc 0.888, test acc 0.840, time 15.7 sec


## 2.Gluon实现

In [8]:
net = nn.Sequential()
net.add(nn.Conv2D(6, kernel_size = 5),
       nn.BatchNorm(),
       nn.Activation('sigmoid'),
       nn.MaxPool2D(pool_size = 2, strides = 2),
       nn.Conv2D(16, kernel_size = 5),
       nn.BatchNorm(),
       nn.Activation('sigmoid'),
       nn.MaxPool2D(pool_size = 2, strides = 2),
       nn.Dense(120),
       nn.BatchNorm(),
       nn.Activation('sigmoid'),
       nn.Dense(84),
       nn.BatchNorm(),
       nn.Activation('sigmoid'),
       nn.Dense(10))

In [9]:
'''
使用同样的超参数进行训练
'''
net.initialize(ctx = ctx, init = init.Xavier())
trainer = gluon.Trainer(net.collect_params(), 'sgd', {'learning_rate': lr})
gb.train_ch5(net, train_iter, test_iter, batch_size, trainer, ctx, num_epochs)

training on cpu(0)
epoch 1, loss 0.6482, train acc 0.770, test acc 0.843, time 15.7 sec
epoch 2, loss 0.4007, train acc 0.854, test acc 0.861, time 15.8 sec
epoch 3, loss 0.3489, train acc 0.873, test acc 0.874, time 15.4 sec
epoch 4, loss 0.3216, train acc 0.884, test acc 0.857, time 15.5 sec
epoch 5, loss 0.3018, train acc 0.890, test acc 0.877, time 16.1 sec
