<a href="https://colab.research.google.com/github/yananma/5_programs_per_day/blob/master/02163.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## 5.10 批量归一化

### 5.10.2 从零开始实现

In [0]:
import torch 
from torch import nn, optim 
import torch.nn.functional as F 
import d2l 
import time 
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')


def batch_norm(is_training, X, gamma, beta, moving_mean, moving_var, eps, momentum):
    if not is_training:
        X_hat = (X - moving_mean) / torch.sqrt(moving_var + eps)
    else:
        assert len(X.shape) in (2, 4)
        if len(X.shape) == 2:
            mean = X.mean(dim=0)
            var = ((X - mean) ** 2).mean(dim=0)
        else:
            mean = X.mean(dim=0, keepdim=True).mean(dim=2, keepdim=True).mean(dim=3, keepdim=True)
            var = ((X - mean) ** 2).mean(dim=0, keepdim=True).mean(dim=2, keepdim=True).mean(dim=3, keepdim=True)
        X_hat = (X - mean) / torch.sqrt(var + eps)
        moving_mean = momentum * moving_mean + (1.0 - momentum) * mean 
        moving_var = momentum * moving_var + (1.0 - momentum) * var 
    Y = gamma * X_hat + beta 
    return Y, moving_mean, moving_var 

In [0]:
class BatchNorm(nn.Module):
    def __init__(self, num_features, num_dims):
        super(BatchNorm, self).__init__()
        if num_dims == 2:
            shape = (1, num_features)
        else:
            shape = (1, num_features, 1, 1)
        self.gamma = nn.Parameter(torch.ones(shape))
        self.beta = nn.Parameter(torch.zeros(shape))
        self.moving_mean = torch.zeros(shape)
        self.moving_var = torch.zeros(shape)

    def forward(self, X):
        if self.moving_mean.device != X.device:
            self.moving_mean = self.moving_mean.to(X.device)
            self.moving_var = self.moving_var.to(X.device)
        Y, self.moving_mean, self.moving_var = batch_norm(self.training, X, self.gamma, self.beta, 
                            self.moving_mean, self.moving_var, eps=1e-5, momentum=0.9)
        return Y 

#### 1 使用批零归一化层的 LeNet

In [0]:
net = nn.Sequential(
    nn.Conv2d(1, 6, 5), 
    BatchNorm(6, num_dims=4), 
    nn.Sigmoid(), 
    nn.MaxPool2d(2, 2), 
    nn.Conv2d(6, 16, 5), 
    BatchNorm(16, num_dims=4), 
    nn.Sigmoid(), 
    nn.MaxPool2d(2, 2), 
    d2l.FlattenLayer(), 
    nn.Linear(16*4*4, 120), 
    BatchNorm(120, num_dims=2), 
    nn.Sigmoid(), 
    nn.Linear(120, 84), 
    BatchNorm(84, num_dims=2), 
    nn.Sigmoid(), 
    nn.Linear(84, 10), 
)

In [11]:
batch_size = 256 
train_iter, test_iter = d2l.load_data_fashion_mnist(batch_size=batch_size)

lr, num_epochs = 0.001, 5 
optimizer = torch.optim.Adam(net.parameters(), lr=lr)
d2l.train_ch5(net, train_iter, test_iter, batch_size, optimizer, device, num_epochs)

training on  cuda
epoch 1, loss 0.9796, train acc 0.794, test acc 0.802, time 6.4 sec
epoch 2, loss 0.4457, train acc 0.869, test acc 0.858, time 6.4 sec
epoch 3, loss 0.3565, train acc 0.883, test acc 0.849, time 6.3 sec
epoch 4, loss 0.3253, train acc 0.887, test acc 0.872, time 6.3 sec
epoch 5, loss 0.3031, train acc 0.894, test acc 0.874, time 6.3 sec


In [12]:
net[1].gamma.view((-1, )), net[1].beta.view((-1, ))

(tensor([0.9841, 0.9523, 1.0258, 1.2424, 1.1701, 1.0703], device='cuda:0',
        grad_fn=<ViewBackward>),
 tensor([ 0.1280, -0.3909,  0.0480,  0.5539,  0.0341, -0.1078], device='cuda:0',
        grad_fn=<ViewBackward>))

### 5.10.3 简洁实现

In [0]:
net = nn.Sequential(
    nn.Conv2d(1, 6, 5), 
    nn.BatchNorm2d(6), 
    nn.Sigmoid(), 
    nn.MaxPool2d(2, 2), 
    nn.Conv2d(6, 16, 5), 
    nn.BatchNorm2d(16), 
    nn.Sigmoid(), 
    nn.MaxPool2d(2, 2), 
    d2l.FlattenLayer(), 
    nn.Linear(16*4*4, 120), 
    nn.BatchNorm1d(120), 
    nn.Sigmoid(), 
    nn.Linear(120, 84), 
    nn.BatchNorm1d(84), 
    nn.Sigmoid(), 
    nn.Linear(84, 10), 
)

In [14]:
batch_size = 256 
train_iter, test_iter = d2l.load_data_fashion_mnist(batch_size=batch_size)

lr, num_epochs = 0.001, 5 
optimizer = torch.optim.Adam(net.parameters(), lr=lr)
d2l.train_ch5(net,  train_iter, test_iter, batch_size, optimizer, device, num_epochs)

training on  cuda
epoch 1, loss 0.9948, train acc 0.784, test acc 0.804, time 5.8 sec
epoch 2, loss 0.4557, train acc 0.865, test acc 0.847, time 5.8 sec
epoch 3, loss 0.3670, train acc 0.879, test acc 0.841, time 5.8 sec
epoch 4, loss 0.3294, train acc 0.888, test acc 0.854, time 5.8 sec
epoch 5, loss 0.3093, train acc 0.892, test acc 0.836, time 5.8 sec
