<a href="https://colab.research.google.com/github/yananma/5_programs_per_day/blob/master/0522.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
!apt-get update && sudo apt-get install -y build-essential git libgfortran3
!wget https://developer.nvidia.com/compute/cuda/10.0/Prod/local_installers/cuda-repo-ubuntu1804-10-0-local-10.0.130-410.48_1.0-1_amd64
!sudo dpkg -i cuda-repo-ubuntu1804-10-0-local-10.0.130-410.48_1.0-1_amd64.deb
!sudo apt-key add /var/cuda-repo-<version>/7fa2af80.pub
!sudo apt-get update
!sudo apt-get install cuda        

In [0]:
!cat /usr/local/cuda/version.txt

CUDA Version 10.0.130


In [0]:
!pip install mxnet-cu100 d2lzh
import mxnet as mx
import d2lzh as d2l

In [0]:
import d2lzh as d2l 
from mxnet import autograd, gluon, init, nd 
from mxnet.gluon import loss as gloss, nn 
import time 

In [0]:
net = nn.Sequential()
net.add(nn.Conv2D(channels=6, kernel_size=5, activation='sigmoid'), 
    nn.MaxPool2D(pool_size=2, strides=2), 
    nn.Conv2D(channels=16, kernel_size=5, activation='sigmoid'), 
    nn.MaxPool2D(pool_size=2, strides=2), 
    nn.Dense(120, activation='sigmoid'), 
    nn.Dense(84, activation='sigmoid'), 
    nn.Dense(10))

In [0]:
X = nd.random.uniform(shape=(1, 1, 28, 28))
net.initialize()
for layer in net:
    X = layer(X)
    print(layer.name, 'output shape:\t', X.shape)

conv0 output shape:	 (1, 6, 24, 24)
pool0 output shape:	 (1, 6, 12, 12)
conv1 output shape:	 (1, 16, 8, 8)
pool1 output shape:	 (1, 16, 4, 4)
dense0 output shape:	 (1, 120)
dense1 output shape:	 (1, 84)
dense2 output shape:	 (1, 10)


In [0]:
batch_size = 256 
train_iter, test_iter = d2l.load_data_fashion_mnist(batch_size=batch_size)

Downloading /root/.mxnet/datasets/fashion-mnist/train-images-idx3-ubyte.gz from https://apache-mxnet.s3-accelerate.dualstack.amazonaws.com/gluon/dataset/fashion-mnist/train-images-idx3-ubyte.gz...
Downloading /root/.mxnet/datasets/fashion-mnist/train-labels-idx1-ubyte.gz from https://apache-mxnet.s3-accelerate.dualstack.amazonaws.com/gluon/dataset/fashion-mnist/train-labels-idx1-ubyte.gz...
Downloading /root/.mxnet/datasets/fashion-mnist/t10k-images-idx3-ubyte.gz from https://apache-mxnet.s3-accelerate.dualstack.amazonaws.com/gluon/dataset/fashion-mnist/t10k-images-idx3-ubyte.gz...
Downloading /root/.mxnet/datasets/fashion-mnist/t10k-labels-idx1-ubyte.gz from https://apache-mxnet.s3-accelerate.dualstack.amazonaws.com/gluon/dataset/fashion-mnist/t10k-labels-idx1-ubyte.gz...


In [0]:
def try_gpu():
    try:
        ctx = mx.gpu()
        _ = nd.zeros((1,), ctx=ctx)
    except mx.base.MXNetError:
        ctx = mx.cpu()
    return ctx 

In [0]:
ctx = try_gpu()
ctx 

gpu(0)

In [0]:
cttx = mx.gpu(1)
cttx

gpu(1)

In [0]:
def evaluate_accuracy(data_iter, net, ctx):
    acc_sum, n = nd.array([0], ctx=ctx), 0 
    for X, y in data_iter:
        X, y = X.as_in_context(ctx), y.as_in_context(ctx).astype('float32')
        acc_sum += (net(X).argmax(axis=1) == y).sum()
        n += y.size 
    return acc_sum.asscalar() / n 

In [0]:
def train_ch5(net, train_iter, test_iter, batch_size, trainer, ctx, num_epochs):
    print('training on', ctx)
    loss = gloss.SoftmaxCrossEntropyLoss()
    for epoch in range(num_epochs):
        train_l_sum, train_acc_sum, n, start = 0.00, 0.00, 0, time.time()
        for X, y in train_iter:
            X, y = X.as_in_context(ctx), y.as_in_context(ctx)
            with autograd.record():
                y_hat = net(X)
                l = loss(y_hat, y).sum()
            l.backward()
            trainer.step(batch_size)
            y = y.astype('float32')
            train_l_sum += l.asscalar()
            train_acc_sum += (y_hat.argmax(axis=1) == y).sum().asscalar()
            n += y.size
        test_acc = evaluate_accuracy(test_iter, net, ctx)
        print('epoch %d, loss %.4f, train acc %.3f, test acc %.3f, time %.1f sec'
            % (epoch + 1, train_l_sum / n, train_acc_sum / n, test_acc, time.time() - start))

In [0]:
lr, num_epochs = 0.9, 5 
net.initialize(force_reinit=True, ctx=ctx, init=init.Xavier())
trainer = gluon.Trainer(net.collect_params(), 'sgd', {'learning_rate': lr})
train_ch5(net, train_iter, test_iter, batch_size, trainer, ctx, num_epochs)

training on gpu(0)
epoch 1, loss 2.3202, train acc 0.101, test acc 0.100, time 7.9 sec
epoch 2, loss 1.9381, train acc 0.260, test acc 0.590, time 8.0 sec
epoch 3, loss 0.9748, train acc 0.612, test acc 0.699, time 7.7 sec
epoch 4, loss 0.7639, train acc 0.702, test acc 0.738, time 7.8 sec
epoch 5, loss 0.6612, train acc 0.738, test acc 0.762, time 7.8 sec


## 5.6 深度卷积神经网络 ( AlexNet )

In [0]:
!nvidia-smi

Fri Nov  8 05:24:31 2019       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 430.50       Driver Version: 418.67       CUDA Version: 10.1     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|   0  Tesla K80           Off  | 00000000:00:04.0 Off |                    0 |
| N/A   68C    P8    32W / 149W |      0MiB / 11441MiB |      0%      Default |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Processes:                                                       GPU Memory |
|  GPU       PID   Type   Process name                             Usage      |
|  No ru

In [0]:
import d2lzh as d2l 
from mxnet import gluon, init, nd 
from mxnet.gluon import data as gdata, nn 
import os 
import sys 

net = nn.Sequential()
net.add(nn.Conv2D(96, kernel_size=11, strides=4, activation='relu'), 
    nn.MaxPool2D(pool_size=3, strides=2), 
    nn.Conv2D(256, kernel_size=5, padding=2, activation='relu'), 
    nn.MaxPool2D(pool_size=3, strides=2), 
    nn.Conv2D(384, kernel_size=3, padding=1, activation='relu'), 
    nn.Conv2D(384, kernel_size=3, padding=1, activation='relu'), 
    nn.Conv2D(256, kernel_size=3, padding=1, activation='relu'), 
    nn.MaxPool2D(pool_size=3, strides=2), 
    nn.Dense(4096, activation='relu'), nn.Dropout(0.5), 
    nn.Dense(4096, activation='relu'), nn.Dropout(0.5), 
    nn.Dense(10)

    )

In [0]:
X = nd.random.uniform(shape=(1, 1, 224, 224))
net.initialize()
for layer in net:
    X = layer(X)
    print(layer.name, 'output shape:\t', X.shape)

conv2 output shape:	 (1, 96, 54, 54)
pool2 output shape:	 (1, 96, 26, 26)
conv3 output shape:	 (1, 256, 26, 26)
pool3 output shape:	 (1, 256, 12, 12)
conv4 output shape:	 (1, 384, 12, 12)
conv5 output shape:	 (1, 384, 12, 12)
conv6 output shape:	 (1, 256, 12, 12)
pool4 output shape:	 (1, 256, 5, 5)
dense3 output shape:	 (1, 4096)
dropout0 output shape:	 (1, 4096)
dense4 output shape:	 (1, 4096)
dropout1 output shape:	 (1, 4096)
dense5 output shape:	 (1, 10)


In [0]:
def load_data_fashion_mnist(batch_size, resize=None, root=os.path.join('~', '.mxnet', 'datasets', 'fashion-mnist')):
    root = os.path.expanduser(root)
    transformer = []
    if resize:
        transformer += [gdata.vision.transforms.Resize(resize)]
    transformer += [gdata.vision.transforms.ToTensor()]
    transformer = gdata.vision.transforms.Compose(transformer)
    mnist_train = gdata.vision.FashionMNIST(root=root, train=True)
    mnist_test = gdata.vision.FashionMNIST(root=root, train=False)
    num_workers = 4 
    train_iter = gdata.DataLoader(
        mnist_train.transform_first(transformer), batch_size, shuffle=True, num_workers=num_workers
    )
    test_iter = gdata.DataLoader(
        mnist_test.transform_first(transformer), batch_size, shuffle=False, num_workers=num_workers
    )
    return train_iter, test_iter     

batch_size = 128 
train_iter, test_iter = load_data_fashion_mnist(batch_size, resize=224)

In [0]:
lr, num_epochs, ctx = 0.01, 5, d2l.try_gpu()
net.initialize(force_reinit=True, ctx=ctx, init=init.Xavier())
trainer = gluon.Trainer(net.collect_params(), 'sgd', {'learning_rate': lr})
d2l.train_ch5(net, train_iter, test_iter, batch_size, trainer, ctx, num_epochs)

training on gpu(0)
epoch 1, loss 1.2992, train acc 0.521, test acc 0.754, time 102.5 sec
epoch 2, loss 0.6595, train acc 0.754, test acc 0.810, time 94.8 sec
epoch 3, loss 0.5430, train acc 0.798, test acc 0.840, time 94.9 sec
epoch 4, loss 0.4802, train acc 0.823, test acc 0.841, time 94.9 sec
epoch 5, loss 0.4359, train acc 0.840, test acc 0.860, time 94.9 sec


## 5.7 使用重复元素的网络 ( VGG )

In [0]:
import d2lzh as d2l 
from mxnet import gluon, init, nd 
from mxnet.gluon import nn 

def vgg_block(num_convs, num_channels):
    blk = nn.Sequential()
    for _ in range(num_convs):
        blk.add(nn.Conv2D(num_channels, kernel_size=3, padding=1, activation='relu'))
    blk.add(nn.MaxPool2D(pool_size=2, strides=2))
    return blk

In [0]:
conv_arch = ((1, 64), (1, 128), (2, 256), (2, 512), (2, 512))

In [0]:
def vgg(conv_arch):
    net = nn.Sequential()
    for (num_convs, num_channels) in conv_arch:
        net.add(vgg_block(num_convs, num_channels))
    net.add(nn.Dense(4096, activation='relu'), nn.Dropout(0.5), 
        nn.Dense(4096, activation='relu'), nn.Dropout(0.5), 
        nn.Dense(10))
    return net 
net = vgg(conv_arch)

In [0]:
net.initialize()
X = nd.random.uniform(shape=(1, 1, 224, 224))
for blk in net:
    X = blk(X)
    print(blk.name, 'output shape:\t', X.shape)

sequential3 output shape:	 (1, 64, 112, 112)
sequential4 output shape:	 (1, 128, 56, 56)
sequential5 output shape:	 (1, 256, 28, 28)
sequential6 output shape:	 (1, 512, 14, 14)
sequential7 output shape:	 (1, 512, 7, 7)
dense6 output shape:	 (1, 4096)
dropout2 output shape:	 (1, 4096)
dense7 output shape:	 (1, 4096)
dropout3 output shape:	 (1, 4096)
dense8 output shape:	 (1, 10)


In [0]:
ratio = 4 
small_conv_arch = [(pair[0], pair[1] // ratio) for pair in conv_arch]
net = vgg(small_conv_arch)

In [0]:
lr, num_epochs, batch_size, ctx = 0.005, 5, 128, d2l.try_gpu()
net.initialize(ctx=ctx, init=init.Xavier())
trainer = gluon.Trainer(net.collect_params(), 'sgd', {'learning_rate': lr})
train_iter, test_iter = d2l.load_data_fashion_mnist(batch_size, resize=224)
d2l.train_ch5(net, train_iter, test_iter, batch_size, trainer, ctx, num_epochs)

training on gpu(0)
epoch 1, loss 1.6113, train acc 0.405, test acc 0.724, time 177.0 sec
epoch 2, loss 0.7265, train acc 0.728, test acc 0.802, time 167.3 sec
epoch 3, loss 0.5771, train acc 0.788, test acc 0.838, time 167.7 sec
epoch 4, loss 0.5059, train acc 0.816, test acc 0.852, time 167.6 sec
epoch 5, loss 0.4593, train acc 0.833, test acc 0.861, time 167.6 sec


## 网络中的网络 ( NiN )

In [0]:
import d2lzh as d2l 
from mxnet import gluon, init, nd 
from mxnet.gluon import nn 

def nin_block(num_channels, kernel_size, strides, padding):
    blk = nn.Sequential()
    blk.add(nn.Conv2D(num_channels, kernel_size, strides, padding, activation='relu'), 
        nn.Conv2D(num_channels, kernel_size=1, activation='relu'), 
        nn.Conv2D(num_channels, kernel_size=1, activation='relu'), 
        ) 
    return blk 

In [0]:
net = nn.Sequential()
net.add(nin_block(96, kernel_size=11, strides=4, padding=0), 
    nn.MaxPool2D(pool_size=3, strides=2), 
    nin_block(256, kernel_size=5, strides=1, padding=2), 
    nn.MaxPool2D(pool_size=3, strides=2), 
    nin_block(384, kernel_size=3, strides=1, padding=1), 
    nn.MaxPool2D(pool_size=3, strides=2), nn.Dropout(0.5), 
    nin_block(10, kernel_size=3, strides=1, padding=1), 
    nn.GlobalAvgPool2D(), 
    nn.Flatten())

In [0]:
X = nd.random.uniform(shape=(1, 1, 224, 224))
net.initialize()
for layer in net:
    X = layer(X)
    print(layer.name, 'output shape:\t', X.shape)

sequential1 output shape:	 (1, 96, 54, 54)
pool0 output shape:	 (1, 96, 26, 26)
sequential2 output shape:	 (1, 256, 26, 26)
pool1 output shape:	 (1, 256, 12, 12)
sequential3 output shape:	 (1, 384, 12, 12)
pool2 output shape:	 (1, 384, 5, 5)
dropout0 output shape:	 (1, 384, 5, 5)
sequential4 output shape:	 (1, 10, 5, 5)
pool3 output shape:	 (1, 10, 1, 1)
flatten0 output shape:	 (1, 10)


In [0]:
lr, num_epochs, batch_size, ctx = 0.1, 5, 128, d2l.try_gpu()
net.initialize(force_reinit=True, ctx=ctx, init=init.Xavier())
trainer = gluon.Trainer(net.collect_params(), 'sgd', {'learning_rate': lr})
train_iter, test_iter = d2l.load_data_fashion_mnist(batch_size, resize=224)
d2l.train_ch5(net, train_iter, test_iter, batch_size, trainer, ctx, num_epochs)

Downloading /root/.mxnet/datasets/fashion-mnist/train-images-idx3-ubyte.gz from https://apache-mxnet.s3-accelerate.dualstack.amazonaws.com/gluon/dataset/fashion-mnist/train-images-idx3-ubyte.gz...
Downloading /root/.mxnet/datasets/fashion-mnist/train-labels-idx1-ubyte.gz from https://apache-mxnet.s3-accelerate.dualstack.amazonaws.com/gluon/dataset/fashion-mnist/train-labels-idx1-ubyte.gz...
Downloading /root/.mxnet/datasets/fashion-mnist/t10k-images-idx3-ubyte.gz from https://apache-mxnet.s3-accelerate.dualstack.amazonaws.com/gluon/dataset/fashion-mnist/t10k-images-idx3-ubyte.gz...
Downloading /root/.mxnet/datasets/fashion-mnist/t10k-labels-idx1-ubyte.gz from https://apache-mxnet.s3-accelerate.dualstack.amazonaws.com/gluon/dataset/fashion-mnist/t10k-labels-idx1-ubyte.gz...
training on gpu(0)
epoch 1, loss 2.2271, train acc 0.174, test acc 0.272, time 147.2 sec
epoch 2, loss 1.4489, train acc 0.477, test acc 0.662, time 139.1 sec
epoch 3, loss 0.9954, train acc 0.645, test acc 0.661, ti

## 5.9 含并行连接的网络 ( GoogLeNet )

In [0]:
import d2lzh as d2l 
from mxnet import gluon, init, nd 
from mxnet.gluon import nn 

class Inception(nn.Block):
    def __init__(self, c1, c2, c3, c4, **kwargs):
        super(Inception, self).__init__(**kwargs)
        self.p1_1 = nn.Conv2D(c1, kernel_size=1, activation='relu')
        self.p2_1 = nn.Conv2D(c2[0], kernel_size=1, activation='relu')
        self.p2_2 = nn.Conv2D(c2[1], kernel_size=3, padding=1, activation='relu')
        self.p3_1 = nn.Conv2D(c3[0], kernel_size=1, activation='relu')
        self.p3_2 = nn.Conv2D(c3[1], kernel_size=5, padding=2, activation='relu')
        self.p4_1 = nn.MaxPool2D(pool_size=3, strides=1, padding=1)
        self.p4_2 = nn.Conv2D(c4, kernel_size=1, activation='relu')

    def forward(self, x):
        p1 = self.p1_1(x)
        p2 = self.p2_2(self.p2_1(x))
        p3 = self.p3_2(self.p3_1(x))
        p4 = self.p4_2(self.p4_1(x))
        return nd.concat(p1, p2, p3, p4, dim=1)


In [0]:
b1 = nn.Sequential()
b1.add(nn.Conv2D(64, kernel_size=7, strides=2, padding=3, activation='relu'), 
    nn.MaxPool2D(pool_size=3, strides=2, padding=1))

In [0]:
b2 = nn.Sequential()
b2.add(nn.Conv2D(64, kernel_size=1, activation='relu'), 
    nn.Conv2D(192, kernel_size=3, padding=1, activation='relu'), 
    nn.MaxPool2D(pool_size=3, strides=2, padding=1))

In [0]:
b3 = nn.Sequential()
b3.add(Inception(64, (96, 128), (16, 32), 32), 
    Inception(128, (128, 192), (32, 96), 64), 
    nn.MaxPool2D(pool_size=3, strides=2, padding=1))

In [0]:
b4 = nn.Sequential()
b4.add(Inception(192, (96, 208), (16, 48), 64),
       Inception(160, (112, 224), (24, 64), 64),
       Inception(128, (128, 256), (24, 64), 64),
       Inception(112, (144, 288), (32, 64), 64),
       Inception(256, (160, 320), (32, 128), 128),
       nn.MaxPool2D(pool_size=3, strides=2, padding=1))

In [0]:
b5 = nn.Sequential()
b5.add(Inception(256, (160, 320), (32, 128), 128),
       Inception(384, (192, 384), (48, 128), 128),
       nn.GlobalAvgPool2D())

In [0]:
net = nn.Sequential()
net.add(b1, b2, b3, b4, b5, nn.Dense(10))

In [0]:
X = nd.random.uniform(shape=(1, 1, 96, 96))
net.initialize()
for layer in net:
    X = layer(X)
    print(layer.name, 'output shape:\t', X.shape)

sequential8 output shape:	 (1, 64, 24, 24)
sequential9 output shape:	 (1, 192, 12, 12)
sequential10 output shape:	 (1, 480, 6, 6)
sequential11 output shape:	 (1, 832, 3, 3)
sequential12 output shape:	 (1, 1024, 1, 1)
dense0 output shape:	 (1, 10)


In [0]:
lr, num_epochs, batch_size, ctx = 0.1, 5, 128, d2l.try_gpu()
net.initialize(force_reinit=True, ctx=ctx, init=init.Xavier())
trainer = gluon.Trainer(net.collect_params(), 'sgd', {'learning_rate': lr})
train_iter, test_iter = d2l.load_data_fashion_mnist(batch_size, resize=96)
d2l.train_ch5(net, train_iter, test_iter, batch_size, trainer, ctx, num_epochs)

training on gpu(0)
epoch 1, loss 1.9761, train acc 0.248, test acc 0.602, time 118.4 sec
epoch 2, loss 0.8583, train acc 0.659, test acc 0.786, time 117.9 sec
epoch 3, loss 0.5071, train acc 0.809, test acc 0.842, time 118.0 sec
epoch 4, loss 0.4012, train acc 0.847, test acc 0.869, time 117.7 sec
epoch 5, loss 0.3496, train acc 0.867, test acc 0.870, time 117.9 sec


## 5.10 批量归一化

In [0]:
import d2lzh as d2l 
from mxnet import autograd, gluon, init, nd 
from mxnet.gluon import nn 

def batch_norm(X, gamma, beta, moving_mean, moving_var, eps, momentum):
    if not autograd.is_training():
        X_hat = (X - moving_mean) / nd.sqrt(moving_var + eps)
    else:
        assert len(X.shape) in (2, 4)
        if len(X.shape) == 2:
            mean = X.mean(axis=0)
            var = ((X - mean) ** 2).mean(axis=0)
        else:
            mean = X.mean(axis=(0, 2, 3), keepdims=True)
            var = ((X - mean) ** 2).mean(axis=(0, 2, 3), keepdims=True)

        X_hat = (X - mean) / nd.sqrt(var + eps)
        moving_mean = momentum * moving_mean + (1.0 - momentum) * mean 
        moving_var = momentum * moving_var + (1.0 - momentum) * var 
    Y = gamma * X_hat + beta 
    return Y, moving_mean, moving_var

In [0]:
class BatchNorm(nn.Block):
    def __init__(self, num_features, num_dims, **kwargs):
        super(BatchNorm, self).__init__(**kwargs)
        if num_dims == 2:
            shape = (1, num_features)
        else:
            shape = (1, num_features, 1, 1)
        self.gamma = self.params.get('gamma', shape=shape, init=init.One())
        self.beta = self.params.get('beta', shape=shape, init=init.Zero())
        self.moving_mean = nd.zeros(shape)
        self.moving_var = nd.zeros(shape)

    def forward(self, X):
        if self.moving_mean.context != X.context:
            self.moving_mean = self.moving_mean.copyto(X.context)
            self.moving_var = self.moving_var.copyto(X.context)

        Y, self.moving_mean, self.moving_var = batch_norm(
            X, self.gamma.data(), self.beta.data(), self.moving_mean, 
            self.moving_var, eps=1e-5, momentum=0.9
        )
        return Y
        

In [0]:
net = nn.Sequential()
net.add(nn.Conv2D(6, kernel_size=5), 
    BatchNorm(6, num_dims=4), 
    nn.Activation('sigmoid'), 
    nn.MaxPool2D(pool_size=2, strides=2), 
    nn.Conv2D(16, kernel_size=5), 
    BatchNorm(16, num_dims=4), 
    nn.Activation('sigmoid'), 
    nn.MaxPool2D(pool_size=2, strides=2), 
    nn.Dense(120), 
    BatchNorm(120, num_dims=2), 
    nn.Activation('sigmoid'), 
    nn.Dense(84), 
    BatchNorm(84, num_dims=2), 
    nn.Activation('sigmoid'), 
    nn.Dense(10))

In [0]:
lr, num_epochs, batch_size, ctx = 1.0, 5, 256, d2l.try_gpu()
net.initialize(ctx=ctx, init=init.Xavier())
trainer = gluon.Trainer(net.collect_params(), 'sgd', {'learning_rate': lr})
train_iter, test_iter = d2l.load_data_fashion_mnist(batch_size)
d2l.train_ch5(net, train_iter, test_iter, batch_size, trainer, ctx, num_epochs)

  v.initialize(None, ctx, init, force_reinit=force_reinit)
  v.initialize(None, ctx, init, force_reinit=force_reinit)
  v.initialize(None, ctx, init, force_reinit=force_reinit)
  v.initialize(None, ctx, init, force_reinit=force_reinit)
  v.initialize(None, ctx, init, force_reinit=force_reinit)
  v.initialize(None, ctx, init, force_reinit=force_reinit)
  v.initialize(None, ctx, init, force_reinit=force_reinit)
  v.initialize(None, ctx, init, force_reinit=force_reinit)
  v.initialize(None, ctx, init, force_reinit=force_reinit)
  v.initialize(None, ctx, init, force_reinit=force_reinit)
  v.initialize(None, ctx, init, force_reinit=force_reinit)
  v.initialize(None, ctx, init, force_reinit=force_reinit)
  v.initialize(None, ctx, init, force_reinit=force_reinit)
  v.initialize(None, ctx, init, force_reinit=force_reinit)
  v.initialize(None, ctx, init, force_reinit=force_reinit)
  v.initialize(None, ctx, init, force_reinit=force_reinit)
  v.initialize(None, ctx, init, force_reinit=force_reini

training on gpu(0)
epoch 1, loss 0.2906, train acc 0.895, test acc 0.872, time 10.8 sec
epoch 2, loss 0.2801, train acc 0.898, test acc 0.889, time 10.5 sec
epoch 3, loss 0.2707, train acc 0.901, test acc 0.770, time 10.5 sec
epoch 4, loss 0.2610, train acc 0.905, test acc 0.880, time 10.8 sec
epoch 5, loss 0.2483, train acc 0.909, test acc 0.865, time 10.5 sec


In [0]:
net[1].gamma.data().reshape((-1, )), net[1].beta.data().reshape((-1, ))

(
 [1.8463953 2.1018672 1.6952718 1.8522757 0.9805336 1.3830012]
 <NDArray 6 @gpu(0)>, 
 [-0.20405497 -2.2198381   0.2428574   1.4535147  -0.20604345  0.7259122 ]
 <NDArray 6 @gpu(0)>)

In [0]:
net = nn.Sequential()
net.add(nn.Conv2D(6, kernel_size=5), 
    nn.BatchNorm(), 
    nn.Activation('sigmoid'), 
    nn.MaxPool2D(pool_size=2, strides=2), 
    nn.Conv2D(16, kernel_size=5), 
    nn.BatchNorm(), 
    nn.Activation('sigmoid'), 
    nn.MaxPool2D(pool_size=2, strides=2), 
    nn.Dense(120), 
    nn.BatchNorm(), 
    nn.Activation('sigmoid'), 
    nn.Dense(84), 
    nn.BatchNorm(), 
    nn.Activation('sigmoid'), 
    nn.Dense(10))

In [0]:
net.initialize(ctx=ctx, init=init.Xavier())
trainer = gluon.Trainer(net.collect_params(), 'sgd', {'learning_rate': lr})
d2l.train_ch5(net, train_iter, test_iter, batch_size, trainer, ctx, num_epochs)

  v.initialize(None, ctx, init, force_reinit=force_reinit)
  v.initialize(None, ctx, init, force_reinit=force_reinit)
  v.initialize(None, ctx, init, force_reinit=force_reinit)
  v.initialize(None, ctx, init, force_reinit=force_reinit)
  v.initialize(None, ctx, init, force_reinit=force_reinit)
  v.initialize(None, ctx, init, force_reinit=force_reinit)
  v.initialize(None, ctx, init, force_reinit=force_reinit)
  v.initialize(None, ctx, init, force_reinit=force_reinit)
  v.initialize(None, ctx, init, force_reinit=force_reinit)
  v.initialize(None, ctx, init, force_reinit=force_reinit)
  v.initialize(None, ctx, init, force_reinit=force_reinit)
  v.initialize(None, ctx, init, force_reinit=force_reinit)
  v.initialize(None, ctx, init, force_reinit=force_reinit)
  v.initialize(None, ctx, init, force_reinit=force_reinit)
  v.initialize(None, ctx, init, force_reinit=force_reinit)
  v.initialize(None, ctx, init, force_reinit=force_reinit)
  v.initialize(None, ctx, init, force_reinit=force_reini

training on gpu(0)
epoch 1, loss 0.2931, train acc 0.892, test acc 0.863, time 8.2 sec
epoch 2, loss 0.2828, train acc 0.897, test acc 0.875, time 8.2 sec
epoch 3, loss 0.2719, train acc 0.900, test acc 0.891, time 8.5 sec
epoch 4, loss 0.2593, train acc 0.905, test acc 0.888, time 8.2 sec
epoch 5, loss 0.2539, train acc 0.906, test acc 0.899, time 8.3 sec


## 5.11 残差网络 ( ResNet )

In [0]:
import d2lzh as d2l 
from mxnet import gluon, init, nd 
from mxnet.gluon import nn  

class Residual(nn.Block):
    def __init__(self, num_channels, use_1x1conv=False, strides=1, **kwargs):
        super(Residual, self).__init__(**kwargs)
        self.conv1 = nn.Conv2D(num_channels, kernel_size=3, padding=1, strides=strides)
        self.conv2 = nn.Conv2D(num_channels, kernel_size=3, padding=1)
        if use_1x1conv:
            self.conv3 = nn.Conv2D(num_channels, kernel_size=1, strides=strides)
        else:
            self.conv3 = None 
        self.bn1 = nn.BatchNorm()
        self.bn2 = nn.BatchNorm()

    def forward(self, X):
        Y = nd.relu(self.bn1(self.conv1(X)))
        Y = self.bn2(self.conv2(Y))
        if self.conv3:
            X = self.conv3(X)
        return nd.relu(Y + X)

In [0]:
blk = Residual(3)
blk.initialize()
X = nd.random.uniform(shape=(4, 3, 6, 6))
blk(X).shape

(4, 3, 6, 6)

In [0]:
blk = Residual(6, use_1x1conv=True, strides=2)
blk.initialize()
blk(X).shape

(4, 6, 3, 3)

In [0]:
net = nn.Sequential()
net.add(nn.Conv2D(64, kernel_size=7, strides=2, padding=2), 
    nn.BatchNorm(), 
    nn.Activation('relu'), 
    nn.MaxPool2D(pool_size=3, strides=2, padding=1))

In [0]:
def resnet_block(num_channels, num_residuals, first_block=False):
    blk = nn.Sequential()
    for i in range(num_residuals):
        if i == 0 and not first_block:
            blk.add(Residual(num_channels, use_1x1conv=True, strides=2))
        else:
            blk.add(Residual(num_channels))
    return blk

In [0]:
net.add(resnet_block(64, 2, first_block=True), 
    resnet_block(128, 2), 
    resnet_block(256, 2), 
    resnet_block(512, 2))

In [0]:
net.add(nn.GlobalAvgPool2D(), nn.Dense(10))

In [0]:
X = nd.random.uniform(shape=(1, 1, 224, 224))
net.initialize()
for layer in net:
    X = layer(X)
    print(layer.name, 'output shape:\t', X.shape)

conv5 output shape:	 (1, 64, 111, 111)
batchnorm4 output shape:	 (1, 64, 111, 111)
relu0 output shape:	 (1, 64, 111, 111)
pool0 output shape:	 (1, 64, 56, 56)
sequential1 output shape:	 (1, 64, 56, 56)
sequential2 output shape:	 (1, 128, 28, 28)
sequential3 output shape:	 (1, 256, 14, 14)
sequential4 output shape:	 (1, 512, 7, 7)
pool1 output shape:	 (1, 512, 1, 1)
dense0 output shape:	 (1, 10)


In [0]:
lr, num_epochs, batch_size, ctx = 0.05, 5, 256, d2l.try_gpu()
net.initialize(force_reinit=True, ctx=ctx, init=init.Xavier())
trainer = gluon.Trainer(net.collect_params(), 'sgd', {'learning_rate': lr})
train_iter, test_iter = d2l.load_data_fashion_mnist(batch_size, resize=96)
d2l.train_ch5(net, train_iter, test_iter, batch_size, trainer, ctx, num_epochs)

Downloading /root/.mxnet/datasets/fashion-mnist/train-images-idx3-ubyte.gz from https://apache-mxnet.s3-accelerate.dualstack.amazonaws.com/gluon/dataset/fashion-mnist/train-images-idx3-ubyte.gz...
Downloading /root/.mxnet/datasets/fashion-mnist/train-labels-idx1-ubyte.gz from https://apache-mxnet.s3-accelerate.dualstack.amazonaws.com/gluon/dataset/fashion-mnist/train-labels-idx1-ubyte.gz...
Downloading /root/.mxnet/datasets/fashion-mnist/t10k-images-idx3-ubyte.gz from https://apache-mxnet.s3-accelerate.dualstack.amazonaws.com/gluon/dataset/fashion-mnist/t10k-images-idx3-ubyte.gz...
Downloading /root/.mxnet/datasets/fashion-mnist/t10k-labels-idx1-ubyte.gz from https://apache-mxnet.s3-accelerate.dualstack.amazonaws.com/gluon/dataset/fashion-mnist/t10k-labels-idx1-ubyte.gz...
training on gpu(0)
epoch 1, loss 0.4853, train acc 0.831, test acc 0.886, time 95.1 sec
epoch 2, loss 0.2538, train acc 0.906, test acc 0.904, time 87.4 sec
epoch 3, loss 0.1907, train acc 0.930, test acc 0.908, time

## 5.12 稠密连接网络 ( DenseNet )

In [0]:
import d2lzh as d2l 
from mxnet import gluon, init, nd 
from mxnet.gluon import nn 

def conv_block(num_channels):
    blk = nn.Sequential()
    blk.add(nn.BatchNorm(), nn.Activation('relu'), 
        nn.Conv2D(num_channels, kernel_size=3, padding=1))
    return blk 

In [0]:
class DenseBlock(nn.Block):
    def __init__(self, num_convs, num_channels, **kwargs):
        super(DenseBlock, self).__init__(**kwargs)
        self.net = nn.Sequential()
        for _ in range(num_convs):
            self.net.add(conv_block(num_channels))

    def forward(self, X):
        for blk in self.net:
            Y = blk(X)
            X = nd.concat(X, Y, dim=1)
        return X 

In [0]:
blk = DenseBlock(2, 10)
blk.initialize()
X = nd.random.uniform(shape=(4, 3, 8, 8))
Y = blk(X)
Y.shape

(4, 23, 8, 8)

In [0]:
def transition_block(num_channels):
    blk = nn.Sequential()
    blk.add(nn.BatchNorm(), nn.Activation('relu'), 
        nn.Conv2D(num_channels, kernel_size=1), 
        nn.AvgPool2D(pool_size=2, strides=2))
    return blk

In [0]:
blk = transition_block(10)
blk.initialize()
blk(Y).shape

(4, 10, 4, 4)

In [0]:
net = nn.Sequential()
net.add(nn.Conv2D(64, kernel_size=7, strides=2, padding=3), 
    nn.BatchNorm(), nn.Activation('relu'), 
    nn.MaxPool2D(pool_size=3, strides=2, padding=1))

In [0]:
num_channels, growth_rate = 64, 32  # num_channels为当前的通道数
num_convs_in_dense_blocks = [4, 4, 4, 4]

for i, num_convs in enumerate(num_convs_in_dense_blocks):
    net.add(DenseBlock(num_convs, growth_rate))
    # 上一个稠密块的输出通道数
    num_channels += num_convs * growth_rate
    # 在稠密块之间加入通道数减半的过渡层
    if i != len(num_convs_in_dense_blocks) - 1:
        num_channels //= 2
        net.add(transition_block(num_channels))

In [0]:
net.add(nn.BatchNorm(), nn.Activation('relu'), nn.GlobalAvgPool2D(),
        nn.Dense(10))

In [0]:
lr, num_epochs, batch_size, ctx = 0.1, 5, 256, d2l.try_gpu()
net.initialize(ctx=ctx, init=init.Xavier())
trainer = gluon.Trainer(net.collect_params(), 'sgd', {'learning_rate': lr})
train_iter, test_iter = d2l.load_data_fashion_mnist(batch_size, resize=96)
d2l.train_ch5(net, train_iter, test_iter, batch_size, trainer, ctx,
              num_epochs)

training on gpu(0)
epoch 1, loss 0.5292, train acc 0.816, test acc 0.856, time 75.6 sec
epoch 2, loss 0.3101, train acc 0.888, test acc 0.888, time 69.3 sec
epoch 3, loss 0.2607, train acc 0.905, test acc 0.797, time 69.3 sec
epoch 4, loss 0.2323, train acc 0.916, test acc 0.910, time 69.2 sec
epoch 5, loss 0.2125, train acc 0.923, test acc 0.907, time 69.4 sec
