# 获取数据

In [116]:
from mxnet import gluon, nd, autograd
root= 'E:/Data/MXNet/fashion-mnist'

def transform(data, label):
        '''转换为 `float32` 数据类型'''
        return nd.transpose(data.astype('float32'), (2, 0, 1)) / 255, label.astype('float32')
    
mnist_train = gluon.data.vision.FashionMNIST(root, train= True, transform= transform)
mnist_test = gluon.data.vision.FashionMNIST(root, train= False, transform= transform)

batch_size = 256

train_data = gluon.data.DataLoader(mnist_train, batch_size, shuffle= True)
test_data = gluon.data.DataLoader(mnist_test, batch_size, shuffle= False)

  label = np.fromstring(fin.read(), dtype=np.uint8).astype(np.int32)
  data = np.fromstring(fin.read(), dtype=np.uint8)


In [117]:
for data, label in train_data:
    # change data from batch x height x weight x channel to batch x channel x height x weight
    print('data.shape: {} \nlabel.shape: {}'.format(data.shape, label.shape))
    break

data.shape: (256, 1, 28, 28) 
label.shape: (256,)


# 定义模型

因为卷积网络计算比全连接要复杂，这里我们默认使用 GPU 来计算。如果 GPU 不能用，默认使用CPU。（下面这段代码会保存在 `utils.py` 里可以下次重复使用）。

In [118]:
import mxnet as mx

try:
    ctx = mx.gpu()
    _ = nd.zeros((1,), ctx= ctx)
except:
    ctx = mx.cpu()
ctx

gpu(0)

In [119]:
def init_params(weight_scale):
    # output channels = 20, kernel = (5,5)
    W1 = nd.random_normal(shape=(20, 1, 5, 5), scale= weight_scale, ctx= ctx)
    b1 = nd.zeros(W1.shape[0], ctx= ctx)

    # output channels = 50, kernel = (3,3)
    W2 = nd.random_normal(shape=(50, 20, 3, 3), scale=weight_scale, ctx= ctx)
    b2 = nd.zeros(W2.shape[0], ctx= ctx)

    # output dim = 128
    W3 = nd.random_normal(shape=(1250, 128), scale=weight_scale, ctx= ctx)
    b3 = nd.zeros(W3.shape[1], ctx= ctx)

    # output dim = 10
    W4 = nd.random_normal(shape=(W3.shape[1], 10), scale=weight_scale, ctx= ctx)
    b4 = nd.zeros(W4.shape[1], ctx= ctx)

    params = [W1, b1, W2, b2, W3, b3, W4, b4]
    for param in params:
        param.attach_grad()
    return params

def net(X, params, verbose=False):
    W1, b1, W2, b2, W3, b3, W4, b4 = params
    # 第一层卷积
    h1_conv = nd.Convolution(
        data=X, weight=W1, bias=b1, kernel=W1.shape[2:], num_filter=W1.shape[0])
    h1_activation = nd.relu(h1_conv)
    h1 = nd.Pooling(
        data=h1_activation, pool_type="max", kernel=(2,2), stride=(2,2))
    # 第二层卷积
    h2_conv = nd.Convolution(
        data=h1, weight=W2, bias=b2, kernel=W2.shape[2:], num_filter=W2.shape[0])
    h2_activation = nd.relu(h2_conv)
    h2 = nd.Pooling(data=h2_activation, pool_type="max", kernel=(2,2), stride=(2,2))
    h2 = nd.flatten(h2)
    # 第一层全连接
    h3_linear = nd.dot(h2, W3) + b3
    h3 = nd.relu(h3_linear)
    # 第二层全连接
    h4_linear = nd.dot(h3, W4) + b4
    if verbose:
        print('1st conv block:', h1.shape)
        print('2nd conv block:', h2.shape)
        print('1st dense:', h3.shape)
        print('2nd dense:', h4_linear.shape)
        print('output:', h4_linear)
    return h4_linear

In [120]:
for data, label in train_data:
    data = data.as_in_context(ctx)
    label = label.as_in_context(ctx)
    params = init_params(weight_scale)
    output = net(data, params, verbose=True)
    acc = nd.mean(output.argmax(axis= 1)==label).asscalar()
    print(acc)
    break

1st conv block: (256, 20, 12, 12)
2nd conv block: (256, 1250)
1st dense: (256, 128)
2nd dense: (256, 10)
output: 
[[ 0.4162476   0.26670948  0.11270595 ... -0.04386711  0.06388557
  -0.74774057]
 [ 0.54506636  0.33793062  0.840812   ...  0.5353344   0.19240683
  -0.940362  ]
 [ 0.15876496  0.3288771   0.30370203 ... -0.10934281  0.11002457
  -0.7485206 ]
 ...
 [ 0.56907856 -0.20700485  0.41588512 ...  0.53575045 -0.18031669
  -0.96532357]
 [ 0.24785687  0.20551816  0.12323917 ...  0.20857449 -0.067404
  -0.55670625]
 [ 0.4602481  -0.09172845  0.42637235 ...  0.7300366  -0.43264282
  -0.8195547 ]]
<NDArray 256x10 @gpu(0)>
0.16015625


# 定义训练和测试

In [121]:
import numpy as np
from mxnet import nd

def softmax(output):
    exp = nd.exp(output)
    return exp/exp.sum(axis=1, keepdims=True)

def cross_entropy(yhat, y):
    '''效果与 `y` 做了 `one-hot` 相同'''
    return - nd.pick(nd.log(yhat), y)

def softmax_cross_entropy(yhat, y):
    return cross_entropy(softmax(yhat), y)

def SGD(params, lr):
    for param in params:
        param[:] -= lr * param.grad 
        
def accuracy(output, label):
    return nd.mean(output.argmax(axis= 1)==label).asscalar()

def evaluate_accuracy(data_iterator, net, ctx, params):
    acc = nd.array([0.], ctx= ctx)
    n = 0.
    if isinstance(data_iterator, mx.io.MXDataIter):
        data_iterator.reset()
    for data, label in data_iterator:
        label = label.as_in_context(ctx)
        data = data.as_in_context(ctx)
        acc += nd.sum(net(data, params).argmax(axis=1)==label)
        n += len(label)
        acc.wait_to_read() # don't push too many operators into backend
    return acc.asscalar() / n

In [143]:
from time import time
weight_scale = .1
params = init_params(weight_scale)
learning_rate = .2

for epoch in range(20):
    train_loss = 0.
    train_acc = 0.
    m = len(train_data)
    
    start = time()
    for data, label in train_data:
        label = label.as_in_context(ctx)
        data = data.as_in_context(ctx)
        with autograd.record():
            output = net(data, params)
            loss = softmax_cross_entropy(output, label)
        loss.backward()
        # 将梯度做平均，这样学习率会对 batch size 不那么敏感
        SGD(params, learning_rate/batch_size)

        train_loss += nd.mean(loss).asscalar()
        train_acc += accuracy(output, label)
        n += len(label)

    test_acc = evaluate_accuracy(test_data, net, ctx, params)
    print(("Epoch %d. Loss: %g, Train acc %g, Test acc %g, Time %g sec" % (
        epoch, train_loss/m, train_acc/m, test_acc, time() - start)))

Epoch 0. Loss: 0.704173, Train acc 0.744891, Test acc 0.8393, Time 21.2681 sec
Epoch 1. Loss: 0.432081, Train acc 0.842991, Test acc 0.8659, Time 21.2816 sec
Epoch 2. Loss: 0.374029, Train acc 0.86301, Test acc 0.8789, Time 21.4811 sec
Epoch 3. Loss: 0.338713, Train acc 0.876219, Test acc 0.8869, Time 21.3909 sec
Epoch 4. Loss: 0.313365, Train acc 0.885167, Test acc 0.8884, Time 21.2565 sec
Epoch 5. Loss: 0.297956, Train acc 0.890758, Test acc 0.8944, Time 21.5322 sec
Epoch 6. Loss: 0.280058, Train acc 0.897706, Test acc 0.899, Time 22.2492 sec
Epoch 7. Loss: 0.268042, Train acc 0.900848, Test acc 0.8967, Time 23.305 sec
Epoch 8. Loss: 0.258463, Train acc 0.904277, Test acc 0.9009, Time 21.2718 sec
Epoch 9. Loss: 0.248514, Train acc 0.908289, Test acc 0.9027, Time 21.1091 sec
Epoch 10. Loss: 0.240115, Train acc 0.910444, Test acc 0.9065, Time 21.6842 sec
Epoch 11. Loss: 0.229878, Train acc 0.915221, Test acc 0.9039, Time 21.4453 sec
Epoch 12. Loss: 0.22257, Train acc 0.91701, Test acc 

In [139]:
time.time()

1517658711.1765313

In [124]:
len(train_data)

235

In [131]:
n = 0.
for x, y in test_data:
    n += len(y)

In [132]:
n

10000.0