# 获取数据

In [1]:
from mxnet import gluon, nd, autograd
root= 'E:/Data/MXNet/fashion-mnist'

def transform(data, label):
        '''转换为 `float32` 数据类型'''
        return nd.transpose(data.astype('float32'), (2, 0, 1)) / 255, label.astype('float32')
    
mnist_train = gluon.data.vision.FashionMNIST(root, train= True, transform= transform)
mnist_test = gluon.data.vision.FashionMNIST(root, train= False, transform= transform)

batch_size = 256

train_data = gluon.data.DataLoader(mnist_train, batch_size, shuffle= True)
test_data = gluon.data.DataLoader(mnist_test, batch_size, shuffle= False)

  from ._conv import register_converters as _register_converters
  import OpenSSL.SSL
  label = np.fromstring(fin.read(), dtype=np.uint8).astype(np.int32)
  data = np.fromstring(fin.read(), dtype=np.uint8)


In [2]:
for data, label in train_data:
    # change data from batch x height x weight x channel to batch x channel x height x weight
    print('data.shape: {} \nlabel.shape: {}'.format(data.shape, label.shape))
    break

data.shape: (256, 1, 28, 28) 
label.shape: (256,)


# 定义模型

因为卷积网络计算比全连接要复杂，这里我们默认使用 GPU 来计算。如果 GPU 不能用，默认使用CPU。（下面这段代码会保存在 `utils.py` 里可以下次重复使用）。

In [5]:
import mxnet as mx
from mxnet.gluon import nn

try:
    ctx = mx.gpu()
    _ = nd.zeros((1,), ctx= ctx)
except:
    ctx = mx.cpu()
ctx

gpu(0)

In [6]:
net = nn.Sequential()
drop_prob1 = 0.2
drop_prob2 = 0.5

with net.name_scope():
    net.add(
        nn.Conv2D(channels= 20, kernel_size= 5, activation= 'relu'),
        nn.Dropout(drop_prob1),
        nn.MaxPool2D(pool_size= 2, strides= 2),
        nn.Conv2D(channels= 50, kernel_size= 3, activation= 'relu'),
        nn.Dropout(drop_prob2),
        nn.MaxPool2D(pool_size= 2, strides= 2),
        nn.Conv2D(channels= 50, kernel_size= 1, activation= 'relu'),
        nn.MaxPool2D(pool_size= 2, strides= 2),
        nn.Flatten(),
        nn.Dense(128, activation= 'relu'),
        nn.Dense(10)
    )
    
net

Sequential(
  (0): Conv2D(None -> 20, kernel_size=(5, 5), stride=(1, 1))
  (1): Dropout(p = 0.2)
  (2): MaxPool2D(size=(2, 2), stride=(2, 2), padding=(0, 0), ceil_mode=False)
  (3): Conv2D(None -> 50, kernel_size=(3, 3), stride=(1, 1))
  (4): Dropout(p = 0.5)
  (5): MaxPool2D(size=(2, 2), stride=(2, 2), padding=(0, 0), ceil_mode=False)
  (6): Conv2D(None -> 50, kernel_size=(1, 1), stride=(1, 1))
  (7): MaxPool2D(size=(2, 2), stride=(2, 2), padding=(0, 0), ceil_mode=False)
  (8): Flatten
  (9): Dense(None -> 128, Activation(relu))
  (10): Dense(None -> 10, linear)
)

# 优化与评估

In [85]:
def softmax(output):
    exp = nd.exp(output)
    return exp/exp.sum(axis=1, keepdims=True)

def cross_entropy(yhat, y):
    '''效果与 `y` 做了 `one-hot` 相同'''
    return - nd.pick(nd.log(yhat), y)

def SGD(params, lr):
    for param in params:
        param[:] -= lr * param.grad 
        
def accuracy(output, label):
    return nd.mean(output.argmax(axis= 1)==label).asscalar()

def evaluate_accuracy(data_iterator, net, ctx):
    acc = nd.array([0.], ctx= ctx)
    n = 0.
    if isinstance(data_iterator, mx.io.MXDataIter):
        data_iterator.reset()
    for data, label in data_iterator:
        label = label.as_in_context(ctx)
        data = data.as_in_context(ctx)
        acc += nd.sum(net(data).argmax(axis=1)==label)
        n += len(label)
        acc.wait_to_read() # don't push too many operators into backend
    return acc.asscalar() / n

# 训练

In [94]:
from time import time

weight_decay = 0.001    # 正则化项系数
lr = 0.2              # 学习率

net.initialize(ctx= ctx)
softmax_cross_entropy = gluon.loss.SoftmaxCrossEntropyLoss()
trainer = gluon.Trainer(net.collect_params(), 'SGD', {'learning_rate': lr, 'wd': weight_decay})

epochs = 20
for epoch in range(epochs):
    train_loss = 0.
    train_acc = 0.
    m = len(train_data)
    
    start = time()
    for data, label in train_data:
        label = label.as_in_context(ctx)
        data = data.as_in_context(ctx)
        with autograd.record():
            output = net(data)
            loss = softmax_cross_entropy(output, label)
        loss.backward()
        # 将梯度做平均，这样学习率会对 batch size 不那么敏感
        trainer.step(batch_size)
        
        train_loss += nd.mean(loss).asscalar()
        train_acc += accuracy(output, label)

    test_acc = evaluate_accuracy(test_data, net, ctx)
    print(("Epoch %d. Loss: %g, Train acc %g, Test acc %g, Time %g sec" % (
            epoch, train_loss/m, train_acc/m, test_acc, time() - start)))

Epoch 0. Loss: 1.77069, Train acc 0.320296, Test acc 0.5274, Time 21.5499 sec
Epoch 1. Loss: 0.833003, Train acc 0.671094, Test acc 0.7191, Time 24.0705 sec
Epoch 2. Loss: 0.642285, Train acc 0.757064, Test acc 0.7938, Time 24.0359 sec
Epoch 3. Loss: 0.568683, Train acc 0.78212, Test acc 0.8037, Time 24.067 sec
Epoch 4. Loss: 0.533042, Train acc 0.798327, Test acc 0.8082, Time 24.2194 sec
Epoch 5. Loss: 0.50936, Train acc 0.806521, Test acc 0.823, Time 24.076 sec
Epoch 6. Loss: 0.48836, Train acc 0.81486, Test acc 0.8331, Time 24.2705 sec
Epoch 7. Loss: 0.473143, Train acc 0.82223, Test acc 0.8188, Time 24.8423 sec
Epoch 8. Loss: 0.454439, Train acc 0.828618, Test acc 0.8415, Time 23.9647 sec
Epoch 9. Loss: 0.447066, Train acc 0.83176, Test acc 0.8309, Time 24.3798 sec
Epoch 10. Loss: 0.434089, Train acc 0.836032, Test acc 0.835, Time 24.4589 sec
Epoch 11. Loss: 0.42686, Train acc 0.838752, Test acc 0.8349, Time 24.3999 sec
Epoch 12. Loss: 0.419692, Train acc 0.843323, Test acc 0.8522,

# 保存训练期间最高准确率的模型

```py
best_acc = 0
epochs = 10
for epoch in range(epochs):
    # training
    # validation
    val_acc = acc_function
    if val_acc > best_acc:
        best_acc = val_acc
        # save model
        model.save_params()
```

In [13]:
def train(train_data, test_data, net, loss, trainer, ctx, num_epochs, print_batches=None):
    """Train a network"""
    print(("Start training on ", ctx))
    if isinstance(ctx, mx.Context):
        ctx = [ctx]
    for epoch in range(num_epochs):
        if isinstance(train_data, mx.io.MXDataIter):
            train_data.reset()
        start = time()
        train_loss = 0.
        train_acc = 0.
        m = 0.
        n = len(train_data)

        start = time()
        for data, label in train_data:
            label = label.as_in_context(ctx)
            data = data.as_in_context(ctx)
            with autograd.record():
                output = net(data)
                loss = softmax_cross_entropy(output, label)
            loss.backward()
            # 将梯度做平均，这样学习率会对 batch size 不那么敏感
            trainer.step(batch_size)
            m += len(label)
            train_loss += nd.mean(loss).asscalar()
            train_acc += accuracy(output, label)

        test_acc = evaluate_accuracy(test_data, net, ctx)
        print(("Epoch %d. Loss: %g, Train acc %g, Test acc %g, Time %g sec" % (
                epoch, train_loss/m, train_acc/n, test_acc, time() - start)))