# 数据处理

In [3]:
import mxnet as mx
from mxnet import gluon, nd
from mxnet.gluon.nn import Sequential, Dense, Flatten

root= 'E:/Data/MXNet/mnist'

def transform(data, label):
    '''
    将数据转换为 类型为：`float32`
    '''
    return data.astype('float32')/255, label.astype('float32')

mnist_train = gluon.data.vision.MNIST(root= root, train=True, transform= transform)
mnist_test = gluon.data.vision.MNIST(root= root, train=False, transform= transform)
batch_size = 256

  label = np.fromstring(fin.read(), dtype=np.uint8).astype(np.int32)
  data = np.fromstring(fin.read(), dtype=np.uint8)


## 普通方法

In [4]:
import numpy as np
def data_iter(dataset, batch_size, shuffle):
        data = dataset[:]
        x, y = data
        n =len(x)
        x = nd.array(x)
        y = nd.array(y)
        idx = np.arange(n)
        if shuffle:
            np.random.shuffle(idx)

        for i in range(0, n, batch_size):
            j = nd.array(idx[i: min(i + batch_size, n)])
            yield nd.take(x, j), nd.take(y, j)
train_data = data_iter(mnist_train, batch_size, True)
test_data = data_iter(mnist_test, batch_size, False)

for data, label in train_data:
    # 存储格式为：`channel_first`
    data = nd.transpose(data, (0, 3, 1, 2))
    print(data.shape, label.shape, label[0])
    break

(256, 1, 28, 28) (256,) 
[8.]
<NDArray 1 @cpu(0)>


## Gluon 方法

In [5]:
batch_size = 256
train_data = gluon.data.DataLoader(mnist_train, batch_size, shuffle= True)
test_data = gluon.data.DataLoader(mnist_test, batch_size, shuffle= False)

In [6]:
for x, y in train_data:
    x = nd.transpose(x, (0, 3, 1, 2))
    print(x.shape, y.shape)
    break

(256, 1, 28, 28) (256,)


In [7]:
for x, y in train_data:
    x = nd.transpose(x, (0, 3, 1, 2))
    in_nuts = x.shape[1]
    out_nuts = 10
    w = nd.random_normal(shape= (out_nuts, in_nuts, 2, 2))
    b = nd.zeros(out_nuts)
    out = nd.Convolution(x, w, b, kernel= w.shape[2:], num_filter= out_nuts, pad= (2, 2), stride= (2, 2))
    max_pool = nd.Pooling(out, kernel= (2, 2), pool_type= 'max')
    print(out.shape)
    print(max_pool.shape)
    break

(256, 10, 16, 16)
(256, 10, 15, 15)


# 模型构造

In [118]:
weight_scale = .01
ctx = mx.gpu(0)

# output channels = 20, kernel = (5,5)
W1 = nd.random_normal(shape=(20, 1, 5, 5), scale= weight_scale, ctx= ctx)
b1 = nd.zeros(W1.shape[0], ctx= ctx)

# output channels = 50, kernel = (3,3)
W2 = nd.random_normal(shape=(50, 20, 3, 3), scale=weight_scale, ctx= ctx)
b2 = nd.zeros(W2.shape[0], ctx= ctx)

# output dim = 128
W3 = nd.random_normal(shape=(1250, 128), scale=weight_scale, ctx= ctx)
b3 = nd.zeros(W3.shape[1], ctx= ctx)

# output dim = 10
W4 = nd.random_normal(shape=(W3.shape[1], 10), scale=weight_scale, ctx= ctx)
b4 = nd.zeros(W4.shape[1], ctx= ctx)

params = [W1, b1, W2, b2, W3, b3, W4, b4]
for param in params:
    param.attach_grad()

In [122]:
def net(X, verbose= False): 
    X = nd.transpose(X.as_in_context(W1.context), (0, 3, 1, 2))
    # 第一层卷积
    h1_conv = nd.Convolution(
        data=X, weight=W1, bias=b1, kernel=W1.shape[2:], num_filter=W1.shape[0])
    h1_activation = nd.relu(h1_conv)
    h1 = nd.Pooling(
        data=h1_activation, pool_type="max", kernel=(2,2), stride=(2,2))
    
    # 第二层卷积
    h2_conv = nd.Convolution(
        data=h1, weight=W2, bias=b2, kernel=W2.shape[2:], num_filter=W2.shape[0])
    h2_activation = nd.relu(h2_conv)
    h2 = nd.Pooling(data=h2_activation, pool_type="max", kernel=(2,2), stride=(2,2))
    h2 = nd.flatten(h2)
    
    # 第一层全连接
    h3_linear = nd.dot(h2, W3) + b3
    h3 = nd.relu(h3_linear)
    
    # 第二层全连接
    h4_linear = nd.dot(h3, W4) + b4
    if verbose:
        print('1st conv block:', h1.shape)
        print('2nd conv block:', h2.shape)
        print('1st dense:', h3.shape)
        print('2nd dense:', h4_linear.shape)
        print('output:', h4_linear)
    return h4_linear

In [123]:
for data, label in train_data:
    label = label.as_in_context(ctx)
    with mx.autograd.record():
        y = net(data, True)
    y.backward()
    print(y.shape)
    acc = accuracy(y, label)
    print(acc)
    break

1st conv block: (256, 20, 12, 12)
2nd conv block: (256, 1250)
1st dense: (256, 128)
2nd dense: (256, 10)
output: 
[[ 6.3262181e-05  2.6563872e-05 -4.8889837e-05 ... -3.9635808e-05
  -5.5221422e-05  3.9352904e-05]
 [ 7.0209302e-05  4.5664907e-05 -6.5646003e-05 ... -1.1263900e-04
  -1.0046176e-04 -7.3953888e-06]
 [-9.4907518e-06  2.7982145e-05 -3.5382713e-05 ... -2.9499155e-05
  -3.6428239e-05  3.7235630e-05]
 ...
 [ 2.6450361e-06  7.4155934e-05 -4.5578709e-05 ... -7.9264239e-05
  -5.8000955e-05  1.4313950e-05]
 [ 4.7803624e-05  7.2226292e-05 -6.0026072e-05 ... -8.4549807e-05
  -7.6076132e-05  1.4584111e-06]
 [ 7.0422859e-05  7.3348070e-05 -1.2362945e-04 ... -7.3199932e-05
  -9.8516175e-05 -9.7410411e-06]]
<NDArray 256x10 @gpu(0)>
(256, 10)
0.12109375


## Gluon 方法

In [10]:
from mxnet.gluon.nn import Sequential, Dense, Flatten, Dropout, Conv2D, MaxPool2D, AvgPool2D

In [9]:
net = Sequential()

In [14]:
with net.name_scope():
    net.add(
        Conv2D(channels= 20, kernel_size= 5, activation= 'relu'),
        MaxPool2D(pool_size= 2, strides= 2),
        Conv2D(channels= 50, kernel_size= 3, activation='relu'),
        MaxPool2D(pool_size= 2, strides= 2),
        Flatten(),
        Dense(128, activation="relu"),
        Dense(10)
    )

In [15]:
def try_gpu():
    try:
        ctx = mx.gpu()
        _ = nd.zeros((1,), ctx=ctx)
    except:
        ctx = mx.cpu()

# 初始化
ctx = try_gpu()
net.initialize(ctx= ctx)
print('initialize weight on', ctx)

initialize weight on None


In [18]:
net

Sequential(
  (0): Conv2D(None -> 20, kernel_size=(5, 5), stride=(1, 1))
  (1): MaxPool2D(size=(2, 2), stride=(2, 2), padding=(0, 0), ceil_mode=False)
  (2): Conv2D(None -> 50, kernel_size=(3, 3), stride=(1, 1))
  (3): MaxPool2D(size=(2, 2), stride=(2, 2), padding=(0, 0), ceil_mode=False)
  (4): Flatten
  (5): Dense(None -> 128, Activation(relu))
  (6): Dense(None -> 10, linear)
)

# 优化

In [20]:
weight_scale = .01
ctx = mx.gpu(0)

# output channels = 20, kernel = (5,5)
W1 = nd.random_normal(shape=(20, 1, 5, 5), scale= weight_scale, ctx= ctx)
b1 = nd.zeros(W1.shape[0], ctx= ctx)

# output channels = 50, kernel = (3,3)
W2 = nd.random_normal(shape=(50, 20, 3, 3), scale=weight_scale, ctx= ctx)
b2 = nd.zeros(W2.shape[0], ctx= ctx)

# output dim = 128
W3 = nd.random_normal(shape=(1250, 128), scale=weight_scale, ctx= ctx)
b3 = nd.zeros(W3.shape[1], ctx= ctx)

# output dim = 10
W4 = nd.random_normal(shape=(W3.shape[1], 10), scale=weight_scale, ctx= ctx)
b4 = nd.zeros(W4.shape[1], ctx= ctx)

params = [W1, b1, W2, b2, W3, b3, W4, b4]
for param in params:
    param.attach_grad()

In [21]:
def SGD(params, lr):
    for param in params:
        param -= lr * param.grad
        
def accuracy(output, label):
    return nd.mean(output.argmax(axis= 1)==label).asscalar()

def _get_batch(batch, ctx):
    """return data and label on ctx"""
    if isinstance(batch, mx.io.DataBatch):
        data = batch.data[0]
        label = batch.label[0]
    else:
        data, label = batch
    return (gluon.utils.split_and_load(data, ctx),
            gluon.utils.split_and_load(label, ctx),
            data.shape[0])

def evaluate_accuracy(data_iterator, net, ctx=[mx.cpu()]):
    if isinstance(ctx, mx.Context):
        ctx = [ctx]
    acc = nd.array([0])
    n = 0.
    if isinstance(data_iterator, mx.io.MXDataIter):
        data_iterator.reset()
    for batch in data_iterator:
        data, label, batch_size = _get_batch(batch, ctx)
        for X, y in zip(data, label):
            acc += nd.sum(net(X).argmax(axis=1)==y).copyto(mx.cpu())
            n += y.size
        acc.wait_to_read() # don't push too many operators into backend
    return acc.asscalar() / n