In [1]:
%matplotlib inline 
from matplotlib import pyplot as plt

from mxnet import nd, autograd, gluon, init

In [5]:
def dropout(X, drop_prob):
    assert 0 <= drop_prob <= 1
    keep_prob = 1 - drop_prob
    if keep_prob == 0: return X.zeros_like()
    mask = nd.random.uniform(0, 1, X.shape) < keep_prob
    return mask * X / keep_prob

X = nd.arange(16).reshape((2, 8))
print(dropout(X, 0))
print(dropout(X, 0.5))
print(dropout(X, 1))


[[ 0.  1.  2.  3.  4.  5.  6.  7.]
 [ 8.  9. 10. 11. 12. 13. 14. 15.]]
<NDArray 2x8 @cpu(0)>

[[ 0.  0.  4.  0.  0. 10. 12.  0.]
 [ 0.  0.  0. 22. 24. 26. 28.  0.]]
<NDArray 2x8 @cpu(0)>

[[0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0.]]
<NDArray 2x8 @cpu(0)>


In [11]:
num_inputs = 28*28
num_outputs = 10
num_hiddens1 = 256
num_hiddens2 = 256

W1 = nd.random.normal(0, 0.01, (num_inputs, num_hiddens1))
b1 = nd.zeros(num_hiddens1)
W2 = nd.random.normal(0, 0.01, (num_hiddens1, num_hiddens2))
b2 = nd.zeros(num_hiddens2)
W3 = nd.random.normal(0, 0.01, (num_hiddens2, num_outputs))
b3 = nd.zeros(num_outputs)

params = [W1, b1, W2, b2, W3, b3]
for param in params: 
    param.attach_grad()

drop_prob1 = 0.2
drop_prob2 = 0.5

def net(X):
    X = X.reshape((-1, num_inputs))
    H1 = (nd.dot(X, W1) + b1).relu()
    if autograd.is_training():
        H1 = dropout(H1, drop_prob1)
    H2 = (nd.dot(H1, W2) + b2).relu()
    if autograd.is_training():
        H2 = dropout(H2, drop_prob2)
    return nd.dot(H2, W3) + b3

def load_data_fashion_mnist(batch_size):
    mnist_train = gluon.data.vision.FashionMNIST(train=True)
    mnist_test = gluon.data.vision.FashionMNIST(train=False)
    print(len(mnist_train), len(mnist_test))
    feature, label = mnist_train[0]
    print(mnist_train[0][0].shape, mnist_train[0][1].shape)
    
    num_workers = 4
    transformer = gluon.data.vision.transforms.ToTensor()

    train_iter = gluon.data.DataLoader(
        mnist_train.transform_first(transformer),
        batch_size, 
        shuffle=True, 
        num_workers=num_workers
    )
    test_iter = gluon.data.DataLoader(
        mnist_test.transform_first(transformer),
        batch_size, 
        shuffle=True, 
        num_workers=num_workers
    )
    return train_iter, test_iter

def sgd(params, lr, batch_size):
    for param in params:
        param[:] = param - lr * param.grad / batch_size
    return 

def evaluate_accuracy(data_iter, net):
    acc_sum, n = 0.0, 0
    for X, y in data_iter:
        y = y.astype("float32")
        acc_sum += (net(X).argmax(axis=1) == y).sum().asscalar()
        n += y.size
    return acc_sum / n


def train(net, train_iter, test_iter, loss, num_epochs, batch_size, params=None, lr=None, trainer=None):
    for ep in range(1, num_epochs+1):
        train_l_sum, train_acc_sum, n = 0.0, 0.0, 0
        for X, y in train_iter:
            with autograd.record():
                y_hat = net(X)
                l = loss(y_hat, y).sum()
            l.backward()
            if trainer is None:
                sgd(params, lr, batch_size)
            else:
                trainer.step(batch_size)
            y = y.astype('float32')
            train_l_sum += l.asscalar()
            train_acc_sum += (y_hat.argmax(axis=1) == y).sum().asscalar()
            n += y.size
        test_acc = evaluate_accuracy(test_iter, net)
        print('epoch {}, loss {}, train acc {}, test acc {}'.format(ep, train_l_sum / n, train_acc_sum / n, test_acc))
    return 

num_epochs = 5
lr = 0.5
batch_size = 256

loss = gluon.loss.SoftmaxCrossEntropyLoss()
train_iter, test_iter = load_data_fashion_mnist(batch_size)

train(net, train_iter, test_iter, loss, num_epochs, batch_size, params, lr)

60000 10000
(28, 28, 1) ()
epoch 1, loss 1.1891664789835612, train acc 0.5396833333333333, test acc 0.7837
epoch 2, loss 0.5828975231806437, train acc 0.7824166666666666, test acc 0.8358
epoch 3, loss 0.4904190071105957, train acc 0.8215333333333333, test acc 0.8519
epoch 4, loss 0.45213590927124025, train acc 0.8354333333333334, test acc 0.8571
epoch 5, loss 0.420484969774882, train acc 0.84715, test acc 0.8608


In [12]:
net = gluon.nn.Sequential()
net.add(
    gluon.nn.Dense(256, activation='relu'), 
    gluon.nn.Dropout(drop_prob1), 
    gluon.nn.Dense(256, activation='relu'), 
    gluon.nn.Dropout(drop_prob2), 
    gluon.nn.Dense(10)
)
net.initialize(init.Normal(sigma=0.01))

trainer = gluon.Trainer(net.collect_params(), 'sgd', {'learning_rate': lr})
train(net, train_iter, test_iter, loss, num_epochs, batch_size, None, None, trainer)

epoch 1, loss 1.1688278127034506, train acc 0.55075, test acc 0.7641
epoch 2, loss 0.5909847119649251, train acc 0.7771833333333333, test acc 0.8248
epoch 3, loss 0.5018488661766052, train acc 0.8161833333333334, test acc 0.8521
epoch 4, loss 0.44974132232666014, train acc 0.8361166666666666, test acc 0.8596
epoch 5, loss 0.42387184727986654, train acc 0.84695, test acc 0.8616
