<a href="https://colab.research.google.com/github/zeroam/jupyter-notebook/blob/master/dive_into_deep_learning/3.13.dropout.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
!pip install mxnet==1.6.0b20190915
!pip install d2l

In [0]:
import d2l
from mxnet import autograd, gluon, init, nd
from mxnet.gluon import loss as gloss, nn

def dropout(X, drop_prob):
  assert 0 <= drop_prob <= 1
  # In this case, all elements are dropped out
  if drop_prob == 1:
    return X.zeros_like()
  mask = nd.random.uniform(0, 1, X.shape) > drop_prob
  return mask * X / (1.0-drop_prob)

In [3]:
X = nd.arange(16).reshape((2, 8))
print(dropout(X, 0))
print(dropout(X, 0.5))
print(dropout(X, 1))


[[ 0.  1.  2.  3.  4.  5.  6.  7.]
 [ 8.  9. 10. 11. 12. 13. 14. 15.]]
<NDArray 2x8 @cpu(0)>

[[ 0.  0.  0.  0.  8. 10. 12.  0.]
 [16.  0. 20. 22.  0.  0.  0. 30.]]
<NDArray 2x8 @cpu(0)>

[[0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0.]]
<NDArray 2x8 @cpu(0)>


In [0]:
num_inputs, num_outputs, num_hiddens1, num_hiddens2 = 784, 10, 256, 256

W1 = nd.random.normal(scale=0.01, shape=(num_inputs, num_hiddens1))
b1 = nd.zeros(num_hiddens1)
W2 = nd.random.normal(scale=0.01, shape=(num_hiddens1, num_hiddens2))
b2 = nd.zeros(num_hiddens2)
W3 = nd.random.normal(scale=0.01, shape=(num_hiddens2, num_outputs))
b3 = nd.zeros(num_outputs)

params = [W1, b1, W2, b2, W3, b3]
for param in params:
  param.attach_grad()

In [0]:
drop_prob1, drop_prob2 = 0.2, 0.5

def net(X):
  X = X.reshape((-1, num_inputs))
  H1 = (nd.dot(X, W1) + b1).relu()
  # Use dropout only when training the model
  if autograd.is_training():
    # Add a dropout layer after the first fully connected layer
    H1 = dropout(H1, drop_prob1)
  H2 = (nd.dot(H1, W2) + b2).relu()
  if autograd.is_training():
    # Add a dropout layer after the second fully connected layer
    H2 = dropout(H2, drop_prob2)
  return nd.dot(H2, W3) + b3

In [0]:

def evaluate_accuracy(data_iter, net):
    acc_sum, n = 0.0, 0
    for X, y in data_iter:
        y = y.astype('float32')
        acc_sum += (net(X).argmax(axis=1) == y).sum().asscalar()
        n += y.size
    return acc_sum / n

def train_ch3(net, train_iter, test_iter, loss, num_epochs, batch_size,
              params=None, lr=None, trainer=None):
    for epoch in range(num_epochs):
        train_l_sum, train_acc_sum, n = 0.0, 0.0, 0
        for X, y in train_iter:
            with autograd.record():
                y_hat = net(X)
                l = loss(y_hat, y).sum()
            l.backward()
            if trainer is None:
                d2l.sgd(params, lr, batch_size)
            else:
                # This will be illustrated in the next section
                trainer.step(batch_size)
            y = y.astype('float32')
            train_l_sum += l.asscalar()
            train_acc_sum += (y_hat.argmax(axis=1) == y).sum().asscalar()
            n += y.size
        test_acc = evaluate_accuracy(test_iter, net)
        print('epoch %d, loss %.4f, train acc %.3f, test acc %.3f'
              % (epoch + 1, train_l_sum / n, train_acc_sum / n, test_acc))

In [7]:
num_epochs, lr, batch_size = 10, 0.5, 256
loss = gloss.SoftmaxCrossEntropyLoss()
train_iter, test_iter = d2l.load_data_fashion_mnist(batch_size)
train_ch3(net, train_iter, test_iter, loss, num_epochs, batch_size, params, lr)

Downloading /root/.mxnet/datasets/fashion-mnist/train-images-idx3-ubyte.gz from https://apache-mxnet.s3-accelerate.dualstack.amazonaws.com/gluon/dataset/fashion-mnist/train-images-idx3-ubyte.gz...
Downloading /root/.mxnet/datasets/fashion-mnist/train-labels-idx1-ubyte.gz from https://apache-mxnet.s3-accelerate.dualstack.amazonaws.com/gluon/dataset/fashion-mnist/train-labels-idx1-ubyte.gz...
Downloading /root/.mxnet/datasets/fashion-mnist/t10k-images-idx3-ubyte.gz from https://apache-mxnet.s3-accelerate.dualstack.amazonaws.com/gluon/dataset/fashion-mnist/t10k-images-idx3-ubyte.gz...
Downloading /root/.mxnet/datasets/fashion-mnist/t10k-labels-idx1-ubyte.gz from https://apache-mxnet.s3-accelerate.dualstack.amazonaws.com/gluon/dataset/fashion-mnist/t10k-labels-idx1-ubyte.gz...
epoch 1, loss 1.1768, train acc 0.541, test acc 0.777
epoch 2, loss 0.5997, train acc 0.779, test acc 0.835
epoch 3, loss 0.5030, train acc 0.816, test acc 0.844
epoch 4, loss 0.4543, train acc 0.834, test acc 0.862


In [0]:
net = nn.Sequential()
net.add(nn.Dense(256, activation='relu'),
        # Add a dropout layer after the first fully connected layer
        nn.Dropout(drop_prob1),
        nn.Dense(256, activation='relu'),
        # Add a dropout layer after the second fully connected layer
        nn.Dropout(drop_prob2),
        nn.Dense(10))
net.initialize(init.Normal(sigma=0.01))

In [9]:
trainer = gluon.Trainer(net.collect_params(), 'sgd', {'learning_rate': lr})
train_ch3(net, train_iter, test_iter, loss, num_epochs, batch_size, None,
          None, trainer)

epoch 1, loss 1.2351, train acc 0.526, test acc 0.786
epoch 2, loss 0.5943, train acc 0.778, test acc 0.833
epoch 3, loss 0.5116, train acc 0.814, test acc 0.833
epoch 4, loss 0.4565, train acc 0.833, test acc 0.862
epoch 5, loss 0.4280, train acc 0.842, test acc 0.864
epoch 6, loss 0.4087, train acc 0.852, test acc 0.868
epoch 7, loss 0.3959, train acc 0.855, test acc 0.868
epoch 8, loss 0.3809, train acc 0.861, test acc 0.867
epoch 9, loss 0.3648, train acc 0.867, test acc 0.876
epoch 10, loss 0.3581, train acc 0.869, test acc 0.866
