In [1]:
# Author-Vishal Burman

## Implementation of Dropout from Scratch

In [2]:
%matplotlib inline
import d2l
from mxnet import autograd, gluon, init, nd
from mxnet.gluon import nn

In [3]:
# To implement the dropout function we take sample values from Bernoulli distribution
# We drop(set the nodes to 0) the nodes whose value is greater than dropout value

In [4]:
def dropout(X, drop_prob):
    assert 0<=drop_prob<=1
    
    # if the drop_prop is 1, all elements are dropped out
    if drop_prob==1:
        return X.zeros_like()
    # Returns the boolean values in the form of 0 and 1
    mask=nd.random.uniform(0, 1, X.shape) > drop_prob
    
    # We return the debiased survivors
    return mask * X/(1-drop_prob)

In [5]:
# We can test out our dropout function
X=nd.arange(16).reshape((2, 8))
print(dropout(X, 0))
print(dropout(X, 0.5))
print(dropout(X, 1))


[[ 0.  1.  2.  3.  4.  5.  6.  7.]
 [ 8.  9. 10. 11. 12. 13. 14. 15.]]
<NDArray 2x8 @cpu(0)>

[[ 0.  0.  0.  0.  8. 10. 12.  0.]
 [16.  0. 20. 22.  0.  0.  0. 30.]]
<NDArray 2x8 @cpu(0)>

[[0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0.]]
<NDArray 2x8 @cpu(0)>


## Defining Model Parameters

In [13]:
# We have out model net with two hidden layers
# We attach the params with attach_grad for graph calculation

In [6]:
num_inputs, num_outputs, num_hidden1, num_hidden2=784, 10, 256, 256

W1=nd.random.normal(scale=0.01, shape=(num_inputs, num_hidden1))
b1=nd.zeros(num_hidden1)

W2=nd.random.normal(scale=0.01, shape=(num_hidden1, num_hidden2))
b2=nd.zeros(num_hidden2)

W3=nd.random.normal(scale=0.01, shape=(num_hidden2, num_outputs))
b3=nd.zeros(num_outputs)

params=[W1, b1, W2, b2, W3, b3]
for param in params:
    param.attach_grad()

## Define the Model

In [7]:
drop_prob1, drop_prop2=0.2, 0.5
def net(X):
    X=X.reshape((-1, num_inputs))
    H1=(nd.dot(X, W1)+b1).relu()
    
    # Use dropout only when training the model
    if autograd.is_training():
        # Add a dropout layer just after the first fully connected layer
        H1=dropout(H1, drop_prob1)
        
    H2=(nd.dot(H1, W2)+b2).relu()
    if autograd.is_training():
        # Add dropout layer after the second fully connected layer
        H2=dropout(H2, drop_prop2)
    
    return nd.dot(H2, W3)+b3

## Training and Testing

In [14]:
# Defining the parameters of training

In [8]:
num_epochs, lr, batch_size=10, 0.5, 256

loss=gluon.loss.SoftmaxCrossEntropyLoss()
train_iter, test_iter=d2l.load_data_fashion_mnist(batch_size)
updater=lambda batch_size: sgd(params, lr, batch_size)

In [9]:
# Writing the Training loop

In [15]:
# Our optimization algorithm

In [10]:
def sgd(param, lr, batch_size):
    for param in params:
        param[:]=param-lr*param.grad/batch_size

In [11]:
import mxnet as mx
def evaluate_accuracy(data_iter, net):
    acc=mx.metric.Accuracy()
    for i, (data, label) in enumerate(data_iter):
        output=net(data)
        predictions=nd.argmax(output, axis=1) # Rowwise
        acc.update(preds=predictions, labels=label)
    return acc.get()[1]

In [12]:
for epoch in range(1, num_epochs+1):
    cumulative_loss=0;
    for X, y in train_iter:
        with autograd.record():
            l=loss(net(X), y)
        l.backward()
        updater(X.shape[0])
        cumulative_loss+=nd.sum(l).asscalar()
    test_acc=evaluate_accuracy(test_iter, net)
    train_acc=evaluate_accuracy(train_iter, net)
    print("Epoch %s. Loss: %s, Train_acc %s, Test_acc %s" % (epoch, cumulative_loss/60000, train_acc, test_acc))

Epoch 1. Loss: 1.124804504966736, Train_acc 0.7626166666666667, Test_acc 0.7601
Epoch 2. Loss: 0.5752545382817587, Train_acc 0.83, Test_acc 0.8264
Epoch 3. Loss: 0.4870275349299113, Train_acc 0.8236333333333333, Test_acc 0.8205
Epoch 4. Loss: 0.44764681158065794, Train_acc 0.8426333333333333, Test_acc 0.8412
Epoch 5. Loss: 0.4183112699508667, Train_acc 0.8388666666666666, Test_acc 0.8388
Epoch 6. Loss: 0.40122329552968344, Train_acc 0.7424166666666666, Test_acc 0.7396
Epoch 7. Loss: 0.4786258300145467, Train_acc 0.8665833333333334, Test_acc 0.8612
Epoch 8. Loss: 0.3842657153447469, Train_acc 0.84985, Test_acc 0.8423
Epoch 9. Loss: 0.36690672982533773, Train_acc 0.83245, Test_acc 0.8226
Epoch 10. Loss: 0.3573777543385824, Train_acc 0.8566666666666667, Test_acc 0.8472
