In [1]:
#Author-Vishal Burman

# Implementation of Multi-layer Perceptron from Scratch

In [2]:
from mxnet import nd, gluon, autograd

In [8]:
batch_size=256
def data_load_fashion_mnist(batch_size, resize=None):
    """Load the fashion-Mnist dataset"""
    dataset=gluon.data.vision
    trans=[dataset.transforms.Resize(resize)] if resize else []
    trans.append(dataset.transforms.ToTensor())
    trans=dataset.transforms.Compose(trans)
    mnist_train=dataset.FashionMNIST(train=True).transform_first(trans)
    mnist_test=dataset.FashionMNIST(train=False).transform_first(trans)
    return ((gluon.data.DataLoader(mnist_train, batch_size=batch_size, shuffle=True)), (gluon.data.DataLoader(mnist_test, batch_size=batch_size, shuffle=False)))

In [9]:
train_iter, test_iter=data_load_fashion_mnist(batch_size)

## Initialize Model Parameters

In [10]:
num_inputs, num_outputs, num_hidden=784, 10, 256

In [11]:
W1=nd.random.normal(scale=0.01, shape=(num_inputs, num_hidden))
b1=nd.zeros(num_hidden)
W2=nd.random.normal(scale=0.01, shape=(num_hidden, num_outputs))
b2=nd.zeros(num_outputs)
params=[W1, b1, W2, b2]
for param in params:
    param.attach_grad()

## Activation Function

In [12]:
# We will use the maximum function directly to see how ReLU works

In [13]:
def relu(x):
    return nd.maximum(x, 0)

## The model

In [15]:
def net(X):
    X=X.reshape((-1, num_inputs))
    H=relu(nd.dot(X, W1)+b1)
    return nd.dot(H, W2)+b2

## The Loss Function

In [16]:
loss=gluon.loss.SoftmaxCrossEntropyLoss()

## Optimization Algorithm

In [21]:
def sgd(params, lr, batch_size):
    for param in params:
        param[:]=param-lr*param.grad/batch_size

## Training

In [17]:
import mxnet as mx

In [18]:
def evaluate_accuracy(data_iterator, net):
    acc=mx.metric.Accuracy()
    for i , (data, label) in enumerate(data_iterator):
        output=net(data)
        predictions=nd.argmax(output, axis=1) # the rowwise
        acc.update(preds=predictions, labels=label)
    return acc.get()[1]

In [24]:
num_epochs, lr=10, 0.5
updater=lambda batch_size: sgd(params, lr, batch_size)

In [19]:
# The Training loop

In [27]:
for epoch in range(1, num_epochs+1):
    cumulative_loss=0
    for X, y in train_iter:
        with autograd.record():
            y_hat=net(X)
            l=loss(y_hat, y)
        l.backward()
        updater(X.shape[0])
        cumulative_loss+=nd.sum(l).asscalar()
    test_accuracy=evaluate_accuracy(test_iter, net)
    train_accuracy=evaluate_accuracy(train_iter, net)
    print("Epoch %s. Loss: %s, Train_acc %s, Test_acc %s" % (epoch, cumulative_loss/60000, train_accuracy, test_accuracy))

Epoch 1. Loss: 0.7927206073125204, Train_acc 0.7898833333333334, Test_acc 0.7879
Epoch 2. Loss: 0.4943413875579834, Train_acc 0.7704, Test_acc 0.7736
Epoch 3. Loss: 0.4252540446917216, Train_acc 0.8489833333333333, Test_acc 0.8459
Epoch 4. Loss: 0.3935036236445109, Train_acc 0.8423166666666667, Test_acc 0.8389
Epoch 5. Loss: 0.3705659663836161, Train_acc 0.86105, Test_acc 0.858
Epoch 6. Loss: 0.3490981467247009, Train_acc 0.8602166666666666, Test_acc 0.8515
Epoch 7. Loss: 0.33836647119522095, Train_acc 0.8621666666666666, Test_acc 0.8556
Epoch 8. Loss: 0.3278519490559896, Train_acc 0.8584, Test_acc 0.8492
Epoch 9. Loss: 0.31651445353825886, Train_acc 0.86685, Test_acc 0.8587
Epoch 10. Loss: 0.3083568675041199, Train_acc 0.8804, Test_acc 0.8684
