## Concise Implementation of Softmax Regression 

In [1]:
from mxnet import gluon, init

In [2]:
from mxnet.gluon import nn

In [3]:
from mxnet import autograd

In [None]:
# We stick with the Fashion_mnist dataset and the batch_size is set to 256 for the iterator

In [4]:
batch_size=256

In [5]:
def load_data_fashion_mnist(batch_size, resize=None):
    """Load the Fashion mnist data into the memory"""
    dataset=gluon.data.vision
    trans=[dataset.transforms.Resize(resize)] if resize else []
    trans.append(dataset.transforms.ToTensor())
    trans=dataset.transforms.Compose(trans)
    mnist_train=dataset.FashionMNIST(train=True).transform_first(trans)
    mnist_test=dataset.FashionMNIST(train=False).transform_first(trans)
    return ((gluon.data.DataLoader(mnist_train, batch_size=batch_size, shuffle=True)), (gluon.data.DataLoader(mnist_test, batch_size=batch_size, shuffle=False)))

In [6]:
train_iter, test_iter=load_data_fashion_mnist(batch_size)

## Initialize Model Parameters

In [13]:
# We initialize our model with Gaussian distribution with zero mean and standard deviation 0.01
#  The output layer consists of 10 units

In [7]:
net=nn.Sequential()
net.add(nn.Dense(10))
net.initialize(init.Normal(sigma=0.01))

## The Softmax

In [14]:
# Combining the two operators softmax and cross_entropy, we can escape numerical stability issues during backpropagation
# log(y_hat)=log((e^z)/sigma(e^z))
#           = log(e^z) - log(sigma(e^z))
#           = z - log(sigma(e^z))

In [8]:
loss=gluon.loss.SoftmaxCrossEntropyLoss()

## Optimization Algorithm

In [15]:
# We use mini-batch gradient descent with a learning scale of 0.1
# This is the same choice as that of Linear Regression and shows the general applicability

In [9]:
trainer=gluon.Trainer(net.collect_params(), 'sgd', {'learning_rate':0.1})

## Training

In [10]:
import mxnet as mx
from mxnet import nd

In [None]:
# Here we use the accuracy metric of the mxnet's gluon library
# We return the second column which contains the accuracy

In [11]:
def evaluate_acc(data_iterator, net):
    acc=mx.metric.Accuracy()
    for i, (data, label) in enumerate(data_iterator):
        output=net(data)
        predictions=nd.argmax(output, axis=1)
        acc.update(preds=predictions, labels=label)
    return acc.get()[1]

In [16]:
# The training loop
# We run for 10 epochs

In [12]:
num_epochs=10
for epoch in range(1, num_epochs+1):
    cumulative_loss=0
    for X, y in train_iter:
        with autograd.record():
            y_hat=net(X)
            l=loss(y_hat, y)
        l.backward()
        trainer.step(batch_size)
        cumulative_loss+=nd.sum(l).asscalar()
    test_accuracy=evaluate_acc(test_iter, net)
    train_accuracy=evaluate_acc(train_iter, net)
    print("Epoch %s. Loss: %s, Train_acc %s, Test_acc %s" % (epoch, cumulative_loss/60000, train_accuracy, test_accuracy))

Epoch 1. Loss: 0.790034747950236, Train_acc 0.7968666666666666, Test_acc 0.7964
Epoch 2. Loss: 0.5738830060323079, Train_acc 0.8209166666666666, Test_acc 0.823
Epoch 3. Loss: 0.5296480720520019, Train_acc 0.8305166666666667, Test_acc 0.8324
Epoch 4. Loss: 0.506044314066569, Train_acc 0.83305, Test_acc 0.8357
Epoch 5. Loss: 0.489495499420166, Train_acc 0.83585, Test_acc 0.8378
Epoch 6. Loss: 0.47769207458496094, Train_acc 0.8423833333333334, Test_acc 0.845
Epoch 7. Loss: 0.46946451168060305, Train_acc 0.8443333333333334, Test_acc 0.8448
Epoch 8. Loss: 0.46226937624613446, Train_acc 0.8444333333333334, Test_acc 0.845
Epoch 9. Loss: 0.45660617497762046, Train_acc 0.8466666666666667, Test_acc 0.8466
Epoch 10. Loss: 0.4507445888519287, Train_acc 0.8469, Test_acc 0.8486
