# Convolutional Neural Networks in ``gluon``

Now let's see how succinctly we can express a convolutional neural network using ``gluon``. You might be relieved to find out that this too requires hardly any more code than a logistic regression. 

In [1]:
from __future__ import print_function
import mxnet as mx
from mxnet import nd, autograd
from mxnet import gluon
import numpy as np

## Set the context

In [2]:
ctx = mx.cpu()

## Grab the MNIST dataset

In [9]:
mnist = mx.test_utils.get_mnist()
batch_size = 64
train_data = mx.io.NDArrayIter(mnist["train_data"], mnist["train_label"], batch_size, shuffle=True)
test_data = mx.io.NDArrayIter(mnist["test_data"], mnist["test_label"], batch_size, shuffle=True)

## Define a convolutional neural network

Again, a few lines here is all we need in order to change the model. Let's add a couple convolutional layers using ``gluon.nn``.

In [16]:
#########################
#   Can do it with sequential once nn.Faltten() gets merged 
#########################

# net = gluon.nn.Sequential()
# with net.name_scope():
#     net.add(gluon.nn.Conv())
#     net.add(gluon.nn.Conv())
#     net.add(gluon.nn.Flatten())
#     net.add(gluon.nn.Dense(128, activation="relu"))
#     net.add(gluon.nn.Dense(10))

import mxnet.ndarray as F

class Net(gluon.Block):
    def __init__(self, **kwargs):
        super(Net, self).__init__(**kwargs)
        with self.name_scope():
            # layers created in name_scope will inherit name space
            # from parent layer.
            self.conv1 = nn.Conv2D(20, kernel_size=(5,5))
            self.pool1 = nn.MaxPool2D(pool_size=(2,2), strides = (2,2))
            self.conv2 = nn.Conv2D(50, kernel_size=(5,5))
            self.pool2 = nn.MaxPool2D(pool_size=(2,2), strides = (2,2))
            self.fc1 = nn.Dense(500)
            self.fc2 = nn.Dense(10)

    def forward(self, x):
        x = self.pool1(F.tanh(self.conv1(x)))
        x = self.pool2(F.tanh(self.conv2(x)))
        # 0 means copy over size from corresponding dimension.
        # -1 means infer size from the rest of dimensions.
        x = x.reshape((0, -1))
        x = F.tanh(self.fc1(x))
        x = F.tanh(self.fc2(x))
        return x

## Parameter initialization


In [17]:
net.collect_params().initialize(mx.init.Xavier(magnitude=2.24), ctx=ctx)

## Softmax cross-entropy Loss

In [18]:
loss = gluon.loss.SoftmaxCrossEntropyLoss()

## Optimizer

In [19]:
trainer = gluon.Trainer(net.collect_params(), 'sgd', {'learning_rate': .1})

## Write evaluation loop to calculate accuracy

In [14]:
metric = mx.metric.Accuracy()

def evaluate_accuracy(data_iterator, net):
    numerator = 0.
    denominator = 0.
    
    data_iterator.reset()
    for i, batch in enumerate(data_iterator):
        with autograd.record():
            data = batch.data[0].as_in_context(ctx).reshape((-1,784))
            label = batch.label[0].as_in_context(ctx)
            label_one_hot = nd.one_hot(label, 10)
            output = net(data)
        
        metric.update([label], [output])
    return metric.get()[1]

## Training Loop

In [15]:
epochs = 10
moving_loss = 0.

for e in range(epochs):
    train_data.reset()
    for i, batch in enumerate(train_data):
        data = batch.data[0].as_in_context(ctx).reshape((-1,784))
        label = batch.label[0].as_in_context(ctx)
        with autograd.record():
            output = net(data)
            cross_entropy = loss(output, label)
            cross_entropy.backward()
        trainer.step(data.shape[0])
        
        ##########################
        #  Keep a moving average of the losses
        ##########################
        if i == 0:
            moving_loss = np.mean(cross_entropy.asnumpy()[0])
        else:
            moving_loss = .99 * moving_loss + .01 * np.mean(cross_entropy.asnumpy()[0])
            
    test_accuracy = evaluate_accuracy(test_data, net)
    train_accuracy = evaluate_accuracy(train_data, net)
    print("Epoch %s. Loss: %s, Train_acc %s, Test_acc %s" % (e, moving_loss, train_accuracy, test_accuracy))    
    

Epoch 0. Loss: 0.284056464863, Train_acc 0.944820205479, Test_acc 0.943869426752
Epoch 1. Loss: 0.193137519472, Train_acc 0.953674372146, Test_acc 0.946647863419
Epoch 2. Loss: 0.149920152076, Train_acc 0.959408295282, Test_acc 0.954509746485
Epoch 3. Loss: 0.122780954208, Train_acc 0.96372716895, Test_acc 0.95989341226
Epoch 4. Loss: 0.106958471014, Train_acc 0.967063356164, Test_acc 0.964021517523
Epoch 5. Loss: 0.0927880979558, Train_acc 0.969810692542, Test_acc 0.967282381925
Epoch 6. Loss: 0.0757419895794, Train_acc 0.972078848663, Test_acc 0.969962464695
Epoch 7. Loss: 0.0602366979909, Train_acc 0.973981521119, Test_acc 0.972165846331
Epoch 8. Loss: 0.0502875866695, Train_acc 0.975597729579, Test_acc 0.974017326455
Epoch 9. Loss: 0.0383648255988, Train_acc 0.976990582192, Test_acc 0.975612078006


## Conclusion

You might notice that by using ``gluon``, we get code that runs much faster whether on CPU or GPU. That's largely because ``gluon`` can call down to highly optimized layers that have been written in C++. 