In [12]:
"""In this assignment, you should train your own net on cifar10 classification with deep learning framework MXNet.
   With MXNet, you only need to define the nets with symbol connection, then set hyperparameters to train the 
   network. You can also save your model and load pretrained model to finetune the network. Make sure using GPU 
   mode. You should achieve at least 80% on the validation set."""

"""vist http://mxnet.io/get_started/index.html to get familar with mxnet!"""
#This is yaodi's notebook!   
import sys
import os
import numpy as np
import mxnet as mx
import logging

# download data if necessary
def _download(data_dir):
    if not os.path.isdir(data_dir):
        os.system("mkdir " + data_dir)
    os.chdir(data_dir)
    if (not os.path.exists('train.rec')) or \
       (not os.path.exists('test.rec')) :
        os.system("wget http://data.dmlc.ml/mxnet/data/cifar10.zip")
        os.system("unzip -u cifar10.zip")
        os.system("mv cifar/* .; rm -rf cifar; rm cifar10.zip")
    os.chdir("..")


# data
def get_iterator(data_shape=(3, 28, 28)):
    if '://' not in data_dir:
        _download(data_dir)

    train = mx.io.ImageRecordIter(
        path_imgrec = os.path.join(data_dir, "train.rec"),
        mean_img    = os.path.join(data_dir, "mean.bin"),
        data_shape  = data_shape,
        batch_size  = batch_size,
        rand_crop   = True,
        rand_mirror = True)

    val = mx.io.ImageRecordIter(
        path_imgrec = os.path.join(data_dir, "test.rec"),
        mean_img    = os.path.join(data_dir, "mean.bin"),
        rand_crop   = False,
        rand_mirror = False,
        data_shape  = data_shape,
        batch_size  = batch_size)

    return (train, val)


def get_net(num_classes=10):
    #####################################################################################
    # TODO: define your net                                                             #
    # Define symbols that using convolution and max pooling to extract better features  #
    # from input image.                                                                 #
    #####################################################################################
    data = mx.symbol.Variable(name="data")

    conv1 = mx.symbol.Convolution(data=data, kernel=(3,3), pad=(1,1), num_filter=128)
    bn1 = mx.symbol.BatchNorm(data=conv1)
    relu1 = mx.symbol.Activation(data=bn1, act_type="relu")
  
    conv2 = mx.symbol.Convolution(data=relu1, kernel=(3,3), pad=(1,1), num_filter=128)
    bn2 = mx.symbol.BatchNorm(data=conv2)
    relu2 = mx.symbol.Activation(data=bn2, act_type="relu")
    
    pool1 = mx.symbol.Pooling(data=relu2, pool_type="max", kernel=(2,2), stride=(2,2))
    
    conv3 = mx.symbol.Convolution(data=pool1, kernel=(3,3), pad=(1,1), num_filter=128)
    bn3 = mx.symbol.BatchNorm(data=conv3)
    relu3 = mx.symbol.Activation(data=bn3, act_type="relu")
    
    conv4 = mx.symbol.Convolution(data=relu3, kernel=(3,3), pad=(1,1), num_filter=128)
    bn4 = mx.symbol.BatchNorm(data=conv4)
    relu4 = mx.symbol.Activation(data=bn4, act_type="relu")
    
    pool2 = mx.symbol.Pooling(data=relu4, pool_type="max", kernel=(2,2), stride=(2,2))
    
    fc = mx.symbol.FullyConnected(data=pool2, num_hidden=1024)
    softmax = mx.symbol.SoftmaxOutput(data=fc, name="softmax")
   
    #####################################################################################
    #                              END OF YOUR CODE                                     #
    #####################################################################################
    return softmax

In [13]:
network = get_net()

################################################################################
# TODO: this is similar as solver                                              #
################################################################################

############################ set hyperparameters ###############################
batch_size = 128
weight_decay = 1e-3   # same as weight reg
num_epoch = 30
learning_rate = 5e-3 
devs=mx.gpu(3)     # set device id

################################  path #########################################
data_dir = 'cifar10/'
chk_dir = 'model/'
chk_prefix = chk_dir +'net1'
load_model = False   ## set true if you want to load a pretrained model and finetune with lower learning rate

if not os.path.isdir(chk_dir):
     os.system("mkdir " + chk_dir)

reload(logging)
head = '%(asctime)-15s %(message)s'
logging.basicConfig(level=logging.DEBUG, format=head)

eval_metrics = ['accuracy']

## TopKAccuracy only allows top_k > 1
#eval_metrics.append(mx.metric.create('top_k_accuracy', top_k = 5))

if load_model:
    model_prefix = 'model/net1'
    model_iter = 30  # which model to load

    _, arg_params,__ = mx.model.load_checkpoint(model_prefix, model_iter)
else:
    arg_params = None
    model_iter = 0

model=mx.model.FeedForward(
       ctx      = devs,
       symbol   = network,
       arg_params = arg_params,
       begin_epoch = model_iter,
       num_epoch  = num_epoch,
       learning_rate = learning_rate,
       momentum      = 0.9,
       wd            = weight_decay,
      initializer   = mx.init.Xavier(factor_type='in', magnitude=2.34)    ## weight initialization
       )

train_ite, val_ite = get_iterator()
model.fit(
        X          = train_ite,
        eval_data  = val_ite,
        eval_metric = eval_metrics,
        batch_end_callback = mx.callback.Speedometer(batch_size, 50), 
        epoch_end_callback=mx.callback.do_checkpoint(chk_prefix, 10)   ## save your model after each 10 epochs
        )

################################################################################
#                              END OF YOUR CODE                                #
################################################################################

2016-10-31 00:14:00,540 Start training with [gpu(3)]
2016-10-31 00:14:02,705 Epoch[0] Batch [50]	Speed: 3582.84 samples/sec	Train-accuracy=0.291406
2016-10-31 00:14:04,489 Epoch[0] Batch [100]	Speed: 3590.05 samples/sec	Train-accuracy=0.427344
2016-10-31 00:14:06,275 Epoch[0] Batch [150]	Speed: 3587.45 samples/sec	Train-accuracy=0.479219
2016-10-31 00:14:08,068 Epoch[0] Batch [200]	Speed: 3572.67 samples/sec	Train-accuracy=0.482656
2016-10-31 00:14:09,859 Epoch[0] Batch [250]	Speed: 3575.26 samples/sec	Train-accuracy=0.517344
2016-10-31 00:14:11,645 Epoch[0] Batch [300]	Speed: 3586.10 samples/sec	Train-accuracy=0.541875
2016-10-31 00:14:13,440 Epoch[0] Batch [350]	Speed: 3568.62 samples/sec	Train-accuracy=0.568750
2016-10-31 00:14:14,915 Epoch[0] Resetting Data Iterator
2016-10-31 00:14:14,917 Epoch[0] Time cost=14.222
2016-10-31 00:14:16,034 Epoch[0] Validation-accuracy=0.600969
2016-10-31 00:14:17,805 Epoch[1] Batch [50]	Speed: 3651.35 samples/sec	Train-accuracy=0.585625
2016-10-31 0