# System cmd

In [None]:
!kill -9 -1 # Kill current runtime

In [None]:
!python3 -c 'import tensorflow as tf; print(tf.__version__)'  # Print tensorflow version: 1.10.1

In [None]:
!cat /etc/*release  # Check ubuntu version: 18.04

# Link folder to google drive

In [None]:
# Check cuda version
!nvidia-smi
!ldconfig -p | grep cuda

In [None]:
!wget https://raw.githubusercontent.com/thanhhvnqb/Mxnet_colab/master/install_mxnet_cu92mkl.sh -O install_mxnet_cu92mkl.sh 
# !wget https://raw.githubusercontent.com/thanhhvnqb/Mxnet_colab/master/install_mxnet_cu90mkl.sh -O install_mxnet_cu92mkl.sh 
!bash install_mxnet_cu92mkl.sh
# # Fix error in installing google-drive-ocamfuse
# !wget https://launchpad.net/~alessandro-strada/+archive/ubuntu/google-drive-ocamlfuse-beta/+build/15331130/+files/google-drive-ocamlfuse_0.7.0-0ubuntu1_amd64.deb
# !dpkg -i google-drive-ocamlfuse_0.7.0-0ubuntu1_amd64.deb
# !apt-get install -f
# !apt-get -y install -qq fuse

In [None]:
from google.colab import auth
auth.authenticate_user()
from oauth2client.client import GoogleCredentials
creds = GoogleCredentials.get_application_default()
import getpass
!google-drive-ocamlfuse -headless -id={creds.client_id} -secret={creds.client_secret} < /dev/null 2>&1 | grep URL
vcode = getpass.getpass()
!echo {vcode} | google-drive-ocamlfuse -headless -id={creds.client_id} -secret={creds.client_secret}

In [None]:
!mkdir -p drive
!google-drive-ocamlfuse drive

In [None]:
!ls drive/Working
!ls drive/Working/cifar10
# !mkdir -p /root/.mxnet/datasets/
# !cp drive/Working/cifar10/cifar10.tar.gz cifar10.tar.gz
!ls
!tar -xzvf drive/Working/cifar10/cifar10.tar.gz 
!ls

# Train Cifar100

## Initialize

In [None]:
from __future__ import division

import sys, argparse, time, logging, random, math

import numpy as np
import mxnet as mx

from mxnet import gluon, nd
from mxnet import autograd as ag
from mxnet.gluon import nn
from mxnet.gluon.data.vision import transforms
from mxboard import SummaryWriter

from gluoncv.model_zoo import get_model
from gluoncv.utils import makedirs, TrainingHistory
from gluoncv.data import transforms as gcv_transforms

from mxnet_utils import *

In [None]:
################################################################
# Data Augmentation and Data Loader
# ---------------------------------
#
# Data augmentation is a common technique used for training. It is
# base on the assumption that, for the same object, photos under different
# composition, lighting condition, or color should all yield the same prediction.
#
# Here are photos of the Golden Bridge, taken by many people,
# at different time from different angles.
# We can easily tell that they are photos of the same thing.
#
# |image-golden-bridge|
#
# We want to teach this invariance to our model, by playing "augmenting"
# input image. Our augmentation transforms the image with
# resizing, cropping, flipping and other techniques.
#
# With ``Gluon``, we can create our transform function as following:

transform_train = transforms.Compose([
    # Randomly crop an area, and then resize it to be 32x32
    gcv_transforms.RandomCrop(32, pad=4),
    # Randomly flip the image horizontally
    transforms.RandomFlipLeftRight(),
    # Randomly jitter the brightness, contrast and saturation of the image
#     transforms.RandomColorJitter(brightness=0.1, contrast=0.1, saturation=0.1),
    # Randomly adding noise to the image
#     transforms.RandomLighting(0.1),
    # Transpose the image from height*width*num_channels to num_channels*height*width
    # and map values from [0, 255] to [0,1]
    transforms.ToTensor(),
    # Normalize the image with mean and standard deviation calculated across all images
    transforms.Normalize([0.4914, 0.4822, 0.4465], [0.2023, 0.1994, 0.2010])
])

################################################################
# You may have noticed that most of the operations are randomized. This in effect
# increases the number of different images the model sees during training.
# The more data we have, the better our model generalizes over
# unseen images.
#
# On the other hand, when making prediction, we would like to remove all
# random operations in order to get a deterministic result. The transform
# function for prediction is:

transform_test = transforms.Compose([
#     transforms.Resize(32),
    transforms.ToTensor(),
    transforms.Normalize([0.4914, 0.4822, 0.4465], [0.2023, 0.1994, 0.2010])
])

def label_transform(label, classes):
    ind = label.astype('int')
    res = nd.zeros((ind.shape[0], classes), ctx = label.context)
    res[nd.arange(ind.shape[0], ctx = label.context), ind] = 1
    return res

## Init Train

In [None]:
################################################################
# Note that it is important to keep the normalization step, since the
# model only works well on inputs from the same distribution.
#
# With the transform functions, we can define data loaders for our
# training and validation datasets.

# Number of data loader workers
num_workers = 8
# Calculate effective total batch size
batch_size = 128
classes = 100
# Set train=True for training data
# Set shuffle=True to shuffle the training data
train_data = gluon.data.DataLoader(
    gluon.data.vision.CIFAR100(root='cifar100',train=True).transform_first(transform_train),
    batch_size=batch_size, shuffle=True, last_batch='discard', num_workers=num_workers)

# Set train=False for validation data
val_data = gluon.data.DataLoader(
    gluon.data.vision.CIFAR100(root='cifar100',train=False).transform_first(transform_test),
    batch_size=batch_size, shuffle=False, num_workers=num_workers)
max_step = len(train_data)
# !ls
# !tar -czvf cifar10.tar.gz cifar10
# !ls
# !cp cifar10.tar.gz drive/Working/cifar10

In [None]:
loss_fn = gluon.loss.SoftmaxCrossEntropyLoss(sparse_label=False)
metric = mx.metric.Accuracy()
train_metric = mx.metric.RMSE()
def test(ctx, val_data):
    progbar = Progbar(target=len(val_data), prefix='Val   - ')
    metric = mx.metric.Accuracy()
    for i, batch in enumerate(val_data):
        data = gluon.utils.split_and_load(batch[0], ctx_list=ctx, batch_axis=0)
        label = gluon.utils.split_and_load(batch[1], ctx_list=ctx, batch_axis=0)
        outputs = [net(X) for X in data]
        metric.update(label, outputs)
        progbar.update(i + 1)
    return metric.get()

# number of GPUs to use
num_gpus = 1
# ctx = [mx.gpu(i) for i in range(num_gpus)]
ctx = [mx.gpu(0)]

## Setup Logger

In [None]:
# Setup logs and saved params directory
net_name = 'pydmobilenet56_0.75_concat'
work_dir = 'out/cifar100/%s/' % net_name
saved_dir = work_dir + '/params/'
# work_dir = 'msresnet'
logsw = SummaryWriter(logdir=work_dir + "/logs/")
get_ipython().system_raw(
    'mkdir {}/'.format(saved_dir)
)

## Create Network

In [None]:
from mxnet.context import cpu
from mxnet.gluon import nn, Parameter, HybridBlock
from mxnet.gluon.contrib.nn import HybridConcurrent, Identity


class ResBlock(HybridBlock):
    def __init__(self, multiplier, stage, ordinal, num_features, stride, dropout, **kwargs):
        super(ResBlock, self).__init__(prefix='stg%d/blk%d/' % (stage, ordinal), **kwargs)
        with self.name_scope():
            use_bias = False
            dw_channels = np.int(num_features * multiplier)
            self.btl_in = nn.HybridSequential(prefix='btl_in/')
            with self.btl_in.name_scope():
                self.btl_in.add(nn.BatchNorm())
                self.btl_in.add(nn.Activation('relu'))
                self.btl_in.add(nn.Conv2D(dw_channels, kernel_size=1, use_bias=use_bias))
                if dropout:
                    self.btl_in.add(nn.Dropout(dropout))
                self.btl_in.add(nn.BatchNorm())
                self.btl_in.add(nn.Activation('relu'))
                    
            self.conv3 = nn.HybridSequential(prefix='conv3/')
            with self.conv3.name_scope():
                self.conv3.add(nn.Conv2D(dw_channels, kernel_size=3, strides=stride, padding=1, groups=dw_channels, use_bias=use_bias))
                if dropout:
                    self.conv3.add(nn.Dropout(dropout))
#                 self.conv3.add(nn.BatchNorm())
                
            self.conv5 = nn.HybridSequential(prefix='conv5/')
            with self.conv5.name_scope():
                self.conv5.add(nn.Conv2D(dw_channels, kernel_size=5, strides=stride, padding=2, groups=dw_channels, use_bias=use_bias))
                if dropout:
                    self.conv5.add(nn.Dropout(dropout))
#                 self.conv5.add(nn.BatchNorm())
                   
            self.conv7 = nn.HybridSequential(prefix='conv7/')
            with self.conv7.name_scope():
                self.conv7.add(nn.Conv2D(dw_channels, kernel_size=7, strides=stride, padding=3, groups=dw_channels, use_bias=use_bias))
                if dropout:
                    self.conv7.add(nn.Dropout(dropout))
#                 self.conv7.add(nn.BatchNorm())
                
            self.btl_out = nn.HybridSequential(prefix='btl_out/')
            with self.btl_out.name_scope():
                self.btl_out.add(nn.BatchNorm())
                self.btl_out.add(nn.Activation('relu'))
                self.btl_out.add(nn.Conv2D(num_features, kernel_size=1, use_bias=use_bias))
                if dropout:
                    self.btl_out.add(nn.Dropout(dropout))
                self.btl_out.add(nn.BatchNorm())
                    
            if stride > 1:
                self.downsample = nn.HybridSequential(prefix='')
#                 self.downsample.add(nn.BatchNorm())
#                 self.downsample.add(nn.Activation('relu'))
                self.downsample.add(nn.Conv2D(num_features, kernel_size=1, strides=stride, padding=0,
                                          use_bias=use_bias))
                self.downsample.add(nn.BatchNorm())
                # self.alpha = self.params.get("alpha", shape=(1, 1), init = mx.init.One(), lr_mult=0)
            else:
                self.downsample = None
                # self.alpha = self.params.get("alpha", shape=(1, 1), init = mx.init.One())

    def hybrid_forward(self, F, x):
        """Hybrid forward"""
        if self.downsample:
            residual = self.downsample(x)
            # residual = F.broadcast_mul(*[alpha, residual])
        else:
            residual = x
            # residual = F.broadcast_mul(*[alpha, x])
        x = self.btl_in(x)
        conv3 = self.conv3(x)
        conv5 = self.conv5(x)
        conv7 = self.conv7(x)
        out = F.concat(*[conv3, conv5, conv7], dim=1) # If concatenate
        # out = conv3 + conv5 + conv7 # If plus
        out = self.btl_out(out)
        out = out + residual
        return out


class ResNet(HybridBlock):
    r"""Densenet-BC model from the
    `"Densely Connected Convolutional Networks" <https://arxiv.org/pdf/1608.06993.pdf>`_ paper.

    Parameters
    ----------
    num_init_features : int
        Number of filters to learn in the first convolution layer.
    growth_rate : int
        Number of filters to add each layer (`k` in the paper).
    block_config : list of int
        List of integers for numbers of layers in each pooling block.
    bn_size : int, default 4
        Multiplicative factor for number of bottle neck layers.
        (i.e. bn_size * k features in the bottleneck layer)
    dropout : float, default 0
        Rate of dropout after each dense layer.
    classes : int, default 1000
        Number of classification classes.
    """
    def __init__(self, prefix, multiplier, num_init_features, cells, channels, dropout=0, classes=10, **kwargs):

        super(ResNet, self).__init__(prefix=prefix, **kwargs)
        with self.name_scope():
            assert len(cells) == len(channels), "Cells and channels should be same"
            self.conv0 = nn.HybridSequential('conv0/')
            with self.conv0.name_scope():
                self.conv0.add(nn.BatchNorm(scale=False, center=False))
                self.conv0.add(nn.Conv2D(num_init_features, kernel_size=3,
                                            strides=1, padding=1, use_bias=False))
            # Add gated cnn cell
            self.cells = nn.HybridSequential()
            for stage in range(len(cells)):                
                for cell in range(cells[stage]):
                    stride = 2 if cell == 0 and stage > 0 else 1
                    self.cells.add(ResBlock(multiplier, stage, cell, channels[stage], stride, dropout))
                    
            self.output = nn.HybridSequential('classifier/')
            with self.output.name_scope():
                self.output.add(nn.GlobalAvgPool2D())
#                 self.output.add(nn.Flatten())
                self.output.add(nn.Dense(classes))

    def hybrid_forward(self, F, x):
        x = self.conv0(x)
        x = self.cells(x)
        x = self.output(x)
        return x

# Specification
msmobilenet_spec = {29: (32, [3, 3, 3], [32, 64, 128]),
              56: (32, [6, 6, 6], [32, 64, 128])
             } # 20: 3; 56: 6; 110: 18
dropout = 0

def get_pydmobilenet(prefix, num_layers, multiplier, classes=10, pretrained=False, ctx=cpu(),
                 root='~/.mxnet/models', **kwargs):
    num_init_features, cells, channels = msmobilenet_spec[num_layers]
    net = ResNet(prefix, multiplier, num_init_features, cells, channels, dropout, classes, **kwargs)
    if pretrained:
        net.load_parameters(get_model_file('densenet%d'%(num_layers), root=root), ctx=ctx)
    return net

def pydmobilenet29_0_25(classes=100, **kwargs):
    return get_pydmobilenet('pydmobilenet29_0_25/', 29, 0.25, classes, **kwargs)

def pydmobilenet29_0_5(classes=100, **kwargs):
    return get_msmobilenet('pydmobilenet29_0_5/', 29, 0.5, classes, **kwargs)

def pydmobilenet29_0_75(classes=100, **kwargs):
    return get_pydmobilenet('pydmobilenet29_0_75/', 29, 0.75, classes, **kwargs)

def pydmobilenet29_1(classes=100, **kwargs):
    return get_msmobilenet('pydmobilenet29_1/', 29, 1, classes, **kwargs)

def pydmobilenet29_1_25(classes=100, **kwargs):
    return get_pydmobilenet('pydmobilenet29_1_25/', 29, 1.25, classes, **kwargs)

def pydmobilenet56_0_25(classes=100, **kwargs):
    return get_pydmobilenet('pydmobilenet56_0_25/', 56, 0.25, classes, **kwargs)

def pydmobilenet56_0_5(classes=100, **kwargs):
    return get_pydmobilenet('pydmobilenet56_0_5/', 56, 0.5, classes, **kwargs)

def pydmobilenet56_0_75(classes=100, **kwargs):
    return get_pydmobilenet('pydmobilenet56_0_75/', 56, 0.75, classes, **kwargs)

def pydmobilenet56_0_75(classes=100, **kwargs):
    return get_pydmobilenet('pydmobilenet56_1/', 56, 1, classes, **kwargs)


net = pydmobilenet56_1(classes)
net.hybridize()

net.initialize(mx.init.Xavier(), ctx=ctx)
net.forward(mx.nd.ones((1,3, 32, 32), ctx=ctx[0]))
total_param = count_param_gluon(net)
logsw.add_graph(net)

## Run train

In [None]:
################################################################
# Optimizer, Loss and Metric
# --------------------------
#
# Optimizer improves the model during training. Here we use the popular
# Nesterov accelerated gradient descent algorithm.

# Learning rate decay factor
lr_decay = 0.1
# Epochs where learning rate decays
epochs = 320
opt_type = 1
if opt_type == 1: # Nesterov accelerated gradient descent
    lr_decay_epoch = [150, 225, np.inf]
    optimizer = 'nag'  
    optimizer_params = {'learning_rate': 0.1, 'wd': 0.0001, 'momentum': 0.9}
elif opt_type == 2: # Adam
    lr_decay_epoch = [150, 225, np.inf]
    optimizer = 'adam'  
    optimizer_params = {'learning_rate': 0.001}

In [None]:
# net = msmobilenet29_0_75(classes)
# net.hybridize()
print(net_name)
print(total_param)

train_history = TrainingHistory(['training-error', 'validation-error'])
gstep = 0
lr_decay_count = 0
begin_epoch = 1
if begin_epoch > 1:
    net.load_parameters(saved_dir + '/pydmobilenet-%04d.params' % (begin_epoch - 1), ctx=ctx)
    gstep = (begin_epoch - 1) * max_step
# else:
#     net.initialize(mx.init.Xavier(), ctx=ctx)

# Define our trainer for net
trainer = gluon.Trainer(net.collect_params(), optimizer, optimizer_params)
best_acc = 0
best_ep = -1

for epoch in range(begin_epoch, epochs + 1):
    print('[Epoch %d] ' % epoch)
    progbar = Progbar(target=max_step, prefix='Train - ', stateful_metrics=['loss'])
    tic = time.time()
    train_metric.reset()
    train_loss = 0
    alpha = 1

    # Learning rate decay
    while epoch >= lr_decay_epoch[lr_decay_count]:
        trainer.set_learning_rate(trainer.learning_rate*lr_decay)
        lr_decay_count += 1

    # Loop through each batch of training data
    for i, batch in enumerate(train_data):
        lam = np.random.beta(alpha, alpha)
        if epoch >= epochs - 20:
            lam = 1
        gstep += 1
#         !nvidia-smi
        # Extract data and label
        data_1 = gluon.utils.split_and_load(batch[0], ctx_list=ctx, batch_axis=0)
        label_1 = gluon.utils.split_and_load(batch[1], ctx_list=ctx, batch_axis=0)

        data = [lam*X + (1-lam)*X[::-1] for X in data_1]
        label = []
        for Y in label_1:
            y1 = label_transform(Y, classes)
            y2 = label_transform(Y[::-1], classes)
            label.append(lam*y1 + (1-lam)*y2)

        # AutoGrad
        with ag.record():
            output = [net(X) for X in data]
            loss = [loss_fn(yhat, y) for yhat, y in zip(output, label)]

        # Backpropagation
        for l in loss:
            l.backward()

        # Optimize
        trainer.step(batch_size)

        # Update metrics
        batch_loss = sum([l.sum().asscalar() for l in loss])
        batch_loss /= (len(loss) * batch_size)
        train_loss += batch_loss
        output_softmax = [nd.SoftmaxActivation(out) for out in output]
        train_metric.update(label, output_softmax)
        logsw.add_scalar('loss_batch', {'pydmobilenet_train_loss': batch_loss}, global_step=gstep)
        progbar.update(i + 1, [['loss', batch_loss]])
        
    train_loss /= max_step
    name, train_acc = train_metric.get()
    train_acc = 1 - train_acc
    # Evaluate on Validation data
    name, val_acc = test(ctx, val_data)

    # Update history and print metrics
    train_history.update([1 - train_acc, 1 - val_acc])
    logsw.add_scalar('loss_ep', {'pydmobilenet_train_loss': train_loss}, global_step=epoch)
    logsw.add_scalar('accuracy', {'pydmobilenet_train_acc': train_acc}, global_step=epoch)
    logsw.add_scalar('accuracy', {'pydmobilenet_val_acc': val_acc}, global_step=epoch)
    net.save_parameters(saved_dir + '/pydmobilenet-%04d.params' % epoch) 
    print('Sum   - train_acc: %.2f | val_acc: %.2f | Time=%ds' %
                (train_acc * 100, val_acc * 100, time.time() - tic))
    if val_acc > best_acc:
        best_acc = val_acc
        best_ep = epoch
print("Validation: Best acc:", best_acc * 100, "at epoch:", best_ep)
# net.save_parameters(saved_dir + '/msmobilenet-0000.params')

In [None]:
train_history.plot()

In [None]:
net = msmobilenet56_1(100)
net.hybridize()
ctx = [mx.gpu(0)]
net.initialize(mx.init.Xavier(), ctx=ctx)
net.forward(mx.nd.ones((1,3, 32, 32), ctx=ctx[0]))
# count_param_gluon(net)
net_name = 'msmobilenet29_0_5_concat'
work_dir = 'drive/Working/cifar10/%s/' % net_name
saved_dir = work_dir + '/params/'
epochs = 1
best_acc = 0
best_ep = -1
val_accs = list()
for epoch in range(1, epochs + 1):
    net.load_parameters('models/msmobilenet-%04d.params' % 300, ctx=ctx)
    name, val_acc = test( ctx, val_data)
    (best_acc, best_ep) = (val_acc, epoch) if val_acc > best_acc else (best_acc, best_ep)
    val_accs.append(val_acc)
np.save('val_acc/cifar100_%s' % net_name, val_accs)
print("Acc:", best_acc, 'at epoch:',best_ep)