## CIFAR 10 - Borrowing from Darknet Architecture

In [1]:
%matplotlib inline
%reload_ext autoreload
%autoreload 2

In [2]:
import sys
sys.path.append('../')
import torch
from fastai.conv_learner import *
PATH = Path("data/cifar10/")
os.makedirs(PATH,exist_ok=True)

In [3]:
print(torch.__version__)
print(torch.cuda.current_device())
print(torch.cuda.get_device_name(0))

0.3.1.post2
0
Quadro P5000


### Download the data 

In [4]:
# !wget http://pjreddie.com/media/files/cifar.tgz -U data/cifar10/

In [5]:
import shutil
classes = ('plane', 'car', 'bird', 'cat', 'deer', 'dog', 'frog', 'horse', 'ship', 'truck')
stats = (np.array([ 0.4914 ,  0.48216,  0.44653]), np.array([ 0.24703,  0.24349,  0.26159]))

In [6]:
# ! mv cifar.tgz data/cifar10/
# ! tar -xf data/cifar10/cifar.tgz -C data/cifar10/
# ! ls data/cifar10/

### Setup some basic folder structure to store all the classes

In [7]:
PATH = 'data/cifar10/cifar/'
OUTPATH = 'data/cifar10/'
# for x in classes:
#     os.makedirs(OUTPATH+'train/'+x,exist_ok=True)
#     os.makedirs(OUTPATH+'val/'+x,exist_ok=True)
    
# # check how many files are in the original directory
# # note that all these files are in a single directory
# filenames = os.listdir(PATH +'train/')
# counts = {x:0 for x in classes}
# print(len(filenames))

In [8]:
# # this part of the code will cycle through each class
# # and make a sub dir in train and val
# # EXAMPLE:
# # train/car/ ... images
# # val/car/ ... images

# valset_size = len(filenames) / 10 * .2
# for file_n in filenames:
#     for x in classes:
#         if x in file_n:
#             counts[x] = counts[x] +1
#             if counts[x] < valset_size:
#                 shutil.copyfile(PATH+'train/'+file_n, OUTPATH+'val/'+x+'/'+file_n)
#             else:
#                 shutil.copyfile(PATH+'train/'+file_n, OUTPATH+'train/'+x+'/'+file_n)
#         if 'automobile' in file_n:
#             counts['car'] = counts['car'] +1
#             if counts[x] < valset_size:
#                 shutil.copyfile(PATH+'train/'+file_n, OUTPATH+'val/car/'+file_n)
#             else:
#                 shutil.copyfile(PATH+'train/'+file_n, OUTPATH+'train/car/'+file_n)

# Set up the Model

In [9]:
# initialize some parameters
num_workers = num_cpus()//2

# batch size
bs=256

# image size
sz=32

# define which transformations we will be using
# we will only be using flips (will skip rotations since images are small)
# random flips and 4 pixels of padding on each side. Doesn't add black padding
# FASTAI - takes the last few pixels and reflect it for a border
tfms = tfms_from_stats(stats, sz, aug_tfms=[RandomFlip()], pad=sz//8)

# create our image data
data = ImageClassifierData.from_paths(OUTPATH, val_name='val', tfms=tfms, bs=bs)

## Architecture

Further research into the exploration of 2 / 3 conv_block layers. What size should the channels be? What happens under difference size configurations. Check out this paper:

https://arxiv.org/abs/1605.07146

- ** may not be good to decrease --> increase ** even though it permits more layers
- paper reviews best practices for choosing GPUs and configurations.

In [10]:
# ===============================================
# Architecture
# ===============================================
'''
Darknet
| - ResLayer
      | - Conv_layer
'''

def conv_layer(ni, nf, ks=3, stride=1):
    """
    Using a sequential layer instead of a custom nn.Module
    Much easier, since we are using standard blocks
    
    ni: number of inputs
    nf: number of filters
    ks: kernel size
    """
    return nn.Sequential(
        nn.Conv2d(ni, 
                  nf, 
                  kernel_size=ks, 
                  bias=False, 
                  stride=stride, 
                  padding=ks//2),
        
        nn.BatchNorm2d(nf, 
                       momentum=0.01),
        
        nn.LeakyReLU(negative_slope=0.1, 
                     inplace=True))


class ResLayer(nn.Module):
    """
    This is the standard res layer (resnet). The key feature
    is adding the input along with the convoluted input. This
    is found in the forward()
    
    OUTPUT = INPUT + CONV_BLOCK2(CONV_BLOCK1(INPUT))
    
    ni : number of inputs
    
    Number of channels: will squash to input channels //2
    
    EXAMPLE:
    --------
        64 input => conv1 => 32 => conv2 => 64
    """
    def __init__(self, ni):
        super().__init__()
        self.conv1=conv_layer(ni, ni//2, ks=1)
        self.conv2=conv_layer(ni//2, ni, ks=3)
        
    def forward(self, x): return x.add_(self.conv2(self.conv1(x)))

    
class Darknet(nn.Module):
    def make_group_layer(self, ch_in, num_blocks, stride=1):
        """
        
        ch_in: channels in 
        
        A. conv_block:
        1. will double the number of layers in
        2. have the grid size
        
        B. collection of res_blocks
        3. then do a bunch of resLayers (constant channels)
        """
        return [conv_layer(ch_in, ch_in*2,stride=stride)
               ] + [(ResLayer(ch_in*2)) for i in range(num_blocks)]

    def __init__(self, num_blocks, num_classes, nf=32):
        """
        Expects a creation such as 
        
        model = Darknet([1,2,4,6,3], num_classes=10, nf=32)
        
        -> Initial conv_block
        -> 1 x group layer [ conv_block + 1 x res_block] w/ 32 filters (64 channels out)
        -> 1 x group layer [ conv_block + 2 x res_block] w/ 32 filters (128 channels out)
        -> 1 x group layer [ conv_block + 4 x res_block] w/ 32 filters (256 channels out)
        -> 1 x group layer [ conv_block + 6 x res_block] w/ 32 filters (512 channels out)
        -> 1 x group layer [ conv_block + 3 x res_block] w/ 32 filters (1024 channels out)
        -> AdaptiveAvgPool
        -> Flatten
        -> Fully Connected Linear Layer
        """
        super().__init__()
        layers = [conv_layer(3, nf, ks=3, stride=1)]
        for i,nb in enumerate(num_blocks):
            
            # first layer has a stride of 1 so we don't halve the size of 
            # small images
            layers += self.make_group_layer(nf, nb, stride=2-(i==1))
            nf *= 2
        layers += [nn.AdaptiveAvgPool2d(1), Flatten(), nn.Linear(nf, num_classes)]
        self.layers = nn.Sequential(*layers)
    
    def forward(self, x): return self.layers(x)

In [11]:
# create an initial model
m = Darknet([1, 2, 4, 6, 3], num_classes=10, nf=32)
m = nn.DataParallel(m, [1,2,3])

# set the learning rate
lr = 1.3

# create our model
learn = ConvLearner.from_model_data(m, data)
learn.crit = nn.CrossEntropyLoss()
learn.metrics = [accuracy]
wd=1e-4

In [15]:
# train the model
%time learn.fit(lr, 1, wds=wd, cycle_len=30, use_clr_beta=(20, 20, 0.95, 0.85))

### Key Features used

- tesla GPUS on AWS P3 - 8 GPUs
- floating point 16 (half/precision) , using volta's half precision capabilities
- single cycle learning rate


#### Reminder of one cycle learning

- Creates a upward path as long as the downward path
- choose a ratio between two numbers x/y
- what % of your epoches are spent going from low part of triangle down to zero
- Momentum is also included, and correlates inverted.
<img src='https://snag.gy/hxUyw1.jpg' style='width:600px'>
