## Example using ClassifierTrainer to train CIFAR dataset


### Imports
We import the ClassifierTrainer class, and also the learning rate scheduler functions from PyTorchTrainer

In [11]:
from PyTorchTrainer import (
    ClassifierTrainer,
    get_cosine_lr_scheduler,
    get_multiplicative_lr_scheduler
)
import os, sys, getopt, pickle
import torch
import torchvision
import torchvision.transforms as transforms
from torch.utils.data import DataLoader
import torch.nn as nn

### Model architecture
Below we define a CNN architecture.

Note here that we also implement the `get_parameters()` method that returns two list: the first list contains parameters (such as BN paramers) that we dont want to apply weight decay regularization during training, and the second list contains parameters that we would like to apply weight decay regularization.

Note that if `get_parameters()` is not implemented and if weight decay coefficient is not 0, then weight decay will be applied to all parameters during training when using the ClassifierTrainer

In [12]:
class AllCNN(nn.Module):
    def __init__(self, n_class):
        super(AllCNN, self).__init__()

        self.block1 = nn.ModuleList([
            nn.Conv2d(in_channels=3, out_channels=36, kernel_size=3, stride=1, padding=1),
            nn.BatchNorm2d(36),
            nn.ReLU(),
            nn.Conv2d(in_channels=36, out_channels=96, kernel_size=3, stride=1, padding=1),
            nn.BatchNorm2d(96),
            nn.ReLU(),
            nn.Conv2d(in_channels=96, out_channels=96, kernel_size=3, stride=2, padding=1),
            nn.BatchNorm2d(96),
            nn.ReLU(),
            nn.Dropout(0.2),
        ])

        self.block2 = nn.ModuleList([
            nn.Conv2d(in_channels=96, out_channels=192, kernel_size=3, stride=1, padding=1),
            nn.BatchNorm2d(192),
            nn.ReLU(),
            nn.Conv2d(in_channels=192, out_channels=192, kernel_size=3, stride=1, padding=1),
            nn.BatchNorm2d(192),
            nn.ReLU(),
            nn.Conv2d(in_channels=192, out_channels=192, kernel_size=3, stride=2, padding=1),
            nn.BatchNorm2d(192),
            nn.ReLU(),
            nn.Dropout(0.2),
        ])

        self.block3 = nn.ModuleList([
            nn.Conv2d(in_channels=192, out_channels=192, kernel_size=3, stride=1, padding=1),
            nn.BatchNorm2d(192),
            nn.ReLU(),
            nn.Conv2d(in_channels=192, out_channels=192, kernel_size=1, stride=1, padding=0),
            nn.BatchNorm2d(192),
            nn.ReLU(),
        ])

        self.classifier = nn.ModuleList([
            nn.Conv2d(in_channels=192, out_channels=n_class, kernel_size=1, stride=1, padding=0),
            nn.BatchNorm2d(n_class),
            nn.ReLU(),
        ])

    def forward(self, x):
        for layer in self.block1:
            x = layer(x)
        for layer in self.block2:
            x = layer(x)
        for layer in self.block3:
            x = layer(x)
        for layer in self.classifier:
            x = layer(x)
        x = x.mean(dim=-1).mean(dim=-1)
        return x

    def initialize(self,):
        for layer in self.modules():
            if isinstance(layer, nn.Conv2d):
                nn.init.kaiming_normal_(layer.weight)
                nn.init.constant_(layer.bias, 0.0)
            elif isinstance(layer, nn.BatchNorm2d):
                nn.init.constant_(layer.weight, 1)
                nn.init.constant_(layer.bias, 0.0)
            elif isinstance(layer, nn.Linear):
                nn.init.kaiming_normal_(layer.weight)
                nn.init.constant_(layer.bias, 0.0)

    def get_parameters(self,):
        bn_params = list(self.block1[1].parameters()) +\
            list(self.block1[4].parameters()) +\
            list(self.block1[7].parameters()) +\
            list(self.block2[1].parameters()) +\
            list(self.block2[4].parameters()) +\
            list(self.block2[7].parameters()) +\
            list(self.block3[1].parameters()) +\
            list(self.block3[4].parameters()) +\
            list(self.classifier[1].parameters())

        other_params = list(self.block1[0].parameters()) +\
            list(self.block1[3].parameters()) +\
            list(self.block1[6].parameters()) +\
            list(self.block2[0].parameters()) +\
            list(self.block2[3].parameters()) +\
            list(self.block2[6].parameters()) +\
            list(self.block3[0].parameters()) +\
            list(self.block3[3].parameters()) +\
            list(self.classifier[0].parameters())

        return bn_params, other_params

### Dataset loader objects

Below we implement the function that returns the dataloaders for CIFAR dataset

In [13]:
def get_dataset(dataset):
    mean = [x / 255 for x in [125.3, 123.0, 113.9]]
    std = [x / 255 for x in [63.0, 62.1, 66.7]]

    train_transform = transforms.Compose(
        [transforms.RandomHorizontalFlip(),
         transforms.RandomCrop(32, padding=4),
         transforms.ToTensor(),
         transforms.Normalize(mean, std)])

    test_transform = transforms.Compose(
        [transforms.ToTensor(),
         transforms.Normalize(mean, std)])

    if dataset == 'cifar10':
        trainset = torchvision.datasets.CIFAR10(root='.', train=True,
                                                download=True, transform=train_transform)

        testset = torchvision.datasets.CIFAR10(root='.', train=False,
                                               download=True, transform=test_transform)

        n_class = 10
    else:
        trainset = torchvision.datasets.CIFAR100(root='.', train=True,
                                                download=True, transform=train_transform)

        testset = torchvision.datasets.CIFAR100(root='.', train=False,
                                               download=True, transform=test_transform)
        n_class = 100

    train_loader = DataLoader(trainset, batch_size=64, pin_memory=True, shuffle=True)
    test_loader = DataLoader(testset, batch_size=64, pin_memory=True, shuffle=False)

    return train_loader, test_loader, n_class

### Main code

For the main code, we will:
- get dataloaders
- create a model instance
- create a trainer and train the model using the dataloaders

In [14]:
# here we load the cifar10 dataset
train_loader, test_loader, n_class = get_dataset('cifar10')

# then create a model instance
model = AllCNN(n_class)


Files already downloaded and verified
Files already downloaded and verified


### trainer options

The `ClassifierTrainer` class comes with many options. The only required argument is `n_epoch`, which specifies the number of epochs to train the model. By default, `ClassifierTrainer` uses a learning rate scheduler that gradually reduces the learning rate using cosine function. The default starting learning rate is `1e-3` and the last learning rate is `1e-5`. To specify another starting and ending learning rate for the cosine learning rate scheduler, we can use the convenient function `get_cosine_lr_scheduler(initial_lr, final_lr)`.

For example, we will create a trainer object with cosine learning rate scheduler starting from 0.01 down to 0.0001 for 200 epochs 

In [15]:
trainer = ClassifierTrainer(n_epoch=200,
                            lr_scheduler=get_cosine_lr_scheduler(1e-2, 1e-4))

Alternatively, we can specify a multiplicative learning rate with an initial learning rate, the epoch indices when the learning rate drops, and the multiplication factor using the convenient function `get_multiplicative_lr_scheduler(init_lr, drop_at, multiplicative_factor)`

For example, below we will specify a learning rate scheduler that starts with a learning rate of `1e-2` and drop by a factor of `0.1` at epoch 30, 60, 90

In [16]:
trainer = ClassifierTrainer(n_epoch=200,
                            lr_scheduler=get_multiplicative_lr_scheduler(1e-2, [30, 60, 90], 0.1))

The `ClassifierTrainer` class has other options such as saving checkpoints (given that `temp_dir` is specified) at a specified frequency `checkpoint_freq`, training from a particular checkpoint `epoch_idx` if exists. Please consult the complete description of the interface to know more. 

After the trainer is constructed, we can train the model using `fit()`. This function will return a dictionary that contains keys:

`train_acc`, `train_cross_entropy`, 

`val_acc`, `val_cross_entropy` if `val_loader` is not None

`test_acc`, `test_cross_entropy` if `test_loader` is not None

When accessing each key, we will get the list of performance measured at each epoch. For example `performance['train_acc']` is the training accuracy curve during training


In [None]:
performance = trainer.fit(model=model,
                          train_loader=train_loader,
                          val_loader=None,
                          test_loader=test_loader,
                          device=torch.device('cuda'))