# Assignment 3 Part 1: Developing Your Own Classifier

In [None]:
from google.colab import drive
drive.mount("/content/gdrive")

In [None]:
import os
HOME_DIR = os.getcwd()
PWD = os.path.join(HOME_DIR, 'gdrive', 'MyDrive', 'Colab Notebooks', 'CS498', 'assignment3_part1')
print(PWD)
os.chdir(PWD)

In [None]:
import os
import numpy as np
import torch
import torch.nn as nn
import torchvision

from torchvision import transforms
from sklearn.metrics import average_precision_score
from PIL import Image, ImageDraw
import matplotlib.pyplot as plt
from kaggle_submission import output_submission_csv
from voc_dataloader import VocDataset, VOC_CLASSES

%matplotlib inline
%reload_ext autoreload
%autoreload 1

%aimport classifier

In [None]:
import shutil
shutil.copyfile("VOCtrainval_06-Nov-2007.tar", "/content/VOCtrainval_06-Nov-2007.tar")
!tar -xf "/content/VOCtrainval_06-Nov-2007.tar" -C "/content/" 
shutil.move("/content/VOCdevkit/", "/content/VOCdevkit_2007")

shutil.copyfile("VOCtest_06-Nov-2007.tar", "/content/VOCtest_06-Nov-2007.tar")
!tar -xf "/content/VOCtest_06-Nov-2007.tar" -C "/content/" 
shutil.move("/content/VOCdevkit/VOC2007", "/content/VOCdevkit_2007/VOC2007test")

# Part 1B: Design your own network

In this notebook, your task is to create and train your own model for multi-label classification on VOC Pascal.

## What to do
1. You will make change on network architecture in ```classifier.py```.
2. You may also want to change other hyperparameters to assist your training to get a better performances. Hints will be given in the below instructions.

## What to submit
Check the submission template for details what to submit. 

## Utilities

We only show the utilities that was not provided in the starter code below.

In [None]:
def train_classifier(train_loader, classifier, criterion, optimizer):
    classifier.train()
    loss_ = 0.0
    losses = []
    for i, (images, labels) in enumerate(train_loader):
        images, labels = images.to(device), labels.to(device)
        optimizer.zero_grad()
        logits = classifier(images)
        loss = criterion(logits, labels)
        loss.backward()
        optimizer.step()
        losses.append(loss)
    return torch.stack(losses).mean().item()

In [None]:
def test_classifier(test_loader, classifier, criterion, print_ind_classes=True, print_total=True):
    classifier.eval()
    losses = []
    with torch.no_grad():
        y_true = np.zeros((0,21))
        y_score = np.zeros((0,21))
        for i, (images, labels) in enumerate(test_loader):
            images, labels = images.to(device), labels.to(device)
            logits = classifier(images)
            y_true = np.concatenate((y_true, labels.cpu().numpy()), axis=0)
            y_score = np.concatenate((y_score, logits.cpu().numpy()), axis=0)
            loss = criterion(logits, labels)
            losses.append(loss.item())
        aps = []
        # ignore first class which is background
        for i in range(1, y_true.shape[1]):
            ap = average_precision_score(y_true[:, i], y_score[:, i])
            if print_ind_classes:
                print('-------  Class: {:<12}     AP: {:>8.4f}  -------'.format(VOC_CLASSES[i], ap))
            aps.append(ap)
        
        mAP = np.mean(aps)
        test_loss = np.mean(losses)
        if print_total:
            print('mAP: {0:.4f}'.format(mAP))
            print('Avg loss: {}'.format(test_loss))
        
    return mAP, test_loss, aps

In [None]:
def plot_losses(train, val, test_frequency, num_epochs):
    plt.plot(train, label="train")
    indices = [i for i in range(num_epochs) if ((i+1)%test_frequency == 0 or i ==0)]
    plt.plot(indices, val, label="val")
    plt.title("Loss Plot")
    plt.ylabel("Loss")
    plt.xlabel("Epoch")
    plt.legend()
    plt.show()
    
def plot_mAP(train, val, test_frequency, num_epochs):
    indices = [i for i in range(num_epochs) if ((i+1)%test_frequency == 0 or i ==0)]
    plt.plot(indices, train, label="train")
    plt.plot(indices, val, label="val")
    plt.title("mAP Plot")
    plt.ylabel("mAP")
    plt.xlabel("Epoch")
    plt.legend()
    plt.show()
    

In [None]:
def train(classifier, num_epochs, train_loader, val_loader, criterion, optimizer, test_frequency=5):
    train_losses = []
    train_mAPs = []
    val_losses = []
    val_mAPs = []

    for epoch in range(1,num_epochs+1):
        print("Starting epoch number " + str(epoch))
        train_loss = train_classifier(train_loader, classifier, criterion, optimizer)
        train_losses.append(train_loss)
        print("Loss for Training on Epoch " +str(epoch) + " is "+ str(train_loss))
        if(epoch%test_frequency==0 or epoch==1):
            mAP_train, _, _ = test_classifier(train_loader, classifier, criterion, False, False)
            train_mAPs.append(mAP_train)
            mAP_val, val_loss, _ = test_classifier(val_loader, classifier, criterion)
            print('Evaluating classifier')
            print("Mean Precision Score for Testing on Epoch " +str(epoch) + " is "+ str(mAP_val))
            val_losses.append(val_loss)
            val_mAPs.append(mAP_val)
    
    return classifier, train_losses, val_losses, train_mAPs, val_mAPs

In [None]:
def summary(net):
    X = torch.rand(size=(1, 3, 227, 227), dtype=torch.float32)
    pipeline = "Model summary"
    print("=" * len(pipeline))
    print(pipeline)
    print("=" * len(pipeline))
    print()

    pipeline = "Feature pipeline"
    print(pipeline)
    print('-' * len(pipeline))
    print()
    for layer in net.features:
        X = layer(X)
        print("{:<15}".format(layer.__class__.__name__), 'output shape: \t', X.shape)

    layer = net.bridge
    X = layer(X)
    print("{:<15}".format(layer.__class__.__name__), 'output shape: \t', X.shape)
    print()

    pipeline = "Classification pipeline"
    print(pipeline)
    print('-' * len(pipeline))
    print()
    for layer in net.classifier:
        X = layer(X)
        print("{:<15}".format(layer.__class__.__name__), 'output shape: \t', X.shape)

    return net

# Developing Your Own Model


### Goal
To meet the benchmark for this assignment you will need to improve the network. Note you should have noticed pretrained Alenxt performs really well, but training Alexnet from scratch performs much worse. We hope you can design a better architecture over both the simple classifier and AlexNet to train from scratch.

### How to start
You may take inspiration from other published architectures and architectures discussed in lecture. However, you are NOT allowed to use predefined models (e.g. models from torchvision) or use pretrained weights. Training must be done from scratch with your own custom model.

#### Some hints
There are a variety of different approaches you should try to improve performance from the simple classifier:

* Network architecture changes
    * Number of layers: try adding layers to make your network deeper
    * Batch normalization: adding batch norm between layers will likely give you a significant performance increase
    * Residual connections: as you increase the depth of your network, you will find that having residual connections like those in ResNet architectures will be helpful
* Optimizer: Instead of plain SGD, you may want to add a learning rate schedule, add momentum, or use one of the other optimizers you have learned about like Adam. Check the `torch.optim` package for other optimizers
* Data augmentation: You should use the `torchvision.transforms` module to try adding random resized crops and horizontal flips of the input data. Check `transforms.RandomResizedCrop` and `transforms.RandomHorizontalFlip` for this. Feel free to apply more [transforms](https://pytorch.org/docs/stable/torchvision/transforms.html) for data augmentation which can lead to better performance. 
* Epochs: Once you have found a generally good hyperparameter setting try training for more epochs
* Loss function: You might want to add weighting to the `MultiLabelSoftMarginLoss` for classes that are less well represented or experiment with a different loss function



#### Note
We will soon be providing some initial expectations of mAP values as a function of epoch so you can get an early idea whether your implementation works without waiting a long time for training to converge.

### What to submit 
Submit your best model to Kaggle and save all plots for the writeup.


## Data augmentation

To better model the data, we augment it by the transformatios shown below, drawing inspiration from [this](https://towardsdatascience.com/improves-cnn-performance-by-applying-data-transformation-bf86b3f4cef4) Medium post . In the color jitter I didn't change the relative concentration of color channels (i.e. saturate or hue) because the `normalize` transformations might need to change if thats the case.

In [None]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406],
                                     std= [0.229, 0.224, 0.225])

# Inspired from https://towardsdatascience.com/improves-cnn-performance-by-applying-data-transformation-bf86b3f4cef4
train_transform = transforms.Compose([
            transforms.RandomResizedCrop(227),
            transforms.RandomVerticalFlip(p=0.5),
            transforms.RandomHorizontalFlip(p=0.5),
            transforms.ColorJitter(brightness=0.5, contrast=0.5),
            transforms.ToTensor(),
            normalize
        ])

test_transform = transforms.Compose([
            transforms.Resize(227),
            transforms.CenterCrop(227),
            transforms.ToTensor(),
            normalize,
        ])

ds_train = VocDataset('/content/VOCdevkit_2007/VOC2007/','train',train_transform)
ds_val = VocDataset('/content/VOCdevkit_2007/VOC2007/','val', test_transform)
ds_test = VocDataset('/content/VOCdevkit_2007/VOC2007test/','test', test_transform)


In [None]:
num_epochs = 100
test_frequency = 5
batch_size = 64

train_loader = torch.utils.data.DataLoader(dataset=ds_train,
                                               batch_size=batch_size, 
                                               shuffle=True,
                                               num_workers=1)

val_loader = torch.utils.data.DataLoader(dataset=ds_val,
                                               batch_size=batch_size, 
                                               shuffle=True,
                                               num_workers=1)

test_loader = torch.utils.data.DataLoader(dataset=ds_test,
                                               batch_size=batch_size, 
                                               shuffle=False,
                                               num_workers=1)

# Results

## Network structure

All networks shown below consist of three distinct modular parts : the feature extractor, the classifier and a bridge between these two. The feature extractor comprises of CNN architecture. The classifer comprises of fully-connected layers. In all our cases the bridge is just a `flatten()` operation, although in principle, any arbitrary transformations can be applied (an activation for example).

Furthermore, we employ both optimization algorithms (SGD and Adam), but only report results for Adam.

## BaseNet
I recoded the base-network in the structure introduced above. The model summary shows the shape of the outputs as the features get refined. 

In [None]:
# TODO: Run your own classifier here
from classifier import MySimpleClassifier
classifier = summary(MySimpleClassifier()).to(device)

criterion = nn.MultiLabelSoftMarginLoss()
# optimizer = torch.optim.SGD(classifier.parameters(), lr=0.01, momentum=0.9)
optimizer = torch.optim.Adam(classifier.parameters(), lr=1e-4)

We train the model now, but skip the output for brevity.

In [None]:
classifier, train_losses, val_losses, train_mAPs, val_mAPs = train(classifier, num_epochs, train_loader, val_loader, criterion, optimizer, test_frequency)

In [None]:
plot_losses(train_losses, val_losses, test_frequency, num_epochs)
plot_mAP(train_mAPs, val_mAPs, test_frequency, num_epochs)

In [None]:
mAP_test, test_loss, test_aps = test_classifier(test_loader, classifier, criterion)
print(mAP_test)

In [None]:
torch.save(classifier.state_dict(), './voc_p1b_simple_classifier.pth')

## Version 1

From the base network, we make two changes.

- Inspired by **VGGnet**, we replace the first 5x5 conv layer with 2 3x3 layers. This retains the receptive fields, while decreasing the number of parameters to train.
- Drawing inspiration from **VGGnet** again, we increase the number of channels with depth from 3->64->128->256 (as opposed to the basenet which had decreasing number of channels, from 3->64->32->16). 
- To retain the classifier architecture from Basenet (so as to not make too many changes in one version), we use a 1x1 conv layer and reduce the 256 channels from the previous layers to 16 channels. This 1x1 conv is inspired by **Network in Network (NiN)**. 

In [None]:
# TODO: Run your own classifier here
from classifier import MyClassifierV1
classifier = summary(MyClassifierV1()).to(device)

criterion = nn.MultiLabelSoftMarginLoss()
# optimizer = torch.optim.SGD(classifier.parameters(), lr=0.01, momentum=0.9)
optimizer = torch.optim.Adam(classifier.parameters(), lr=1e-4)

We train the model now, but skip the output for brevity.

In [None]:
classifier, train_losses, val_losses, train_mAPs, val_mAPs = train(classifier, num_epochs, train_loader, val_loader, criterion, optimizer, test_frequency)

In [None]:
plot_losses(train_losses, val_losses, test_frequency, num_epochs)
plot_mAP(train_mAPs, val_mAPs, test_frequency, num_epochs)

In [None]:
mAP_test, test_loss, test_aps = test_classifier(test_loader, classifier, criterion)
print(mAP_test)

In [None]:
torch.save(classifier.state_dict(), './voc_p1b_my_classifier_v1_100epochs.pth')

## Version 2

We only see a marginal improvement in Version 1. One possible reason is that the network has enough representational power, but cannot effectively learn because of vanishing ReLU gradients in the deep convolutional layers.

From Version 1, then we make one change.

- Inspired by **GoogleNet (Inception v3)**, add BatchNorm layers after the convolution, but before the ReLU. The idea is to make at least one part of the network effectively learn. 

In [None]:
# TODO: Run your own classifier here
from classifier import MyClassifierV2
classifier = summary(MyClassifierV2()).to(device)

criterion = nn.MultiLabelSoftMarginLoss()
# optimizer = torch.optim.SGD(classifier.parameters(), lr=0.01, momentum=0.9)
optimizer = torch.optim.Adam(classifier.parameters(), lr=1e-4)

We train the model now, but skip the output for brevity.

In [None]:
classifier, train_losses, val_losses, train_mAPs, val_mAPs = train(classifier, num_epochs, train_loader, val_loader, criterion, optimizer, test_frequency)

In [None]:
plot_losses(train_losses, val_losses, test_frequency, num_epochs)
plot_mAP(train_mAPs, val_mAPs, test_frequency, num_epochs)

In [None]:
mAP_test, test_loss, test_aps = test_classifier(test_loader, classifier, criterion)
print(mAP_test)

In [None]:
torch.save(classifier.state_dict(), './voc_p1b_my_classifier_v2_100epochs.pth')

## Version 3

We see a good improvement in mAP for Version 2. But here, there may be a chance to further improve generalization and learning. The reduction of 256 channels to 16 channels in the last layer using 1x1 convolutions seems aggressive---the classifier might be getting data that is too correlated.

To prevent this, we 

- Use 1x1 convolutions to reduce 256 channels to 128 channels. To accommodate these additional degrees of freedom, we make the classifier more bulky by adding width to the linear layers. This has its roots in **AlexNet**, where the MLP is very wide. Here we choose to make the 128 x 26 x 26  = 86528 outputs map onto a 1024 wide MLP, which in turn maps onto a 420 layer MLP which then maps to 21 scores. The numbers were chosen based on the same the ratio of neurons between MLP as Version 2.

In [None]:
# TODO: Run your own classifier here
from classifier import MyClassifierV3
classifier = summary(MyClassifierV3()).to(device)

criterion = nn.MultiLabelSoftMarginLoss()
# optimizer = torch.optim.SGD(classifier.parameters(), lr=0.01, momentum=0.9)
optimizer = torch.optim.Adam(classifier.parameters(), lr=1e-4)

We train the model now, but skip the output for brevity.

In [None]:
classifier, train_losses, val_losses, train_mAPs, val_mAPs = train(classifier, num_epochs, train_loader, val_loader, criterion, optimizer, test_frequency)

In [None]:
plot_losses(train_losses, val_losses, test_frequency, num_epochs)
plot_mAP(train_mAPs, val_mAPs, test_frequency, num_epochs)

In [None]:
mAP_test, test_loss, test_aps = test_classifier(test_loader, classifier, criterion)
print(mAP_test)

In [None]:
torch.save(classifier.state_dict(), './voc_p1b_my_classifier_v3_100epochs.pth')
output_submission_csv('my_solution_for_v3.csv', test_aps)

## Version 4

We see a marginal improvement in mAP for Version 3. However what is more concerning is that the generalization decreases, as evident from the growing gap between training and validation losses. The probable reason is that by increasing the width of the linear layers, we have added too many degrees of freedom and the network starts overfitting. Additionally, these degrees of freedom imply that we need to train the network for longer duration.

While there are several ways to prevent overfitting in the classification layers (dropout, L2 loss regularization), we also need means to reduce the degrees of freedom. We achieve this by

- Retaining the classifeir structure from V3 (for its generalization capabilites), but adding an average pooling layer, inspired by **GoogleNet (Inception v3)** and **NiN** ,at the end of the feature extractor. The average pooling size is 5x5 to roughly recover the degrees of freedom as the V2 architecture. 

In [None]:
# TODO: Run your own classifier here
from classifier import MyClassifierV4
classifier = summary(MyClassifierV4()).to(device)

criterion = nn.MultiLabelSoftMarginLoss()
# optimizer = torch.optim.SGD(classifier.parameters(), lr=0.01, momentum=0.9)
optimizer = torch.optim.Adam(classifier.parameters(), lr=1e-4)

We train the model now, but skip the output for brevity.

In [None]:
classifier, train_losses, val_losses, train_mAPs, val_mAPs = train(classifier, num_epochs, train_loader, val_loader, criterion, optimizer, test_frequency)

In [None]:
plot_losses(train_losses, val_losses, test_frequency, num_epochs)
plot_mAP(train_mAPs, val_mAPs, test_frequency, num_epochs)

In [None]:
mAP_test, test_loss, test_aps = test_classifier(test_loader, classifier, criterion)
print(mAP_test)

In [None]:
torch.save(classifier.state_dict(), './voc_p1b_my_classifier_v4_100epochs.pth')
output_submission_csv('my_solution_for_v4.csv', test_aps)

## Version 5

We once again see a marginal improvement in mAP for Version 4. The network also seems to perform favorably (i.e. both training and validation loss decrease together).

To test the effects of potential overfitting in the classification layers, we now

- Add Dropout in the MLP inspired by **AlexNet**

In [None]:
# TODO: Run your own classifier here
from classifier import MyClassifierV5
classifier = summary(MyClassifierV5()).to(device)

criterion = nn.MultiLabelSoftMarginLoss()
# optimizer = torch.optim.SGD(classifier.parameters(), lr=0.01, momentum=0.9)
optimizer = torch.optim.Adam(classifier.parameters(), lr=1e-4)

We train the model now, but skip the output for brevity.

In [None]:
classifier, train_losses, val_losses, train_mAPs, val_mAPs = train(classifier, num_epochs, train_loader, val_loader, criterion, optimizer, test_frequency)

In [None]:
plot_losses(train_losses, val_losses, test_frequency, num_epochs)
plot_mAP(train_mAPs, val_mAPs, test_frequency, num_epochs)

In [None]:
mAP_test, test_loss, test_aps = test_classifier(test_loader, classifier, criterion)
print(mAP_test)

In [None]:
torch.save(classifier.state_dict(), './voc_best_classifier.pth')
output_submission_csv('my_solution.csv', test_aps)

In [None]:
classifier.load_state_dict(torch.load(
    './voc_best_classifier.pth'
))

In [None]:
# TODO: Run your own classifier here
from classifier import MyClassifierV6
classifier = summary(MyClassifierV6()).to(device)

criterion = nn.MultiLabelSoftMarginLoss()
# optimizer = torch.optim.SGD(classifier.parameters(), lr=0.01, momentum=0.9)
optimizer = torch.optim.Adam(classifier.parameters(), lr=1e-4)

In [None]:
classifier, train_losses, val_losses, train_mAPs, val_mAPs = train(classifier, num_epochs, train_loader, val_loader, criterion, optimizer, test_frequency)

In [None]:
plot_losses(train_losses, val_losses, test_frequency, num_epochs)
plot_mAP(train_mAPs, val_mAPs, test_frequency, num_epochs)

In [None]:
mAP_test, test_loss, test_aps = test_classifier(test_loader, classifier, criterion)
print(mAP_test)

Finally, we pick the best model among all (v4) and report it in Kaggle.