# Regularization layers

Some code borrowed from [this tutorial](https://pytorch.org/tutorials/beginner/blitz/cifar10_tutorial.html)

In [None]:
import torch
import torchvision
import torchvision.transforms as transforms

import numpy as np

# selecting one of the 4 gpus randomly
device = torch.device(f'cuda:{np.random.randint(4)}')
print(device)

Note: if you get GPU out of memory errors later in the code and you believe your model isn't that heavy, check out the available GPU memory on other devices by invoking the command:

```
!nvidia-smi
```

Then, select the suitable GPU (0, 1, 2 or 3). E.g. if you want to select GPU 2, run this:

```
device = torch.device('cuda:2')
```

## Getting the dataset (CIFAR-10)

Here we'll download and open the data from the CIFAR-10 datset. (For more details see [this link](https://pytorch.org/docs/stable/torchvision/datasets.html).)

In [None]:
transform = transforms.ToTensor()

BATCH_SIZE = 256

trainset = torchvision.datasets.CIFAR10(
    root='../../../share/2.8_NetworkRegularization/data/',
    train=True, download=True, transform=transform
)
trainloader = torch.utils.data.DataLoader(trainset, batch_size=BATCH_SIZE,
                                          shuffle=True, num_workers=2)

testset = torchvision.datasets.CIFAR10(
    root='../../../share/2.8_NetworkRegularization/data/',
    train=False, download=True, transform=transform
)
testloader = torch.utils.data.DataLoader(testset, batch_size=1024,
                                         shuffle=False, num_workers=2)

classes = ('plane', 'car', 'bird', 'cat',
           'deer', 'dog', 'frog', 'horse', 'ship', 'truck')

In [None]:
import matplotlib.pyplot as plt
import matplotlib

In [None]:
batch_x, batch_y = next(iter(trainloader))
batch_x = batch_x[:16]
batch_y = batch_y[:16]

plt.imshow(
    batch_x.permute(0, 2, 3, 1).reshape(4, 4, 32, 32, 3).permute(0, 2, 1, 3, 4).reshape(128, 128, 3)
)
print(
    np.array(classes)[batch_y.numpy()].reshape(4, 4)
)

## Defining the model

In [None]:
import torch.nn as nn
import torch.nn.functional as F


# We are going to build a model from several convolutional blocks.
# I.e. it's going to be:
#
#       [Conv2d -> Conv2d -> MaxPool2d] x 4
#     
# So why don't we define such a block as a separate Module?
class ConvBlock(nn.Module):
    def __init__(self,
                 in_channels,     # <== number of input channels to the 1st convolution
                 interm_channels, # <== outputs of the 1st / inputs of the 2nd convolution
                 out_channels,    # <== outputs of the 2nd convolution
                 use_batchnorm,   # <== whether we'll use batchnorm
                 initialization): # <== function that'll initialize the weights
        # First we run the base class constructor
        super(ConvBlock, self).__init__()

        # And then define all the layers used within a block
        self.conv1 = nn.Conv2d(in_channels=in_channels,
                               out_channels=interm_channels,
                               kernel_size=3, padding=1)
        self.conv2 = nn.Conv2d(in_channels=interm_channels,
                               out_channels=out_channels,
                               kernel_size=3, padding=1)
        self.pool = nn.MaxPool2d(2, 2)

        self.use_batchnorm = use_batchnorm
        if use_batchnorm:
            self.bn1 = nn.BatchNorm2d(interm_channels)
            self.bn2 = nn.BatchNorm2d(out_channels)

        # If initialization function provided, call it on the weights of the model
        if initialization is not None:
            initialization(self.conv1.weight)
            initialization(self.conv2.weight)

    def forward(self, x):
        x = self.conv1(x)
        if self.use_batchnorm:
            x = self.bn1(x)
        x = F.relu(x)

        x = self.conv2(x)
        if self.use_batchnorm:
            x = self.bn2(x)
        x = F.relu(x)

        x = self.pool(x)
        return x

# The model itself:
class Net(nn.Module):
    def __init__(self, use_batchnorm, initialization):
        super(Net, self).__init__()

        # Convolutional layers:                                         # 3x32x32 (Channels x height x width)
        self.conv1 = ConvBlock(3, 8, 16, use_batchnorm, initialization) # -> 8x32x32 -> 16x32x32 -> 16x16x16
        self.conv2 = ConvBlock(16, 16, 32, use_batchnorm, initialization) # -> 16x16x16 -> 32x16x16 -> 32x8x8
        self.conv3 = ConvBlock(32, 32, 64, use_batchnorm, initialization) # -> 32x8x8 -> 64x8x8 -> 64x4x4
        self.conv4 = ConvBlock(64, 64, 128, use_batchnorm, initialization) # -> 64x4x4 -> 128x4x4 -> 128x2x2

        # Fully connected layers:
        self.fc1 = nn.Linear(128 * 2 * 2, 64)
        self.fc2 = nn.Linear(64, 16)
        self.fc3 = nn.Linear(16, 10)

        # If initialization function provided, call it on the weights of the model
        if initialization is not None:
            initialization(self.fc1.weight)
            initialization(self.fc2.weight)
            initialization(self.fc3.weight)

    def forward(self, x):
        x = self.conv1(x)
        x = self.conv2(x)
        x = self.conv3(x)
        x = self.conv4(x)

        x = x.view(x.shape[0], 128 * 2 * 2)
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = self.fc3(x)
        return x

## Training

In [None]:
import torch.optim as optim
from tqdm import tqdm

loss_fn = nn.CrossEntropyLoss() # softmax + neg. log likelihood

def train_model(model, epochs=3, lr=0.001):
    optimizer = optim.Adam(model.parameters(), lr=lr)

    train_loss = []
    test_loss = []
    test_accuracy = []
    for epoch in range(epochs):
        model.train() # train mode (affects batchnorm layers:
                      # in the subsequent forward passes they'll
                      # exhibit 'train' behaviour, i.e. they'll
                      # normalize activations over batches)
        for i, (X, y) in enumerate(tqdm(trainloader)):
            X, y = X.to(device), y.to(device)

            pred = model(X)
            loss = loss_fn(pred, y)
            loss.backward()
            optimizer.step()
            optimizer.zero_grad()

            train_loss.append(loss.item())

        model.eval() # test mode (affects batchnorm layers:
                     # in the subsequent forward passes they'll
                     # exhibit 'test' behaviour, i.e. they'll
                     # use the accumulated running statistics
                     # to normalize activations)
        epoch_losses = []
        epoch_accuracies = []
        with torch.no_grad(): # avoid calculating gradients during evaluation
            for X, y in testloader:
                X, y = X.to(device), y.to(device)

                pred = model(X)

                epoch_losses.append(loss_fn(pred, y).item())
                _, pred = torch.max(pred.data, 1) # pred = index of maximal output along axis=1
                epoch_accuracies.append(
                    (pred == y).to(torch.float32).mean().item()
                )
        test_loss.append(np.mean(epoch_losses))
        test_accuracy.append(np.mean(epoch_accuracies))

    return dict(
        train_loss=train_loss,
        test_loss=test_loss,
        test_accuracy=test_accuracy
    )

## Experiments

In [None]:
configurations = dict(
    fixed_normal_init=dict(
        use_batchnorm=False,
        initialization=(lambda w: w.data.normal_(std=0.001))
    ),
    he_normal_init=dict(
        use_batchnorm=False,
        initialization=(lambda w: torch.nn.init.kaiming_normal_(w, nonlinearity='relu'))
    ),
    he_normal_init_with_batchnorm=dict(
        use_batchnorm=True,
        initialization=(lambda w: torch.nn.init.kaiming_normal_(w, nonlinearity='relu'))
    )
)


                                                 # the '**' notation transforms the dictionary
                                                 # into keyword arguments, as if we called:
result = {                                       # Net(use_batchnorm=config['use_batchnorm'],
    name : train_model(Net(**config).to(device)) #     initialization=config['initialization'])
    for name, config in configurations.items()
} # train the defined configurations, 
  # get the result as a dictionary

In [None]:
fig, [ax0, ax1] = plt.subplots(1, 2, figsize=(12, 4), dpi=100)

# per step loss values are too noizy, so we'll use a function to 
# average them with a running window
def running_mean(x, win_size):
    return (np.cumsum(x)[win_size:] - np.cumsum(x[:-win_size])) / win_size

for (name, metrics), color in zip(result.items(),
                                  matplotlib.rcParams['axes.prop_cycle'].by_key()['color']):
    ax0.plot(
        running_mean(metrics['train_loss'], 20),
        color=color, label=name, alpha=0.8
    )
    ax0.plot(
        np.linspace(0, len(metrics['train_loss']), len(metrics['test_loss']) + 1)[1:],
        metrics['test_loss'], '--',
        color=color, alpha=0.8
    )
    ax0.set_ylabel("Loss")

    ax1.plot(metrics['test_accuracy'], color=color, label=name)
    ax1.set_ylabel("Test accuracy")

ax1.legend();

## Your turn!

Try improving the score. Since we don't have too much time to train the model thoroughly, see if you can change the model design and/or use regularization layers (batchnorm, dropout) to get better score quicker.

In [None]:
# your code here
raise NotImplementedError

# E.g.:
#
#     model = ...
#     result['MyModel'] = train_model(model, epochs=..., lr=...)
#
# (and then run the plotting code again)

In [None]:
model_device = next(model.parameters()).device
correct, total = 0, 0
with torch.no_grad():
    for X, y in testloader:
        X, y = X.to(model_device), y.to(model_device)

        _, pred = torch.max(model(X).data, 1)

        total += len(y)
        correct += (pred == y).sum().item()

print('accuracy =', correct / total)
assert correct / total >= 0.65

In [None]:
model_device = next(model.parameters()).device
correct, total = 0, 0
with torch.no_grad():
    for X, y in testloader:
        X, y = X.to(model_device), y.to(model_device)

        _, pred = torch.max(model(X).data, 1)

        total += len(y)
        correct += (pred == y).sum().item()

print('accuracy =', correct / total)
assert correct / total >= 0.7

In [None]:
model_device = next(model.parameters()).device
correct, total = 0, 0
with torch.no_grad():
    for X, y in testloader:
        X, y = X.to(model_device), y.to(model_device)

        _, pred = torch.max(model(X).data, 1)

        total += len(y)
        correct += (pred == y).sum().item()

print('accuracy =', correct / total)
assert correct / total >= 0.75