In [59]:
# !pip install pandas
# !pip install torch
# !pip install torchvision

In [84]:
import os.path
import numpy as np
import pandas as pd
import torch
from torch import nn
from torch.utils.data import DataLoader
from torchvision import datasets, transforms
from torchvision.transforms import ToTensor

In [3]:
# Configure hardware
device = (
    "cuda"
    if torch.cuda.is_available()
    else "mps"
    if torch.backends.mps.is_available()
    else "cpu"
)
print(f"Using {device} device")

Using mps device


In [69]:
# Define train, test, evaluation functions
def train(dataloader, model, loss_fn, optimizer):
    size = len(dataloader.dataset)
    model.train()

    losses = []
    for batch, (X, y) in enumerate(dataloader):
        X, y = X.to(device), y.to(device)

        # Compute prediction error
        pred = model(X)
        loss = loss_fn(pred, y)

        # Backpropagation
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()

        if batch % 100 == 0:
            loss, current = loss.item(), (batch + 1) * len(X)
            losses.append(loss)
            print(f"loss: {loss:>7f}  [{current:>5d}/{size:>5d}]")
    return np.mean(losses)

def test(dataloader, model, loss_fn):
    size = len(dataloader.dataset)
    num_batches = len(dataloader)
    model.eval()
    test_loss, correct = 0, 0

    with torch.no_grad():
        for X, y in dataloader:
            X, y = X.to(device), y.to(device)
            pred = model(X)
            test_loss += loss_fn(pred, y).item()
            correct += (pred.argmax(1) == y).type(torch.float).sum().item()
            
    test_loss /= num_batches
    correct /= size
    print(f"Test Error: \n Accuracy: {(100*correct):>0.1f}%, Avg loss: {test_loss:>8f} \n")
    return correct, test_loss

def eval(train_dataloader, test_dataloader, model, loss_fn, optimizer, epochs):
    overall_train_loss = []
    overall_test_acc = []
    overall_test_loss = []
    for t in range(epochs):
        print(f"Epoch {t+1}\n-------------------------------")
        train_loss = train(train_dataloader, model, loss_fn, optimizer)
        test_acc, test_loss = test(test_dataloader, model, loss_fn)
        overall_train_loss.append(train_loss)
        overall_test_acc.append(test_acc)
        overall_test_loss.append(test_loss)
        print("Done!")
    return pd.DataFrame({
        'train loss' : overall_train_loss, 
        'test_acc' : overall_test_acc, 
        'test_loss' : overall_test_loss
    })

## 1. Fully Connected Network Classifier on Fashion MNIST Dataset

In [7]:
# Fully connected network with variable activation function
class FCNet(nn.Module):
    def __init__(self, activation):
        super().__init__()
        self.flatten = nn.Flatten()
        self.sequential = nn.Sequential(
            nn.Linear(28*28, 512),
            activation(),
            nn.Linear(512, 512),
            activation(),
            nn.Linear(512, 10)
        )

    def forward(self, x):
        x = self.flatten(x)
        logits = self.sequential(x)
        return logits

fcnet = FCNet(nn.ReLU).to(device)
silunet = FCNet(nn.SiLU).to(device)

print(fcnet)
print(silunet)

FCNet(
  (flatten): Flatten(start_dim=1, end_dim=-1)
  (sequential): Sequential(
    (0): Linear(in_features=784, out_features=512, bias=True)
    (1): ReLU()
    (2): Linear(in_features=512, out_features=512, bias=True)
    (3): ReLU()
    (4): Linear(in_features=512, out_features=10, bias=True)
  )
)
FCNet(
  (flatten): Flatten(start_dim=1, end_dim=-1)
  (sequential): Sequential(
    (0): Linear(in_features=784, out_features=512, bias=True)
    (1): SiLU()
    (2): Linear(in_features=512, out_features=512, bias=True)
    (3): SiLU()
    (4): Linear(in_features=512, out_features=10, bias=True)
  )
)


In [65]:
# Download training and test data from PyTorch open datasets and create dataloaders
training_data = datasets.FashionMNIST(
    root="data",
    train=True,
    download=True,
    transform=ToTensor(),
)

test_data = datasets.FashionMNIST(
    root="data",
    train=False,
    download=True,
    transform=ToTensor(),
)

batch_size = 64
train_dataloader = DataLoader(training_data, batch_size=batch_size)
test_dataloader = DataLoader(test_data, batch_size=batch_size)

for X, y in test_dataloader:
    print(f"Shape of X [N, C, H, W]: {X.shape}")
    print(f"Shape of y: {y.shape} {y.dtype}")
    break

Shape of X [N, C, H, W]: torch.Size([64, 1, 28, 28])
Shape of y: torch.Size([64]) torch.int64


In [70]:
# Evaluate ReLU network
loss_fn = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(fcnet.parameters(), lr=1e-3)
epochs = 10

results = eval(train_dataloader, test_dataloader, fcnet, loss_fn, optimizer, epochs)
results.to_csv('results/fcnet_relu.csv')

Epoch 1
-------------------------------
loss: 0.117952  [   64/60000]
loss: 0.143804  [ 6464/60000]
loss: 0.171468  [12864/60000]
loss: 0.132781  [19264/60000]
loss: 0.303117  [25664/60000]
loss: 0.226374  [32064/60000]
loss: 0.165133  [38464/60000]
loss: 0.229570  [44864/60000]
loss: 0.194307  [51264/60000]
loss: 0.223545  [57664/60000]
Test Error: 
 Accuracy: 87.8%, Avg loss: 0.384517 

Done!
Epoch 2
-------------------------------
loss: 0.116424  [   64/60000]
loss: 0.184845  [ 6464/60000]
loss: 0.129399  [12864/60000]
loss: 0.123533  [19264/60000]
loss: 0.221683  [25664/60000]
loss: 0.190467  [32064/60000]
loss: 0.211879  [38464/60000]
loss: 0.228477  [44864/60000]
loss: 0.153745  [51264/60000]
loss: 0.232126  [57664/60000]
Test Error: 
 Accuracy: 87.9%, Avg loss: 0.391903 

Done!
Epoch 3
-------------------------------
loss: 0.090314  [   64/60000]
loss: 0.128335  [ 6464/60000]
loss: 0.141976  [12864/60000]
loss: 0.126144  [19264/60000]
loss: 0.228877  [25664/60000]
loss: 0.208667

In [73]:
# Evaluate SiLU network
optimizer = torch.optim.Adam(silunet.parameters())

results = eval(train_dataloader, test_dataloader, silunet, loss_fn, optimizer, epochs)
results.to_csv('results/fcnet_silu.csv')

Epoch 1
-------------------------------
loss: 2.308833  [   64/60000]
loss: 0.596282  [ 6464/60000]
loss: 0.379886  [12864/60000]
loss: 0.561692  [19264/60000]
loss: 0.463238  [25664/60000]
loss: 0.416341  [32064/60000]
loss: 0.385961  [38464/60000]
loss: 0.508026  [44864/60000]
loss: 0.492627  [51264/60000]
loss: 0.487733  [57664/60000]
Test Error: 
 Accuracy: 84.7%, Avg loss: 0.415579 

Done!
Epoch 2
-------------------------------
loss: 0.286252  [   64/60000]
loss: 0.339467  [ 6464/60000]
loss: 0.283954  [12864/60000]
loss: 0.379861  [19264/60000]
loss: 0.414202  [25664/60000]
loss: 0.345941  [32064/60000]
loss: 0.296767  [38464/60000]
loss: 0.435048  [44864/60000]
loss: 0.410662  [51264/60000]
loss: 0.410783  [57664/60000]
Test Error: 
 Accuracy: 86.2%, Avg loss: 0.375773 

Done!
Epoch 3
-------------------------------
loss: 0.206058  [   64/60000]
loss: 0.291727  [ 6464/60000]
loss: 0.251315  [12864/60000]
loss: 0.298110  [19264/60000]
loss: 0.362192  [25664/60000]
loss: 0.335226

## 2. ResNet on CIFAR10 and CIFAR100

In [82]:
# ResNet 164 implementation, adapted from https://github.com/a-martyn/resnet/blob/master/resnet.py

class block(nn.Module):
    def __init__(self, activation, filters, subsample=False):
        super().__init__()
        """
        A 2-layer residual learning building block as illustrated by Fig.2
        in "Deep Residual Learning for Image Recognition"
        
        Parameters:
        
        - filters:   int
                     the number of filters for all layers in this block
                   
        - subsample: boolean
                     whether to subsample the input feature maps with stride 2
                     and doubling in number of filters
                     
        Attributes:
        
        - shortcuts: boolean
                     When false the residual shortcut is removed
                     resulting in a 'plain' convolutional block.
        """
        # Determine subsampling
        s = 0.5 if subsample else 1.0
        
        # Setup layers
        self.conv1 = nn.Conv2d(int(filters*s), filters, kernel_size=3, 
                               stride=int(1/s), padding=1, bias=False)
        self.bn1   = nn.BatchNorm2d(filters, track_running_stats=True)
        self.activation = activation()
        self.conv2 = nn.Conv2d(filters, filters, kernel_size=3, stride=1, padding=1, bias=False)
        self.bn2   = nn.BatchNorm2d(filters, track_running_stats=True)

        # Shortcut downsampling
        self.downsample = nn.AvgPool2d(kernel_size=1, stride=2)

        # Initialise weights according to the method described in 
        # “Delving deep into rectifiers: Surpassing human-level performance on ImageNet 
        # classification” - He, K. et al. (2015)
        for m in self.modules():
            if isinstance(m, nn.Conv2d):
                nn.init.kaiming_normal_(m.weight, mode='fan_out', nonlinearity='relu')
            elif isinstance(m, nn.BatchNorm2d):
                nn.init.constant_(m.weight, 1)
                nn.init.constant_(m.bias, 0)   
        
    def shortcut(self, z, x):
        """ 
        Implements parameter free shortcut connection by identity mapping.
        If dimensions of input x are greater than activations then this
        is rectified by downsampling and then zero padding dimension 1
        as described by option A in paper.
        
        Parameters:
        - x: tensor
             the input to the block
        - z: tensor
             activations of block prior to final non-linearity
        """
        if x.shape != z.shape:
            d = self.downsample(x)
            p = torch.mul(d, 0)
            return z + torch.cat((d, p), dim=1)
        else:
            return z + x        
    
    def forward(self, x, shortcuts=False):
        z = self.conv1(x)
        z = self.bn1(z)
        z = self.activation(z)
        
        z = self.conv2(z)
        z = self.bn2(z)
        
        # Shortcut connection
        # This if statement is the only difference between
        # a convolutional net and a resnet!
        if shortcuts:
            z = self.shortcut(z, x)

        z = self.activation(z)
        
        return z
    


class ResNet(nn.Module):
    def __init__(self, activation, n, shortcuts=True):
        super().__init__()
        self.shortcuts = shortcuts
        
        # Input
        self.convIn = nn.Conv2d(3, 16, kernel_size=3, stride=1, padding=1, bias=False)
        self.bnIn   = nn.BatchNorm2d(16, track_running_stats=True)
        self.activation   = activation()
        
        # Stack1
        self.stack1 = nn.ModuleList([block(activation, 16, subsample=False) for _ in range(n)])

        # Stack2
        self.stack2a = block(activation, 32, subsample=True)
        self.stack2b = nn.ModuleList([block(activation, 32, subsample=False) for _ in range(n-1)])

        # Stack3
        self.stack3a = block(activation, 64, subsample=True)
        self.stack3b = nn.ModuleList([block(activation, 64, subsample=False) for _ in range(n-1)])
        
        # Output
        # The parameters of this average pool are not specified in paper.
        # Initially I tried kernel_size=2 stride=2 resulting in 
        # 64*4*4= 1024 inputs to the fully connected layer. More aggresive
        # pooling as implemented below results in better results and also
        # better matches the total model parameter count cited by authors.
        self.avgpool = nn.AdaptiveAvgPool2d((1, 1))
        self.fcOut   = nn.Linear(64, 10, bias=True)
        self.softmax = nn.LogSoftmax(dim=-1)
        
        # Initilise weights in fully connected layer 
        for m in self.modules():
            if isinstance(m, nn.Linear):
                nn.init.kaiming_normal_(m.weight)
                m.bias.data.zero_()      
        
        
    def forward(self, x):     
        z = self.convIn(x)
        z = self.bnIn(z)
        z = self.activation(z)
        
        for l in self.stack1: z = l(z, shortcuts=self.shortcuts)
        
        z = self.stack2a(z, shortcuts=self.shortcuts)
        for l in self.stack2b: 
            z = l(z, shortcuts=self.shortcuts)
        
        z = self.stack3a(z, shortcuts=self.shortcuts)
        for l in self.stack3b: 
            z = l(z, shortcuts=self.shortcuts)

        z = self.avgpool(z)
        z = z.view(z.size(0), -1)
        z = self.fcOut(z)
        return self.softmax(z)

In [76]:
# Download training and test data from PyTorch open datasets and create dataloaders
train_transform = transforms.Compose([
    transforms.Pad(4),
    transforms.RandomHorizontalFlip(0.5),
    transforms.RandomCrop(32),
    transforms.ToTensor()
])

training_data = datasets.CIFAR10(
    root="data",
    train=True,
    download=True,
    transform=train_transform
)

test_transform = transforms.ToTensor()


test_data = datasets.CIFAR10(
    root="data",
    train=False,
    download=True,
    transform=test_transform
)

batch_size = 64
train_dataloader = DataLoader(training_data, batch_size=batch_size)
test_dataloader = DataLoader(test_data, batch_size=batch_size)

for X, y in test_dataloader:
    print(f"Shape of X [N, C, H, W]: {X.shape}")
    print(f"Shape of y: {y.shape} {y.dtype}")
    break

Files already downloaded and verified
Files already downloaded and verified
Shape of X [N, C, H, W]: torch.Size([64, 3, 32, 32])
Shape of y: torch.Size([64]) torch.int64


In [77]:
resnet_relu = ResNet(nn.ReLU, 3).to(device)
resnet_silu = ResNet(nn.SiLU, 3).to(device)

  nn.init.kaiming_normal(m.weight)


In [80]:
loss_fn = torch.nn.NLLLoss()
optimizer = torch.optim.Adam(resnet_relu.parameters())

results = eval(train_dataloader, test_dataloader, resnet_relu, loss_fn, optimizer, epochs)
results.to_csv('resnet_relu_3.csv')

Epoch 1
-------------------------------
loss: 1.755440  [   64/50000]
loss: 1.586624  [ 6464/50000]
loss: 1.262855  [12864/50000]
loss: 1.403428  [19264/50000]
loss: 1.402931  [25664/50000]
loss: 1.450590  [32064/50000]
loss: 1.366369  [38464/50000]
loss: 1.283585  [44864/50000]
Test Error: 
 Accuracy: 55.4%, Avg loss: 1.232394 

Done!
Epoch 2
-------------------------------
loss: 1.190705  [   64/50000]
loss: 1.117294  [ 6464/50000]
loss: 0.969083  [12864/50000]
loss: 1.103904  [19264/50000]
loss: 1.104823  [25664/50000]
loss: 1.336930  [32064/50000]
loss: 1.065357  [38464/50000]
loss: 1.095886  [44864/50000]
Test Error: 
 Accuracy: 65.6%, Avg loss: 0.993317 

Done!
Epoch 3
-------------------------------
loss: 0.903309  [   64/50000]
loss: 0.936006  [ 6464/50000]
loss: 0.724649  [12864/50000]
loss: 0.843482  [19264/50000]
loss: 0.965253  [25664/50000]
loss: 1.189905  [32064/50000]
loss: 0.865692  [38464/50000]
loss: 0.889967  [44864/50000]
Test Error: 
 Accuracy: 67.2%, Avg loss: 0.9

In [81]:
optimizer = torch.optim.Adam(resnet_silu.parameters())

results = eval(train_dataloader, test_dataloader, resnet_silu, loss_fn, optimizer, epochs)
results.to_csv('resnet_silu_3.csv')

Epoch 1
-------------------------------
loss: 1.889300  [   64/50000]
loss: 1.614534  [ 6464/50000]
loss: 1.318044  [12864/50000]
loss: 1.403661  [19264/50000]
loss: 1.437201  [25664/50000]
loss: 1.346170  [32064/50000]
loss: 1.205990  [38464/50000]
loss: 1.208679  [44864/50000]
Test Error: 
 Accuracy: 61.0%, Avg loss: 1.108664 

Done!
Epoch 2
-------------------------------
loss: 1.076397  [   64/50000]
loss: 1.109718  [ 6464/50000]
loss: 0.923419  [12864/50000]
loss: 1.063451  [19264/50000]
loss: 1.149035  [25664/50000]
loss: 1.212288  [32064/50000]
loss: 0.910367  [38464/50000]
loss: 1.097296  [44864/50000]
Test Error: 
 Accuracy: 70.2%, Avg loss: 0.857176 

Done!
Epoch 3
-------------------------------
loss: 0.900486  [   64/50000]
loss: 0.927178  [ 6464/50000]
loss: 0.690153  [12864/50000]
loss: 0.882510  [19264/50000]
loss: 0.937455  [25664/50000]
loss: 1.196976  [32064/50000]
loss: 0.840949  [38464/50000]
loss: 0.861300  [44864/50000]
Test Error: 
 Accuracy: 72.5%, Avg loss: 0.8

In [None]:
# Make classes for functions not available in library
class MaxXSigmoid(nn.Module):
    __constants__ = ['inplace']
    inplace: bool

    def __init__(self, inplace: bool = False):
        super().__init__()
        self.inplace = inplace

    def forward(self, input: torch.Tensor) -> torch.Tensor:
        return torch.max(input, nn.Sigmoid(input, inplace=self.inplace))

    def extra_repr(self) -> str:
        inplace_str = 'inplace=True' if self.inplace else ''
        return inplace_str

class CosxMinusX(nn.Module):
    __constants__ = ['inplace']
    inplace: bool

    def __init__(self, inplace: bool = False):
        super().__init__()
        self.inplace = inplace

    def forward(self, input: torch.Tensor) -> torch.Tensor:
        return torch.cos(input) - input

    def extra_repr(self) -> str:
        inplace_str = 'inplace=True' if self.inplace else ''
        return inplace_str

# Map function names to functions
functions = {
    'relu' : nn.ReLU,
    'silu' : nn.SiLU,
    'max_x_sigmoid' : MaxXSigmoid,
    'cosx_minus_x' : CosxMinusX,
    'lrelu' : nn.LeakyReLU,
    'prelu' : nn.PReLU,
    'softplus' : nn.Softplus
}

# Specify ResNet model sizes. Size parameter n will create a 6n+2 layer model.
model_sizes = [3, 5, 7, 9]

In [87]:
for n in model_sizes:
    for funcname, function in functions.items():
        save_to = f'results/resnet_{6*n+2}_{funcname}.csv'
        print(f'Evaluating {6*n+2} layer model with activation function: {funcname}')
        if os.path.isfile(save_to):
            print(f'Results already generated, skipping')
        else:
            training_data = datasets.CIFAR10(
                root="data",
                train=True,
                download=True,
                transform=train_transform
            )

            test_data = datasets.CIFAR10(
                root="data",
                train=False,
                download=True,
                transform=test_transform
            )
            train_dataloader = DataLoader(training_data, batch_size=batch_size)
            test_dataloader = DataLoader(test_data, batch_size=batch_size)

            model = ResNet(function, n)
            loss_fn = torch.nn.CrossEntropyLoss()
            optimizer = torch.optim.Adam(model.parameters())

            results = eval(train_dataloader, test_dataloader, resnet_relu, loss_fn, optimizer, epochs)
            results.to_csv(save_to)

Evaluating 20 layer model with activation function: relu
Results already generated, skipping
Evaluating 20 layer model with activation function: silu
Results already generated, skipping
Evaluating 20 layer model with activation function: max_x_sigmoid
Files already downloaded and verified
Files already downloaded and verified
Epoch 1
-------------------------------
loss: 0.368309  [   64/50000]
loss: 0.475862  [ 6464/50000]
loss: 0.364805  [12864/50000]
loss: 0.521528  [19264/50000]
loss: 0.655674  [25664/50000]
loss: 0.835961  [32064/50000]
loss: 0.575536  [38464/50000]
loss: 0.482475  [44864/50000]
Test Error: 
 Accuracy: 83.1%, Avg loss: 0.515004 

Done!
Epoch 2
-------------------------------
loss: 0.432974  [   64/50000]
loss: 0.586704  [ 6464/50000]
loss: 0.279422  [12864/50000]
loss: 0.531659  [19264/50000]
loss: 0.564877  [25664/50000]
loss: 0.846933  [32064/50000]
loss: 0.484522  [38464/50000]
loss: 0.457927  [44864/50000]
Test Error: 
 Accuracy: 83.0%, Avg loss: 0.514386 

Do