# Homework 04 - Valentina Blasone

## Deep Learning - A.A. 2020/2021

> 1. Now that you have all the tools to train an MLP with high performance on MNIST, try reaching 0-loss on the training data (with a small epsilon, e.g. 99.99% training performance -- don't worry if you overfit!).
The implementation is completely up to you. You just need to keep it an MLP without using fancy layers (e.g., keep the `Linear` layers, don't use `Conv1d` or something like this, don't use attention). You are free to use any LR scheduler or optimizer, any one of batchnorm/groupnorm, regularization methods... If you use something we haven't seen during lectures, please motivate your choice and explain (as briefly as possible) how it works.
> 2. Try reaching 0-loss on the training data with **permuted labels**. Assess the model on the test data (without permuted labels) and comment. Help yourself with [3](https://arxiv.org/abs/1611.03530).
*Tip*: To permute the labels, act on the `trainset.targets` with an appropriate torch function.
Then, you can pass this "permuted" `Dataset` to a `DataLoader` like so: `trainloader_permuted = torch.utils.data.DataLoader(trainset_permuted, batch_size=batch_size_train, shuffle=True)`. You can now use this `DataLoader` inside the training function.
Additional view for motivating this exercise: ["The statistical significance perfect linear separation", by Jared Tanner (Oxford U.)](https://www.youtube.com/watch?v=vl2QsVWEqdA).

In [1]:
import torch
import os
from torch import nn
from matplotlib import pyplot as plt

from scripts import mnist
from scripts.train_utils import accuracy, AverageMeter

## 1

In [4]:
class MLP(nn.Module):
    def __init__(self):
        super().__init__()
        self.flat = nn.Flatten()
        self.h1 = nn.Linear(28*28, 16)
        self.h2 = nn.Linear(16, 32)
        self.h3 = nn.Linear(32, 24)
        self.out = nn.Linear(24, 10)
    
    def forward(self, X, activ_hidden=nn.functional.relu):
        out = self.flat(X)
        out = activ_hidden(self.h1(out))
        out = activ_hidden(self.h2(out))
        out = activ_hidden(self.h3(out))
        out = self.out(out)
        return out

In [12]:
def train_epoch(model, dataloader, loss_fn, optimizer, loss_meter, performance_meter, performance, device): # note: I've added a generic performance to replace accuracy and the device
    for X, y in dataloader:
        # TRANSFER X AND y TO GPU IF SPECIFIED
        X = X.to(device)
        y = y.to(device)
        # ... like last time
        optimizer.zero_grad() 
        y_hat = model(X)
        loss = loss_fn(y_hat, y)
        loss.backward()
        optimizer.step()
        acc = performance(y_hat, y)
        loss_meter.update(val=loss.item(), n=X.shape[0])
        performance_meter.update(val=acc, n=X.shape[0])

def test_model(model, dataloader, performance=accuracy, loss_fn=None, device=None):
    # establish device
    if device is None:
        device = "cuda:0" if torch.cuda.is_available() else "cpu"

    # create an AverageMeter for the loss if passed
    if loss_fn is not None:
        loss_meter = AverageMeter()
    
    performance_meter = AverageMeter()

    model.to(device)
    model.eval()
    with torch.no_grad():
        for X, y in dataloader:
            X = X.to(device)
            y = y.to(device)

            y_hat = model(X)
            loss = loss_fn(y_hat, y) if loss_fn is not None else None
            acc = performance(y_hat, y)
            if loss_fn is not None:
                loss_meter.update(loss.item(), X.shape[0])
            performance_meter.update(acc, X.shape[0])
    # get final performances
    fin_loss = loss_meter.sum if loss_fn is not None else None
    fin_perf = performance_meter.avg
    print(f"TESTING - loss {fin_loss if fin_loss is not None else '--'} - performance {fin_perf}")
    return fin_loss, fin_perf

In [None]:
def train_model(model, dataloader, loss_fn, optimizer, num_epochs, alpha=None, k=None, checkpoint_loc=None, checkpoint_name="checkpoint.pt", performance=accuracy, lr_scheduler=None, epoch_start_scheduler=1, device=None):
    # added lr_scheduler

    # create the folder for the checkpoints (if it's not None)
    if checkpoint_loc is not None:
        os.makedirs(checkpoint_loc, exist_ok=True)
    
    # establish device
    if device is None:
        device = "cuda:0" if torch.cuda.is_available() else "cpu"
    print(f"Training on {device}")   

    if alpha is not None and k is None:
        print("Training with early stopping: GL")

    if k is not None:
        print("Training with early stopping: PL")
        j = 1
        strip = []
    
    model.to(device)
    model.train()

    # epoch loop
    for epoch in range(num_epochs):

        loss_meter = AverageMeter()
        performance_meter = AverageMeter()

        # added print for LR
        print(f"Epoch {epoch+1} --- learning rate {optimizer.param_groups[0]['lr']:.5f}")

        train_epoch(model, dataloader, loss_fn, optimizer, loss_meter, performance_meter, performance)

        print(f"Epoch {epoch+1} completed. Loss - total: {loss_meter.sum} - average: {loss_meter.avg}; Performance: {performance_meter.avg}")

        # produce checkpoint dictionary -- but only if the name and folder of the checkpoint are not None
        if checkpoint_name is not None and checkpoint_loc is not None:
            checkpoint_dict = {
                "parameters": model.state_dict(),
                "optimizer": optimizer.state_dict(),
                "epoch": epoch
            }
            torch.save(checkpoint_dict, os.path.join(checkpoint_loc, checkpoint_name))
        
        if lr_scheduler is not None:
            if epoch >= epoch_start_scheduler:
                lr_scheduler.step()
            # or you can use a MultiStepLR with milestones=[6, 11] thus deleting the `if` construct for the epoch

        if alpha is not None or k is not None:
            E_va, _ = test_model(model, testloader, loss_fn=loss_fn)
            E_opt = E_va if E_opt is None else min(E_opt, E_va)
            GL = 100 * (E_va/E_opt - 1)
            if k is None and GL > alpha:
                print(f"GL = {GL} > {alpha} - Stop training")
                return loss_meter.sum, performance_meter.avg
            
            if k is not None:
                strip.append(loss_meter.sum)
                if j % k != 0:
                    j += 1
                else:
                    print(GL, strip, sum(strip), min(strip))
                    Pk = 1000 * (sum(strip)/(k*min(strip)) - 1)
                    if GL/Pk > alpha:
                        print(f"GL/Pk = {GL/Pk} > {alpha} - Stop training")
                        return loss_meter.sum, performance_meter.avg
                    else:
                        print(f"GL/Pk = {GL/Pk} < {alpha} - Continue training")
                        j = 1
                        strip = []         

    return loss_meter.sum, performance_meter.avg

In [None]:
def train_model(model, dataloader, loss_fn, optimizer, num_epochs, alpha=None, k=None, performance=accuracy, device=None): # note: I've added a generic performance to replace accuracy and an object where to store the trajectory and the device on which to run our training

    # establish device
    if device is None:
        device = "cuda:0" if torch.cuda.is_available() else "cpu"
    print(f"Training on {device}")
    
    if alpha is not None and k is None:
        print("Training with early stopping: GL")

    if k is not None:
        print("Training with early stopping: PL")
        j = 1
        strip = []

    model.to(device)
    model.train()

    E_opt = None

    # epoch loop
    for epoch in range(num_epochs):
        loss_meter = AverageMeter()
        performance_meter = AverageMeter()

        train_epoch(model, dataloader, loss_fn, optimizer, loss_meter, performance_meter, performance, device, l1_rate)

        print(f"Epoch {epoch+1} completed. Loss - total: {loss_meter.sum} - average: {loss_meter.avg}; Performance: {performance_meter.avg}")

        if alpha is not None or k is not None:
            E_va, _ = test_model(model, testloader, loss_fn=loss_fn)
            E_opt = E_va if E_opt is None else min(E_opt, E_va)
            GL = 100 * (E_va/E_opt - 1)
            if k is None and GL > alpha:
                print(f"GL = {GL} > {alpha} - Stop training")
                return loss_meter.sum, performance_meter.avg
            
            if k is not None:
                strip.append(loss_meter.sum)
                if j % k != 0:
                    j += 1
                else:
                    print(GL, strip, sum(strip), min(strip))
                    Pk = 1000 * (sum(strip)/(k*min(strip)) - 1)
                    if GL/Pk > alpha:
                        print(f"GL/Pk = {GL/Pk} > {alpha} - Stop training")
                        return loss_meter.sum, performance_meter.avg
                    else:
                        print(f"GL/Pk = {GL/Pk} < {alpha} - Continue training")
                        j = 1
                        strip = []                       

    return loss_meter.sum, performance_meter.avg

In [15]:
minibatch_size_train = 256
minibatch_size_test = 512

trainloader, testloader, trainset, testset = mnist.get_data(batch_size_train=minibatch_size_test, batch_size_test=minibatch_size_test)

learn_rate = 0.1
num_epochs = 15

l1_rate = 1e-05

model = MLP()
loss_fn = nn.CrossEntropyLoss()


optimizer = torch.optim.SGD(model.parameters(), lr=learn_rate)
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=5, gamma=.1)

In [16]:
train_loss, train_acc = train_model(model, trainloader, loss_fn, optimizer, num_epochs, lr_scheduler=scheduler, device="cuda:0")

TypeError: train_model() got an unexpected keyword argument 'lr_scheduler'

In [7]:
final_loss, final_perf = test_model(model, testloader, loss_fn=loss_fn)

TESTING - loss 11253.26155424118 - performance 0.9414833333333333
