# Homework 03 - Valentina Blasone

## Deep Learning - A.A. 2020/2021

> 1. Implement L1 norm regularization as a custom loss function
> 2. Implement early stopping in the $E_{\text{opt}}$ specification
> 3. Implement early stopping in one of the additional specifications as of [4](https://page.mi.fu-berlin.de/prechelt/Biblio/stop_tricks1997.pdf) 

In [1]:
import torch
import os
from torch import nn
from matplotlib import pyplot as plt

from scripts import mnist
from scripts.train_utils import accuracy, AverageMeter

## 1

The L1 norm regularization can be implemented as a custom loss function, that takes in input a given loss function to be regularized and the model, from which the weights values are derived.

In [2]:
def l1_regularization(loss_values, learn_rate, model):
    '''Custom loss functions that adds the L1
    regularization to a given loss functions'''
    l1 = sum(p.abs().sum() for name, p in model.named_parameters() if 'weight' in name)
    loss = loss_values + learn_rate * l1
    return loss

We can use the example of LAB 3 to check the correctness of the L1 norm regularization just implemented.

In [3]:
class MLP(nn.Module):
    def __init__(self):
        super().__init__()
        self.flat = nn.Flatten()
        self.h1 = nn.Linear(28*28, 16)
        self.h2 = nn.Linear(16, 32)
        self.h3 = nn.Linear(32, 24)
        self.out = nn.Linear(24, 10)
    
    def forward(self, X, activ_hidden=nn.functional.relu):
        out = self.flat(X)
        out = activ_hidden(self.h1(out))
        out = activ_hidden(self.h2(out))
        out = activ_hidden(self.h3(out))
        out = self.out(out)
        return out

In [4]:
def train_epoch(model, dataloader, loss_fn, optimizer, loss_meter, performance_meter, performance, device, l1_rate=None): # note: I've added a generic performance to replace accuracy and the device
    for X, y in dataloader:
        # TRANSFER X AND y TO GPU IF SPECIFIED
        X = X.to(device)
        y = y.to(device)
        # ... like last time
        optimizer.zero_grad() 
        y_hat = model(X)
        if l1_rate is not None:
            loss = l1_regularization(loss_fn(y_hat,y), l1_rate, model)
        else:
            loss = loss_fn(y_hat,y)
        loss.backward()
        optimizer.step()
        acc = performance(y_hat, y)
        loss_meter.update(val=loss.item(), n=X.shape[0])
        performance_meter.update(val=acc, n=X.shape[0])


def train_model(model, dataloader, loss_fn, optimizer, num_epochs, l1_rate=None, performance=accuracy, device=None): # note: I've added a generic performance to replace accuracy and an object where to store the trajectory and the device on which to run our training

    # establish device
    if device is None:
        device = "cuda:0" if torch.cuda.is_available() else "cpu"
    print(f"Training on {device}")

    if l1_rate is not None:
        print("Training with L1 regularization")
    
    model.to(device)
    model.train()

    # epoch loop
    for epoch in range(num_epochs):
        loss_meter = AverageMeter()
        performance_meter = AverageMeter()

        train_epoch(model, dataloader, loss_fn, optimizer, loss_meter, performance_meter, performance, device, l1_rate)

        print(f"Epoch {epoch+1} completed. Loss - total: {loss_meter.sum} - average: {loss_meter.avg}; Performance: {performance_meter.avg}")

    return loss_meter.sum, performance_meter.avg

def test_model(model, dataloader, performance=accuracy, loss_fn=None, device=None):
    # establish device
    if device is None:
        device = "cuda:0" if torch.cuda.is_available() else "cpu"

    # create an AverageMeter for the loss if passed
    if loss_fn is not None:
        loss_meter = AverageMeter()
    
    performance_meter = AverageMeter()

    model.to(device)
    model.eval()
    with torch.no_grad():
        for X, y in dataloader:
            X = X.to(device)
            y = y.to(device)

            y_hat = model(X)
            loss = loss_fn(y_hat, y) if loss_fn is not None else None
            acc = performance(y_hat, y)
            if loss_fn is not None:
                loss_meter.update(loss.item(), X.shape[0])
            performance_meter.update(acc, X.shape[0])
    # get final performances
    fin_loss = loss_meter.sum if loss_fn is not None else None
    fin_perf = performance_meter.avg
    print(f"TESTING - loss {fin_loss if fin_loss is not None else '--'} - performance {fin_perf}")
    return fin_loss, fin_perf

In [5]:
minibatch_size_train = 256
minibatch_size_test = 512

trainloader, testloader, trainset, testset = mnist.get_data(batch_size_train=minibatch_size_test, batch_size_test=minibatch_size_test)

learn_rate = 0.1
num_epochs = 10

l1_rate = 1e-05

model = MLP()
loss_fn = nn.CrossEntropyLoss()
optimizer = torch.optim.SGD(model.parameters(), lr=learn_rate)

In [6]:
train_loss, train_acc = train_model(model, trainloader, loss_fn, optimizer, num_epochs, l1_rate, device="cuda:0")

Training on cuda:0
Training with L1 regularization
Epoch 1 completed. Loss - total: 80186.87865447998 - average: 1.3364479775746663; Performance: 0.5613666666666667
Epoch 2 completed. Loss - total: 25101.48248386383 - average: 0.41835804139773053; Performance: 0.8748166666666667
Epoch 3 completed. Loss - total: 19658.83703660965 - average: 0.3276472839434942; Performance: 0.9042
Epoch 4 completed. Loss - total: 16609.475224494934 - average: 0.2768245870749156; Performance: 0.9192333333333333
Epoch 5 completed. Loss - total: 14976.65601181984 - average: 0.24961093353033065; Performance: 0.9262333333333334
Epoch 6 completed. Loss - total: 13650.245250701904 - average: 0.2275040875116984; Performance: 0.9342666666666667
Epoch 7 completed. Loss - total: 12588.35791015625 - average: 0.20980596516927083; Performance: 0.9389666666666666
Epoch 8 completed. Loss - total: 11860.14377784729 - average: 0.1976690629641215; Performance: 0.9423166666666667
Epoch 9 completed. Loss - total: 10802.18730

In [7]:
final_loss, final_perf = test_model(model, testloader, loss_fn=loss_fn)

TESTING - loss 11253.26155424118 - performance 0.9414833333333333


## 2. a.

We define a MLP architecture in which the early stopping in the $E_{opt}$ specification is implemented. The idea is to stop the training as soon as the error on the validation set is higher than it was the last time it was checked.

The value $E_{opt}(t)$ is defined to be the lowest validation set error obtained in epochs up to $t$: $$E_{opt}(t) := \min_{t'\leq t} E_{va}(t')$$

Where $E_{va}$ is the validation error. We can then define the _generalization loss_ at epoch $t$ to be the relative increase of the validation error over the minimum-so-far in percent: $$GL(t)=100\cdot \left( \dfrac{E_{va}(t)}{E_{opt}(t)}-1\right)$$

In this case, the stopping criterion says to stop as soon as the generalization loss exceeds a certain threshold, $\alpha$: $$GL(t)>\alpha$$

In [8]:
def train_model(model, dataloader, loss_fn, optimizer, num_epochs, alpha=None, l1_rate=None, performance=accuracy, device=None): # note: I've added a generic performance to replace accuracy and an object where to store the trajectory and the device on which to run our training

    # establish device
    if device is None:
        device = "cuda:0" if torch.cuda.is_available() else "cpu"
    print(f"Training on {device}")

    if l1_rate is not None:
        print("Training with L1 regularization")
    
    if alpha is not None:
        print("Training with early stopping: GL")

    model.to(device)
    model.train()

    E_opt = None

    # epoch loop
    for epoch in range(num_epochs):
        loss_meter = AverageMeter()
        performance_meter = AverageMeter()

        train_epoch(model, dataloader, loss_fn, optimizer, loss_meter, performance_meter, performance, device, l1_rate)

        print(f"Epoch {epoch+1} completed. Loss - total: {loss_meter.sum} - average: {loss_meter.avg}; Performance: {performance_meter.avg}")

        if alpha is not None:
            E_va, _ = test_model(model, testloader, loss_fn=loss_fn)
            E_opt = E_va if E_opt is None else min(E_opt, E_va)
            GL = 100 * (E_va/E_opt - 1)
            if GL > alpha:
                print(f"GL = {GL} > {alpha} - Stop training")
                return loss_meter.sum, performance_meter.avg

    return loss_meter.sum, performance_meter.avg

In [9]:
minibatch_size_train = 256
minibatch_size_test = 512

trainloader, testloader, trainset, testset = mnist.get_data(batch_size_train=minibatch_size_test, batch_size_test=minibatch_size_test)

learn_rate = 0.1
num_epochs = 30

model = MLP()
loss_fn = nn.CrossEntropyLoss()
optimizer = torch.optim.SGD(model.parameters(), lr=learn_rate)

train_loss, train_acc = train_model(model, trainloader, loss_fn, optimizer, num_epochs, alpha=5, device="cuda:0")

Training on cuda:0
Training with early stopping: GL
Epoch 1 completed. Loss - total: 82812.12795066833 - average: 1.3802021325111389; Performance: 0.50945
TESTING - loss 34024.57218551636 - performance 0.8098833333333333
Epoch 2 completed. Loss - total: 24887.656105041504 - average: 0.4147942684173584; Performance: 0.8734166666666666
TESTING - loss 23664.80724620819 - performance 0.87845
Epoch 3 completed. Loss - total: 18715.289516448975 - average: 0.31192149194081625; Performance: 0.9067833333333334
TESTING - loss 25743.16288471222 - performance 0.8646
GL = 8.782474401252705 > 5 - Stop training


## 2. b.

Another stopping criteria presented in the paper is based on the fact that if the training is still progressing rapidly, we might want to avoid stopping the algorithm. The reason is that if the training error is still decreasing quickly, there is more chance that the generalization losses can be somehow "repaired" and overfitting has probably not yet begun. The formalization of this reasoning is the definition of a so called _training strip of length k_, which is a training sequence of _k_ epochs ($n+1 ... n+k$, where $n$ is divisible by $k$). We can then measure the training progress (in per thousand) after a training strip as: $$P_k(t):=1000\cdot\left(\dfrac{\sum_{t'=t-k+1}^t E_{tr}(t')}{k\cdot\min_{t'=t-k+1}^t E_{tr}(t')}-1\right)$$ 

which is a comparison between the average training error during the strip and the minimum training error during the same strip.

The criterion tells to stop after the first end-of-strip epoch $t$ with $$\dfrac{GL(t)}{P_k(t)}>\alpha$$

In [10]:
def train_model(model, dataloader, loss_fn, optimizer, num_epochs, alpha=None, k=None, l1_rate=None, performance=accuracy, device=None): # note: I've added a generic performance to replace accuracy and an object where to store the trajectory and the device on which to run our training

    # establish device
    if device is None:
        device = "cuda:0" if torch.cuda.is_available() else "cpu"
    print(f"Training on {device}")

    if l1_rate is not None:
        print("Training with L1 regularization")
    
    if alpha is not None and k is None:
        print("Training with early stopping: GL")

    if k is not None:
        print("Training with early stopping: PL")
        j = 1
        strip = []

    model.to(device)
    model.train()

    E_opt = None

    # epoch loop
    for epoch in range(num_epochs):
        loss_meter = AverageMeter()
        performance_meter = AverageMeter()

        train_epoch(model, dataloader, loss_fn, optimizer, loss_meter, performance_meter, performance, device, l1_rate)

        print(f"Epoch {epoch+1} completed. Loss - total: {loss_meter.sum} - average: {loss_meter.avg}; Performance: {performance_meter.avg}")

        if alpha is not None or k is not None:
            E_va, _ = test_model(model, testloader, loss_fn=loss_fn)
            E_opt = E_va if E_opt is None else min(E_opt, E_va)
            GL = 100 * (E_va/E_opt - 1)
            if k is None and GL > alpha:
                print(f"GL = {GL} > {alpha} - Stop training")
                return loss_meter.sum, performance_meter.avg
            
            if k is not None:
                strip.append(loss_meter.sum)
                if j % k != 0:
                    j += 1
                else:
                    print(GL, strip, sum(strip), min(strip))
                    Pk = 1000 * (sum(strip)/(k*min(strip)) - 1)
                    if GL/Pk > alpha:
                        print(f"GL/Pk = {GL/Pk} > {alpha} - Stop training")
                        return loss_meter.sum, performance_meter.avg
                    else:
                        print(f"GL/Pk = {GL/Pk} < {alpha} - Continue training")
                        j = 1
                        strip = []                       

    return loss_meter.sum, performance_meter.avg

In [11]:
minibatch_size_train = 256
minibatch_size_test = 512

trainloader, testloader, trainset, testset = mnist.get_data(batch_size_train=minibatch_size_test, batch_size_test=minibatch_size_test)

learn_rate = 0.1
num_epochs = 30

model = MLP()
loss_fn = nn.CrossEntropyLoss()
optimizer = torch.optim.SGD(model.parameters(), lr=learn_rate)

train_loss, train_acc = train_model(model, trainloader, loss_fn, optimizer, num_epochs, alpha=0.5, k=5, device="cuda:0")

Training on cuda:0
Training with early stopping: PL
Epoch 1 completed. Loss - total: 96991.61274909973 - average: 1.6165268791516623; Performance: 0.45568333333333333
TESTING - loss 39627.57803916931 - performance 0.7799833333333334
Epoch 2 completed. Loss - total: 29560.72462081909 - average: 0.4926787436803182; Performance: 0.846
TESTING - loss 28653.597908973694 - performance 0.8504166666666667
Epoch 3 completed. Loss - total: 20760.78544807434 - average: 0.346013090801239; Performance: 0.89635
TESTING - loss 21107.402194023132 - performance 0.89295
Epoch 4 completed. Loss - total: 16382.692322731018 - average: 0.273044872045517; Performance: 0.9178666666666667
TESTING - loss 16101.186066627502 - performance 0.9167833333333333
Epoch 5 completed. Loss - total: 14067.8447971344 - average: 0.23446407995224; Performance: 0.9294833333333333
TESTING - loss 16175.528463363647 - performance 0.91615
0.4617200026663326 [96991.61274909973, 29560.72462081909, 20760.78544807434, 16382.6923227310