# Assignment 2

> Tristan PERROT

## Utils

### Imports


In [37]:
import torch
import torchvision
import torchvision.transforms as transforms
import matplotlib.pyplot as plt
import numpy as np
import math
import time
import os
import tqdm

np.random.seed(42)

DATASET_PATH = '../Dataset/'

In [38]:
def load_batch(filename):
    """ Copied from the dataset website """
    import pickle
    with open(DATASET_PATH + filename, 'rb') as fo:
        data = pickle.load(fo, encoding='bytes')
    return data


def load_data(filename):
    """ Read the data from the file """
    data = load_batch(filename)
    X = data[b'data'].T / 255
    y = np.array(data[b'labels'])
    Y = np.zeros((10, X.shape[1]))
    for i in range(y.shape[0]):
        Y[y[i], i] = 1
    return X, Y, y

In [39]:
def compute_relative_error(grad_analytical, grad_numerical, eps=1e-9):
    """ Compute the relative error between the analytical and numerical gradients """
    return torch.max(
        torch.abs(grad_analytical - grad_numerical) / torch.clamp(torch.abs(grad_analytical) + torch.abs(grad_numerical) + eps, min=eps))

## Exercises

### Model


In [43]:
class Classifier():
    def __init__(self, x_train, x_val, x_test, y_train, y_val, y_test, y_hot_train, y_hot_val, y_hot_test):
        self.x_train = torch.tensor(x_train)
        self.x_val = torch.tensor(x_val)
        self.x_test = torch.tensor(x_test)
        self.y_train = torch.tensor(y_train)
        self.y_val = torch.tensor(y_val)
        self.y_test = torch.tensor(y_test)
        self.y_hot_train = torch.tensor(y_hot_train)
        self.y_hot_val = torch.tensor(y_hot_val)
        self.y_hot_test = torch.tensor(y_hot_test)

        self.eta = 0.001
        self.lambd = 0.0

    def preprocess(self):
        mean_X = torch.mean(self.x_train, axis=1).reshape(-1, 1)
        std_X = torch.std(self.x_train, axis=1).reshape(-1, 1)
        self.x_train = (self.x_train - mean_X) / std_X
        self.x_val = (self.x_val - mean_X) / std_X
        self.x_test = (self.x_test - mean_X) / std_X

    def initialize_weights(self, hidden_size):
        features = self.x_train.shape[0]
        outputs = self.y_train.shape[0]

        self.W1 = torch.randn(hidden_size, features,
                              dtype=self.x_train.dtype) / math.sqrt(features)
        self.b1 = torch.zeros(hidden_size, 1, dtype=self.x_train.dtype)
        self.W2 = torch.randn(outputs, hidden_size,
                              dtype=self.x_train.dtype) / math.sqrt(hidden_size)
        self.b2 = torch.zeros(outputs, 1, dtype=self.x_train.dtype)
        self.W1.requires_grad = True
        self.b1.requires_grad = True
        self.W2.requires_grad = True
        self.b2.requires_grad = True

    def forward(self, x):
        self.z1 = self.W1 @ x + self.b1
        self.h = torch.relu(self.z1)
        self.z2 = self.W2 @ self.h + self.b2
        self.p = torch.softmax(self.z2, dim=0)
        return self.p
    
    def forward_test(self, x, W1, b1, W2, b2):
        z1 = W1 @ x + b1
        h = torch.relu(z1)
        z2 = W2 @ h + b2
        p = torch.softmax(z2, dim=0)
        return p

    def cost(self, y, W1, W2):
        return -torch.mean(y.T @ torch.log(self.p)) + self.lambd * (torch.sum(W1 ** 2) + torch.sum(W2 ** 2))

    def cost_grad(self, x, y):
        m = x.shape[1]
        self.p = self.forward(x)
        dZ2 = self.p - y
        dW2 = dZ2 @ self.h.T / m + 2 * self.lambd * self.W2
        db2 = torch.sum(dZ2, axis=1, keepdim=True) / m
        dH = self.W2.T @ dZ2
        dZ1 = dH * (self.z1 > 0)
        dW1 = dZ1 @ x.T / m + 2 * self.lambd * self.W1
        db1 = torch.sum(dZ1, axis=1, keepdim=True) / m
        return dW1, db1, dW2, db2
    
    def compute_grads_num(self, h):
        dW1 = torch.zeros(self.W1.shape)
        dW2 = torch.zeros(self.W2.shape)
        db1 = torch.zeros(self.b1.shape)
        db2 = torch.zeros(self.b2.shape)

        W1_try = self.W1.clone()
        W2_try = self.W2.clone()
        b1_try = self.b1.clone()
        b2_try = self.b2.clone()

        self.forward(self.x_train)
        c = self.cost(self.y_train, self.W1, self.W2)
        
        for i in range(self.W1.shape[0]):
            for j in range(self.W1.shape[1]):
                W1_try[i, j] = self.W1[i, j] - h
                self.forward_test(self.x_train, W1_try, self.b1, self.W2, self.b2)
                c1 = self.cost(self.y_train, W1_try, self.W2)

                W1_try[i, j] = self.W1[i, j] + h
                self.forward_test(self.x_train, W1_try, self.b1, self.W2, self.b2)
                c2 = self.cost(self.y_train, W1_try, self.W2)

                dW1[i, j] = (c2 - c1) / (2 * h)
        
        for i in range(self.W2.shape[0]):
            for j in range(self.W2.shape[1]):
                W2_try[i, j] = self.W2[i, j] - h
                self.forward_test(self.x_train, self.W1, self.b1, W2_try, self.b2)
                c1 = self.cost(self.y_train, self.W1, W2_try)

                W2_try[i, j] = self.W2[i, j] + h
                self.forward_test(self.x_train, self.W1, self.b1, W2_try, self.b2)
                c2 = self.cost(self.y_train, self.W1, W2_try)

                dW2[i, j] = (c2 - c1) / (2 * h)

        for i in range(self.b1.shape[0]):
            b1_try[i] = self.b1[i] - h
            self.forward_test(self.x_train, self.W1, b1_try, self.W2, self.b2)
            c1 = self.cost(self.y_train, self.W1, self.W2)

            b1_try[i] = self.b1[i] + h
            self.forward_test(self.x_train, self.W1, b1_try, self.W2, self.b2)
            c2 = self.cost(self.y_train, self.W1, self.W2)

            db1[i] = (c2 - c1) / (2 * h)

        for i in range(self.b2.shape[0]):
            b2_try[i] = self.b2[i] - h
            self.forward_test(self.x_train, self.W1, self.b1, self.W2, b2_try)
            c1 = self.cost(self.y_train, self.W1, self.W2)

            b2_try[i] = self.b2[i] + h
            self.forward_test(self.x_train, self.W1, self.b1, self.W2, b2_try)
            c2 = self.cost(self.y_train, self.W1, self.W2)

            db2[i] = (c2 - c1) / (2 * h)

        return dW1, db1, dW2, db2

    def check_grad(self):
        dW1, db1, dW2, db2 = self.cost_grad(self.x_train, self.y_hot_train)
        cost = self.cost(self.y_train, self.W1, self.W2)
        cost.backward()
        grad_W1 = self.W1.grad
        grad_b1 = self.b1.grad
        grad_W2 = self.W2.grad
        grad_b2 = self.b2.grad

        print('Between analytical and torch gradients:')
        print('dW1:', dW1.shape, 'grad_W1:', grad_W1.shape)
        print('db1:', db1.shape, 'grad_b1:', grad_b1.shape)
        print('dW2:', dW2.shape, 'grad_W2:', grad_W2.shape)
        print('db2:', db2.shape, 'grad_b2:', grad_b2.shape)
        print('Difference for W1:', torch.max(torch.abs(dW1 - grad_W1)))
        print('Difference for b1:', torch.max(torch.abs(db1 - grad_b1)))
        print('Difference for W2:', torch.max(torch.abs(dW2 - grad_W2)))
        print('Difference for b2:', torch.max(torch.abs(db2 - grad_b2)))
        print('Relative error for W1:', compute_relative_error(dW1, grad_W1))
        print('Relative error for b1:', compute_relative_error(db1, grad_b1))
        print('Relative error for W2:', compute_relative_error(dW2, grad_W2))
        print('Relative error for b2:', compute_relative_error(db2, grad_b2))

        dW1_num, db1_num, dW2_num, db2_num = self.compute_grads_num(1e-9)

        print('\nBetween analytical and numerical gradients:')
        print('Difference for W1:', torch.max(torch.abs(dW1 - dW1_num)))
        print('Difference for b1:', torch.max(torch.abs(db1 - db1_num)))
        print('Difference for W2:', torch.max(torch.abs(dW2 - dW2_num)))
        print('Difference for b2:', torch.max(torch.abs(db2 - db2_num)))
        print('Relative error for W1:', compute_relative_error(dW1, dW1_num))
        print('Relative error for b1:', compute_relative_error(db1, db1_num))
        print('Relative error for W2:', compute_relative_error(dW2, dW2_num))
        print('Relative error for b2:', compute_relative_error(db2, db2_num))

        print('\nBetween torch and numerical gradients:')
        print('Difference for W1:', torch.max(torch.abs(grad_W1 - dW1_num)))
        print('Difference for b1:', torch.max(torch.abs(grad_b1 - db1_num)))
        print('Difference for W2:', torch.max(torch.abs(grad_W2 - dW2_num)))
        print('Difference for b2:', torch.max(torch.abs(grad_b2 - db2_num)))
        print('Relative error for W1:', compute_relative_error(grad_W1, dW1_num))
        print('Relative error for b1:', compute_relative_error(grad_b1, db1_num))
        print('Relative error for W2:', compute_relative_error(grad_W2, dW2_num))
        print('Relative error for b2:', compute_relative_error(grad_b2, db2_num))


In [46]:
X_train, Y_train, y_train = load_data('data_batch_1')
X_val, Y_val, y_val = load_data('data_batch_2')
X_test, Y_test, y_test = load_data('test_batch')

n = 10
dim = 50
m = 5

X_train = X_train[:dim, :n]
Y_train = Y_train[:, :n]
y_train = y_train[:n]
X_val = X_val[:dim, :n]
Y_val = Y_val[:, :n]
y_val = y_val[:n]
X_test = X_test[:dim, :n]
Y_test = Y_test[:, :n]
y_test = y_test[:n]

self = Classifier(X_train, X_val, X_test, Y_train,
                        Y_val, Y_test, y_train, y_val, y_test)
self.preprocess()
self.initialize_weights(m)
print(X_train.shape, Y_train.shape, y_train.shape)
print(X_val.shape, Y_val.shape, y_val.shape)
print(X_test.shape, Y_test.shape, y_test.shape)
print(self.W1.shape, self.b1.shape,
      self.W2.shape, self.b2.shape)

(50, 10) (10, 10) (10,)
(50, 10) (10, 10) (10,)
(50, 10) (10, 10) (10,)
torch.Size([5, 50]) torch.Size([5, 1]) torch.Size([10, 5]) torch.Size([10, 1])


In [47]:
self.check_grad()

Between analytical and torch gradients:
dW1: torch.Size([5, 50]) grad_W1: torch.Size([5, 50])
db1: torch.Size([5, 1]) grad_b1: torch.Size([5, 1])
dW2: torch.Size([10, 5]) grad_W2: torch.Size([10, 5])
db2: torch.Size([10, 1]) grad_b2: torch.Size([10, 1])
Difference for W1: tensor(4.7554, dtype=torch.float64, grad_fn=<MaxBackward1>)
Difference for b1: tensor(5.3112, dtype=torch.float64, grad_fn=<MaxBackward1>)
Difference for W2: tensor(2.7605, dtype=torch.float64, grad_fn=<MaxBackward1>)
Difference for b2: tensor(5.0000, dtype=torch.float64, grad_fn=<MaxBackward1>)
Relative error for W1: tensor(1.0000, dtype=torch.float64, grad_fn=<MaxBackward1>)
Relative error for b1: tensor(1.0000, dtype=torch.float64, grad_fn=<MaxBackward1>)
Relative error for W2: tensor(1.0000, dtype=torch.float64, grad_fn=<MaxBackward1>)
Relative error for b2: tensor(1.0000, dtype=torch.float64, grad_fn=<MaxBackward1>)

Between analytical and numerical gradients:
Difference for W1: tensor(4.7623, dtype=torch.float64