## Laboratoire 2

In [None]:
%load_ext autoreload
%autoreload 2
%matplotlib inline

import math
import numpy as np
import matplotlib
import matplotlib.pyplot as plt

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.autograd import Variable
from deeplib.visualization import make_vizualization_autograd

### Graphe computationnel et backprop

- Autograd
- Appeler backprop deux fois (qu'est-ce qui arrive)?
- Volatile
- requires_gradient true et false pour les variables à entraîner vs. les inputs et les variables freezées

In [None]:
x = Variable(torch.Tensor(3, 3).uniform_(-1, 1), requires_grad=True)
y = Variable(torch.Tensor(3, 3).uniform_(-1, 1), requires_grad=True)
z = Variable(torch.Tensor(3, 3).uniform_(-1, 1), requires_grad=True)
w = torch.matmul(x, y) + x + y + z

In [None]:
make_vizualization_autograd(w)

### Fonction d'activation
- Avantage de la ReLU sur sigmoid vs. tanh
- Exemple sur le vanishing gradient
- Réduction d'un réseau à plusieur couches sans non-linéarité à un réseau à une seule couche.

In [None]:
class RandomModel(torch.nn.Module):
    
    def __init__(self, n_layers):
        super().__init__()
        torch.manual_seed(12345) # Both Tanh model and ReLU model will have the same random weights
        self.layers = []
        for i in range(n_layers):
            layer = nn.Linear(5,5)
            layer.weight.data.normal_(0, math.sqrt(2 / 5))
            layer.bias.data.fill_(0)
            self.layers.append(layer)
        self.nonzero_grad_stats = None
        
    
    def forward(self):
        raise NotImplementedError('Defined in children classes')
        
    
    def print_weights_grads(self):
        self.nonzero_grad_stats = []
        for i, layer in enumerate(self.layers):
            print("-----\nLayer %d" % i)
            print("Weight:\n%sWeight gradient:\n%s\n" % (str(layer.weight.data), 
                                                         str(layer.weight.grad)))
            if layer.weight.grad is not None:
                nonzero_grad_indices = torch.nonzero(layer.weight.grad.data)
                nonzero_grad = [layer.weight.grad.data[i,j] for (i,j) in nonzero_grad_indices]
                nonzero_grad_mean = np.mean(np.abs(nonzero_grad))
                self.nonzero_grad_stats.append((len(nonzero_grad), nonzero_grad_mean))
                print("Number of nonzero gradient: %f" % len(nonzero_grad))
                print("Nonzero grad mean: %f" % nonzero_grad_mean)
        

        
class RandomReluModel(RandomModel):
    
    def __init__(self, n_layers):
        super().__init__(n_layers)
        
    
    def forward(self, x):
        out = x
        for layer in self.layers[:-1]: # All but last layer
            out = layer.forward(out)
            out = F.relu(out)
        return self.layers[-1].forward(out)
        
        
        
class RandomTanhModel(RandomModel):
    
    def __init__(self, n_layers):
        super().__init__(n_layers)
        
    
    def forward(self, x):
        out = x
        for layer in self.layers[:-1]: # All but last layer
            out = layer.forward(out)
            out = F.tanh(out)
        return self.layers[-1].forward(out)


In [None]:
relu_model = RandomReluModel(10)
tanh_model = RandomTanhModel(10)
relu_model.print_weights_grads()
tanh_model.print_weights_grads()

In [None]:
random_input = Variable(torch.randn(5))
relu_output = relu_model.forward(random_input)
tanh_output = tanh_model.forward(random_input)
print(random_input)
print("ReLU model ouput:\n", relu_output)
print("tanh model ouput:\n", tanh_output)

In [None]:
relu_loss = torch.norm(relu_output)
tanh_loss = torch.norm(tanh_output)
print(relu_loss, tanh_loss)

In [None]:
relu_loss.backward()
tanh_loss.backward()
relu_model.print_weights_grads()
tanh_model.print_weights_grads()

In [None]:
plt.plot(np.arange(len(relu_model.nonzero_grad_stats)), [x[0] for x in relu_model.nonzero_grad_stats])
plt.plot(np.arange(len(tanh_model.nonzero_grad_stats)), [x[0] for x in tanh_model.nonzero_grad_stats])

In [None]:
fig, axs = plt.subplots(2)
axs[0].plot(np.arange(len(relu_model.nonzero_grad_stats)), [x[1] for x in relu_model.nonzero_grad_stats])
axs[0].plot(np.arange(len(tanh_model.nonzero_grad_stats)), [x[1] for x in tanh_model.nonzero_grad_stats])
axs[1].plot(np.arange(4), [x[1] / x[0] for x in relu_model.nonzero_grad_stats[:4]])
axs[1].plot(np.arange(4), [x[1] / x[0] for x in tanh_model.nonzero_grad_stats[:4]])
plt.show()

In [None]:
heatmap = np.zeros((5,5))
for _ in range(1000):
    random_input = Variable(torch.randn(5))
    relu_model.forward(random_input)
    nonzero_grad_indices = torch.nonzero(relu_model.layers[0].weight.grad.data)
    for (i, j) in nonzero_grad_indices:
        heatmap[i,j] += 1
print(heatmap)

### Questions
- Observez la distribution du gradient lors de la backprop. Quelles différences y a-t-il entre la backprop à travers ReLU et à travers tanh?
- Est-ce que, pour deux entrées différentes, les mêmes poids ont un gradient élevé?
- Changez le nombre de couches du réseau. Qu'observez-vous?

### Couche de sortie
Voir ce qui est vu en classe et faire un exemple en lien avec ça. Idées:
- Comment utiliser softmax
- Non-linéarité après le fully-connected en sortie (erreur classique, exercice du genre trouvez l'erreur)