In [1]:
# import libs that we will use
import os
import numpy as np
import matplotlib.pyplot as plt
import math
import torch as th 
import torch.nn as nn 
import torch.nn.functional as F

# To load the data we will use the script of Gaetan Marceau Caron
# You can download it from the course webiste and move it to the same directory that contains this ipynb file
import dataset_loader

%matplotlib inline

In [2]:
# # Download mnist dataset 
# if("mnist.pkl.gz" not in os.listdir(".")):
#     !wget http://deeplearning.net/data/mnist/mnist.pkl.gz

# if you have it somewhere else, you can comment the lines above
# and overwrite the path below
mnist_path = "../mnist.pkl.gz"

In [3]:
# load the 3 splits
train_data, dev_data, test_data = dataset_loader.load_mnist(mnist_path)

# Part one: MNIST classification with Pytorch
The goal of the first part is to learn how to use Pytorch and to observe the impact of regularization during training. You should test different network architectures, e.g. with hidden layers of size 128-128, 128-64-32-16, 256-128-64-32-16, 512-256-128-64-32-16, 800-800, and different activation functions (tanh, relu, sigmoid).

Remember that Pytorch expects data in a different format than in the previous lab exercise: the first dimension is always the batch dimension.

In [4]:
image = th.from_numpy(train_data[0][0])
print(image.shape) # flat image of dim (784,)

# reshape the tensor so it is represented as a batch containing a single image
# -1 means "all remaining elements", here it would be equivalent to image.reshape(1, 784)
image = image.reshape(1, -1)
print(image.shape) # flat image of dim (1, 784)

torch.Size([784])
torch.Size([1, 784])


In [5]:
# Constructing a batched input
batch_size = 10
first = 20

# the cat() function concatenates a list of tensor along a dimension
batch_input = th.cat(
    [
        # we reshape the image tensor so it has dimension (1, 784)
        th.from_numpy(image).reshape(1, -1)
        for image in train_data[0][first:first + batch_size]
    ],
    # we want to concatenate on the batch dimension
    dim=0
)
print(batch_input.shape)  # batch of ten flat images (10, 784)

torch.Size([10, 784])


## 1.2. Layer initialization¶
By default, Pytorch will apply Kaiming initialization to linear layers. However, I recommend you to always explicitly initialize you network by hand in the constructor.

In [6]:
linear = th.nn.Linear(10, 20, bias=bias)

# initialization are always in-place operations
# linear.weight is a Parameter, linear.weight.data is the tensor containing the parameter values
th.nn.init.xavier_uniform_(linear.weight.data)  # Xavier/Glorot init for tanh
th.nn.init.kaiming_uniform_(linear.weight.data)  # Kaiming/He init for tanh

if bias:
    th.nn.init.zeros_(linear.bias.data)

NameError: name 'bias' is not defined

## 1.3. Regularization

You can try two types of regularization (they can be combined together):

weight decay:��it is a parameter of the optimizer
dropout: see slides

## 1.4. Gradient clipping

A commong trick for training neural networks is gradient clipping: if the norm of the gradient is too big, we rescale the gradient. This trick can be used to prevent exploding gradients and also to make "too big steps" in the wrong direction due the use of approximate gradient computation in SGD.

In [10]:
batch_loss.backward()  # compute gradient
torch.nn.utils.clip_grad_value_(network.parameters(), 5.)  # clip gradient if its norm exceed 5
optimizer.step()  # update parameters

NameError: name 'batch_loss' is not defined

In [93]:
class Model_1(nn.Module):
    def __init__(self):
        super(Model_1, self).__init__()
        self.dense1 = nn.Linear(784, 128)
        self.dense2 = nn.Linear(128, 10)

        self.dropout = nn.Dropout(0.5)
        nn.init.xavier_uniform_(self.dense1.weight.data)  # Xavier/Glorot init for tanh
        nn.init.zeros_(self.dense1.bias.data)  # Xavier/Glorot init for tanh
        nn.init.zeros_(self.dense2.bias.data)  # Xavier/Glorot init for tanh
        
    def forward(self, x):
        x = F.relu(self.dense1(x))
        x = self.dropout(x)
        return F.softmax(self.dense2(x), dim=1)
    
class Model_2(nn.Module):
    def __init__(self):
        super(Model_2, self).__init__()
        self.dense1 = nn.Linear(784, 10)
        nn.init.xavier_uniform_(self.dense1.weight.data)  # Xavier/Glorot init for tanh
        nn.init.zeros_(self.dense1.bias.data)  # Xavier/Glorot init for tanh
        self.dropout = nn.Dropout(0.5)
        
    def forward(self, x):
        return F.softmax(self.dropout(self.dense1(x)), dim=1)
    
# 256-128-64-32-16    
class Model_3(nn.Module):
    def __init__(self):
        super(Model_3, self).__init__()
        self.dense1 = nn.Linear(784, 256)
        self.dense2 = nn.Linear(256, 128)
        self.dense3 = nn.Linear(128, 64)
        self.dense4 = nn.Linear(64, 32)
        self.dense5 = nn.Linear(32, 16)
        self.dense6 = nn.Linear(16, 10)
        
        nn.init.xavier_uniform_(self.dense1.weight.data)  # Xavier/Glorot init for tanh
        nn.init.zeros_(self.dense1.bias.data)  # Xavier/Glorot init for tanh
        
        nn.init.xavier_uniform_(self.dense2.weight.data)  # Xavier/Glorot init for tanh
        nn.init.zeros_(self.dense2.bias.data)  # Xavier/Glorot init for tanh
        
        nn.init.xavier_uniform_(self.dense3.weight.data)  # Xavier/Glorot init for tanh
        nn.init.zeros_(self.dense3.bias.data)  # Xavier/Glorot init for tanh
        
        nn.init.xavier_uniform_(self.dense4.weight.data)  # Xavier/Glorot init for tanh
        nn.init.zeros_(self.dense4.bias.data)  # Xavier/Glorot init for tanh
        
        nn.init.xavier_uniform_(self.dense5.weight.data)  # Xavier/Glorot init for tanh
        nn.init.zeros_(self.dense5.bias.data)  # Xavier/Glorot init for tanh
        
        nn.init.xavier_uniform_(self.dense6.weight.data)  # Xavier/Glorot init for tanh
        nn.init.zeros_(self.dense6.bias.data)  # Xavier/Glorot init for tanh
        

        self.dropout = nn.Dropout(0.5)
        
    def forward(self, x):
        x = F.relu(self.dense1(x))
#         x = self.dropout(x)
        x = F.relu(self.dense2(x))
#         x = self.dropout(x)
        x = F.relu(self.dense3(x))
#         x = self.dropout(x)
        x = F.relu(self.dense4(x))
#         x = self.dropout(x)
        x = F.relu(self.dense5(x))
#         x = self.dropout(x)
        return F.softmax(self.dropout(self.dense6(x)), dim=1)
    
class Model_4(nn.Module):
    def __init__(self):
        super(Model_4, self).__init__()
        self.conv1 = nn.Conv2d(1, 20, 5, 1)
        self.conv2 = nn.Conv2d(20, 50, 5, 1)
        self.fc1 = nn.Linear(4*4*50, 500)
        self.fc2 = nn.Linear(500, 10)

    def forward(self, x):
        x = F.relu(self.conv1(x))
        x = F.max_pool2d(x, 2, 2)
        x = F.relu(self.conv2(x))
        x = F.max_pool2d(x, 2, 2)
        x = x.view(-1, 4*4*50)
        x = F.relu(self.fc1(x))
        x = self.fc2(x)
        return F.softmax(x, dim=1)

In [25]:
def get_batches(batch_size):
    for i in range(0, len(train_data[0]), batch_size):
        yield train_data[0][i:i+batch_size], train_data[1][i:i+batch_size]

In [72]:
lr = 1e-2
model1 = Model_1()
batch_size = 10
fn = nn.NLLLoss()
max_epoch = 10

In [61]:
model2 = Model_2()

In [81]:
model3 = Model_3()

In [95]:
model4 = Model_4()

In [94]:
def train(model, batch_size, max_epochs=5, conv=False):
    optim = th.optim.SGD(params=model.parameters(), lr =lr, weight_decay=1e-4)
    model.train()
    for i in range(max_epoch):
        np.random.shuffle(idx)
        accuracy = 0
        mean_loss = 0
        batches = get_batches(batch_size)
        for x, y in batches:
            if conv :
                x = x.reshape(batch_size, 1, 28,28)
            data = th.from_numpy(x)#.reshape(x.shape[0],x.shape[1],1))
            label = th.LongTensor(y)
            pred = model(data)
            loss = fn(pred, label)

            mean_loss+=loss.item()
            optim.zero_grad()
            loss.backward()
            optim.step()
            _,p = th.max(pred,1)

            accuracy+=len([1 for i, j in zip(label, p) if i == j])
        print("EPOCH {}".format(i+1))
        print("Accuracy : ",accuracy/len(train_data[0]))
        print("Mean loss : ",mean_loss/len(train_data[0]))
        print("----------------------------------------")

In [73]:
train(model1, 10)

EPOCH 1
Accuracy :  0.47382
Mean loss :  -1899.6645382866263
----------------------------------------
EPOCH 2
Accuracy :  0.70156
Mean loss :  -3281.501865029335
----------------------------------------
EPOCH 3
Accuracy :  0.75574
Mean loss :  -3605.012201502919
----------------------------------------
EPOCH 4
Accuracy :  0.8016
Mean loss :  -3854.4228487312794
----------------------------------------
EPOCH 5
Accuracy :  0.81262
Mean loss :  -3942.9916238188744
----------------------------------------
EPOCH 6
Accuracy :  0.82266
Mean loss :  -4006.8974404633045
----------------------------------------
EPOCH 7
Accuracy :  0.82558
Mean loss :  -4037.788662701845
----------------------------------------
EPOCH 8
Accuracy :  0.83016
Mean loss :  -4068.9664451777935
----------------------------------------
EPOCH 9
Accuracy :  0.83254
Mean loss :  -4086.0620645284653
----------------------------------------
EPOCH 10
Accuracy :  0.83536
Mean loss :  -4103.097301006317
-------------------------

In [51]:
train(model2, 10)



EPOCH 1
Accuracy :  0.49768
Mean loss :  -2482.722016237676
----------------------------------------
EPOCH 2
Accuracy :  0.49992
Mean loss :  -2498.326222091913
----------------------------------------
EPOCH 3
Accuracy :  0.50252
Mean loss :  -2508.5511846393347
----------------------------------------
EPOCH 4
Accuracy :  0.50498
Mean loss :  -2524.6669537201524
----------------------------------------
EPOCH 5
Accuracy :  0.50866
Mean loss :  -2543.958852380514
----------------------------------------
EPOCH 6
Accuracy :  0.50936
Mean loss :  -2547.138530880213
----------------------------------------
EPOCH 7
Accuracy :  0.514
Mean loss :  -2568.0107804089785
----------------------------------------
EPOCH 8
Accuracy :  0.51412
Mean loss :  -2563.2451852038503
----------------------------------------


KeyboardInterrupt: 

In [82]:
train(model3, 10)

EPOCH 1
Accuracy :  0.35084
Mean loss :  -0.03299043372314423
----------------------------------------
EPOCH 2
Accuracy :  0.51066
Mean loss :  -0.05074212283015251
----------------------------------------
EPOCH 3
Accuracy :  0.53664
Mean loss :  -0.05367386571764946
----------------------------------------
EPOCH 4
Accuracy :  0.54456
Mean loss :  -0.05465708997815848
----------------------------------------
EPOCH 5
Accuracy :  0.55368
Mean loss :  -0.055373174402713776
----------------------------------------
EPOCH 6
Accuracy :  0.55954
Mean loss :  -0.056128567058146
----------------------------------------
EPOCH 7
Accuracy :  0.55802
Mean loss :  -0.056090947017371655
----------------------------------------
EPOCH 8
Accuracy :  0.5637
Mean loss :  -0.05654029991388321
----------------------------------------
EPOCH 9
Accuracy :  0.56546
Mean loss :  -0.05670385486841202
----------------------------------------
EPOCH 10
Accuracy :  0.57232
Mean loss :  -0.057393597545027736
----------

In [96]:
train(model4, 10, conv=True)

EPOCH 1
Accuracy :  0.55366
Mean loss :  -0.04659384910196066
----------------------------------------
EPOCH 2
Accuracy :  0.84
Mean loss :  -0.08310517079651356
----------------------------------------
EPOCH 3
Accuracy :  0.93874
Mean loss :  -0.09306505871772766
----------------------------------------
EPOCH 4
Accuracy :  0.9578
Mean loss :  -0.09525022202134133
----------------------------------------
EPOCH 5
Accuracy :  0.96704
Mean loss :  -0.0962427853679657
----------------------------------------
EPOCH 6
Accuracy :  0.9724
Mean loss :  -0.09682691614866257
----------------------------------------
EPOCH 7
Accuracy :  0.97638
Mean loss :  -0.09727106639027595
----------------------------------------
EPOCH 8
Accuracy :  0.97918
Mean loss :  -0.09759943298101426
----------------------------------------
EPOCH 9
Accuracy :  0.98174
Mean loss :  -0.09786574044585228
----------------------------------------
EPOCH 10
Accuracy :  0.98324
Mean loss :  -0.09806167255043984
----------------

In [99]:
th.save(model4.state_dict(), "conv_dict.pth")

## 1.4. Bonus: Convolutional Neural Network

You can try to rely on a CNN instead of a MLP to classify MNIST images (you can still have a single layer MLP on top of convolutions, after pooling!). Note that this will requires you to reshape the input images!

# Part 2: Variational Auto-Encoder

To build a new Variational Auto-Encoder, you need two networks:

- An encoder that will take as input an image and compute the parameters of list of Normal distributions
- A decoder that will take a sample from each Normal distribution and will output an image

For simplicity we will assume that:

- each network as a single hidden layer of size 100
- the latent space contains only 2 points

To understand exactly what a VAE is, you can:


- check the slides of Michèle Sebag
- check this tutorial: https://arxiv.org/abs/1606.05908

## 1.2. Encoder
- Compute an hidden representation:  $z = relu(W^1x+b^1)$
- Compute the means of the normal distributions:  $mu=W^2x+b^2$
- Compute the log variance of the normal distributions:  $log_{sigmasquared}=W^3x+b^3$

## 1.2. Decoder
This a simple MLP, nothing new here!

## 1.3. Training loss
To compute the training loss, you must compute two terms:

- a Monte-Carlo estimation of the reconstruction loss
- the KL divergence between the distributions computed by the encoder and the prior

To sample values, you can use the reparameterization trick as follows:

In [None]:
e = torch.normal(0, 1., mu.shape)
z = mu + e * torch.sqrt(torch.exp(log_sigma_squared))

For the reconstruction loss, use the Binary Cross Entropy loss:

In [97]:
loss_builder = torch.nn.BCEWithLogitsLoss(reduction="sum")

NameError: name 'torch' is not defined

The formula of the KL divergence with the prior is as follows:

In [None]:
-0.5 * torch.sum(1 + log_sigma_squared - mu.pow(2) - log_sigma_squared.exp())

## 1.4. Recomended hyper parameters
- Optimizer: Adam
- N. epochs: 50
- Use gradient clipping!
- Large batch size, e.g. 128

In [None]:
# use itertools.chain to join parameters of the two networks
optimizer = torch.optim.Adam(itertools.chain(encoder.parameters(), decoder.parameters()))
torch.nn.utils.clip_grad_value_(itertools.chain(encoder.parameters(), decoder.parameters()), 5.)

## 1.5. Generate new images
Note: they will be blurry, but that's ok!

In [None]:
e = torch.normal(0, 1., (10, 2))
images = decoder(e).sigmoid()

for i in range(10):
    picture = images[i].clone().detach().numpy()
    plt.imshow(picture.reshape(28,28), cmap='Greys')
    plt.show()