In [2]:
# import libs that we will use
import os
import numpy as np
import matplotlib.pyplot as plt
import math
import torch as th 
import torch.nn as nn 
import torch.nn.functional as F

# To load the data we will use the script of Gaetan Marceau Caron
# You can download it from the course webiste and move it to the same directory that contains this ipynb file
import dataset_loader

%matplotlib inline

In [2]:
# # Download mnist dataset 
# if("mnist.pkl.gz" not in os.listdir(".")):
#     !wget http://deeplearning.net/data/mnist/mnist.pkl.gz

# if you have it somewhere else, you can comment the lines above
# and overwrite the path below
mnist_path = "../mnist.pkl.gz"

In [3]:
# load the 3 splits
train_data, dev_data, test_data = dataset_loader.load_mnist(mnist_path)

# Part one: MNIST classification with Pytorch
The goal of the first part is to learn how to use Pytorch and to observe the impact of regularization during training. You should test different network architectures, e.g. with hidden layers of size 128-128, 128-64-32-16, 256-128-64-32-16, 512-256-128-64-32-16, 800-800, and different activation functions (tanh, relu, sigmoid).

Remember that Pytorch expects data in a different format than in the previous lab exercise: the first dimension is always the batch dimension.

In [7]:
image = th.from_numpy(train_data[0][0])
print(image.shape) # flat image of dim (784,)

# reshape the tensor so it is represented as a batch containing a single image
# -1 means "all remaining elements", here it would be equivalent to image.reshape(1, 784)
image = image.reshape(1, -1)
print(image.shape) # flat image of dim (1, 784)

torch.Size([784])
torch.Size([1, 784])


In [13]:
# Constructing a batched input
batch_size = 10
first = 20

# the cat() function concatenates a list of tensor along a dimension
batch_input = th.cat(
    [
        # we reshape the image tensor so it has dimension (1, 784)
        th.from_numpy(image).reshape(1, -1)
        for image in train_data[0][first:first + batch_size]
    ],
    # we want to concatenate on the batch dimension
    dim=0
)
print(batch_input.shape)  # batch of ten flat images (10, 784)

torch.Size([10, 784])


## 1.2. Layer initialization¶
By default, Pytorch will apply Kaiming initialization to linear layers. However, I recommend you to always explicitly initialize you network by hand in the constructor.

In [9]:
linear = th.nn.Linear(10, 20, bias=bias)

# initialization are always in-place operations
# linear.weight is a Parameter, linear.weight.data is the tensor containing the parameter values
th.nn.init.xavier_uniform_(linear.weight.data)  # Xavier/Glorot init for tanh
th.nn.init.kaiming_uniform_(linear.weight.data)  # Kaiming/He init for tanh

if bias:
    th.nn.init.zeros_(linear.bias.data)

NameError: name 'bias' is not defined

## 1.3. Regularization

You can try two types of regularization (they can be combined together):

weight decay:��it is a parameter of the optimizer
dropout: see slides

## 1.4. Gradient clipping

A commong trick for training neural networks is gradient clipping: if the norm of the gradient is too big, we rescale the gradient. This trick can be used to prevent exploding gradients and also to make "too big steps" in the wrong direction due the use of approximate gradient computation in SGD.

In [10]:
batch_loss.backward()  # compute gradient
torch.nn.utils.clip_grad_value_(network.parameters(), 5.)  # clip gradient if its norm exceed 5
optimizer.step()  # update parameters

NameError: name 'batch_loss' is not defined

In [119]:
class Model_1(nn.Module):
    def __init__(self):
        super(Model_1, self).__init__()
        self.dense1 = nn.Linear(784, 128)
        self.dense2 = nn.Linear(128, 10)

        self.dropout = nn.Dropout(0.5)
        nn.init.xavier_uniform_(self.dense1.weight.data)  # Xavier/Glorot init for tanh
        nn.init.zeros_(self.dense1.bias.data)  # Xavier/Glorot init for tanh
        nn.init.zeros_(self.dense2.bias.data)  # Xavier/Glorot init for tanh
        
    def forward(self, x):
        x = F.relu(self.dense1(x))
        x = self.dropout(x)
        return F.softmax(self.dense2(x))
    
class Model_2(nn.Module):
    def __init__(self):
        super(Model_2, self).__init__()
        self.dense1 = nn.Linear(784, 10)
        nn.init.xavier_uniform_(self.dense1.weight.data)  # Xavier/Glorot init for tanh
        nn.init.zeros_(self.dense1.bias.data)  # Xavier/Glorot init for tanh
        self.dropout = nn.Dropout(0.5)
        
    def forward(self, x):
        return F.softmax(self.dropout(self.dense1(x)))

In [120]:
lr = 1e-2
model1 = Model_1()
model2 = Model_2()

optim1 = th.optim.SGD(params=model1.parameters(), lr =lr, weight_decay=1e-4)
optim2 = th.optim.SGD(params=model2.parameters(), lr =lr, weight_decay=1e-4)

fn = nn.NLLLoss()
idx = np.arange(len(train_data[0]))
max_epoch = 10

In [123]:
def train(model, optim, max_epochs=5):
    for i in range(max_epoch):
        np.random.shuffle(idx)
        accuracy = 0
        mean_loss = 0
        for id in idx:

            data = th.from_numpy(train_data[0][id].reshape(1, -1))
            label = th.LongTensor([train_data[1][id]])

            pred = model(data)
            loss = fn(pred, label)

            mean_loss+=loss.item()
            optim.zero_grad()
            loss.backward()
            optim.step()
            _,p = th.max(pred,1)

            if(p==label):
                accuracy+=1
        print("EPOCH {}".format(i+1))
        print("Accuracy : ",accuracy/len(idx))
        print("Mean loss : ",mean_loss)

In [124]:
train(model1, optim1, 10)

  from ipykernel import kernelapp as app


EPOCH 1
Accuracy :  0.75972
Mean loss :  -36489.70196761612
EPOCH 2
Accuracy :  0.84682
Mean loss :  -41729.32121706661


KeyboardInterrupt: 

## 1.4. Bonus: Convolutional Neural Network

You can try to rely on a CNN instead of a MLP to classify MNIST images (you can still have a single layer MLP on top of convolutions, after pooling!). Note that this will requires you to reshape the input images!

In [12]:
t = th.rand((10, 100))  # t is batch of 10 "flat" pictures
t = t.reshape(10, 10, 10)  # we reshape t so each batch contains a 10x10 picture that is not flat