<a href="https://colab.research.google.com/github/yesoly/MachineLearningProject/blob/master/Assignment_10.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Optimal Selection of the hyper-parameters associated with the classification on MNIST
Choose an optimal set of hyper-parameters and design a neural network for the classification of MNIST dataset

In [None]:
import os

# load data
from torch.utils.data import DataLoader
from torchvision import datasets, transforms

# train
import torch
from torch import nn, optim
from torch.nn import functional as F
import numpy as np

# visualization
import matplotlib.pyplot as plt
import pandas as pd

check device

In [None]:
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
print('Device: {}'.format(device))

## 1. Data
* you can use any data normalisation method
* one example of the data normalisation is whitenning as given by:

In [None]:
transform = transforms.Compose([
    transforms.ToTensor(),
    transforms.Normalize((0.1307,),(0.3081,)),  # mean value = 0.1307, standard deviation value = 0.3081
])

* load the MNIST dataset
* use the original training dataset for testing your model
* use the original testing dataset for training your model

In [None]:
data_path = './MNIST'

data_test   = datasets.MNIST(root = data_path, train= True, download=True, transform= transform)
data_train  = datasets.MNIST(root = data_path, train= False, download=True, transform= transform)

* Note that the number of your training data must be 10,000
* Note that the number of your testing data must be 60,000

In [None]:
print("the number of your training data (must be 10,000) = ", data_train.__len__())
print("hte number of your testing data (must be 60,000) = ", data_test.__len__())

## 2. Model

* design a neural network architecture with three layers (input layer, one hidden layer and output layer)
* the input dimension of the input layer should be 784 (28 * 28)
* the output dimension of the output layer should be 10 (class of digits)
* all the layers should be fully connected layers
* use any type of activation functions

In [None]:
class classification(nn.Module):
    def __init__(self):
        super(classification, self).__init__()
        
        # construct layers for a neural network
        self.classifier1 = nn.Sequential(
            nn.Linear(in_features=28*28, out_features=dim_layer1_out),
            nn.activation_layer1,
        ) 
        self.classifier2 = nn.Sequential(
            nn.Linear(in_features=dim_layer2_in, out_features=dim_layer2_out),
            nn.activation_layer2,
        ) 
        self.classifier3 = nn.Sequential(
            nn.Linear(in_features=dim_layer3_in, out_features=10),
            nn.activation_layer3,
        ) 
    
    def forward(self, inputs):                 # [batchSize, 1, 28, 28]
        x = inputs.view(inputs.size(0), -1)    # [batchSize, 28*28]
        x = self.classifier1(x)                # [batchSize, 20*20]
        x = self.classifier2(x)                # [batchSize, 10*10]
        out = self.classifier3(x)              # [batchSize, 10]
        
        return out


## 3. Loss function
* use any type of loss function
* design the output of the output layer considering your loss function

In [None]:
criterion = nn.CrossEntropyLoss()

## 4. Optimization
* use any stochastic gradient descent algorithm for the optimization
* use any size of the mini-batch
* use any optimization algorithm (for example, Momentum, AdaGrad, RMSProp, Adam)
* use any regularization algorithm (for example, Dropout, Weight Decay)
* use any annealing scheme for the learning rate (for example, constant, decay, staircase)

In [None]:
net = classification()
net.to(device)

In [None]:
lr = 0.0001

optimizer = optim.RMSprop(net.parameters(), lr=lr, alpha=0.9)
scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=5, gamma=0.8)

##5. Training

In [None]:
def accuracy(out, y):
    preds = torch.argmax(out, dim=1)
    return (preds == y).float().mean().item(), len(y)


def loss_batch(net, x, y):
    loss = loss_fn(net(x), y)
    return loss.item(), len(x)


def train_model():
    print('Train model')
    best_model_wts = copy.deepcopy(net.state_dict())
    best_acc = 0.0
    loss_show = []
    acc_show = []

    for epoch in range(epochs):
        epoch_since = time.time()
        print("Epoch {}:".format(epoch), end=' ')
        net.train()
        for step, (b_x, b_y) in enumerate(train_dl):
            out = net(b_x.to(device))
            loss = loss_fn(out, b_y.to(device))
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
        net.eval()
        with torch.no_grad():
            losses, nums_loss = zip(
                *[loss_batch(net, x.to(device), y.to(device)) for x, y in valid_dl]
            )
            acc, nums_acc = zip(
                *[accuracy(net(x.to(device)), y.to(device)) for x, y in valid_dl]
            )
        val_loss = np.sum(np.multiply(losses, nums_loss)) / np.sum(nums_loss)
        val_acc = np.sum(np.multiply(acc, nums_acc)) / np.sum(nums_acc)
        epoch_time_elapsed = time.time() - epoch_since
        print("time:{:.0f}s".format(epoch_time_elapsed), "loss:{:.10f}".format(val_loss), "accuracy:{:.10f}".format(val_acc))
        if val_acc > best_acc:
            best_acc = val_acc
            best_model_wts = copy.deepcopy(net.state_dict())
        scheduler.step(val_loss)
        loss_show.append(val_loss)
        acc_show.append(val_acc)

        if epoch % 10 == 9:
            net.load_state_dict(best_model_wts)
# torch.save(net.state_dict(), net_path)
    net.load_state_dict(best_model_wts)
# Save your model if you want
# torch.save(net.state_dict(), net_path)
    plt.figure()
    plt.plot(range(epochs), loss_show)
    plt.figure()
    plt.plot(range(epochs), acc_show)
    plt.show()
    


# Set epoch to 30 to get 99.4% accuracy
epochs = 10
train_model()

# Submission

1. Plot the training and testing losses over epochs [2pt]

2. Plot the training and testing accuracies over epochs [2pt]

3. Print the final training and testing losses at convergence [2pt]

4. Print the final training and testing accuracies at convergence [20pt]