<a href="https://colab.research.google.com/github/venomouscyanide/dl_sain/blob/master/week3/week3_tuning.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install gdown==3.13.0

Collecting gdown==3.13.0
  Downloading https://files.pythonhosted.org/packages/52/b9/d426f164f35bb50d512a77d6a7c5eb70b2bea3459dc10f73f130ba732810/gdown-3.13.0.tar.gz
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
    Preparing wheel metadata ... [?25l[?25hdone
Building wheels for collected packages: gdown
  Building wheel for gdown (PEP 517) ... [?25l[?25hdone
  Created wheel for gdown: filename=gdown-3.13.0-cp37-none-any.whl size=9046 sha256=c491804ff49821c78205fa5ec5cb544a3c6d8f6b4193864618cec7cc33032a82
  Stored in directory: /root/.cache/pip/wheels/ba/fa/c5/12813d7496f34652c43a471e11a780e769889d06e34735c32e
Successfully built gdown
Installing collected packages: gdown
  Found existing installation: gdown 3.6.4
    Uninstalling gdown-3.6.4:
      Successfully uninstalled gdown-3.6.4
Successfully installed gdown-3.13.0


## All Imports

In [2]:
import gzip
import shutil
import copy
import random
from typing import Tuple, List, Union
import numpy as np
import struct

# third party
import gdown

## Add Dataloader to get the validation/testing/training MNIST datasets

In [3]:
class MNISTDataLoader:
    # explanation of idx file formats: http://yann.lecun.com/exdb/mnist/
    # help wrt parsing data: https://stackoverflow.com/a/53181925

    TRAINING_DATA_URL: str = 'https://drive.google.com/uc?id=1pmI9wAdNtJkOvkJpdTqM9bmIAwPkGyMU'
    TRAINING_DATA_LABELS_URL: str = 'https://drive.google.com/uc?id=1R8BZL67U1N0GUGnf6AQIBZNVDCWO9QLS'
    TESTING_DATA_URL: str = 'https://drive.google.com/uc?id=10FdcUHw3BcQAU6keKaUwtDwJm4sC00Hu'
    TESTING_DATA_LABELS_URL: str = 'https://drive.google.com/uc?id=1GvsacEnI1eQ1vYZM-oYdERvaE2SPh0Lj'

    def load_data_wrapper(self):
        testing_data_tuple = self.load_data_as_ndarray(self.TESTING_DATA_URL, self.TESTING_DATA_LABELS_URL, False)
        training_data_tuple = self.load_data_as_ndarray(self.TRAINING_DATA_URL, self.TRAINING_DATA_LABELS_URL, True)
        return training_data_tuple, testing_data_tuple

    def load_data_wrapper_with_validation(self):
        testing_data_tuple = self.load_data_as_ndarray(self.TESTING_DATA_URL, self.TESTING_DATA_LABELS_URL, False)
        training_data_tuple = self.load_data_as_ndarray(self.TRAINING_DATA_URL, self.TRAINING_DATA_LABELS_URL, True)

        slice_threshold = len(training_data_tuple) - len(testing_data_tuple)
        training_data_tuple, validation_data_tuple = training_data_tuple[:slice_threshold], training_data_tuple[
                                                                                            slice_threshold:]
        self._convert_label_to_int(validation_data_tuple)
        return training_data_tuple, testing_data_tuple, validation_data_tuple

    def load_data_as_ndarray(self, data_file_url: str, data_labels_file_url: str, train: bool) -> List[
        Tuple[np.ndarray, Union[int, np.ndarray]]]:
        uncompressed_dataset = self._download_and_uncompressed_file(data_file_url)
        uncompressed_labels = self._download_and_uncompressed_file(data_labels_file_url)
        pixel_data = self._get_pixel_data(uncompressed_dataset)
        label_data = self._get_labels(uncompressed_labels)
        zipped_data = [
            (x.reshape(784, 1), self._one_hot_enc(y) if train else y[0]) for x, y in zip(pixel_data, label_data)
        ]
        return zipped_data

    def _one_hot_enc(self, y: np.ndarray):
        one_hot_vector = np.zeros((10, 1))
        one_hot_vector[y[0]][0] = 1
        return one_hot_vector

    def _download_and_uncompressed_file(self, url: str) -> str:
        downloaded_gzip = gdown.download(url)
        decompressed_data_file = self._write_decompressed_data(downloaded_gzip)
        return decompressed_data_file

    def _write_decompressed_data(self, downloaded_gzip: str) -> str:
        with gzip.open(downloaded_gzip, 'rb') as compressed:
            uncompressed_dataset = downloaded_gzip.replace('.gz', '')
            with open(uncompressed_dataset, 'wb') as decompressed:
                shutil.copyfileobj(compressed, decompressed)
        return uncompressed_dataset

    def _get_pixel_data(self, data_file: str) -> np.ndarray:
        with open(data_file, "rb") as dataset:
            _, num_data = struct.unpack(">II", dataset.read(8))
            num_rows, num_colums = struct.unpack(">II", dataset.read(8))
            pixel_data = np.fromfile(dataset, dtype=np.uint8) / 255
            pixel_data = pixel_data.reshape((num_data, num_rows * num_colums))
        return pixel_data

    def _get_labels(self, data_labels_file: str) -> Union[np.ndarray, int]:
        with open(data_labels_file, "rb") as labels:
            _, num_data = struct.unpack(">II", labels.read(8))
            label_data = np.fromfile(labels, dtype=np.uint8)
            label_data = label_data.reshape((num_data, -1))
        return label_data

    def _convert_label_to_int(self, validation_data_tuple: List[Tuple[np.ndarray, np.ndarray]]):
        for index, (inp, label) in enumerate(validation_data_tuple):
            validation_data_tuple[index] = (inp, label.argmax())

## Add network utility functions

In [4]:
class NetworkUtils:
    # Replace sigmoid with relu for hidden and softmax for output layer
    @staticmethod
    def relu(z: np.ndarray) -> np.ndarray:
        return np.maximum(z, 0.0)

    @staticmethod
    def relu_prime(z: np.ndarray) -> np.ndarray:
        return (z > 0.0) * 1

    @staticmethod
    def softmax(z):
        exp_z = np.exp(z)
        return exp_z / sum(exp_z)

## Init the network class

Changes from week2:
- Add dropout

In [5]:
class Network:
    TESTING_DATA_TYPE: str = "testing"
    TRAINING_DATA_TYPE: str = "training"

    def __init__(self, training_data: List[Tuple[np.ndarray, np.ndarray]],
                 testing_data: List[Tuple[np.ndarray, int]],
                 size: List[int], learning_rate: float, epochs: int,
                 mini_batch_size: int, lmda: int, training_accuracy_print: bool, testing_accuracy_print: bool):
        self.training_data = training_data
        self.testing_data = testing_data
        self.size = size
        self.num_layers = len(size)
        self.learning_rate = learning_rate
        self.biases = []
        self.weights = []
        self.retained_indices = {}
        self._init_weights()
        self._init_biases()
        self.epochs = epochs
        self.mini_batch_size = mini_batch_size
        self.lmda = lmda
        self.testing_accuracy_print = testing_accuracy_print
        self.training_accuracy_print = training_accuracy_print

    def _init_biases(self):
        for i in range(1, self.num_layers):
            self.biases.append(np.random.randn(self.size[i], 1))

    def _init_weights(self):
        bias_matrix_sizes = [(self.size[x + 1], self.size[x]) for x in range(self.num_layers - 1)]
        # Init weights by dividing by sqrt of each neuron's input size
        for x, y in bias_matrix_sizes:
            std_dev = 1 / np.sqrt(y)
            self.weights.append(np.random.randn(x, y) * std_dev)

    def _init_dropout_data(self):
        # Init dropout
        self.retained_indices = {}
        self._init_dropout_weights()
        self._init_dropout_biases()

    def _init_dropout_biases(self):
        # Retain biases only for the retained weights
        self.dropout_biases = copy.deepcopy(self.biases)
        for layer in range(self.num_layers - 2):
            indices_retained = self.retained_indices[layer + 1]
            self.dropout_biases[layer] = self.biases[layer][indices_retained]

    def _init_dropout_weights(self):
        # Add dropout to the weights
        self.dropout_weights = copy.deepcopy(self.weights)
        for layer in range(self.num_layers - 1):
            weight = self.dropout_weights[layer]

            input_size = np.size(weight, 1)
            indices_retained = random.sample(range(input_size), int(Hyperparameters.DROPOUT_RETAIN * input_size))
            indices_retained.sort()
            self.retained_indices[layer] = indices_retained
            self.dropout_weights[layer] = weight[:, indices_retained]

            if layer > 0:
                previous_weight = self.dropout_weights[layer - 1]
                self.dropout_weights[layer - 1] = previous_weight[indices_retained]

    def train(self):
        for epoch in range(self.epochs):
            # init dropout for each epoch
            self._init_dropout_data()
            np.random.shuffle(self.training_data)
            print(f"Start training for epoch: {epoch + 1} of {self.epochs}")

            mini_batches = self._create_mini_batches()

            for batch, mini_batch in enumerate(mini_batches, start=1):
                self._update_b_w(mini_batch)

            self._update_wt_bias()
            if self.training_accuracy_print:
                self._calc_accuracy(epoch + 1, self.training_data, self.TRAINING_DATA_TYPE)
            if self.testing_accuracy_print:
                self._calc_accuracy(epoch + 1, self.testing_data, self.TESTING_DATA_TYPE)

    def _create_mini_batches(self) -> List[List[Tuple[np.ndarray, np.ndarray]]]:
        mini_batches = [
            self.training_data[multiple:multiple + self.mini_batch_size] for multiple in
            range(0, len(self.training_data), self.mini_batch_size)
        ]
        return mini_batches

    def _update_b_w(self, mini_batch: List[Tuple[np.ndarray, np.ndarray]]):
        nabla_bias = self._get_nabla_bias_zeroes()
        nabla_wt = self._get_nabla_wt_zeroes()

        for x, y in mini_batch:
            del_bias, del_wt = self._run_back_propagation(x, y)

            nabla_bias = [curr_b + del_b for curr_b, del_b in zip(nabla_bias, del_bias)]
            nabla_wt = [curr_wt + del_w for curr_wt, del_w in zip(nabla_wt, del_wt)]

        self.dropout_biases = [
            b - ((self.learning_rate / self.mini_batch_size) * nb) for b, nb in zip(self.dropout_biases, nabla_bias)
        ]
        # Add L2 normalization
        self.dropout_weights = [
            np.dot(w, 1 - (self.learning_rate * self.lmda) / len(self.training_data)) -
            ((self.learning_rate / self.mini_batch_size) * nw) for w, nw in zip(self.dropout_weights, nabla_wt)
        ]

    def _get_nabla_bias_zeroes(self) -> List[np.ndarray]:
        return [np.zeros(np.shape(bias)) for bias in self.dropout_biases]

    def _get_nabla_wt_zeroes(self) -> List[np.ndarray]:
        return [np.zeros(np.shape(wt)) for wt in self.dropout_weights]

    def _run_back_propagation(self, x: np.ndarray, y: np.ndarray) -> Tuple[List[np.ndarray], List[np.ndarray]]:
        nabla_bias = self._get_nabla_bias_zeroes()
        nabla_wt = self._get_nabla_wt_zeroes()

        activations, z_list = self.feedforward(x[self.retained_indices[0]], self.dropout_weights, self.dropout_biases)
        # Delta for cross entropy
        error_l = self._delta_cross_entropy(activations[-1], y)

        nabla_bias[-1] = error_l
        nabla_wt[-1] = np.dot(error_l, np.transpose(activations[-2]))

        for layer in range(self.num_layers - 2, 0, -1):
            error_l = np.multiply(
                np.dot(np.transpose(self.dropout_weights[layer]), error_l), NetworkUtils.relu_prime(z_list[layer - 1])
            )

            nabla_bias[layer - 1] = error_l
            nabla_wt[layer - 1] = np.dot(error_l, activations[layer - 1].transpose())

        return nabla_bias, nabla_wt

    def _delta_cross_entropy(self, a_l: np.ndarray, y: np.ndarray) -> np.ndarray:
        return a_l - y

    def _update_wt_bias(self):
        self._update_wt()
        self._update_bias()

    def _update_wt(self):
        # Update the weights
        for layer in range(self.num_layers - 1):
            y_indices_for_wt = self.retained_indices[layer]
            if layer + 1 < self.num_layers - 1:
                x_indices_for_wt = self.retained_indices[layer + 1]

                temp = self.weights[layer].copy()[:, y_indices_for_wt]
                temp[x_indices_for_wt] = self.dropout_weights[layer]
                self.weights[layer][:, y_indices_for_wt] = temp
            else:
                self.weights[layer][:, y_indices_for_wt] = self.dropout_weights[layer]

    def _update_bias(self):
        # Update the biases
        for layer in range(self.num_layers - 2):
            x_indices_for_wt = self.retained_indices[layer + 1]
            self.biases[layer][x_indices_for_wt] = self.dropout_biases[layer]

    def _calc_accuracy(self, epoch: int, data: List[Tuple[np.ndarray, Union[np.ndarray, int]]], data_type: str):
        correct_results = 0
        total_results = len(data)
        for x, y in data:
            if data_type == self.TRAINING_DATA_TYPE:
                y = y.argmax()
            activations, _ = self.feedforward(x, self.weights, self.biases)
            logit = activations[-1]
            if logit.argmax() == y:
                correct_results += 1
        accuracy = round((correct_results / total_results) * 100, 2)
        print(
            f"Accuracy on {data_type} data for epoch {epoch}: {accuracy}"
        )

    def feedforward(self, x: np.ndarray, wt: List[np.ndarray], bias: List[np.ndarray]) -> \
            Tuple[List[np.ndarray], List[np.ndarray]]:
        a = x
        activations, z_list = list(), list()
        activations.append(x)
        self._set_relu_activations(a, z_list, activations, wt, bias)
        self._set_softmax_activation(activations[-1], z_list, activations, wt, bias)
        return activations, z_list

    def _set_relu_activations(self, a: np.ndarray, z_list: List[np.ndarray], activations: List[np.ndarray],
                              wt: List[np.ndarray], bias: List[np.ndarray]):
        for layer in range(self.num_layers - 2):
            # hidden layers(relu activation)
            z = np.dot(wt[layer], a) + bias[layer]
            z_list.append(z)
            a = NetworkUtils.relu(z)
            activations.append(a)

    def _set_softmax_activation(self, a: np.ndarray, z_list: List[np.ndarray], activations: List[np.ndarray],
                                wt: List[np.ndarray], bias: List[np.ndarray]):
        # output layer(softmax activation)
        z = np.dot(wt[-1], a) + bias[-1]
        z_list.append(z)
        a = NetworkUtils.softmax(z)
        activations.append(a)

## Get datasets

In [6]:
training, testing, validation = MNISTDataLoader().load_data_wrapper_with_validation()

Downloading...
From: https://drive.google.com/uc?id=10FdcUHw3BcQAU6keKaUwtDwJm4sC00Hu
To: /content/t10k-images-idx3-ubyte.gz
100%|██████████| 1.65M/1.65M [00:00<00:00, 113MB/s]
Downloading...
From: https://drive.google.com/uc?id=1GvsacEnI1eQ1vYZM-oYdERvaE2SPh0Lj
To: /content/t10k-labels-idx1-ubyte.gz
100%|██████████| 4.54k/4.54k [00:00<00:00, 5.24MB/s]
Downloading...
From: https://drive.google.com/uc?id=1pmI9wAdNtJkOvkJpdTqM9bmIAwPkGyMU
To: /content/train-images-idx3-ubyte.gz
9.91MB [00:00, 45.1MB/s]
Downloading...
From: https://drive.google.com/uc?id=1R8BZL67U1N0GUGnf6AQIBZNVDCWO9QLS
To: /content/train-labels-idx1-ubyte.gz
100%|██████████| 28.9k/28.9k [00:00<00:00, 5.42MB/s]


## Tune the hyperparameters on validation data and a subset of the training data

### [Validation dataset] 5 eta; 3 layer; 100 mini_batch; 100% neuron retention; 25 epochs

---


Start out trial with 3 level deep network and learning rate = 5 and by using subsets slices of the training data and validation data

In [21]:
class Hyperparameters:
    SIZE: List[int] = [784, 30, 10]
    LEARNING_RATE: float = 5
    EPOCHS: int = 25
    MINI_BATCH_SIZE: int = 100
    # Add lambda hyperparameter
    LMDA: int = 5
    # Add dropout to the weights. Specify the % of neurons to retain
    DROPOUT_RETAIN: float = 1

    def __str__(self) -> str:
        str_rep = ""
        str_rep += "Hyperparameters set are as follows"
        for hyper_param in self.__annotations__:
            str_rep += f' \n {hyper_param}: {getattr(self, hyper_param)}'
        return str_rep

params = Hyperparameters()
print(params)
mlp = Network(training[: 1000], validation[:1000], params.SIZE, params.LEARNING_RATE, params.EPOCHS,
              params.MINI_BATCH_SIZE,
              params.LMDA, training_accuracy_print=True, testing_accuracy_print=True)
mlp.train()


Hyperparameters set are as follows 
 SIZE: [784, 30, 10] 
 LEARNING_RATE: 5 
 EPOCHS: 25 
 MINI_BATCH_SIZE: 100 
 LMDA: 5 
 DROPOUT_RETAIN: 1
Start training for epoch: 1 of 25
Accuracy on training data for epoch 1: 9.7
Accuracy on testing data for epoch 1: 10.0
Start training for epoch: 2 of 25
Accuracy on training data for epoch 2: 9.7
Accuracy on testing data for epoch 2: 10.0
Start training for epoch: 3 of 25
Accuracy on training data for epoch 3: 9.7
Accuracy on testing data for epoch 3: 10.0
Start training for epoch: 4 of 25
Accuracy on training data for epoch 4: 11.6
Accuracy on testing data for epoch 4: 12.2
Start training for epoch: 5 of 25
Accuracy on training data for epoch 5: 9.7
Accuracy on testing data for epoch 5: 10.0
Start training for epoch: 6 of 25
Accuracy on training data for epoch 6: 9.7
Accuracy on testing data for epoch 6: 10.0
Start training for epoch: 7 of 25
Accuracy on training data for epoch 7: 9.7
Accuracy on testing data for epoch 7: 10.0
Start training fo

### [Validation dataset] 1 eta; 3 layer; 100 mini_batch; 100% neuron retention; 10 epochs

---


No learning occurs, try with reduced learning rate of 1

In [22]:
class Hyperparameters:
    SIZE: List[int] = [784, 30, 10]
    LEARNING_RATE: float = 1
    EPOCHS: int = 10
    MINI_BATCH_SIZE: int = 100
    # Add lambda hyperparameter
    LMDA: int = 5
    # Add dropout to the weights. Specify the % of neurons to retain
    DROPOUT_RETAIN: float = 1

    def __str__(self) -> str:
        str_rep = ""
        str_rep += "Hyperparameters set are as follows"
        for hyper_param in self.__annotations__:
            str_rep += f' \n {hyper_param}: {getattr(self, hyper_param)}'
        return str_rep

params = Hyperparameters()
print(params)
mlp = Network(training[: 1000], validation[:1000], params.SIZE, params.LEARNING_RATE, params.EPOCHS,
              params.MINI_BATCH_SIZE,
              params.LMDA, training_accuracy_print=True, testing_accuracy_print=True)
mlp.train()


Hyperparameters set are as follows 
 SIZE: [784, 30, 10] 
 LEARNING_RATE: 1 
 EPOCHS: 10 
 MINI_BATCH_SIZE: 100 
 LMDA: 5 
 DROPOUT_RETAIN: 1
Start training for epoch: 1 of 10
Accuracy on training data for epoch 1: 32.9
Accuracy on testing data for epoch 1: 28.9
Start training for epoch: 2 of 10
Accuracy on training data for epoch 2: 40.9
Accuracy on testing data for epoch 2: 36.5
Start training for epoch: 3 of 10
Accuracy on training data for epoch 3: 44.9
Accuracy on testing data for epoch 3: 37.6
Start training for epoch: 4 of 10
Accuracy on training data for epoch 4: 51.8
Accuracy on testing data for epoch 4: 46.4
Start training for epoch: 5 of 10
Accuracy on training data for epoch 5: 62.6
Accuracy on testing data for epoch 5: 53.2
Start training for epoch: 6 of 10
Accuracy on training data for epoch 6: 38.2
Accuracy on testing data for epoch 6: 31.9
Start training for epoch: 7 of 10
Accuracy on training data for epoch 7: 50.2
Accuracy on testing data for epoch 7: 43.3
Start train

### [Validation dataset] 0.1 eta; 3 layer; 100 mini_batch; 100% neuron retention; 20 epochs

---


Better results, try lowering to 0.1

In [23]:
class Hyperparameters:
    SIZE: List[int] = [784, 30, 10]
    LEARNING_RATE: float = 0.1
    EPOCHS: int = 20
    MINI_BATCH_SIZE: int = 100
    # Add lambda hyperparameter
    LMDA: int = 5
    # Add dropout to the weights. Specify the % of neurons to retain
    DROPOUT_RETAIN: float = 1

    def __str__(self) -> str:
        str_rep = ""
        str_rep += "Hyperparameters set are as follows"
        for hyper_param in self.__annotations__:
            str_rep += f' \n {hyper_param}: {getattr(self, hyper_param)}'
        return str_rep

params = Hyperparameters()
print(params)
mlp = Network(training[: 1000], validation[:1000], params.SIZE, params.LEARNING_RATE, params.EPOCHS,
              params.MINI_BATCH_SIZE,
              params.LMDA, training_accuracy_print=True, testing_accuracy_print=True)
mlp.train()


Hyperparameters set are as follows 
 SIZE: [784, 30, 10] 
 LEARNING_RATE: 0.1 
 EPOCHS: 20 
 MINI_BATCH_SIZE: 100 
 LMDA: 5 
 DROPOUT_RETAIN: 1
Start training for epoch: 1 of 20
Accuracy on training data for epoch 1: 48.2
Accuracy on testing data for epoch 1: 45.1
Start training for epoch: 2 of 20
Accuracy on training data for epoch 2: 68.1
Accuracy on testing data for epoch 2: 62.1
Start training for epoch: 3 of 20
Accuracy on training data for epoch 3: 77.5
Accuracy on testing data for epoch 3: 68.6
Start training for epoch: 4 of 20
Accuracy on training data for epoch 4: 81.0
Accuracy on testing data for epoch 4: 73.8
Start training for epoch: 5 of 20
Accuracy on training data for epoch 5: 83.2
Accuracy on testing data for epoch 5: 76.7
Start training for epoch: 6 of 20
Accuracy on training data for epoch 6: 84.9
Accuracy on testing data for epoch 6: 78.5
Start training for epoch: 7 of 20
Accuracy on training data for epoch 7: 85.8
Accuracy on testing data for epoch 7: 80.0
Start tra

### [Validation dataset] 0.01 eta; 3 layer(deeper hidden); 100 mini_batch; 100% neuron retention; 20 epochs

---


Try increasing the number of neurons in the hidden layer from 30 to 100 & reducing learning rate to 0.01


In [24]:
class Hyperparameters:
    SIZE: List[int] = [784, 100, 10]
    LEARNING_RATE: float = 0.01
    EPOCHS: int = 20
    MINI_BATCH_SIZE: int = 100
    # Add lambda hyperparameter
    LMDA: int = 5
    # Add dropout to the weights. Specify the % of neurons to retain
    DROPOUT_RETAIN: float = 1

    def __str__(self) -> str:
        str_rep = ""
        str_rep += "Hyperparameters set are as follows"
        for hyper_param in self.__annotations__:
            str_rep += f' \n {hyper_param}: {getattr(self, hyper_param)}'
        return str_rep

params = Hyperparameters()
print(params)
mlp = Network(training[: 1000], validation[:1000], params.SIZE, params.LEARNING_RATE, params.EPOCHS,
              params.MINI_BATCH_SIZE,
              params.LMDA, training_accuracy_print=True, testing_accuracy_print=True)
mlp.train()

Hyperparameters set are as follows 
 SIZE: [784, 100, 10] 
 LEARNING_RATE: 0.01 
 EPOCHS: 20 
 MINI_BATCH_SIZE: 100 
 LMDA: 5 
 DROPOUT_RETAIN: 1
Start training for epoch: 1 of 20
Accuracy on training data for epoch 1: 9.3
Accuracy on testing data for epoch 1: 8.3
Start training for epoch: 2 of 20
Accuracy on training data for epoch 2: 17.6
Accuracy on testing data for epoch 2: 13.7
Start training for epoch: 3 of 20
Accuracy on training data for epoch 3: 28.0
Accuracy on testing data for epoch 3: 23.2
Start training for epoch: 4 of 20
Accuracy on training data for epoch 4: 39.7
Accuracy on testing data for epoch 4: 31.2
Start training for epoch: 5 of 20
Accuracy on training data for epoch 5: 47.7
Accuracy on testing data for epoch 5: 37.4
Start training for epoch: 6 of 20
Accuracy on training data for epoch 6: 51.2
Accuracy on testing data for epoch 6: 41.7
Start training for epoch: 7 of 20
Accuracy on training data for epoch 7: 54.8
Accuracy on testing data for epoch 7: 44.0
Start tra

### [Validation dataset] 0.01 eta; 4 layer(deeper hidden); 10 mini_batch; 100% neuron retention; 20 epochs

---


Slower learning observed, reducing batch size in SGD to 10
Also make the network deeper by one more hidden layer

In [28]:
class Hyperparameters:
    SIZE: List[int] = [784, 100, 100, 10]
    LEARNING_RATE: float = 0.01
    EPOCHS: int = 20
    MINI_BATCH_SIZE: int = 10
    # Add lambda hyperparameter
    LMDA: int = 5
    # Add dropout to the weights. Specify the % of neurons to retain
    DROPOUT_RETAIN: float = 1

    def __str__(self) -> str:
        str_rep = ""
        str_rep += "Hyperparameters set are as follows"
        for hyper_param in self.__annotations__:
            str_rep += f' \n {hyper_param}: {getattr(self, hyper_param)}'
        return str_rep

params = Hyperparameters()
print(params)
mlp = Network(training[: 2000], validation[:2000], params.SIZE, params.LEARNING_RATE, params.EPOCHS,
              params.MINI_BATCH_SIZE,
              params.LMDA, training_accuracy_print=True, testing_accuracy_print=True)
mlp.train()

Hyperparameters set are as follows 
 SIZE: [784, 100, 100, 10] 
 LEARNING_RATE: 0.01 
 EPOCHS: 20 
 MINI_BATCH_SIZE: 10 
 LMDA: 5 
 DROPOUT_RETAIN: 1
Start training for epoch: 1 of 20
Accuracy on training data for epoch 1: 65.0
Accuracy on testing data for epoch 1: 60.85
Start training for epoch: 2 of 20
Accuracy on training data for epoch 2: 80.7
Accuracy on testing data for epoch 2: 77.85
Start training for epoch: 3 of 20
Accuracy on training data for epoch 3: 84.5
Accuracy on testing data for epoch 3: 82.7
Start training for epoch: 4 of 20
Accuracy on training data for epoch 4: 87.25
Accuracy on testing data for epoch 4: 84.4
Start training for epoch: 5 of 20
Accuracy on training data for epoch 5: 88.25
Accuracy on testing data for epoch 5: 84.2
Start training for epoch: 6 of 20
Accuracy on training data for epoch 6: 89.5
Accuracy on testing data for epoch 6: 85.0
Start training for epoch: 7 of 20
Accuracy on training data for epoch 7: 90.75
Accuracy on testing data for epoch 7: 87.

### [Validation dataset] 0.01 eta; 4 layer(deeper hidden); 10 mini_batch; 80% neuron retention; 20 epochs

---


Better results with lower mini batch size observed, introduce dropout to retain 80% of neurons

In [29]:
class Hyperparameters:
    SIZE: List[int] = [784, 100, 100, 10]
    LEARNING_RATE: float = 0.01
    EPOCHS: int = 20
    MINI_BATCH_SIZE: int = 10
    # Add lambda hyperparameter
    LMDA: int = 5
    # Add dropout to the weights. Specify the % of neurons to retain
    DROPOUT_RETAIN: float = 0.80

    def __str__(self) -> str:
        str_rep = ""
        str_rep += "Hyperparameters set are as follows"
        for hyper_param in self.__annotations__:
            str_rep += f' \n {hyper_param}: {getattr(self, hyper_param)}'
        return str_rep

params = Hyperparameters()
print(params)
mlp = Network(training[: 2000], validation[:2000], params.SIZE, params.LEARNING_RATE, params.EPOCHS,
              params.MINI_BATCH_SIZE,
              params.LMDA, training_accuracy_print=True, testing_accuracy_print=True)
mlp.train()

Hyperparameters set are as follows 
 SIZE: [784, 100, 100, 10] 
 LEARNING_RATE: 0.01 
 EPOCHS: 20 
 MINI_BATCH_SIZE: 10 
 LMDA: 5 
 DROPOUT_RETAIN: 0.8
Start training for epoch: 1 of 20
Accuracy on training data for epoch 1: 21.0
Accuracy on testing data for epoch 1: 19.95
Start training for epoch: 2 of 20
Accuracy on training data for epoch 2: 53.9
Accuracy on testing data for epoch 2: 50.85
Start training for epoch: 3 of 20
Accuracy on training data for epoch 3: 53.35
Accuracy on testing data for epoch 3: 49.6
Start training for epoch: 4 of 20
Accuracy on training data for epoch 4: 65.65
Accuracy on testing data for epoch 4: 62.65
Start training for epoch: 5 of 20
Accuracy on training data for epoch 5: 66.6
Accuracy on testing data for epoch 5: 62.35
Start training for epoch: 6 of 20
Accuracy on training data for epoch 6: 70.2
Accuracy on testing data for epoch 6: 66.7
Start training for epoch: 7 of 20
Accuracy on training data for epoch 7: 77.05
Accuracy on testing data for epoch 7:

### [Validation dataset] 0.01 eta; 4 layer(deeper hidden); 10 mini_batch; 70% neuron retention; 20 epochs

---


Similar results if not worse, further reduce dropout to 70%

In [31]:
class Hyperparameters:
    SIZE: List[int] = [784, 100, 100, 10]
    LEARNING_RATE: float = 0.01
    EPOCHS: int = 20
    MINI_BATCH_SIZE: int = 10
    # Add lambda hyperparameter
    LMDA: int = 5
    # Add dropout to the weights. Specify the % of neurons to retain
    DROPOUT_RETAIN: float = 0.70

    def __str__(self) -> str:
        str_rep = ""
        str_rep += "Hyperparameters set are as follows"
        for hyper_param in self.__annotations__:
            str_rep += f' \n {hyper_param}: {getattr(self, hyper_param)}'
        return str_rep

params = Hyperparameters()
print(params)
mlp = Network(training[: 2000], validation[:2000], params.SIZE, params.LEARNING_RATE, params.EPOCHS,
              params.MINI_BATCH_SIZE,
              params.LMDA, training_accuracy_print=True, testing_accuracy_print=True)
mlp.train()

Hyperparameters set are as follows 
 SIZE: [784, 100, 100, 10] 
 LEARNING_RATE: 0.01 
 EPOCHS: 20 
 MINI_BATCH_SIZE: 10 
 LMDA: 5 
 DROPOUT_RETAIN: 0.7
Start training for epoch: 1 of 20
Accuracy on training data for epoch 1: 23.0
Accuracy on testing data for epoch 1: 22.75
Start training for epoch: 2 of 20
Accuracy on training data for epoch 2: 52.55
Accuracy on testing data for epoch 2: 49.85
Start training for epoch: 3 of 20
Accuracy on training data for epoch 3: 55.45
Accuracy on testing data for epoch 3: 53.35
Start training for epoch: 4 of 20
Accuracy on training data for epoch 4: 63.1
Accuracy on testing data for epoch 4: 59.75
Start training for epoch: 5 of 20
Accuracy on training data for epoch 5: 64.65
Accuracy on testing data for epoch 5: 61.8
Start training for epoch: 6 of 20
Accuracy on training data for epoch 6: 63.9
Accuracy on testing data for epoch 6: 62.55
Start training for epoch: 7 of 20
Accuracy on training data for epoch 7: 73.8
Accuracy on testing data for epoch 7

### [Validation dataset] 0.01 eta; 4 layer(deeper hidden); 10 mini_batch; 70% neuron retention; 100 epochs

---


Try increasing epochs from 20 to 100 to see if networks keeps learning

In [35]:
class Hyperparameters:
    SIZE: List[int] = [784, 100, 100, 10]
    LEARNING_RATE: float = 0.01
    EPOCHS: int = 100
    MINI_BATCH_SIZE: int = 10
    # Add lambda hyperparameter
    LMDA: int = 5
    # Add dropout to the weights. Specify the % of neurons to retain
    DROPOUT_RETAIN: float = 0.70

    def __str__(self) -> str:
        str_rep = ""
        str_rep += "Hyperparameters set are as follows"
        for hyper_param in self.__annotations__:
            str_rep += f' \n {hyper_param}: {getattr(self, hyper_param)}'
        return str_rep

params = Hyperparameters()
print(params)
mlp = Network(training[: 2000], validation[:2000], params.SIZE, params.LEARNING_RATE, params.EPOCHS,
              params.MINI_BATCH_SIZE,
              params.LMDA, training_accuracy_print=True, testing_accuracy_print=True)
mlp.train()

Hyperparameters set are as follows 
 SIZE: [784, 100, 100, 10] 
 LEARNING_RATE: 0.01 
 EPOCHS: 100 
 MINI_BATCH_SIZE: 10 
 LMDA: 5 
 DROPOUT_RETAIN: 0.7
Start training for epoch: 1 of 100
Accuracy on training data for epoch 1: 39.85
Accuracy on testing data for epoch 1: 36.45
Start training for epoch: 2 of 100
Accuracy on training data for epoch 2: 57.2
Accuracy on testing data for epoch 2: 56.35
Start training for epoch: 3 of 100
Accuracy on training data for epoch 3: 40.3
Accuracy on testing data for epoch 3: 37.75
Start training for epoch: 4 of 100
Accuracy on training data for epoch 4: 57.55
Accuracy on testing data for epoch 4: 55.95
Start training for epoch: 5 of 100
Accuracy on training data for epoch 5: 65.85
Accuracy on testing data for epoch 5: 62.05
Start training for epoch: 6 of 100
Accuracy on training data for epoch 6: 66.05
Accuracy on testing data for epoch 6: 63.3
Start training for epoch: 7 of 100
Accuracy on training data for epoch 7: 72.3
Accuracy on testing data fo

### [Testing dataset] 0.01 eta; 4 layer(deeper hidden); 10 mini_batch; 70% neuron retention; 100 epochs


---


Promising results, maybe with higher epochs will go higher. Verify data with testing data

In [36]:
class Hyperparameters:
    SIZE: List[int] = [784, 100, 100, 10]
    LEARNING_RATE: float = 0.01
    EPOCHS: int = 100
    MINI_BATCH_SIZE: int = 10
    # Add lambda hyperparameter
    LMDA: int = 5
    # Add dropout to the weights. Specify the % of neurons to retain
    DROPOUT_RETAIN: float = 0.70

    def __str__(self) -> str:
        str_rep = ""
        str_rep += "Hyperparameters set are as follows"
        for hyper_param in self.__annotations__:
            str_rep += f' \n {hyper_param}: {getattr(self, hyper_param)}'
        return str_rep

params = Hyperparameters()
print(params)
mlp = Network(training[: 2000], testing[:2000], params.SIZE, params.LEARNING_RATE, params.EPOCHS,
              params.MINI_BATCH_SIZE,
              params.LMDA, training_accuracy_print=True, testing_accuracy_print=True)
mlp.train()

Hyperparameters set are as follows 
 SIZE: [784, 100, 100, 10] 
 LEARNING_RATE: 0.01 
 EPOCHS: 100 
 MINI_BATCH_SIZE: 10 
 LMDA: 5 
 DROPOUT_RETAIN: 0.7
Start training for epoch: 1 of 100
Accuracy on training data for epoch 1: 24.45
Accuracy on testing data for epoch 1: 21.6
Start training for epoch: 2 of 100
Accuracy on training data for epoch 2: 44.3
Accuracy on testing data for epoch 2: 41.35
Start training for epoch: 3 of 100
Accuracy on training data for epoch 3: 42.35
Accuracy on testing data for epoch 3: 38.3
Start training for epoch: 4 of 100
Accuracy on training data for epoch 4: 51.55
Accuracy on testing data for epoch 4: 46.7
Start training for epoch: 5 of 100
Accuracy on training data for epoch 5: 63.05
Accuracy on testing data for epoch 5: 57.75
Start training for epoch: 6 of 100
Accuracy on training data for epoch 6: 65.6
Accuracy on testing data for epoch 6: 63.55
Start training for epoch: 7 of 100
Accuracy on training data for epoch 7: 60.15
Accuracy on testing data for

### [Testing dataset] 0.01 eta; 4 layer(deeper hidden); 10 mini_batch; 70% neuron retention; 200 epochs

---


Results hold on testing data subset as well, run with current setting on whole training and testing datasets with epochs=200

In [37]:
class Hyperparameters:
    SIZE: List[int] = [784, 100, 100, 10]
    LEARNING_RATE: float = 0.01
    EPOCHS: int = 200
    MINI_BATCH_SIZE: int = 10
    # Add lambda hyperparameter
    LMDA: int = 5
    # Add dropout to the weights. Specify the % of neurons to retain
    DROPOUT_RETAIN: float = 0.70

    def __str__(self) -> str:
        str_rep = ""
        str_rep += "Hyperparameters set are as follows"
        for hyper_param in self.__annotations__:
            str_rep += f' \n {hyper_param}: {getattr(self, hyper_param)}'
        return str_rep

params = Hyperparameters()
print(params)
mlp = Network(training, testing, params.SIZE, params.LEARNING_RATE, params.EPOCHS,
              params.MINI_BATCH_SIZE,
              params.LMDA, training_accuracy_print=True, testing_accuracy_print=True)
mlp.train()

Hyperparameters set are as follows 
 SIZE: [784, 100, 100, 10] 
 LEARNING_RATE: 0.01 
 EPOCHS: 200 
 MINI_BATCH_SIZE: 10 
 LMDA: 5 
 DROPOUT_RETAIN: 0.7
Start training for epoch: 1 of 200
Accuracy on training data for epoch 1: 87.38
Accuracy on testing data for epoch 1: 88.19
Start training for epoch: 2 of 200
Accuracy on training data for epoch 2: 88.86
Accuracy on testing data for epoch 2: 88.87
Start training for epoch: 3 of 200
Accuracy on training data for epoch 3: 88.39
Accuracy on testing data for epoch 3: 88.71
Start training for epoch: 4 of 200
Accuracy on training data for epoch 4: 85.23
Accuracy on testing data for epoch 4: 85.8
Start training for epoch: 5 of 200
Accuracy on training data for epoch 5: 90.48
Accuracy on testing data for epoch 5: 90.37
Start training for epoch: 6 of 200
Accuracy on training data for epoch 6: 91.53
Accuracy on testing data for epoch 6: 91.55
Start training for epoch: 7 of 200
Accuracy on training data for epoch 7: 88.77
Accuracy on testing data

### [Testing dataset] 0.01 eta; 4 layer(deeper hidden); 10 mini_batch; 80% neuron retention; 100 epochs
Previous setting shows a peak accuray of around ~97% on testing data. Try decreasing dropout

In [7]:
class Hyperparameters:
    SIZE: List[int] = [784, 100, 100, 10]
    LEARNING_RATE: float = 0.01
    EPOCHS: int = 100
    MINI_BATCH_SIZE: int = 10
    # Add lambda hyperparameter
    LMDA: int = 5
    # Add dropout to the weights. Specify the % of neurons to retain
    DROPOUT_RETAIN: float = 0.85

    def __str__(self) -> str:
        str_rep = ""
        str_rep += "Hyperparameters set are as follows"
        for hyper_param in self.__annotations__:
            str_rep += f' \n {hyper_param}: {getattr(self, hyper_param)}'
        return str_rep

params = Hyperparameters()
print(params)
mlp = Network(training, testing, params.SIZE, params.LEARNING_RATE, params.EPOCHS,
              params.MINI_BATCH_SIZE,
              params.LMDA, training_accuracy_print=True, testing_accuracy_print=True)
mlp.train()

Hyperparameters set are as follows 
 SIZE: [784, 100, 100, 10] 
 LEARNING_RATE: 0.01 
 EPOCHS: 100 
 MINI_BATCH_SIZE: 10 
 LMDA: 5 
 DROPOUT_RETAIN: 0.85
Start training for epoch: 1 of 100
Accuracy on training data for epoch 1: 90.88
Accuracy on testing data for epoch 1: 91.36
Start training for epoch: 2 of 100
Accuracy on training data for epoch 2: 91.12
Accuracy on testing data for epoch 2: 91.18
Start training for epoch: 3 of 100
Accuracy on training data for epoch 3: 90.21
Accuracy on testing data for epoch 3: 90.38
Start training for epoch: 4 of 100
Accuracy on training data for epoch 4: 90.82
Accuracy on testing data for epoch 4: 90.94
Start training for epoch: 5 of 100
Accuracy on training data for epoch 5: 89.5
Accuracy on testing data for epoch 5: 89.26
Start training for epoch: 6 of 100
Accuracy on training data for epoch 6: 91.81
Accuracy on testing data for epoch 6: 91.19
Start training for epoch: 7 of 100
Accuracy on training data for epoch 7: 93.31
Accuracy on testing dat

### [Testing dataset] 0.01 eta; 4 layer(deeper hidden); 10 mini_batch; 90% neuron retention; 100 epochs
More consistent results observed, use lesser dropout

In [9]:
class Hyperparameters:
    SIZE: List[int] = [784, 100, 100, 10]
    LEARNING_RATE: float = 0.01
    EPOCHS: int = 100
    MINI_BATCH_SIZE: int = 10
    # Add lambda hyperparameter
    LMDA: int = 5
    # Add dropout to the weights. Specify the % of neurons to retain
    DROPOUT_RETAIN: float = 0.9

    def __str__(self) -> str:
        str_rep = ""
        str_rep += "Hyperparameters set are as follows"
        for hyper_param in self.__annotations__:
            str_rep += f' \n {hyper_param}: {getattr(self, hyper_param)}'
        return str_rep

params = Hyperparameters()
print(params)
mlp = Network(training, testing, params.SIZE, params.LEARNING_RATE, params.EPOCHS,
              params.MINI_BATCH_SIZE,
              params.LMDA, training_accuracy_print=True, testing_accuracy_print=True)
mlp.train()

Hyperparameters set are as follows 
 SIZE: [784, 100, 100, 10] 
 LEARNING_RATE: 0.01 
 EPOCHS: 100 
 MINI_BATCH_SIZE: 10 
 LMDA: 5 
 DROPOUT_RETAIN: 0.9
Start training for epoch: 1 of 100
Accuracy on training data for epoch 1: 89.75
Accuracy on testing data for epoch 1: 90.32
Start training for epoch: 2 of 100
Accuracy on training data for epoch 2: 91.83
Accuracy on testing data for epoch 2: 92.39
Start training for epoch: 3 of 100
Accuracy on training data for epoch 3: 90.61
Accuracy on testing data for epoch 3: 90.91
Start training for epoch: 4 of 100
Accuracy on training data for epoch 4: 93.14
Accuracy on testing data for epoch 4: 93.02
Start training for epoch: 5 of 100
Accuracy on training data for epoch 5: 93.74
Accuracy on testing data for epoch 5: 93.54
Start training for epoch: 6 of 100
Accuracy on training data for epoch 6: 93.34
Accuracy on testing data for epoch 6: 92.82
Start training for epoch: 7 of 100
Accuracy on training data for epoch 7: 93.21
Accuracy on testing dat

### [Testing dataset] 0.01 eta; 4 layer(deeper hidden); 10 mini_batch; 100% neuron retention; 100 epochs

---

Accuray with dropout setting seems to not be that consistent. Remove dropout and try

In [10]:
class Hyperparameters:
    SIZE: List[int] = [784, 100, 100, 10]
    LEARNING_RATE: float = 0.01
    EPOCHS: int = 100
    MINI_BATCH_SIZE: int = 10
    # Add lambda hyperparameter
    LMDA: int = 5
    # Add dropout to the weights. Specify the % of neurons to retain
    DROPOUT_RETAIN: float = 1.0

    def __str__(self) -> str:
        str_rep = ""
        str_rep += "Hyperparameters set are as follows"
        for hyper_param in self.__annotations__:
            str_rep += f' \n {hyper_param}: {getattr(self, hyper_param)}'
        return str_rep

params = Hyperparameters()
print(params)
mlp = Network(training, testing, params.SIZE, params.LEARNING_RATE, params.EPOCHS,
              params.MINI_BATCH_SIZE,
              params.LMDA, training_accuracy_print=True, testing_accuracy_print=True)
mlp.train()

Hyperparameters set are as follows 
 SIZE: [784, 100, 100, 10] 
 LEARNING_RATE: 0.01 
 EPOCHS: 100 
 MINI_BATCH_SIZE: 10 
 LMDA: 5 
 DROPOUT_RETAIN: 1.0
Start training for epoch: 1 of 100
Accuracy on training data for epoch 1: 91.49
Accuracy on testing data for epoch 1: 91.32
Start training for epoch: 2 of 100
Accuracy on training data for epoch 2: 94.01
Accuracy on testing data for epoch 2: 94.01
Start training for epoch: 3 of 100
Accuracy on training data for epoch 3: 95.35
Accuracy on testing data for epoch 3: 94.98
Start training for epoch: 4 of 100
Accuracy on training data for epoch 4: 96.01
Accuracy on testing data for epoch 4: 95.59
Start training for epoch: 5 of 100
Accuracy on training data for epoch 5: 96.64
Accuracy on testing data for epoch 5: 95.99
Start training for epoch: 6 of 100
Accuracy on training data for epoch 6: 97.05
Accuracy on testing data for epoch 6: 96.25
Start training for epoch: 7 of 100
Accuracy on training data for epoch 7: 97.48
Accuracy on testing dat

Peak accuracy of **~97.7%** seen on testing data. Results were more consistent without dropout. Beyond certain epoch(46th) training data accuracy is at 100% and testing data accuracy does not improve much.