# Colab Setup

In [1]:
from google.colab import drive
drive.mount('/content/drive')

ModuleNotFoundError: No module named 'google'

In [2]:
"""
Change directory to where this file is located
"""
#%cd 'COPY&PASTE FILE DIRECTORY HERE'

'\nChange directory to where this file is located\n'

In [3]:
!unzip mnist.zipy


unzip:  cannot find or open mnist.zipy, mnist.zipy.zip or mnist.zipy.ZIP.


# Import Modules

In [1]:
import copy
import numpy as np
import matplotlib.pyplot as plt
from mnist.data_utils import load_data
import os

#Utils

In [2]:
def sigmoid(z):
    """
    Do NOT modify this function
    """
    return 1/(1+np.exp(-z))

def softmax(X):
    """
    Do NOT modify this function
    """
    logit = np.exp(X-np.amax(X, axis=1, keepdims=True))
    numer = logit
    denom = np.sum(logit, axis=1, keepdims=True)
    return numer/denom

def load_batch(X, Y, batch_size, shuffle=True):
    """
    Generates batches with the remainder dropped.

    Do NOT modify this function
    """
    if shuffle:
        permutation = np.random.permutation(X.shape[0])
        X = X[permutation, :]
        Y = Y[permutation, :]
    num_steps = int(X.shape[0])//batch_size
    step = 0
    while step<num_steps:
        X_batch = X[batch_size*step:batch_size*(step+1)]
        Y_batch = Y[batch_size*step:batch_size*(step+1)]
        step+=1
        yield X_batch, Y_batch



#2-Layer Neural Network

In [3]:
class TwoLayerNN:
    """ a neural network with 2 layers """

    def __init__(self, input_dim, num_hiddens, num_classes):
        """
        Do NOT modify this function.
        """
        self.input_dim = input_dim
        self.num_hiddens = num_hiddens
        self.num_classes = num_classes
        self.params = self.initialize_parameters(input_dim, num_hiddens, num_classes)
    def initialize_parameters(self, input_dim, num_hiddens, num_classes):
        """
        initializes parameters with Xavier Initialization.

        Question (a)
        - refer to https://paperswithcode.com/method/xavier-initialization for Xavier initialization 
        
        Inputs
        - input_dim
        - num_hiddens
        - num_classes
        Returns
        - params: a dictionary with the initialized parameters.
        """
        params = {}
        params["W1"]=np.random.uniform(-1/np.sqrt(input_dim), 1/np.sqrt(input_dim),(input_dim,num_hiddens))
        params["W2"]=np.random.uniform(-1/np.sqrt(num_hiddens), 1/np.sqrt(num_hiddens),(num_hiddens,num_classes))
        params["b1"]=np.zeros(num_hiddens)
        params["b2"]=np.zeros(num_classes)


        return params

    def forward(self, X):
        """
        Define and perform the feed forward step of a two-layer neural network.
        Specifically, the network structue is given by

          y = softmax(sigmoid(X W1 + b1) W2 + b2)

        where X is the input matrix of shape (N, D), y is the class distribution matrix
        of shape (N, C), N is the number of examples (either the entire dataset or
        a mini-batch), D is the feature dimensionality, and C is the number of classes.

        Question (b)
        - ff_dict will be used to run backpropagation in backward method.

        Inputs
        - X: the input matrix of shape (N, D)

        Returns
        - y: the output of the model
        - ff_dict: a dictionary with all the fully connected units and activations.
        """
        ff_dict = {}
        ff_dict=self.params
        ff_dict["h"]=sigmoid((np.matmul(X,ff_dict["W1"])+ff_dict["b1"]))

        ff_dict["minus h"]=1-sigmoid((np.matmul(X,ff_dict["W1"])+ff_dict["b1"]))

        y=softmax(np.matmul(ff_dict["h"],ff_dict["W2"])+ff_dict["b2"])
        ff_dict["y"]=y
        
        return y, ff_dict

    def backward(self, X, Y, ff_dict):
        """
        Performs backpropagation over the two-layer neural network, and returns
        a dictionary of gradients of all model parameters.

        Question (c)

        Inputs:
         - X: the input matrix of shape (B, D), where B is the number of examples
              in a mini-batch, D is the feature dimensionality.
         - Y: the matrix of one-hot encoded ground truth classes of shape (B, C),
              where B is the number of examples in a mini-batch, C is the number
              of classes.
         - ff_dict: the dictionary containing all the fully connected units and
              activations.

        Returns:
         - grads: a dictionary containing the gradients of corresponding weights and biases.
        """


        grads = {}
        grads["h"]=np.matmul(ff_dict["y"]-Y,np.transpose(ff_dict["W2"]))
        grads["dW2"]=np.matmul((np.transpose(ff_dict["h"])),ff_dict["y"]-Y)
        grads["db1"] =grads["h"]*ff_dict["h"]*ff_dict["minus h"]
        grads["db1"]=grads["db1"].sum(0)
        grads["dW1"] = np.matmul(np.transpose(X),grads["h"]*ff_dict["h"]*ff_dict["minus h"])
        grads["db2"] = (ff_dict["y"]-Y)
        grads["db2"] = grads["db2"].sum(0)

        return grads

    def compute_loss(self, Y, Y_hat):
        """
        Computes cross entropy loss.

        Do NOT modify this function.

        Inputs
            Y:
            Y_hat:
        Returns
            loss:
        """
        loss = -(1/Y.shape[0]) * np.sum(np.multiply(Y, np.log(Y_hat)))
        return loss

    def train(self, X, Y, X_val, Y_val, lr, n_epochs, batch_size, log_interval=1):
        """
        Runs mini-batch gradient descent.

        Do NOT Modify this method.

        Inputs
        - X
        - Y
        - X_val
        - Y_Val
        - lr
        - n_epochs
        - batch_size
        - log_interval
        """
        for epoch in range(n_epochs):
            for X_batch, Y_batch in load_batch(X, Y, batch_size):
                self.train_step(X_batch, Y_batch, batch_size, lr)
            if epoch % log_interval==0:
              
                Y_hat, ff_dict = self.forward(X)
                train_loss = self.compute_loss(Y, Y_hat)
                train_acc = self.evaluate(Y, Y_hat)
                Y_hat, ff_dict = self.forward(X_val)
                valid_loss = self.compute_loss(Y_val, Y_hat)
                valid_acc = self.evaluate(Y_val, Y_hat)
                print('epoch {:02} - train loss/acc: {:.3f} {:.3f}, valid loss/acc: {:.3f} {:.3f}'.\
                      format(epoch, train_loss, train_acc, valid_loss, valid_acc))

    def train_step(self, X_batch, Y_batch, batch_size, lr):
        """
        Updates the parameters using gradient descent.

        Do NOT Modify this method.

        Inputs
        - X_batch
        - Y_batch
        - batch_size
        - lr
        """
        _, ff_dict = self.forward(X_batch)
        grads = self.backward(X_batch, Y_batch, ff_dict)
        self.params["W1"] -= lr * grads["dW1"]/batch_size
        self.params["b1"] -= lr * grads["db1"]/batch_size
        self.params["W2"] -= lr * grads["dW2"]/batch_size
        self.params["b2"] -= lr * grads["db2"]/batch_size

    def evaluate(self, Y, Y_hat):
        """
        Computes classification accuracy.
        
        Do NOT modify this function

        Inputs
        - Y: A numpy array of shape (N, C) containing the softmax outputs,
             where C is the number of classes.
        - Y_hat: A numpy array of shape (N, C) containing the one-hot encoded labels,
             where C is the number of classes.

        Returns
            accuracy: the classification accuracy in float
        """        
        classes_pred = np.argmax(Y_hat, axis=1)
        classes_gt = np.argmax(Y, axis=1)
        accuracy = float(np.sum(classes_pred==classes_gt)) / Y.shape[0]
        return accuracy

#Load MNIST

In [4]:

X_train, Y_train, X_test, Y_test = load_data()

idxs = np.arange(len(X_train))
np.random.shuffle(idxs)
split_idx = int(np.ceil(len(idxs)*0.8))
X_valid, Y_valid = X_train[idxs[split_idx:]], Y_train[idxs[split_idx:]]
X_train, Y_train = X_train[idxs[:split_idx]], Y_train[idxs[:split_idx]]
print()
print('Set validation data aside')
print('Training data shape: ', X_train.shape)
print('Training labels shape: ', Y_train.shape)
print('Validation data shape: ', X_valid.shape)
print('Validation labels shape: ', Y_valid.shape)

MNIST data loaded:
Training data shape: (60000, 784)
Training labels shape: (60000, 10)
Test data shape: (10000, 784)
Test labels shape: (10000, 10)

Set validation data aside
Training data shape:  (48000, 784)
Training labels shape:  (48000, 10)
Validation data shape:  (12000, 784)
Validation labels shape:  (12000, 10)


#Training & Evaluation

In [5]:
### 
# Question (d)
# Tune the hyperparameters with validation data, 
# and print the results by running the lines below.
###

In [6]:
# model instantiation
model = TwoLayerNN(input_dim=784, num_hiddens=256, num_classes=10)

In [7]:
# train the model
lr, n_epochs, batch_size = 0.1, 500, 256
model.train(X_train, Y_train, X_valid, Y_valid, lr, n_epochs, batch_size)

epoch 00 - train loss/acc: 1.242 0.762, valid loss/acc: 1.242 0.757
epoch 01 - train loss/acc: 0.688 0.835, valid loss/acc: 0.693 0.829
epoch 02 - train loss/acc: 0.526 0.870, valid loss/acc: 0.534 0.862
epoch 03 - train loss/acc: 0.453 0.881, valid loss/acc: 0.462 0.875
epoch 04 - train loss/acc: 0.411 0.891, valid loss/acc: 0.419 0.884
epoch 05 - train loss/acc: 0.384 0.895, valid loss/acc: 0.394 0.888
epoch 06 - train loss/acc: 0.365 0.898, valid loss/acc: 0.375 0.893
epoch 07 - train loss/acc: 0.352 0.901, valid loss/acc: 0.361 0.896
epoch 08 - train loss/acc: 0.339 0.903, valid loss/acc: 0.349 0.898
epoch 09 - train loss/acc: 0.331 0.906, valid loss/acc: 0.342 0.900
epoch 10 - train loss/acc: 0.324 0.908, valid loss/acc: 0.334 0.902
epoch 11 - train loss/acc: 0.317 0.910, valid loss/acc: 0.328 0.904
epoch 12 - train loss/acc: 0.312 0.911, valid loss/acc: 0.323 0.906
epoch 13 - train loss/acc: 0.306 0.912, valid loss/acc: 0.317 0.906
epoch 14 - train loss/acc: 0.302 0.914, valid lo

epoch 121 - train loss/acc: 0.113 0.969, valid loss/acc: 0.138 0.959
epoch 122 - train loss/acc: 0.112 0.969, valid loss/acc: 0.137 0.960
epoch 123 - train loss/acc: 0.111 0.970, valid loss/acc: 0.136 0.960
epoch 124 - train loss/acc: 0.110 0.970, valid loss/acc: 0.135 0.959
epoch 125 - train loss/acc: 0.109 0.970, valid loss/acc: 0.135 0.960
epoch 126 - train loss/acc: 0.109 0.970, valid loss/acc: 0.134 0.960
epoch 127 - train loss/acc: 0.108 0.970, valid loss/acc: 0.134 0.960
epoch 128 - train loss/acc: 0.107 0.971, valid loss/acc: 0.133 0.960
epoch 129 - train loss/acc: 0.106 0.971, valid loss/acc: 0.132 0.961
epoch 130 - train loss/acc: 0.106 0.971, valid loss/acc: 0.132 0.961
epoch 131 - train loss/acc: 0.105 0.971, valid loss/acc: 0.131 0.962
epoch 132 - train loss/acc: 0.104 0.972, valid loss/acc: 0.130 0.961
epoch 133 - train loss/acc: 0.104 0.971, valid loss/acc: 0.131 0.962
epoch 134 - train loss/acc: 0.103 0.972, valid loss/acc: 0.129 0.962
epoch 135 - train loss/acc: 0.103 

epoch 240 - train loss/acc: 0.056 0.986, valid loss/acc: 0.094 0.973
epoch 241 - train loss/acc: 0.056 0.986, valid loss/acc: 0.093 0.973
epoch 242 - train loss/acc: 0.055 0.986, valid loss/acc: 0.093 0.973
epoch 243 - train loss/acc: 0.055 0.986, valid loss/acc: 0.093 0.973
epoch 244 - train loss/acc: 0.055 0.986, valid loss/acc: 0.093 0.973
epoch 245 - train loss/acc: 0.055 0.986, valid loss/acc: 0.093 0.973
epoch 246 - train loss/acc: 0.054 0.986, valid loss/acc: 0.093 0.973
epoch 247 - train loss/acc: 0.054 0.986, valid loss/acc: 0.092 0.974
epoch 248 - train loss/acc: 0.054 0.986, valid loss/acc: 0.092 0.973
epoch 249 - train loss/acc: 0.054 0.986, valid loss/acc: 0.092 0.973
epoch 250 - train loss/acc: 0.053 0.986, valid loss/acc: 0.092 0.973
epoch 251 - train loss/acc: 0.053 0.986, valid loss/acc: 0.092 0.974
epoch 252 - train loss/acc: 0.053 0.987, valid loss/acc: 0.092 0.974
epoch 253 - train loss/acc: 0.053 0.987, valid loss/acc: 0.092 0.974
epoch 254 - train loss/acc: 0.052 

epoch 359 - train loss/acc: 0.032 0.994, valid loss/acc: 0.080 0.977
epoch 360 - train loss/acc: 0.032 0.993, valid loss/acc: 0.080 0.977
epoch 361 - train loss/acc: 0.032 0.994, valid loss/acc: 0.080 0.977
epoch 362 - train loss/acc: 0.032 0.994, valid loss/acc: 0.080 0.977
epoch 363 - train loss/acc: 0.032 0.993, valid loss/acc: 0.080 0.977
epoch 364 - train loss/acc: 0.032 0.994, valid loss/acc: 0.080 0.977
epoch 365 - train loss/acc: 0.032 0.994, valid loss/acc: 0.080 0.977
epoch 366 - train loss/acc: 0.032 0.994, valid loss/acc: 0.080 0.977
epoch 367 - train loss/acc: 0.031 0.994, valid loss/acc: 0.080 0.977
epoch 368 - train loss/acc: 0.031 0.994, valid loss/acc: 0.080 0.977
epoch 369 - train loss/acc: 0.031 0.994, valid loss/acc: 0.079 0.977
epoch 370 - train loss/acc: 0.031 0.994, valid loss/acc: 0.080 0.977
epoch 371 - train loss/acc: 0.031 0.994, valid loss/acc: 0.079 0.977
epoch 372 - train loss/acc: 0.031 0.994, valid loss/acc: 0.080 0.977
epoch 373 - train loss/acc: 0.031 

epoch 478 - train loss/acc: 0.020 0.997, valid loss/acc: 0.076 0.978
epoch 479 - train loss/acc: 0.020 0.997, valid loss/acc: 0.075 0.978
epoch 480 - train loss/acc: 0.020 0.997, valid loss/acc: 0.075 0.978
epoch 481 - train loss/acc: 0.020 0.997, valid loss/acc: 0.076 0.977
epoch 482 - train loss/acc: 0.020 0.997, valid loss/acc: 0.075 0.978
epoch 483 - train loss/acc: 0.020 0.997, valid loss/acc: 0.075 0.978
epoch 484 - train loss/acc: 0.020 0.997, valid loss/acc: 0.075 0.978
epoch 485 - train loss/acc: 0.020 0.997, valid loss/acc: 0.075 0.978
epoch 486 - train loss/acc: 0.020 0.997, valid loss/acc: 0.075 0.978
epoch 487 - train loss/acc: 0.020 0.997, valid loss/acc: 0.075 0.978
epoch 488 - train loss/acc: 0.020 0.997, valid loss/acc: 0.075 0.978
epoch 489 - train loss/acc: 0.019 0.997, valid loss/acc: 0.075 0.978
epoch 490 - train loss/acc: 0.019 0.997, valid loss/acc: 0.075 0.978
epoch 491 - train loss/acc: 0.019 0.997, valid loss/acc: 0.075 0.978
epoch 492 - train loss/acc: 0.019 

In [8]:
# evalute the model on test data
Y_hat, _ = model.forward(X_test)
test_loss = model.compute_loss(Y_test, Y_hat)
test_acc = model.evaluate(Y_test, Y_hat)
print("Final test loss = {:.3f}, acc = {:.3f}".format(test_loss, test_acc))

Final test loss = 0.067, acc = 0.978


# Extra Credit (Optional)

In [6]:
def Relu(X):
    return X*(X>0)
def Relu_diffential(X):
    return 1*(X>0)
def initialize_parameters(self, input_dim, num_hiddens, num_classes):
    """
    initializes parameters with He Initialization.

    Question (e)
    - refer to https://paperswithcode.com/method/he-initialization for He initialization

    Inputs
    - input_dim
    - num_hiddens
    - num_classes
    Returns
    - params: a dictionary with the initialized parameters.
    """
    params = {}
    params["W1"]=np.random.normal(0, 1/np.sqrt(input_dim/2),(input_dim,num_hiddens))
    params["W2"]=np.random.normal(0, 1/np.sqrt(num_hiddens/2),(num_hiddens,num_classes))
    params["b1"]=np.zeros(num_hiddens)
    params["b2"]=np.zeros(num_classes)

    return params

def forward_relu(self, X):
    """
    Defines and performs the feed forward step of a two-layer neural network.
    Specifically, the network structue is given by

        y = softmax(relu(X W1 + b1) W2 + b2)

    where X is the input matrix of shape (N, D), y is the class distribution matrix
    of shape (N, C), N is the number of examples (either the entire dataset or
    a mini-batch), D is the feature dimensionality, and C is the number of classes.

    Question (e)

    Inputs
        X: the input matrix of shape (N, D)

    Returns
        y: the output of the model
        ff_dict: a dictionary containing all the fully connected units and activations.
    """
    ff_dict = {}
    ff_dict=self.params

    ff_dict=self.params
    ff_dict["h"]=Relu((np.matmul(X,ff_dict["W1"])+ff_dict["b1"]))
    y=softmax(np.matmul(ff_dict["h"],ff_dict["W2"])+ff_dict["b2"])
    ff_dict["y"]=y

    return y, ff_dict

def backward_relu(self, X, Y, ff_dict):
    """
    Performs backpropagation over the two-layer neural network, and returns
    a dictionary of gradients of all model parameters.

    Question (e)

    Inputs:
        - X: the input matrix of shape (B, D), where B is the number of examples
            in a mini-batch, D is the feature dimensionality.
        - Y: the matrix of one-hot encoded ground truth classes of shape (B, C),
            where B is the number of examples in a mini-batch, C is the number
            of classes.
        - ff_dict: the dictionary containing all the fully connected units and
            activations.

    Returns:
        - grads: a dictionary containing the gradients of corresponding weights
            and biases.
    """
    grads = {}
    grads["h"]=np.matmul(ff_dict["y"]-Y,np.transpose(ff_dict["W2"]))
    grads["dW2"]=np.matmul((np.transpose(ff_dict["h"])),ff_dict["y"]-Y)
    grads["db1"] =grads["h"]*Relu_diffential(X)
    grads["db1"]=grads["db1"].sum(0)
    grads["dW1"] = np.matmul(np.transpose(X),Relu_diffential(X)*grads["h"])
            # grads["dW1"] = np.matmul(np.transpose(X),grads["h"]*ff_dict["h"]*ff_dict["minus h"])
    grads["db2"] = (ff_dict["y"]-Y)
    grads["db2"] = grads["db2"].sum(0)

    return grads

TwoLayerNNRelu = copy.copy(TwoLayerNN)
TwoLayerNNRelu.initialize_parameters = initialize_parameters

TwoLayerNNRelu.feed_forward = forward_relu
TwoLayerNNRelu.back_propagate = backward_relu

In [7]:
### 
# Question (e)
# Tune the hyperparameters with validation data,
# and print the results by running the lines below.
###

In [8]:
# model instantiation
model_relu = TwoLayerNNRelu(input_dim=784, num_hiddens=256, num_classes=10)


In [9]:
# train the model
lr, n_epochs, batch_size =0.5, 100, 256
history = model_relu.train(X_train, Y_train, X_valid, Y_valid, lr, n_epochs, batch_size)

epoch 00 - train loss/acc: 0.418 0.873, valid loss/acc: 0.421 0.869
epoch 01 - train loss/acc: 0.326 0.906, valid loss/acc: 0.331 0.903
epoch 02 - train loss/acc: 0.296 0.915, valid loss/acc: 0.299 0.912
epoch 03 - train loss/acc: 0.284 0.918, valid loss/acc: 0.289 0.916
epoch 04 - train loss/acc: 0.269 0.921, valid loss/acc: 0.277 0.919
epoch 05 - train loss/acc: 0.252 0.926, valid loss/acc: 0.260 0.923
epoch 06 - train loss/acc: 0.233 0.935, valid loss/acc: 0.242 0.930
epoch 07 - train loss/acc: 0.223 0.936, valid loss/acc: 0.234 0.932
epoch 08 - train loss/acc: 0.209 0.941, valid loss/acc: 0.220 0.938
epoch 09 - train loss/acc: 0.195 0.944, valid loss/acc: 0.207 0.940
epoch 10 - train loss/acc: 0.187 0.947, valid loss/acc: 0.200 0.943
epoch 11 - train loss/acc: 0.177 0.950, valid loss/acc: 0.191 0.946
epoch 12 - train loss/acc: 0.167 0.953, valid loss/acc: 0.181 0.949
epoch 13 - train loss/acc: 0.160 0.955, valid loss/acc: 0.174 0.950
epoch 14 - train loss/acc: 0.154 0.957, valid lo

In [10]:
Y_hat, _ = model_relu.forward(X_test)
test_loss = model_relu.compute_loss(Y_test, Y_hat)
test_acc = model_relu.evaluate(Y_test, Y_hat)
print("Final test loss = {:.3f}, acc = {:.3f}".format(test_loss, test_acc))

Final test loss = 0.069, acc = 0.980
