# Import Modules

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [10]:
#%cd '/content/drive/PATH/TO/MNIST_DATA_FILE'
%cd '/Users/jspark/workspace/PycharmProjects/pythonProject/MLDL1_HW3/'

/Users/jspark/workspace/PycharmProjects/pythonProject/MLDL1_HW3


  self.shell.db['dhist'] = compress_dhist(dhist)[-100:]


In [1]:
import copy
import numpy as np
import matplotlib.pyplot as plt
from sklearn.metrics import f1_score
from mnist.data_utils import load_data

# Utils

In [2]:
def leaky_relu(z, alpha=0.01):
    """
    Implement the leaky relu activation function.
    The method takes the input z and returns the output of the function.
    Please DO NOT MODIFY the alpha value.

    Question (a)

    """
    ##### YOUR CODE #####
    # return if z<0 then alpha*z , z>=0 then z
    return np.maximum(alpha*z, z)
    #####################


def softmax(X):
    """
    Implement the softmax function.
    The method takes the input X and returns the output of the function.

    Question (a)

    """
    ##### YOUR CODE #####
    # exp(x_i) / sum(exp(x_i))
    exps = np.exp(X - np.max(X, axis=1, keepdims=True)) # subtract max value to prevent overflow
    return exps / np.sum(exps, axis=1, keepdims=True)
    #####################

def deriv_leaky_relu(x, alpha=0.01):
    """
    Implement the derivative of leaky relu activation function.
    The method takes the input z and returns the output of the function.
    Please DO NOT MODIFY the alpha value.

    Question (a)

    """
    ##### YOUR CODE #####
    # return if x<0 then alpha , x>=0 then 1
    return np.where(x >=0, 1, alpha)
    #####################

def load_batch(X, Y, batch_size, shuffle=True):
    """
    Generates batches with the remainder dropped.

    Do NOT modify this function
    """
    if shuffle:
        permutation = np.random.permutation(X.shape[0])
        X = X[permutation, :]
        Y = Y[permutation, :]
    num_steps = int(X.shape[0])//batch_size
    step = 0
    while step<num_steps:
        X_batch = X[batch_size*step:batch_size*(step+1)]
        Y_batch = Y[batch_size*step:batch_size*(step+1)]
        step+=1
        yield X_batch, Y_batch

# 2-Layer Neural Network

In [3]:
class TwoLayerNN:
    """ a neural network with 2 layers """

    def __init__(self, input_dim, num_hiddens, num_classes):
        """
        Do NOT modify this function.
        """
        self.input_dim = input_dim
        self.num_hiddens = num_hiddens
        self.num_classes = num_classes
        self.params = self.initialize_parameters(input_dim, num_hiddens, num_classes)

    def initialize_parameters(self, input_dim, num_hiddens, num_classes):
        """
        initializes parameters with Xavier Initialization.

        Question (b)
        - refer to https://paperswithcode.com/method/xavier-initialization for Xavier initialization

        Inputs
        - input_dim
        - num_hiddens
        - num_classes
        Returns
        - params: a dictionary with the initialized parameters.
        """
        params = {}
        ##### YOUR CODE #####
        params["W1"] = np.random.randn(input_dim, num_hiddens) / np.sqrt(input_dim)
        params["b1"] = np.zeros(num_hiddens)
        params["W2"] = np.random.randn(num_hiddens, num_classes) / np.sqrt(num_hiddens)
        params["b2"] = np.zeros(num_classes)
        #####################
        return params

    def forward(self, X):
        """
        Define and perform the feed forward step of a two-layer neural network.
        Specifically, the network structue is given by

          y = softmax(LeakyReLU(X W1 + b1) W2 + b2)

        where X is the input matrix of shape (N, D), y is the class distribution matrix
        of shape (N, C), N is the number of examples (either the entire dataset or
        a mini-batch), D is the feature dimensionality, and C is the number of classes.

        Question (c)
        - ff_dict will be used to run backpropagation in backward method.

        Inputs
        - X: the input matrix of shape (N, D)

        Returns
        - y: the output of the model
        - ff_dict: a dictionary with all the fully connected units and activations.
        """
        ff_dict = {}
        ##### YOUR CODE #####
        # Z1 = X dot W1 + b1
        ff_dict["Z1"] = np.dot(X, self.params["W1"]) + self.params["b1"]
        # A1 = LeakyReLU(Z1)
        ff_dict["A1"] = leaky_relu(ff_dict["Z1"])
        # Z2 = A1 dot W2 + b2
        ff_dict["Z2"] = np.dot(ff_dict["A1"], self.params["W2"]) + self.params["b2"]
        # y = softmax(Z2)
        y = softmax(ff_dict["Z2"])
        ff_dict["y"] = y
        #####################
        return y, ff_dict

    def backward(self, X, Y, ff_dict):
        """
        Performs backpropagation over the two-layer neural network, and returns
        a dictionary of gradients of all model parameters.

        Question (d)

        Inputs:
         - X: the input matrix of shape (B, D), where B is the number of examples
              in a mini-batch, D is the feature dimensionality.
         - Y: the matrix of one-hot encoded ground truth classes of shape (B, C),
              where B is the number of examples in a mini-batch, C is the number
              of classes.
         - ff_dict: the dictionary containing all the fully connected units and
              activations.

        Returns:
         - grads: a dictionary containing the gradients of corresponding weights and biases.
        """
        grads = {}
        ##### YOUR CODE #####
        # dL/dZ2 = dL/dy * dy/dZ2 = (y - Y)   # dim : (B, C)  (mini-batch size, num_classes)
        # ref : https://math.stackexchange.com/questions/3993037/computing-the-gradient-of-cross-entropy-loss
        dz2 = ff_dict["y"] - Y
        # dL/dW2 = dL/dZ2 * dZ2/dW2 = dL/dZ2 * A1  # dim : (num_hiddens, num_classes)
        grads["dW2"] = np.dot(ff_dict["A1"].T, dz2)
        # dL/db2 = dL/dZ2 * dZ2/db2 = dL/dZ2 # dim : (num_classes)
        grads["db2"] = np.sum(dz2, axis=0)
        # dL/dA1 = dL/dz2 * dz2/dA1 = W2 dot dL/dZ2 # dim : (B, num_hiddens)
        dA1 = np.dot(dz2, self.params["W2"].T)
        # dL/dZ1 = dL/dA1 * dA1/dZ1 = dL/dA1 * deriv_leaky_relu(Z1) # dim : (B, num_hiddens)
        dZ1 = dA1 * deriv_leaky_relu(ff_dict["Z1"])
        # dL/dW1 = dL/dZ1 * dZ1/dW1 = dL/dZ1 * X # dim : (input_dim, num_hiddens)
        grads["dW1"] = np.dot(X.T, dZ1)
        # dL/db1 = dL/dZ1 * dZ1/db1 = dL/dZ1 # dim : (num_hiddens)
        grads["db1"] = np.sum(dZ1, axis=0)
        #####################
        return grads


    def compute_loss(self, Y, Y_hat):
        """
        Computes cross entropy loss.

        Do NOT modify this function.

        Inputs
            Y:
            Y_hat:
        Returns
            loss:
        """
        loss = -(1/Y.shape[0]) * np.sum(np.multiply(Y, np.log(Y_hat)))
        return loss

    def train(self, X, Y, X_val, Y_val, lr, n_epochs, batch_size, log_interval=1):
        """
        Runs mini-batch gradient descent.

        Do NOT Modify this method.

        Inputs
        - X
        - Y
        - X_val
        - Y_Val
        - lr
        - n_epochs
        - batch_size
        - log_interval
        """
        for epoch in range(n_epochs):
          for X_batch, Y_batch in load_batch(X, Y, batch_size):
              self.train_step(X_batch, Y_batch, batch_size, lr)
          if epoch % log_interval==0:
              Y_hat, ff_dict = self.forward(X)
              train_loss = self.compute_loss(Y, Y_hat)
              train_acc = self.evaluate(Y, Y_hat)
              Y_hat, ff_dict = self.forward(X_val)
              valid_loss = self.compute_loss(Y_val, Y_hat)
              valid_acc = self.evaluate(Y_val, Y_hat)
              print('epoch {:02} - train loss/acc: {:.3f} {:.3f}, valid loss/acc: {:.3f} {:.3f}'.\
                    format(epoch, train_loss, train_acc, valid_loss, valid_acc))

    def train_step(self, X_batch, Y_batch, batch_size, lr):
        """
        Updates the parameters using gradient descent.

        Do NOT Modify this method.

        Inputs
        - X_batch
        - Y_batch
        - batch_size
        - lr
        """
        _, ff_dict = self.forward(X_batch)
        grads = self.backward(X_batch, Y_batch, ff_dict)
        self.params["W1"] -= lr * grads["dW1"]/batch_size
        self.params["b1"] -= lr * grads["db1"]/batch_size
        self.params["W2"] -= lr * grads["dW2"]/batch_size
        self.params["b2"] -= lr * grads["db2"]/batch_size

    def evaluate(self, Y, Y_hat):
        """
        Computes classification accuracy.

        Do NOT modify this function

        Inputs
        - Y: A numpy array of shape (N, C) containing the softmax outputs,
             where C is the number of classes.
        - Y_hat: A numpy array of shape (N, C) containing the one-hot encoded labels,
             where C is the number of classes.

        Returns
            accuracy: the classification accuracy in float
        """
        classes_pred = np.argmax(Y_hat, axis=1)
        classes_gt = np.argmax(Y, axis=1)
        accuracy = float(np.sum(classes_pred==classes_gt)) / Y.shape[0]
        return accuracy

# Load MNIST

In [4]:
X_train, Y_train, X_test, Y_test = load_data()

idxs = np.arange(len(X_train))
np.random.shuffle(idxs)
split_idx = int(np.ceil(len(idxs)*0.8))
X_valid, Y_valid = X_train[idxs[split_idx:]], Y_train[idxs[split_idx:]]
X_train, Y_train = X_train[idxs[:split_idx]], Y_train[idxs[:split_idx]]
print()
print('Set validation data aside')
print('Training data shape: ', X_train.shape)
print('Training labels shape: ', Y_train.shape)
print('Validation data shape: ', X_valid.shape)
print('Validation labels shape: ', Y_valid.shape)

MNIST data loaded:
Training data shape: (60000, 784)
Training labels shape: (60000, 10)
Test data shape: (10000, 784)
Test labels shape: (10000, 10)

Set validation data aside
Training data shape:  (48000, 784)
Training labels shape:  (48000, 10)
Validation data shape:  (12000, 784)
Validation labels shape:  (12000, 10)


# Training & Evaluation

In [27]:
###
# Question (e)
# Tune the hyperparameters with validation data,
# and print the results by running the lines below.
###

# Answer (e)
# - 1) 일부 hyperparameter를 랜덤하게 선택한 best valid accuracy : 0.9775833333333334 
#      {'num_hiddens': 256, 'batch_size': 64, 'lr': 0.1, 'n_epochs': 50}
# - 2) 1) 결과를 바탕으로 num_hiddens, batch_size, lr, n_epochs를 일부 조정한 best valid accuracy : 0.9798333333333333
#      {'num_hiddens': 512, 'batch_size': 32, 'lr': 0.1, 'n_epochs': 50}
# - 최종) 2) 결과에서 valid loss가 증가하는(overfit) epoch에서 training을 중단하고 test data로 평가한 결과
#   best parameters : {'num_hiddens': 512, 'batch_size': 32, 'lr': 0.1, 'n_epochs': 17}
#    ==> Final test loss = 0.062, acc = 0.981

In [5]:
# model instantiation
model = TwoLayerNN(input_dim=784, num_hiddens=512, num_classes=10)

In [6]:
# train the model
lr, n_epochs, batch_size = 0.1, 17, 32
model.train(X_train, Y_train, X_valid, Y_valid, lr, n_epochs, batch_size)

epoch 00 - train loss/acc: 0.177 0.950, valid loss/acc: 0.189 0.947
epoch 01 - train loss/acc: 0.114 0.969, valid loss/acc: 0.137 0.962
epoch 02 - train loss/acc: 0.082 0.978, valid loss/acc: 0.107 0.969
epoch 03 - train loss/acc: 0.068 0.982, valid loss/acc: 0.101 0.972
epoch 04 - train loss/acc: 0.053 0.986, valid loss/acc: 0.089 0.973
epoch 05 - train loss/acc: 0.041 0.990, valid loss/acc: 0.082 0.976
epoch 06 - train loss/acc: 0.039 0.990, valid loss/acc: 0.082 0.975
epoch 07 - train loss/acc: 0.027 0.994, valid loss/acc: 0.073 0.979
epoch 08 - train loss/acc: 0.026 0.994, valid loss/acc: 0.076 0.977
epoch 09 - train loss/acc: 0.021 0.996, valid loss/acc: 0.072 0.978
epoch 10 - train loss/acc: 0.017 0.997, valid loss/acc: 0.073 0.980
epoch 11 - train loss/acc: 0.015 0.998, valid loss/acc: 0.072 0.979
epoch 12 - train loss/acc: 0.012 0.998, valid loss/acc: 0.069 0.979
epoch 13 - train loss/acc: 0.011 0.999, valid loss/acc: 0.071 0.979
epoch 14 - train loss/acc: 0.009 0.999, valid lo

In [7]:
# evalute the model on test data
Y_hat, _ = model.forward(X_test)
test_loss = model.compute_loss(Y_test, Y_hat)
test_acc = model.evaluate(Y_test, Y_hat)
print("Final test loss = {:.3f}, acc = {:.3f}".format(test_loss, test_acc))

Final test loss = 0.071, acc = 0.977


In [17]:
## Test 1 - the model with different hyperparameters
# Possible hyperparameter values
num_hiddens_values = [32, 64, 128, 256]
batch_size_values = [64, 128, 256, 512]
lr_values = [0.1, 0.01, 0.001, 0.0001]
n_epochs_values = [10, 20, 30, 40, 50]

best_acc = 0
best_params = {}

for _ in range(50):  # Try 50 random combinations
    num_hiddens = np.random.choice(num_hiddens_values)
    batch_size = np.random.choice(batch_size_values)
    lr = np.random.choice(lr_values)
    n_epochs = np.random.choice(n_epochs_values)

    # Create and train the model with these hyperparameters
    model = TwoLayerNN(input_dim=784, num_hiddens=num_hiddens, num_classes=10)
    print("######## TESTING WITH HYPERPARAMETERS ########")
    print(">>> num_hiddens: ", num_hiddens, "batch_size: ", batch_size, "lr: ", lr, "n_epochs: ", n_epochs)
    model.train(X_train, Y_train, X_valid, Y_valid, lr, n_epochs, batch_size, log_interval=5)
    print("###########################################################")

    # Evaluate the model on the validation data
    Y_hat, _ = model.forward(X_valid)
    acc = model.evaluate(Y_valid, Y_hat)

    if acc > best_acc:
        best_acc = acc
        best_params = {'num_hiddens': num_hiddens, 'batch_size': batch_size, 'lr': lr, 'n_epochs': n_epochs}

print("Best validation accuracy: ", best_acc)
print("Best parameters: ", best_params)

num_hiddens:  64 batch_size:  128 lr:  0.1 n_epochs:  30
epoch 00 - train loss/acc: 0.324 0.909, valid loss/acc: 0.341 0.903
epoch 01 - train loss/acc: 0.266 0.924, valid loss/acc: 0.283 0.920
epoch 02 - train loss/acc: 0.233 0.933, valid loss/acc: 0.253 0.927
epoch 03 - train loss/acc: 0.203 0.943, valid loss/acc: 0.226 0.935
epoch 04 - train loss/acc: 0.183 0.948, valid loss/acc: 0.207 0.940
epoch 05 - train loss/acc: 0.165 0.954, valid loss/acc: 0.191 0.945
epoch 06 - train loss/acc: 0.150 0.958, valid loss/acc: 0.177 0.949
epoch 07 - train loss/acc: 0.142 0.960, valid loss/acc: 0.170 0.952
epoch 08 - train loss/acc: 0.129 0.964, valid loss/acc: 0.158 0.953
epoch 09 - train loss/acc: 0.121 0.966, valid loss/acc: 0.152 0.956
epoch 10 - train loss/acc: 0.112 0.969, valid loss/acc: 0.142 0.958
epoch 11 - train loss/acc: 0.106 0.970, valid loss/acc: 0.139 0.958
epoch 12 - train loss/acc: 0.100 0.972, valid loss/acc: 0.134 0.960
epoch 13 - train loss/acc: 0.093 0.974, valid loss/acc: 0.1

In [19]:
## Test 2 - the model with different hyperparameters
# Possible hyperparameter values
num_hiddens_values = [128, 256, 512]
batch_size_values = [32, 64, 128]
lr_values = [0.1, 0.01]
n_epochs_values = [40, 50, 60]

best_acc = 0
best_params = {}

for _ in range(50):  # Try 50 random combinations
    num_hiddens = np.random.choice(num_hiddens_values)
    batch_size = np.random.choice(batch_size_values)
    lr = np.random.choice(lr_values)
    n_epochs = np.random.choice(n_epochs_values)

    # Create and train the model with these hyperparameters
    model = TwoLayerNN(input_dim=784, num_hiddens=num_hiddens, num_classes=10)
    print("######## TESTING WITH HYPERPARAMETERS ########")
    print(">>> num_hiddens: ", num_hiddens, "batch_size: ", batch_size, "lr: ", lr, "n_epochs: ", n_epochs)
    model.train(X_train, Y_train, X_valid, Y_valid, lr, n_epochs, batch_size, log_interval=5)
    print("###########################################################")

    # Evaluate the model on the validation data
    Y_hat, _ = model.forward(X_valid)
    acc = model.evaluate(Y_valid, Y_hat)

    if acc > best_acc:
        best_acc = acc
        best_params = {'num_hiddens': num_hiddens, 'batch_size': batch_size, 'lr': lr, 'n_epochs': n_epochs}

print("Best validation accuracy: ", best_acc)
print("Best parameters: ", best_params)

######## TESTING WITH HYPERPARAMETERS ########
>>> num_hiddens:  512 batch_size:  128 lr:  0.01 n_epochs:  60
epoch 00 - train loss/acc: 0.809 0.840, valid loss/acc: 0.816 0.835
epoch 05 - train loss/acc: 0.353 0.903, valid loss/acc: 0.367 0.898
epoch 10 - train loss/acc: 0.294 0.917, valid loss/acc: 0.310 0.913
epoch 15 - train loss/acc: 0.262 0.927, valid loss/acc: 0.279 0.922
epoch 20 - train loss/acc: 0.237 0.934, valid loss/acc: 0.256 0.929
epoch 25 - train loss/acc: 0.217 0.939, valid loss/acc: 0.237 0.934
epoch 30 - train loss/acc: 0.200 0.945, valid loss/acc: 0.221 0.938
epoch 35 - train loss/acc: 0.185 0.949, valid loss/acc: 0.207 0.942
epoch 40 - train loss/acc: 0.172 0.952, valid loss/acc: 0.196 0.945
epoch 45 - train loss/acc: 0.161 0.956, valid loss/acc: 0.185 0.949
epoch 50 - train loss/acc: 0.151 0.959, valid loss/acc: 0.176 0.950
epoch 55 - train loss/acc: 0.142 0.961, valid loss/acc: 0.168 0.951
###########################################################
######## TESTI