# Colab Setup

In [1]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


In [2]:
"""
Change directory to where this file is located
"""
#%cd 'COPY&PASTE FILE DIRECTORY HERE'
%cd "/content/drive/MyDrive/GSDS/2022-1/MLDL1/DL/homework3"

/content/drive/MyDrive/GSDS/2022-1/MLDL1/DL/homework3


# Import Modules

In [3]:
import copy
import numpy as np
import matplotlib.pyplot as plt
from mnist.data_utils import load_data

#Utils

In [4]:
def sigmoid(z):
    """
    Do NOT modify this function
    """
    return 1/(1+np.exp(-z))

def softmax(X):
    """
    Do NOT modify this function
    """
    logit = np.exp(X-np.amax(X, axis=1, keepdims=True))
    numer = logit
    denom = np.sum(logit, axis=1, keepdims=True)
    return numer/denom

def load_batch(X, Y, batch_size, shuffle=True):
    """
    Generates batches with the remainder dropped.

    Do NOT modify this function
    """
    if shuffle:
        permutation = np.random.permutation(X.shape[0])
        X = X[permutation, :]
        Y = Y[permutation, :]
    num_steps = int(X.shape[0])//batch_size
    step = 0
    while step<num_steps:
        X_batch = X[batch_size*step:batch_size*(step+1)]
        Y_batch = Y[batch_size*step:batch_size*(step+1)]
        step+=1
        yield X_batch, Y_batch

#2-Layer Neural Network

In [5]:
class TwoLayerNN:
    """ a neural network with 2 layers """

    def __init__(self, input_dim, num_hiddens, num_classes):
        """
        Do NOT modify this function.
        """
        self.input_dim = input_dim
        self.num_hiddens = num_hiddens
        self.num_classes = num_classes
        self.params = self.initialize_parameters(input_dim, num_hiddens, num_classes)

    def initialize_parameters(self, input_dim, num_hiddens, num_classes):
        """
        initializes parameters with Xavier Initialization.

        Question (a)
        - refer to https://paperswithcode.com/method/xavier-initialization for Xavier initialization 
        
        Inputs
        - input_dim
        - num_hiddens
        - num_classes
        Returns
        - params: a dictionary with the initialized parameters.
        """
        params = {}
        params["W1"] = np.random.uniform(-(1/np.sqrt(input_dim)), 1/np.sqrt(input_dim), size=(input_dim*num_hiddens)).reshape(input_dim, num_hiddens)
        params["b1"] = np.zeros(shape=(num_hiddens, ))
        params["W2"] = np.random.uniform(-(1/np.sqrt(num_hiddens)), 1/np.sqrt(num_hiddens), size=(num_hiddens*num_classes)).reshape(num_hiddens, num_classes)
        params["b2"] = np.zeros(shape=(num_classes, ))

        # params["W1"] = np.random.randn(input_dim, num_hiddens) / np.sqrt(input_dim)
        # params["b1"] = np.zeros(shape=(num_hiddens, ))
        # params["W2"] = np.random.randn(num_hiddens, num_classes) / np.sqrt(num_hiddens)
        # params["b2"] = np.zeros(shape=(num_classes, ))

        return params

    def forward(self, X):
        """
        Define and perform the feed forward step of a two-layer neural network.
        Specifically, the network structue is given by

          y = softmax(sigmoid(X W1 + b1) W2 + b2)

        where X is the input matrix of shape (N, D), y is the class distribution matrix
        of shape (N, C), N is the number of examples (either the entire dataset or
        a mini-batch), D is the feature dimensionality, and C is the number of classes.

        Question (b)
        - ff_dict will be used to run backpropagation in backward method.

        Inputs
        - X: the input matrix of shape (N, D)

        Returns
        - y: the output of the model
        - ff_dict: a dictionary with all the fully connected units and activations.
        """
        ff_dict = {}

        W1, b1 = self.params["W1"], self.params["b1"]
        W2, b2 = self.params["W2"], self.params["b2"]
        
        layer1 = X.dot(W1) + b1
        activation1 = sigmoid(layer1)
        layer2 = activation1.dot(W2) + b2
        y = softmax(layer2)

        ff_dict["layer1"] = layer1
        ff_dict["activation1"] = activation1
        ff_dict["layer2"] = layer2
        ff_dict["y_hat"] = y
        
        return y, ff_dict

    def backward(self, X, Y, ff_dict):
        """
        Performs backpropagation over the two-layer neural network, and returns
        a dictionary of gradients of all model parameters.

        Question (c)

        Inputs:
         - X: the input matrix of shape (B, D), where B is the number of examples
              in a mini-batch, D is the feature dimensionality.
         - Y: the matrix of one-hot encoded ground truth classes of shape (B, C),
              where B is the number of examples in a mini-batch, C is the number
              of classes.
         - ff_dict: the dictionary containing all the fully connected units and
              activations.

        Returns:
         - grads: a dictionary containing the gradients of corresponding weights and biases.
        """
        layer1 = ff_dict["layer1"] # (B, num_hiddens)
        activation1 = ff_dict["activation1"] # (B, num_hiddens)
        layer2 = ff_dict["layer2"] # (B, C)
        y_hat = ff_dict["y_hat"] # (B, C)

        grads = {}

        # partial Loss over partial yhat - upstream to the layer2
        dy = (y_hat - Y) # (B, C)
        grads["dW2"] = activation1.T.dot(dy) # local gradient = dy/dW2 = Z (B, num_hiddens) -> (num_hiddens, C)가 되어야 연산이 가능
        grads["db2"] = dy.T.dot(np.ones(shape=(dy.shape[0], )))

        # X -> W1X+b1 -> sigmoid(WX+b1) = activation1 = Z -> W2(Z)+b2 = y -> softmax(y)
        # W1: (D, num_hiddens), W2: (num_hiddens, C)
        W2 = self.params["W2"]
        dz = dy.dot(W2.T) # (B, num_hiddens)
        da = dz * (1 - activation1) * activation1 # (B, num_hiddens) -> derivative of sigmoid
        
        grads["dW1"] = X.T.dot(da) # local gradient = X(B, D) -> (D, num_hiddens)가 되어야 연산이 가능
        grads["db1"] = da.T.dot(np.ones(shape=(da.shape[0], )))

        return grads

    def compute_loss(self, Y, Y_hat):
        """
        Computes cross entropy loss.

        Do NOT modify this function.

        Inputs
            Y:
            Y_hat:
        Returns
            loss:
        """
        loss = -(1/Y.shape[0]) * np.sum(np.multiply(Y, np.log(Y_hat)))
        return loss

    def train(self, X, Y, X_val, Y_val, lr, n_epochs, batch_size, log_interval=1):
        """
        Runs mini-batch gradient descent.

        Do NOT Modify this method.

        Inputs
        - X
        - Y
        - X_val
        - Y_Val
        - lr
        - n_epochs
        - batch_size
        - log_interval
        """
        for epoch in range(n_epochs):
            for X_batch, Y_batch in load_batch(X, Y, batch_size):
                self.train_step(X_batch, Y_batch, batch_size, lr)
            if epoch % log_interval==0:
                Y_hat, ff_dict = self.forward(X) # <- 따라서 forward에서 반환되는 Y_hat도 확률값
                train_loss = self.compute_loss(Y, Y_hat) # <- compute_loss에 들어는 Y_hat은 확률값
                train_acc = self.evaluate(Y, Y_hat)
                Y_hat, ff_dict = self.forward(X_val)
                valid_loss = self.compute_loss(Y_val, Y_hat)
                valid_acc = self.evaluate(Y_val, Y_hat)
                print('epoch {:02} - train loss/acc: {:.3f} {:.3f}, valid loss/acc: {:.3f} {:.3f}'.\
                      format(epoch, train_loss, train_acc, valid_loss, valid_acc))

    def train_step(self, X_batch, Y_batch, batch_size, lr):
        """
        Updates the parameters using gradient descent.

        Do NOT Modify this method.

        Inputs
        - X_batch
        - Y_batch
        - batch_size
        - lr
        """
        _, ff_dict = self.forward(X_batch)
        grads = self.backward(X_batch, Y_batch, ff_dict)
        self.params["W1"] -= lr * grads["dW1"]/batch_size
        self.params["b1"] -= lr * grads["db1"]/batch_size
        self.params["W2"] -= lr * grads["dW2"]/batch_size
        self.params["b2"] -= lr * grads["db2"]/batch_size

    def evaluate(self, Y, Y_hat):
        """
        Computes classification accuracy.
        
        Do NOT modify this function

        Inputs
        - Y: A numpy array of shape (N, C) containing the softmax outputs,
             where C is the number of classes.
        - Y_hat: A numpy array of shape (N, C) containing the one-hot encoded labels,
             where C is the number of classes.

        Returns
            accuracy: the classification accuracy in float
        """        
        classes_pred = np.argmax(Y_hat, axis=1)
        classes_gt = np.argmax(Y, axis=1)
        accuracy = float(np.sum(classes_pred==classes_gt)) / Y.shape[0]
        return accuracy

#Load MNIST

In [6]:
X_train, Y_train, X_test, Y_test = load_data()

idxs = np.arange(len(X_train))
np.random.shuffle(idxs)
split_idx = int(np.ceil(len(idxs)*0.8))
X_valid, Y_valid = X_train[idxs[split_idx:]], Y_train[idxs[split_idx:]]
X_train, Y_train = X_train[idxs[:split_idx]], Y_train[idxs[:split_idx]]
print()
print('Set validation data aside')
print('Training data shape: ', X_train.shape)
print('Training labels shape: ', Y_train.shape)
print('Validation data shape: ', X_valid.shape)
print('Validation labels shape: ', Y_valid.shape)

MNIST data loaded:
Training data shape: (60000, 784)
Training labels shape: (60000, 10)
Test data shape: (10000, 784)
Test labels shape: (10000, 10)

Set validation data aside
Training data shape:  (48000, 784)
Training labels shape:  (48000, 10)
Validation data shape:  (12000, 784)
Validation labels shape:  (12000, 10)


#Training & Evaluation

In [7]:
### 
# Question (d)
# Tune the hyperparameters with validation data, 
# and print the results by running the lines below.
###

In [8]:
# model instantiation
model = TwoLayerNN(input_dim=784, num_hiddens=64, num_classes=10)

In [9]:
# train the model
lr, n_epochs, batch_size = 2.0, 20, 256
model.train(X_train, Y_train, X_valid, Y_valid, lr, n_epochs, batch_size)

epoch 00 - train loss/acc: 0.274 0.920, valid loss/acc: 0.278 0.916
epoch 01 - train loss/acc: 0.213 0.939, valid loss/acc: 0.222 0.938
epoch 02 - train loss/acc: 0.169 0.951, valid loss/acc: 0.180 0.948
epoch 03 - train loss/acc: 0.143 0.960, valid loss/acc: 0.160 0.953
epoch 04 - train loss/acc: 0.123 0.965, valid loss/acc: 0.146 0.957
epoch 05 - train loss/acc: 0.108 0.969, valid loss/acc: 0.133 0.960
epoch 06 - train loss/acc: 0.098 0.972, valid loss/acc: 0.127 0.961
epoch 07 - train loss/acc: 0.090 0.975, valid loss/acc: 0.118 0.965
epoch 08 - train loss/acc: 0.082 0.978, valid loss/acc: 0.112 0.967
epoch 09 - train loss/acc: 0.076 0.979, valid loss/acc: 0.111 0.967
epoch 10 - train loss/acc: 0.069 0.982, valid loss/acc: 0.105 0.968
epoch 11 - train loss/acc: 0.063 0.983, valid loss/acc: 0.101 0.970
epoch 12 - train loss/acc: 0.059 0.984, valid loss/acc: 0.098 0.970
epoch 13 - train loss/acc: 0.056 0.986, valid loss/acc: 0.098 0.970
epoch 14 - train loss/acc: 0.052 0.987, valid lo

In [10]:
# evalute the model on test data
Y_hat, _ = model.forward(X_test)
test_loss = model.compute_loss(Y_test, Y_hat)
test_acc = model.evaluate(Y_test, Y_hat)
print("Final test loss = {:.3f}, acc = {:.3f}".format(test_loss, test_acc))

Final test loss = 0.092, acc = 0.973


## Grid Search

* Here, I perform a grid search in order to find the best combination of parameters among the given candidates. To make the conditions equal for each experiment, I'll evaluate the combinations under the same epochs - 30.

In [19]:
hidden = [32, 64, 96, 128]
lr_candidates = np.linspace(0.1, 2.5, num=8)
batch_size = [(2**i) for i in range(6, 10)] # from 64 to 512

In [20]:
params = []
for h in hidden:
    for l in lr_candidates:
        for b in batch_size:
            params.append((h, l, b))

print(len(params))
params[:5]

128


[(32, 0.1, 64),
 (32, 0.1, 128),
 (32, 0.1, 256),
 (32, 0.1, 512),
 (32, 0.44285714285714284, 64)]

In [21]:
from time import perf_counter

test_accs = np.zeros(shape=(len(params), ))
test_losses = np.zeros(shape=(len(params), ))
elapsed_time = []
for i in range(len(params)):
    p = params[i]
    num_hiddens, lr, batch_size = p
    start = perf_counter()
    model = TwoLayerNN(input_dim=784, num_hiddens=num_hiddens, num_classes=10)
    model.train(X_train, Y_train, X_valid, Y_valid, lr=lr, n_epochs=30, batch_size=batch_size)
    end = perf_counter()
    Y_hat, _ = model.forward(X_test)
    test_loss = model.compute_loss(Y_test, Y_hat)
    test_acc = model.evaluate(Y_test, Y_hat)

    test_accs[i] = test_acc
    test_losses[i] = test_loss
    elapsed = end - start
    elapsed_time.append(elapsed)
    print(f"================== Training {i} is done! Elapsed time: {elapsed:.4f} seconds ==================")

print(f"Average training time: {np.mean(elapsed_time):.4f} seconds")

epoch 00 - train loss/acc: 0.535 0.874, valid loss/acc: 0.536 0.875
epoch 01 - train loss/acc: 0.377 0.897, valid loss/acc: 0.377 0.899
epoch 02 - train loss/acc: 0.322 0.910, valid loss/acc: 0.325 0.909
epoch 03 - train loss/acc: 0.292 0.917, valid loss/acc: 0.296 0.916
epoch 04 - train loss/acc: 0.272 0.923, valid loss/acc: 0.279 0.920
epoch 05 - train loss/acc: 0.253 0.929, valid loss/acc: 0.262 0.926
epoch 06 - train loss/acc: 0.240 0.932, valid loss/acc: 0.251 0.929
epoch 07 - train loss/acc: 0.229 0.935, valid loss/acc: 0.242 0.929
epoch 08 - train loss/acc: 0.216 0.939, valid loss/acc: 0.230 0.934
epoch 09 - train loss/acc: 0.206 0.942, valid loss/acc: 0.221 0.936
epoch 10 - train loss/acc: 0.198 0.943, valid loss/acc: 0.214 0.938
epoch 11 - train loss/acc: 0.190 0.945, valid loss/acc: 0.207 0.940
epoch 12 - train loss/acc: 0.183 0.948, valid loss/acc: 0.202 0.942
epoch 13 - train loss/acc: 0.179 0.949, valid loss/acc: 0.198 0.942
epoch 14 - train loss/acc: 0.172 0.951, valid lo

In [22]:
max_acc_idx = np.argmax(test_accs)
max_acc_param = params[max_acc_idx]

min_loss_idx = np.argmin(test_losses)
min_loss_param = params[min_loss_idx]

print(f"Best iteration - Accuracy: {max_acc_idx}\tLoss: {min_loss_idx}")
print(f"Combination of parameters with the highest accuracy: {max_acc_param} with score: {test_accs[max_acc_idx]}")
print(f"Combination of parameters with the lowest loss: {min_loss_param} with score: {test_losses[min_loss_idx]}")

Best iteration - Accuracy: 113	Loss: 100
Combination of parameters with the highest accuracy: (128, 1.4714285714285715, 128) with score: 0.9801
Combination of parameters with the lowest loss: (128, 0.44285714285714284, 64) with score: 0.068134105544498


# Extra Credit (Optional)

In [23]:
def relu(x):
    return np.maximum(x, 0)

In [24]:
def initialize_parameters(self, input_dim, num_hiddens, num_classes):
    """
    initializes parameters with He Initialization.

    Question (e)
    - refer to https://paperswithcode.com/method/he-initialization for He initialization 
    
    Inputs
    - input_dim
    - num_hiddens
    - num_classes
    Returns
    - params: a dictionary with the initialized parameters.
    """
    params = {}
    params["W1"] = np.random.randn(input_dim, num_hiddens) * np.sqrt(2 / input_dim)
    params["b1"] = np.zeros(shape=(num_hiddens, ))
    params["W2"] = np.random.randn(num_hiddens, num_classes) * np.sqrt(2 / num_hiddens)
    params["b2"] = np.zeros(shape=(num_classes, ))
    
    return params

def forward_relu(self, X):
    """
    Defines and performs the feed forward step of a two-layer neural network.
    Specifically, the network structue is given by

        y = softmax(relu(X W1 + b1) W2 + b2)

    where X is the input matrix of shape (N, D), y is the class distribution matrix
    of shape (N, C), N is the number of examples (either the entire dataset or
    a mini-batch), D is the feature dimensionality, and C is the number of classes.

    Question (e)

    Inputs
        X: the input matrix of shape (N, D)

    Returns
        y: the output of the model
        ff_dict: a dictionary containing all the fully connected units and activations.
    """
    ff_dict = {}

    W1, b1 = self.params["W1"], self.params["b1"]
    W2, b2 = self.params["W2"], self.params["b2"]
    
    layer1 = X.dot(W1) + b1
    activation1 = relu(layer1)
    layer2 = activation1.dot(W2) + b2
    y = softmax(layer2)

    ff_dict["layer1"] = layer1
    ff_dict["activation1"] = activation1
    ff_dict["layer2"] = layer2
    ff_dict["y_hat"] = y
    
    return y, ff_dict

def backward_relu(self, X, Y, ff_dict):
    """
    Performs backpropagation over the two-layer neural network, and returns
    a dictionary of gradients of all model parameters.

    Question (e)

    Inputs:
        - X: the input matrix of shape (B, D), where B is the number of examples
            in a mini-batch, D is the feature dimensionality.
        - Y: the matrix of one-hot encoded ground truth classes of shape (B, C),
            where B is the number of examples in a mini-batch, C is the number
            of classes.
        - ff_dict: the dictionary containing all the fully connected units and
            activations.

    Returns:
        - grads: a dictionary containing the gradients of corresponding weights
            and biases.
    """
    layer1 = ff_dict["layer1"] # (B, num_hiddens)
    activation1 = ff_dict["activation1"] # (B, num_hiddens)
    layer2 = ff_dict["layer2"] # (B, C)
    y_hat = ff_dict["y_hat"] # (B, C)

    grads = {}
    # partial Loss over partial yhat - upstream to the layer2
    dy = (y_hat - Y) # (B, C)
    grads["dW2"] = activation1.T.dot(dy)
    grads["db2"] = dy.T.dot(np.ones(shape=(dy.shape[0], )))

    # W1: (D, num_hiddens), W2: (num_hiddens, C)
    W2 = self.params["W2"]
    dh = dy.dot(W2.T) # (B, num_hiddens)
    overzero = (activation1 > 0).astype(np.uint8)
    da = dh * overzero # (B, num_hiddens)

    grads["dW1"] = X.T.dot(da)
    grads["db1"] = da.T.dot(np.ones(shape=(da.shape[0], )))

    return grads

TwoLayerNNRelu = copy.copy(TwoLayerNN)
TwoLayerNNRelu.initialize_parameters = initialize_parameters
TwoLayerNNRelu.forward = forward_relu
TwoLayerNNRelu.backward = backward_relu

In [25]:
### 
# Question (e)
# Tune the hyperparameters with validation data,
# and print the results by running the lines below.
###

In [26]:
# model instantiation
model_relu = TwoLayerNNRelu(input_dim=784, num_hiddens=64, num_classes=10)

In [27]:
# train the model
lr, n_epochs, batch_size = 0.4, 20, 256
history = model_relu.train(X_train, Y_train, X_valid, Y_valid, lr, n_epochs, batch_size)

epoch 00 - train loss/acc: 0.255 0.926, valid loss/acc: 0.260 0.922
epoch 01 - train loss/acc: 0.191 0.945, valid loss/acc: 0.202 0.942
epoch 02 - train loss/acc: 0.154 0.956, valid loss/acc: 0.170 0.953
epoch 03 - train loss/acc: 0.129 0.963, valid loss/acc: 0.150 0.957
epoch 04 - train loss/acc: 0.114 0.967, valid loss/acc: 0.138 0.959
epoch 05 - train loss/acc: 0.106 0.969, valid loss/acc: 0.132 0.961
epoch 06 - train loss/acc: 0.088 0.976, valid loss/acc: 0.117 0.966
epoch 07 - train loss/acc: 0.087 0.974, valid loss/acc: 0.122 0.963
epoch 08 - train loss/acc: 0.081 0.976, valid loss/acc: 0.118 0.964
epoch 09 - train loss/acc: 0.070 0.981, valid loss/acc: 0.105 0.968
epoch 10 - train loss/acc: 0.065 0.982, valid loss/acc: 0.104 0.968
epoch 11 - train loss/acc: 0.056 0.984, valid loss/acc: 0.098 0.970
epoch 12 - train loss/acc: 0.054 0.985, valid loss/acc: 0.100 0.970
epoch 13 - train loss/acc: 0.053 0.985, valid loss/acc: 0.103 0.969
epoch 14 - train loss/acc: 0.045 0.989, valid lo

In [28]:
Y_hat, _ = model_relu.forward(X_test)
test_loss = model_relu.compute_loss(Y_test, Y_hat)
test_acc = model_relu.evaluate(Y_test, Y_hat)
print("Final test loss = {:.3f}, acc = {:.3f}".format(test_loss, test_acc))

Final test loss = 0.091, acc = 0.973


## Grid Search

In [29]:
hidden = [32, 64, 96, 128]
lr_candidates = np.linspace(0.1, 2.5, num=8)
batch_size = [(2**i) for i in range(6, 10)] # from 64 to 512

In [30]:
params = []
for h in hidden:
    for l in lr_candidates:
        for b in batch_size:
            params.append((h, l, b))

print(len(params))
params[:5]

128


[(32, 0.1, 64),
 (32, 0.1, 128),
 (32, 0.1, 256),
 (32, 0.1, 512),
 (32, 0.44285714285714284, 64)]

In [31]:
test_accs = np.zeros(shape=(len(params), ))
test_losses = np.zeros(shape=(len(params), ))
elasped_time = []
for i in range(len(params)):
    p = params[i]
    num_hiddens, lr, batch_size = p
    start = perf_counter()
    model_r = TwoLayerNNRelu(input_dim=784, num_hiddens=num_hiddens, num_classes=10)
    model_r.train(X_train, Y_train, X_valid, Y_valid, lr=lr, n_epochs=30, batch_size=batch_size)
    end = perf_counter()
    Y_hat, _ = model_r.forward(X_test)
    test_loss = model_r.compute_loss(Y_test, Y_hat)
    test_acc = model_r.evaluate(Y_test, Y_hat)

    test_accs[i] = test_acc
    test_losses[i] = test_loss
    elapsed = end - start
    elapsed_time.append(elapsed)
    print(f"================== Training {i} is done! Elapsed time: {elapsed:.4f} seconds ==================")

print(f"Average training time: {np.mean(elapsed_time):.4f} seconds")

epoch 00 - train loss/acc: 0.263 0.923, valid loss/acc: 0.277 0.921
epoch 01 - train loss/acc: 0.203 0.944, valid loss/acc: 0.220 0.939
epoch 02 - train loss/acc: 0.174 0.951, valid loss/acc: 0.191 0.946
epoch 03 - train loss/acc: 0.151 0.958, valid loss/acc: 0.172 0.950
epoch 04 - train loss/acc: 0.133 0.962, valid loss/acc: 0.157 0.954
epoch 05 - train loss/acc: 0.132 0.961, valid loss/acc: 0.160 0.953
epoch 06 - train loss/acc: 0.116 0.967, valid loss/acc: 0.147 0.957
epoch 07 - train loss/acc: 0.112 0.969, valid loss/acc: 0.146 0.958
epoch 08 - train loss/acc: 0.106 0.971, valid loss/acc: 0.145 0.956
epoch 09 - train loss/acc: 0.093 0.974, valid loss/acc: 0.132 0.961
epoch 10 - train loss/acc: 0.088 0.975, valid loss/acc: 0.132 0.962
epoch 11 - train loss/acc: 0.086 0.975, valid loss/acc: 0.131 0.960
epoch 12 - train loss/acc: 0.078 0.978, valid loss/acc: 0.125 0.963
epoch 13 - train loss/acc: 0.072 0.980, valid loss/acc: 0.120 0.964
epoch 14 - train loss/acc: 0.070 0.980, valid lo



epoch 21 - train loss/acc: nan 0.295, valid loss/acc: nan 0.292
epoch 22 - train loss/acc: 1.719 0.297, valid loss/acc: 1.755 0.291
epoch 23 - train loss/acc: 1.719 0.302, valid loss/acc: 1.750 0.306
epoch 24 - train loss/acc: 1.730 0.299, valid loss/acc: 1.769 0.303
epoch 25 - train loss/acc: 1.713 0.306, valid loss/acc: 1.744 0.298
epoch 26 - train loss/acc: 1.748 0.292, valid loss/acc: 1.759 0.294
epoch 27 - train loss/acc: 1.716 0.299, valid loss/acc: 1.758 0.298
epoch 28 - train loss/acc: 1.715 0.301, valid loss/acc: 1.750 0.303
epoch 29 - train loss/acc: 1.736 0.298, valid loss/acc: 1.780 0.303
epoch 00 - train loss/acc: 1.771 0.299, valid loss/acc: 1.779 0.294
epoch 01 - train loss/acc: 2.069 0.196, valid loss/acc: 2.089 0.199
epoch 02 - train loss/acc: 2.041 0.188, valid loss/acc: 2.053 0.186
epoch 03 - train loss/acc: 2.305 0.113, valid loss/acc: 2.305 0.110
epoch 04 - train loss/acc: 2.303 0.113, valid loss/acc: 2.303 0.110
epoch 05 - train loss/acc: 2.303 0.106, valid loss/a

In [32]:
max_acc_idx_r = np.argmax(test_accs)
max_acc_param_r = params[max_acc_idx]

min_loss_idx_r = np.argmin(test_losses)
min_loss_param_r = params[min_loss_idx_r]

print(f"Best iteration - Accuracy: {max_acc_idx_r}\tLoss: {min_loss_idx_r}")
print(f"Combination of parameters with the highest accuracy: {max_acc_param_r} with score: {test_accs[max_acc_idx_r]}")
print(f"Combination of parameters with the lowest loss: {min_loss_param_r} with score: {test_losses[min_loss_idx_r]}")

Best iteration - Accuracy: 105	Loss: 107
Combination of parameters with the highest accuracy: (128, 1.4714285714285715, 128) with score: 0.9804
Combination of parameters with the lowest loss: (128, 0.7857142857142857, 512) with score: 0.07044334456754013
