## COMP4329 Assignment 1

SIDs:
- 510428929
- 510429339
- 510429203

### Import Libraries

Note: Pandas were not used for module implementation, only for results dataframes

In [None]:
# Import relevant libraries
import numpy as np
import pandas as pd
import time
import matplotlib.pyplot as plt

### Load Datasets

In [None]:
# Load datasets
train_data = np.load('Assignment1-Dataset/train_data.npy')
train_label = np.load('Assignment1-Dataset/train_label.npy')
test_data = np.load('Assignment1-Dataset/test_data.npy')
test_label = np.load('Assignment1-Dataset/test_label.npy')

In [None]:
# Check data shape
print(train_data.shape)
print(train_label.shape)
print(test_data.shape)
print(test_label.shape)

(50000, 128)
(50000, 1)
(10000, 128)
(10000, 1)


### Activation Class

In [None]:
class Activation:

    def __logistic(self, x):
        return 1.0 /(1.0 + np.exp(-x))

    def __logistic_derivative(self, a):
        #where a = logistic(x)
        return a * (1-a)

    def __relu(self, x):
        return np.maximum(0, x)

    def __relu_derivative(self, a):
        return np.heaviside(a, 0)

    def __leakyrelu(self, x, alpha=0.01):
        return np.where(x >= 0, x, alpha * x)

    def __leakyrelu_derivative(self, x, alpha=0.01):
        return np.heaviside(x, 1) * (1 - alpha) + alpha

    def _gelu(self, x):
        return 0.5 * x * (1.0 + np.tanh(np.sqrt(2.0 / np.pi) * (x + 0.044715 * np.power(x, 3))))

    def _gelu_derivative(self, x):
        # Compute the inner term for tanh
        k = np.sqrt(2.0 / np.pi) * (x + 0.044715 * np.power(x, 3))
        tanh_k = np.tanh(k)

        # First term of the derivative
        term1 = 0.5 * (1.0 + tanh_k)

        # Second term with sech^2(k) = 1 - tanh^2(k)
        term2 = 0.5 * x * (1 - np.power(tanh_k, 2)) * np.sqrt(2.0 / np.pi) * (1 + 3 * 0.044715 * np.power(x, 2))

        return term1 + term2

    def __softmax(self, z):
        z = np.atleast_2d(z)
        max_z = np.max(z, axis=1, keepdims=True)
        z = z - max_z
        return np.exp(z) / np.sum(np.exp(z), axis=1, keepdims=True)

    def __softmax_derivative(self, z, z_hat):
        return z_hat - z

    def __init__(self, activation_function = 'relu'):
        if activation_function == "logistic":
            self.f = self.__logistic
            self.f_deriv = self.__logistic_derivative
        elif activation_function == 'relu':
            self.f = self.__relu
            self.f_derivative = self.__relu_derivative
        elif activation_function == 'leakyrelu':
            self.f = self.__leakyrelu
            self.f_derivative = self.__leakyrelu_derivative
        elif activation_function == "softmax":
            self.f = self.__softmax
            self.f_derivative = self.__softmax_derivative
        elif activation_function == "gelu":
            self.f = self._gelu
            self.f_derivative = self._gelu_derivative

### Hidden Layer Class

In [None]:
class HiddenLayer(object):
    def __init__(self,
                 n_in,
                 n_out,
                 activation_last_layer = 'relu',
                 activation = 'relu',
                 initialisation = 'xavier',
                 W = None,
                 b = None,
                 v_W = None,
                 v_b = None,
                 last_hidden_layer = False,
                 use_batchnorm=False):

        '''
        The class for a Hidden Layer in a MLP.

        Attributes:
        n_in (int): The dimensionality of the input to the Hidden Layer.
        n_out (int): The dimensionality of the output, i.e. the number of hidden units.

        activation_last_layer (str): The activation function of the previous Hidden Layer.
        activation (str): The activation function of this current Hidden Layer

        W (numpy array): The weight(s) applied to this current Hidden Layer. Set to None by default to allow initialisation later.
        b (numpy array): The bias applied to this current Hidden Layer. Set to None by default to allow initialisation later.

        v_W (numpy array): The 'velocity' or 'trajectory' term vt for the weight(s) in Momentum SGD. Set to None by default to allow initialisation later.
        v_b (numpy array): The 'velocity' or 'trajectory' term vt for the bias in Momentum SGD. Set to None by default to allow initialisation later.

        last_hidden_layer (bool): The boolean to determine if the current Hidden Layer object is the Last Hidden Layer in the MLP.
        '''


        self.last_hidden_layer = last_hidden_layer
        self.input = None
        self.initialisation = initialisation

        #Create a Activatino object Grab the .f method from relu
        self.activation = Activation(activation).f

        #Set activation deriv of last layer, none if no last layer
        self.activation_deriv = None
        if activation_last_layer:
            self.activation_deriv = Activation(activation_last_layer).f_derivative

        if self.initialisation == 'xavier':
            #Xavier Initialisation - assign random small values (from uniform dist)
            self.W = np.random.uniform(low = -np.sqrt(6. / (n_in + n_out)),
                                    high = np.sqrt(6. / (n_in + n_out )),
                                    size = (n_in, n_out))
        elif self.initialisation == 'zeros':
            self.W = np.zeros((n_in, n_out))
        elif self.initialisation == 'random_small':
            self.W = np.random.randn(n_in, n_out) * 0.01

        #set size of the bias as the size of the output dimension (all zero)
        self.b = np.zeros(n_out,)

        # we set he size of weight gradients as the size of weight
        self.grad_W = np.zeros(self.W.shape)
        self.grad_b = np.zeros(self.b.shape)

        #Create array of zeros with the same shape as the gradient weights
        self.v_W = np.zeros_like(self.grad_W)
        self.v_b = np.zeros_like(self.grad_b)
        self.binomial_array=np.zeros(n_out)

        #setting up batch normalisation
        self.use_batchnorm = use_batchnorm
        self.eps = 1e-8
        self.gamma = np.ones(n_out)  # Scale
        self.beta = np.zeros(n_out)  # Shift
        self.running_mean = np.zeros(n_out)
        self.running_var = np.ones(n_out)

        self.grad_gamma = np.zeros(n_out)
        self.grad_beta = np.zeros(n_out)

        self.v_gamma = np.zeros_like(self.gamma)
        self.v_beta = np.zeros_like(self.beta)

        self.num_batches_tracked = 0
        self.sum_batch_means = np.zeros(n_out)
        self.sum_batch_vars  = np.zeros(n_out)
        self.fixed_batch_size = None  # set after first forward pass

    @staticmethod
    def dropout_forward(X, p_dropout):
        '''
        The method to perform dropout during the training of the forward pass.

        Paremeters:
        X (numpy array): The input data to be fed through the dropout forward pass.
        p_dropout (float): The controlling factor of the proportion of neurons dropped in the network.

        Returns:
        out (numpy array): The resulting output array with values from inactive neurons as 0 and values from the active neuron equal to that of the input.
        binomial_array (numpy array): An array with the same size of input, filled with 0s for neurons that are to be inactive and 1s for neurons that are to be active during training.
        '''

        u = np.random.binomial(1, 1 - p_dropout, size=X.shape)
        out = X * u
        binomial_array=u
        return out, binomial_array


    @staticmethod
    def dropout_backward(delta, binomial_array, layer_num):
        '''
        The method to perform dropout during the backpropagation.

        Parameters:
        delta (numpy array): The delta generated for the backpropagation process.
        binomial_array (numpy array): An array with the same size of input, filled with 0s for neurons that are to be inactive and 1s for neurons that are to be active during training.
        layer_num (int): The current layer in the MLP which dropout is being performed on.

        Returns:
        delta (numpy array): The adjusted delta with dropout applied.
        '''

        delta *= nn.layers[layer_num - 1].binomial_array
        return delta

    def finalise_batchnorm_stats(self):
        if self.num_batches_tracked == 0:
            return  # No training batches processed

        # Compute running mean as the average of all batch means
        self.running_mean = self.sum_batch_means / self.num_batches_tracked

        # get an unbiased population variance
        m = self.fixed_batch_size
        self.running_var = (m / (m - 1)) * (self.sum_batch_vars / self.num_batches_tracked)

    #forward progress for training epoch:
    def forward(self, input, training=True):
        '''
        The feedforward pass of a single Hidden Layer.
        Applies the weights and bias to the input, performs calculations via the selected activation function and returns this output.

        Parameters:
        input (numpy array): The input data, either from the output of the previous Hidden Layer or the initial input data.

        Returns:
        self.output (numpy array): The resulting output.
        '''
        #Set current input for this layer
        self.input = input

        #this is the whole layer
        lin_output = np.dot(input, self.W) + self.b #simple perceptron output

        #Setting unnormalised linear output for this layer
        self.unnorm_lin_out = lin_output

        #batch normalisation (before activation function)
        if self.use_batchnorm and not self.last_hidden_layer:
            if training:
                #Compute batch statistics
                batch_mean = np.mean(lin_output, axis=0)
                batch_var = np.var(lin_output, axis=0)

                #Store statistics
                self.batch_mean = batch_mean
                self.batch_var = batch_var

                # Normalize
                lin_output_norm = (lin_output - batch_mean) / np.sqrt(batch_var + self.eps)

                #Scale and shift
                lin_output = self.gamma * lin_output_norm + self.beta

                #Store
                self.bn_output = lin_output

                # accumulate for inference stats
                self.num_batches_tracked += 1
                self.sum_batch_means += batch_mean
                self.sum_batch_vars  += batch_var

                if self.fixed_batch_size is None:
                    self.fixed_batch_size = lin_output.shape[0]

            else:
                # Inference mode: use running statistics
                # use precomputed inference stats
                mean = self.running_mean
                var  = self.running_var
                lin_output_norm = (lin_output - mean) / np.sqrt(var + self.eps)
                lin_output = self.gamma * lin_output_norm + self.beta
                self.bn_output = lin_output

        #feed linear output into activation function
        self.output = (
            lin_output if self.activation is None #linear if no activation specified
            else self.activation(lin_output) #activation fn on w*I + b  (i.e. activation function on linear output)
        )

        #
        if not self.last_hidden_layer:
            self.output, self.binomial_array = self.dropout_forward(self.output, DROPOUT_PROB)

        #return the output
        return self.output

    #backpropagation
    def backward(self, delta, layer_num, output_layer = False):
        '''
        The backward pass of a single Hidden Layer.

        Parameters:
        delta (numpy array): The delta values to be applied to the activation derivative.
        layer_num (int): The number of the current layer in the MLP, used to check if it is not the input layer.
        output_layer (bool): A boolean to reflect if the current layer is not the output layer.

        Returns delta (numpy array): The delta for the hidden layer to be used in parameters.
        '''

        #If using Batch norm and this is not the output layer
        if self.use_batchnorm and not self.last_hidden_layer:
            #Gradient w.r.t gamma and beta
            x_norm = (self.unnorm_lin_out - self.batch_mean) / np.sqrt(self.batch_var + self.eps)
            self.grad_gamma = np.sum(delta * x_norm, axis=0)
            self.grad_beta = np.sum(delta, axis=0)

            #Backprop through BN normalisation
            N, D = delta.shape

            #Backprop through scale and shift
            d_bn = delta * self.gamma

            #Gradients for normalised input (from BN)
            x_mu = self.unnorm_lin_out - self.batch_mean
            std_inv = 1. / np.sqrt(self.batch_var + self.eps)

            d_var = np.sum(d_bn * x_mu * -0.5 * std_inv**3, axis=0)
            d_mean = np.sum(d_bn * -std_inv, axis=0) + d_var * np.mean(-2. * x_mu, axis=0)

            delta = d_bn * std_inv + d_var * 2 * x_mu / N + d_mean / N

        #Completely different formulas as this is a vectorised implementation

        #calcualtes gradients as input(^t) * delta
        self.grad_W = np.atleast_2d(self.input).T.dot(np.atleast_2d(delta))

        # The gradient of the bias vector b becomes the average of the delta values across the batch
        self.grad_b = np.average(delta, axis=0)

        #
        if self.activation_deriv:
            #Propogates the error backward through the weights * applies the derivative of the activation function to get the true local gradient
            delta = delta.dot(self.W.T) * self.activation_deriv(self.input)

        #If not in the onput layer
        if layer_num != 0:
            #Restores dropped-out connections if dropout was applied in the forward pass
            delta=self.dropout_backward(delta, self.binomial_array, layer_num)

        #pass delta to the next (previous) layer in the backward chain to repeat
        return delta

### Network Class

In [None]:
class MLP:
    '''
    Main class holding the structure of the Multi-Layer Perceptron.

    Attributes:
    None
    '''

    def __init__(self, layers, activation = [None, 'relu', 'relu','relu', 'softmax'], weight_decay = 1.0, initialisation='xaiver', batch_normal=False):

        '''
        The initialisation of the MLP.

        Attributes:
        layers (list of int): A list containing the number of neurons in each respective layer.
        activation (list of str): A list containing the activation functions to be used in each respective layer. Set to [None, 'relu', 'relu', 'relu', 'softmax'] as default.
        weight_decay (float): The value set for the weight decay to be applied. Value of 1.0 indicates no weight decay to be applied.
        '''

        self.batch_normal = batch_normal
        #Will contain all the hidden layer objects
        self.layers = []
        self.params = []

        #Activation functions for each layer
        self.activation = activation

        #Weight decay coefficient
        self.weight_decay = weight_decay

        #Loop through the list of layers. initialise a new hidden layer object for each layer
        for i in range(len(layers)-1):

            last_hidden_layer = False

            if i == len(layers) - 2: # -2 because -1 for output layer, and another -1 since it's index 0
                last_hidden_layer = True

            self.layers.append(HiddenLayer(layers[i],
                                           layers[i+1],
                                           activation[i],
                                           activation[i+1],
                                           initialisation=initialisation,
                                           last_hidden_layer=last_hidden_layer,
                                           use_batchnorm=self.batch_normal))

    def forward(self, input, training=True):
        '''
        The feedforward process conducted sequentially through each layer in the MLP.
        Takes the input from the previous layer (or initial data if it is the input layer), applies weights & bias then activation function and feeds the resulting output as the input to the next layer via the HiddenLayer.forward() method.

        Parameters:
        input (numpy array): The input array to be fed through the feedforward process.

        Returns:
        output (numpy array): The resulting final output from the feedforward process across all layers.
        '''

        #Perform forward propogation on each layer object
        for layer in self.layers:
            output = layer.forward(input, training=training)

            input = output
        return output


    def CE_loss(self, y, y_hat):
        '''
        The calculation of the Cross-Entropy loss function.
        Computes the cross entropy loss, averages this and applies weight decay (if applicable) as well as calculating the respective delta to be used in the backpropagation process.

        Parameters:
        y (numpy array): The actual y values (or labels) from the data set.
        y_hat (numpy array): The calculated y values (y hat) as output from the feedforward process.

        Returns:
        loss (float): The calculated Cross Entropy Loss value.
        delta (numpy array): The calculated delta array to be used in the backpropagation process.
        '''
        epsilon = 1e-12
        y_hat = np.clip(y_hat, epsilon, 1. - epsilon)  # avoid log(0)
        loss = - np.nansum(y * np.log(y_hat)) / y.shape[0]
        loss *= self.weight_decay
        delta = Activation(self.activation[-1]).f_derivative(y, y_hat)
        return loss, delta

    def backward(self, delta):
        '''
        The backpropagation process conducted backwards across each layer in the MLP.
        Updates the delta via the Hidden Layer backward process and applies this updated delta as the delta input in the HiddenLayer.backward() method.

        Parameters:
        delta (numpy array): The value for delta calculated in the Loss function.

        Returns:
        None
        '''

        delta = self.layers[-1].backward(delta, len(self.layers) -1, output_layer = True)
        for layer_num, layer in reversed(list(enumerate(self.layers[:-1]))):
            delta = layer.backward(delta, layer_num)

    def update(self, lr, SGD_optim):
      '''
      The method to update the parameters under Stochastic Gradient Descent (SGD).
      Updates the weights and bias parameters based on the learning rate and respective gradient. Includes functionality for applying SGD Momentum optimization.

      Parameters:
      lr (float): The learning rate for the parameter updates.
      SGD_optim (dict of str: str): The SGD Optimization values as a dictionary with keys 'Type': as the type of optimisation and 'Parameters': for the optimization parameter value.

      Returns:
      None
      '''

      #Update without momentum
      if SGD_optim is None:
          for layer in self.layers:
            #Update weight and bias parameters
            layer.W -= lr * layer.grad_W
            layer.b -= lr * layer.grad_b

            if layer.use_batchnorm:
                #Update gamma and beta params for batch normalisation
                layer.gamma -= lr * layer.grad_gamma
                layer.beta  -= lr * layer.grad_beta

      #Update with momentum
      elif SGD_optim['Type'] == 'Momentum':
          for layer in self.layers:
              layer.v_W = (SGD_optim['Parameter'] * layer.v_W) + (lr * layer.grad_W)
              layer.v_b = (SGD_optim['Parameter'] * layer.v_b) + (lr * layer.grad_b)
              layer.W = layer.W - layer.v_W
              layer.b = layer.b - layer.v_b

              if layer.use_batchnorm:
                #For BN parameters
                layer.v_gamma= (SGD_optim['Parameter'] * layer.v_gamma) + (lr * layer.grad_gamma)
                layer.v_beta = (SGD_optim['Parameter'] * layer.v_beta) + (lr * layer.grad_beta)

                layer.gamma -= layer.v_gamma
                layer.beta -= layer.v_beta

    def fit(self, X, y, learning_rate = 0.1, epochs = 100, SGD_optim = None, batch_size = 1):
        '''
        The method to fit the MLP.
        Iterates through epochs, runs the forward process to calculate respective loss (and delta) then runs the backpropagation process to update the parameters.

        Parameters:
        X (numpy array): The input X values.
        y (numpy array): The corresponding y values (or labels).
        learning_rate (float): The learning rate to be used in the parameter updates. Set to 0.1 by default.
        epochs (int): The number of times the dataset is passed through the MLP. Set to 100 by default.
        SGD_optim (dict of str: str): A dictionary containing the type of optimization and the respective optimization algorithm parameter to be used. Set as None by default.
        batch_size (int): The size of the batches to be used in Mini-Batch learning.

        Returns:
        output_dct (dict of float): A dictionary containing the training cross-entropy loss, training accuracy and testing accuracy.
        '''

        X = np.array(X)
        y = np.array(y)
        training_loss = []
        training_accuracy = []
        testing_accuracy = []

        #Split the data into batches
        num_batches = int(np.ceil(X.shape[0] / batch_size))

        #Perform epochs on batches
        for k in range(epochs):

            loss = np.zeros(num_batches)

            current_idx = 0

            #Shuffle the data, to ensure that each epoch will have different sequence of observations
            X, y = Utils.shuffle(X, y)

            for batch_idx in range(num_batches):

                #forward pass
                y_hat = self.forward(X[current_idx : current_idx + batch_size, :], training=True)

                #backward pass
                loss[batch_idx], delta = self.CE_loss(y[current_idx : current_idx + batch_size], y_hat)

                self.backward(delta)

                #update
                self.update(learning_rate, SGD_optim)

                #Update the index based on the batch window for the next round of Mini-Batch learning.
                if (current_idx + batch_size) > X.shape[0]:
                    batch_size = X.shape[0] - current_idx
                current_idx += batch_size

            # Finalize batchnorm running statistics for inference
            for layer in self.layers:
                if hasattr(layer, 'finalize_batchnorm_stats'):
                    layer.finalize_batchnorm_stats()

            #Predict and compute metrics for each run
            test_predict = self.predict(test_df.X)
            train_predict = self.predict(train_df.X)
            test_predict = test_df.decode(test_predict)
            train_predict = train_df.decode(train_predict)
            test_accuracy = np.sum(test_predict == test_label[:, 0]) / test_predict.shape[0]
            train_accuracy = np.sum(train_predict == train_label[:, 0]) / train_predict.shape[0]

            training_loss.append(np.mean(loss))
            training_accuracy.append(train_accuracy)
            testing_accuracy.append(test_accuracy)

            output_dict = {'Training Loss': training_loss, 'Training Accuracy': training_accuracy, 'Testing Accuracy': testing_accuracy}

            print(f'Epoch {k+1}/{epochs} has been trained with Train Loss: {str(round(training_loss[-1], 4))}, Training Accuracy: {str(round(training_accuracy[-1] * 100, 4))}% and Testing Accuracy: {str(round(testing_accuracy[-1] * 100, 4))}%.')

        return output_dict

    def predict(self, x):
        '''
        The method to predict values based on input x by running forward process through the fitted MLP.

        Parameters:
        x (numpy array): The input x values on which to compute predictions.

        Returns:
        output (numpy array): The resulting predictions.
        '''

        x = np.array(x)
        output = [i for i in range(x.shape[0])]
        for i in np.arange(x.shape[0]):
            output[i] = self.forward(x[i, :], training=False)
        output = np.array(output)
        return output

### Data Preprocessing methods

In [None]:
class Preprocessing:
    '''
    The Class to apply preprocessing methods.

    Attributes:
    X (numpy array): The input array of X values.
    y (numpy array): The input array of y values (or labels).
    '''

    def __init__(self, X, y):
        self.X = X
        self.y = y
        self.predictions = None

    def normalize(self):
        '''
        Normalizes and transforms the X values based on min-max normalization.

        Parameters:
        None

        Returns:
        None
        '''

        norm_data = (self.X - np.min(self.X))/(np.max(self.X) - np.min(self.X))
        self.X = norm_data

    def standardize(self):
        '''
        Standardizes and transforms the X values based on the mean & standard deviation.

        Parameters:
        None

        Returns:
        None
        '''

        self.X = (self.X - np.mean(self.X)) / np.std(self.X)

    @staticmethod
    def label_encode(label_vector):
        '''
        Encodes the label (y) values based on one-hot encoding.
        Creates an empty list for each observation, fills it with zeros then set the index of the class label to 1.

        Parameters:
        label_vector (numpy array): The label array to be one-hot encoded.

        Returns:
        encoded_label_vector (numpy array): The resulting one-hot encoded array for the labels.
        '''

        num_classes = np.unique(label_vector).size

        encoded_label_vector = []

        for label in label_vector:
            encoded_label = np.zeros(num_classes)
            encoded_label[int(label)] = 1
            encoded_label_vector.append(encoded_label)

        encoded_label_vector = np.array(encoded_label_vector)

        return encoded_label_vector

    @staticmethod
    def decode(prediction_matrix):
        '''
        Transforms a one-hot encoded matrix back to a class label.
        Creates a zero array and fills it with the index of maximum value (i.e. 1) in the one-hot encoded array.

        Parameters:
        prediction_matrix (numpy array): The one-hot encoded label matrix.

        Returns:
        decoded_predictions (numpy array): A numpy array filled with the labels.
        '''

        decoded_predictions = np.zeros(prediction_matrix.shape[0])
        for prediction_idx, prediction_vector in enumerate(prediction_matrix):
            decoded_predictions[prediction_idx] = int(np.argmax(prediction_vector)) # we add the two index zeros because it's a nparray within a tuple

        return decoded_predictions

### Miscellaneous methods

In [None]:
class Utils:
    '''
    Class used to contain miscellaneous methods.

    Attributes:
    None
    '''

    @staticmethod
    def shuffle(X, y):
        '''
        Randomly shuffles the data.

        Parameters:
        X (numpy array): The X values to be shuffled.
        y (numpy array): The y values to be shuffled.

        Returns:
        X (numpy array), y (numpy array): The pair of the shuffled X & y numpy arrays.
        '''
        shuffled_idx = np.arange(X.shape[0])
        np.random.shuffle(shuffled_idx)
        X = X[shuffled_idx]
        y = y[shuffled_idx]

        return X, y

    @staticmethod
    def create_confusion_mat(df):
      '''
      Creates a confusion matrix based on a Preprocessing object that has X, y and predicted y values.
      Calculates the values to be placed in respective row/columns by summing the occurences in pairwise indices for the original and predicted y values.

      Parameters:
      df (Preprocessing): A Preprocessing object with original X, original y and predicted y values.

      Returns:
      confusion_mat (pandas DataFrame): A confusion matrix represented as a pandas DataFrame, where the rows (indexes) reflect predicted values and the columns reflect actual values.
      '''

      confusion_mat = pd.DataFrame(0, index = np.unique(df.y) , columns = np.unique(df.y))
      for i in range(0, len(df.y)):
        confusion_mat[int(df.y[i])].iloc[int(df.predictions[i])] += 1
      return confusion_mat

    @staticmethod
    def confusion_mat_measures(confusion_matrix):
      '''
      Produces a pandas DataFrame with Precision, Recall and F1 measures per class.
      First calculates True Positive (TP), False Negative (FN), False Positive (FP) and True Negative (TN) values then calculates Precision, Recall and F1 values and stores them in a DataFrame.

      Parameters:
      confusion_matrix (pandas DataFrame): A confusion matrix as a Pandas DataFrame Object.

      Returns:
      scores_df (pandas Dataframe): A DataFrame with labels as rows (indexes) and Precision, Recall and F1 scores as columns.
      '''

      scores_df = pd.DataFrame(0, index = confusion_matrix.index, columns = ['Precision', 'Recall', 'F1'])
      for  i in confusion_matrix.index:
        TP = confusion_matrix[i][i]
        FN = np.array(confusion_matrix[i].iloc[0:i].values.tolist() + confusion_matrix[i].iloc[i+1:].values.tolist()).sum()
        FP = np.array(confusion_matrix.iloc[i][0:i].values.tolist() + confusion_matrix.iloc[i][i + 1:].values.tolist()).sum()
        TN = confusion_matrix.sum().sum() - TP - FN - FP

        Precision = TP / (TP + FP)
        Recall = TP / (TP + FN)
        F1 = (2 * Precision * Recall) / (Precision + Recall)

        scores_df.loc[i, 'Precision'] = Precision
        scores_df.loc[i, 'Recall'] = Recall
        scores_df.loc[i, 'F1'] = F1

      scores_df.index.name = 'Label'

      return scores_df

### Testing Model

In [None]:
# Instantiating our data and pre-processing it as required
train_df = Preprocessing(train_data, train_label)
test_df = Preprocessing(test_data, test_label)

# Standardize X matrix (features)
# train_df.normalize()
# test_df.normalize()
train_df.standardize()
test_df.standardize()

# Perform one-hot encoding for our label vector (ONLY ON TRAIN)
train_df.y = train_df.label_encode(train_df.y)


# Hyperparameters
LAYER_NEURONS = [128, 150, 10]
LAYER_ACTIVATION_FUNCS = [None, 'leakyrelu', 'softmax']
LEARNING_RATE = 0.005
EPOCHS = 300
DROPOUT_PROB = 0.5
SGD_OPTIM = None
BATCH_SIZE = 100
WEIGHT_DECAY = 0.98
BATCH_NORMAL = False
INITIALISATION = 'xavier'

# Instantiate the multi-layer neural network
nn = MLP(LAYER_NEURONS, LAYER_ACTIVATION_FUNCS, weight_decay = WEIGHT_DECAY, initialisation=INITIALISATION, batch_normal=BATCH_NORMAL)

# Perform fitting using the training dataset
t0 = time.time()
trial1 = nn.fit(train_df.X, train_df.y, learning_rate = LEARNING_RATE, epochs = EPOCHS, SGD_optim = SGD_OPTIM, batch_size=BATCH_SIZE )
t1 = time.time()
print(f"============= Model Build Done =============")
print(f"Time taken to build model: {round(t1 - t0, 4)} seconds.")

  encoded_label[int(label)] = 1


Epoch 1/300 has been trained with Train Loss: 1.7924, Training Accuracy: 39.218% and Testing Accuracy: 38.51%.


KeyboardInterrupt: 

### Performance Figures

In [None]:
fig, ax = plt.subplots(2, 1, figsize = (20, 10))
ax[0].plot(trial1['Training Loss'])
ax[0].title.set_text("Cross-Entropy Loss over Epoch")
ax[0].set_xlabel('Epoch')
ax[0].set_ylabel('Loss')
ax[1].plot(trial1['Training Accuracy'], label = "Training Accuracy")
ax[1].plot(trial1['Testing Accuracy'], label = "Testing Accuracy")
ax[1].title.set_text("Training & Testing Accuracy over Epoch")
ax[1].set_xlabel("Epoch")
ax[1].set_ylabel("Accuracy")
ax[1].legend()
plt.show()

In [None]:
test_df.predictions = nn.predict(test_df.X)
train_df.predictions = nn.predict(train_df.X)
test_df.predictions = test_df.decode(test_df.predictions)
train_df.predictions = train_df.decode(train_df.predictions)

CM = Utils.create_confusion_mat(test_df)

measures = Utils.confusion_mat_measures(CM)


# Accuracy & Performance Metrics
test_accuracy = np.sum(test_df.predictions == test_df.y[:, 0]) / test_df.predictions.shape[0]
train_accuracy = np.sum(train_df.predictions == train_label[:, 0]) / train_df.predictions.shape[0]
F1_avg = measures['F1'].mean()
CELoss = trial1['Training Loss'][-1]
print(f'Final Cross-Entropy Training Loss: {round(CELoss, 4)}.')
print(f'Final Train accuracy: {round(train_accuracy * 100, 4)}%.')
print(f'Final Test accuracy: {round(test_accuracy * 100, 4)}%.')
print(f'Final Average F1 Score: {round(F1_avg, 4)}.')


Final Cross-Entropy Training Loss: 1.4412.
Final Train accuracy: 48.556%.
Final Test accuracy: 44.64%.
Final Average F1 Score: 0.4407.


  confusion_mat[int(df.y[i])].iloc[int(df.predictions[i])] += 1
You are setting values through chained assignment. Currently this works in certain cases, but when using Copy-on-Write (which will become the default behaviour in pandas 3.0) this will never work to update the original DataFrame or Series, because the intermediate object on which we are setting values will behave as a copy.
A typical example is when you are setting values in a column of a DataFrame, like:

df["col"][row_indexer] = value

Use `df.loc[row_indexer, "col"] = values` instead, to perform the assignment in a single step and ensure this keeps updating the original `df`.

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

  confusion_mat[int(df.y[i])].iloc[int(df.predictions[i])] += 1
  scores_df.loc[i, 'Precision'] = Precision
  scores_df.loc[i, 'Recall'] = Recall
  scores_df.loc[i, 'F1'] = F1


### Ablation testing

In [None]:
import time
import numpy as np
import datetime

train_df = Preprocessing(train_data, train_label)
test_df = Preprocessing(test_data, test_label)

train_df.standardize()
test_df.standardize()

train_df.y = train_df.label_encode(train_df.y)


# --- Baseline/Default Hyperparameters ---
BASELINE_NUM_HIDDEN_LAYERS = 3
BASELINE_NEURONS_PER_LAYER = 100
BASELINE_HIDDEN_ACTIVATION = 'relu'
BASELINE_SGD_OPTIM = None
BASELINE_WEIGHT_DECAY = 0.98
BASELINE_INITIALISATION = 'xavier'
BASELINE_BATCH_SIZE = 100
BASELINE_DROPOUT_PROB = 0.5
BASELINE_LEARNING_RATE = 0.005
DROPOUT_PROB = BASELINE_DROPOUT_PROB
EPOCHS = 300
OUTPUT_ACTIVATION = 'softmax'
INPUT_ACTIVATION = None


input_layer_size = train_df.X.shape[1]
try:
    output_layer_size = len(np.unique(train_label))
except NameError:
     try: output_layer_size = len(np.unique(train_df.y))
     except AttributeError: output_layer_size = train_df.y.shape[1]

print(f"Data Shapes: Input X={train_df.X.shape}, Train y={train_df.y.shape}")
print(f"Deduced/Set Parameters: Input Size={input_layer_size}, Output Size={output_layer_size}")

BASELINE_HIDDEN_NEURONS = [BASELINE_NEURONS_PER_LAYER] * BASELINE_NUM_HIDDEN_LAYERS
BASELINE_LAYER_NEURONS = [input_layer_size] + BASELINE_HIDDEN_NEURONS + [output_layer_size]
BASELINE_ACTIVATION_FUNCS = [INPUT_ACTIVATION] + ([BASELINE_HIDDEN_ACTIVATION] * BASELINE_NUM_HIDDEN_LAYERS) + [OUTPUT_ACTIVATION]


# --- Storage for Results ---
neuron_study_results = {}
activation_study_results = {}
num_layers_study_results = {}
sgd_optim_study_results = {}
weight_decay_study_results = {}
batch_size_study_results = {}
dropout_prob_study_results = {}
learning_rate_study_results = {}

# --- Helper Function for Metric Calculation (to reduce repetition) ---
def calculate_metrics(nn_model, trial_hist, test_dataframe, train_dataframe):
    """Calculates and returns key metrics after model training."""
    print("    Calculating performance metrics...")
    test_accuracy, F1_avg, CELoss = -1.0, -1.0, -1.0 # Defaults

    try:
        test_preds_raw = nn_model.predict(test_dataframe.X)
        # Decode predictions (ensure decode methods exist and work)
        test_preds_decoded = test_dataframe.decode(test_preds_raw)
        try:
            # Using the structure from your snippet:
            test_accuracy = np.sum(test_preds_decoded == test_dataframe.y[:, 0]) / test_preds_decoded.shape[0]
            original_predictions_attr = getattr(test_dataframe, 'predictions', None)
            test_dataframe.predictions = test_preds_decoded
            CM = Utils.create_confusion_mat(test_dataframe)
            measures = Utils.confusion_mat_measures(CM)
            F1_avg = measures['F1'].mean()
            if original_predictions_attr is not None: # Restore original
                 test_dataframe.predictions = original_predictions_attr
            else: # Clean up
                delattr(test_dataframe, 'predictions')


        except Exception as metric_e:
            print(f"    ERROR calculating accuracy/F1: {metric_e}")
        try:
            if isinstance(trial_hist, dict) and 'Training Loss' in trial_hist and len(trial_hist['Training Loss']) > 0:
                 CELoss = trial_hist['Training Loss'][-1]
            else:
                 print(f"    WARNING: Could not find 'Training Loss' in trial_hist or it's empty/not a dict. trial_hist type: {type(trial_hist)}")
        except Exception as loss_e:
             print(f"    ERROR accessing training loss: {loss_e}")

        print(f"    Metrics: Test Accuracy={test_accuracy:.4f}, Avg F1={F1_avg:.4f}, Final Train Loss={CELoss:.4f}")
    except AttributeError as ae:
         print(f"    ERROR: Missing method/attribute like '.predict', '.decode', or '.y'? Error: {ae}")
    except Exception as e:
        print(f"    ERROR during metrics calculation: {e}")

    return test_accuracy, F1_avg, CELoss

# ================================================================
# PHASE 1: Neurons Per Hidden Layer
# ================================================================
neurons_per_layer_to_test = [50, 100, 150]
fixed_activation_funcs_p1 = [INPUT_ACTIVATION] + ([BASELINE_HIDDEN_ACTIVATION] * BASELINE_NUM_HIDDEN_LAYERS) + [OUTPUT_ACTIVATION]
for i, neurons_per_layer in enumerate(neurons_per_layer_to_test):
    current_hidden_neurons = [neurons_per_layer] * BASELINE_NUM_HIDDEN_LAYERS
    current_layer_neurons = [input_layer_size] + current_hidden_neurons + [output_layer_size]
    config_key = f"neurons_{'_'.join(map(str, current_hidden_neurons))}"
    print(f"\n--- Phase 1 / Trial {i+1}: Testing Neurons = {current_layer_neurons} ---")
    try:
        nn = MLP(current_layer_neurons, fixed_activation_funcs_p1, weight_decay=BASELINE_WEIGHT_DECAY, initialisation=BASELINE_INITIALISATION)
        t0 = time.time()
        trial_result = nn.fit(train_df.X, train_df.y, learning_rate=BASELINE_LEARNING_RATE, epochs=EPOCHS, SGD_optim=BASELINE_SGD_OPTIM, batch_size=BASELINE_BATCH_SIZE)
        t1 = time.time(); time_taken = round(t1 - t0, 4)
        print(f"    Fit completed in {time_taken} seconds.")
        test_accuracy, F1_avg, CELoss = calculate_metrics(nn, trial_result, test_df, train_df)
        active_dropout_prob = DROPOUT_PROB
        neuron_study_results[config_key] = { 'time_taken': time_taken, 'test_accuracy': test_accuracy, 'F1_avg': F1_avg, 'final_train_loss': CELoss, 'layer_config': current_layer_neurons, 'activation_config': fixed_activation_funcs_p1, 'sgd_optim_tested': BASELINE_SGD_OPTIM, 'weight_decay_tested': BASELINE_WEIGHT_DECAY, 'batch_size_tested': BASELINE_BATCH_SIZE, 'dropout_prob_tested': active_dropout_prob, 'learning_rate_tested': BASELINE_LEARNING_RATE, 'history': trial_result } # Store history too if needed
    except Exception as e: print(f"    ERROR: {e}"); neuron_study_results[config_key] = {'error': str(e)}
print("\n" + "="*60 + "\nPhase 1 Complete.\n" + "="*60 + "\n")

# ================================================================
# PHASE 2: Activation Functions
# ================================================================
hidden_activations_to_test = ['relu', 'leakyrelu', 'gelu']
fixed_layer_neurons_p2 = BASELINE_LAYER_NEURONS
for i, activation_func in enumerate(hidden_activations_to_test):
    current_activation_funcs = [INPUT_ACTIVATION] + ([activation_func] * BASELINE_NUM_HIDDEN_LAYERS) + [OUTPUT_ACTIVATION]
    config_key = f"activation_{activation_func}"
    print(f"\n--- Phase 2 / Trial {i+1}: Testing Activation = '{activation_func}' ---")
    try:
        nn = MLP(fixed_layer_neurons_p2, current_activation_funcs, weight_decay=BASELINE_WEIGHT_DECAY, initialisation=BASELINE_INITIALISATION)
        t0 = time.time()
        trial_result = nn.fit(train_df.X, train_df.y, learning_rate=BASELINE_LEARNING_RATE, epochs=EPOCHS, SGD_optim=BASELINE_SGD_OPTIM, batch_size=BASELINE_BATCH_SIZE)
        t1 = time.time(); time_taken = round(t1 - t0, 4)
        print(f"    Fit completed in {time_taken} seconds.")
        test_accuracy, F1_avg, CELoss = calculate_metrics(nn, trial_result, test_df, train_df)
        active_dropout_prob = DROPOUT_PROB
        activation_study_results[config_key] = { 'time_taken': time_taken, 'test_accuracy': test_accuracy, 'F1_avg': F1_avg, 'final_train_loss': CELoss, 'layer_config': fixed_layer_neurons_p2, 'activation_config': current_activation_funcs, 'sgd_optim_tested': BASELINE_SGD_OPTIM, 'weight_decay_tested': BASELINE_WEIGHT_DECAY, 'batch_size_tested': BASELINE_BATCH_SIZE, 'dropout_prob_tested': active_dropout_prob, 'learning_rate_tested': BASELINE_LEARNING_RATE, 'history': trial_result }
    except Exception as e: print(f"    ERROR: {e}"); activation_study_results[config_key] = {'error': str(e)}
print("\n" + "="*60 + "\nPhase 2 Complete.\n" + "="*60 + "\n")

# ================================================================
# PHASE 3: Number of Hidden Layers
# ================================================================
num_hidden_layers_to_test = [1, 3, 5]
for i, num_hidden in enumerate(num_hidden_layers_to_test):
    current_hidden_neurons = [BASELINE_NEURONS_PER_LAYER] * num_hidden
    current_layer_neurons = [input_layer_size] + current_hidden_neurons + [output_layer_size]
    current_activation_funcs = [INPUT_ACTIVATION] + ([BASELINE_HIDDEN_ACTIVATION] * num_hidden) + [OUTPUT_ACTIVATION]
    config_key = f"num_hidden_{num_hidden}"
    print(f"\n--- Phase 3 / Trial {i+1}: Testing Num Hidden Layers = {num_hidden} ---")
    try:
        nn = MLP(current_layer_neurons, current_activation_funcs, weight_decay=BASELINE_WEIGHT_DECAY, initialisation=BASELINE_INITIALISATION)
        t0 = time.time()
        trial_result = nn.fit(train_df.X, train_df.y, learning_rate=BASELINE_LEARNING_RATE, epochs=EPOCHS, SGD_optim=BASELINE_SGD_OPTIM, batch_size=BASELINE_BATCH_SIZE)
        t1 = time.time(); time_taken = round(t1 - t0, 4)
        print(f"    Fit completed in {time_taken} seconds.")
        test_accuracy, F1_avg, CELoss = calculate_metrics(nn, trial_result, test_df, train_df)
        active_dropout_prob = DROPOUT_PROB
        num_layers_study_results[config_key] = { 'time_taken': time_taken, 'test_accuracy': test_accuracy, 'F1_avg': F1_avg, 'final_train_loss': CELoss, 'layer_config': current_layer_neurons, 'activation_config': current_activation_funcs, 'sgd_optim_tested': BASELINE_SGD_OPTIM, 'weight_decay_tested': BASELINE_WEIGHT_DECAY, 'batch_size_tested': BASELINE_BATCH_SIZE, 'dropout_prob_tested': active_dropout_prob, 'learning_rate_tested': BASELINE_LEARNING_RATE, 'history': trial_result }
    except Exception as e: print(f"    ERROR: {e}"); num_layers_study_results[config_key] = {'error': str(e)}
print("\n" + "="*60 + "\nPhase 3 Complete.\n" + "="*60 + "\n")

# ================================================================
# PHASE 4: SGD Optimization Parameter (Using Dictionary Input, with Metrics)
# ================================================================

sgd_optim_values_to_test = [None, {"Type": "Momentum", "Parameter": 0.5}, {"Type": "Momentum", "Parameter": 0.0}]
for i, sgd_optim_value in enumerate(sgd_optim_values_to_test):
    if sgd_optim_value is None:
        config_key = "sgd_optim_None"
    elif isinstance(sgd_optim_value, dict):
        optim_type = sgd_optim_value.get("Type", "Unknown")
        optim_param = sgd_optim_value.get("Parameter", "Unknown")
        config_key = f"sgd_optim_{optim_type}_{optim_param}"
    else:
        config_key = f"sgd_optim_{str(sgd_optim_value)}"

    print(f"\n--- Phase 4 / Trial {i+1}: Testing SGD_optim = {sgd_optim_value} ---")
    try:
        nn = MLP(BASELINE_LAYER_NEURONS, BASELINE_ACTIVATION_FUNCS,
                 weight_decay=BASELINE_WEIGHT_DECAY, initialisation=BASELINE_INITIALISATION)
        t0 = time.time()
        trial_result = nn.fit(train_df.X, train_df.y,
                            learning_rate=BASELINE_LEARNING_RATE, epochs=EPOCHS,
                            SGD_optim=sgd_optim_value, # Passing None or Dict
                            batch_size=BASELINE_BATCH_SIZE)
        t1 = time.time()
        time_taken = round(t1 - t0, 4)
        test_accuracy, F1_avg, CELoss = calculate_metrics(nn, trial_result, test_df, train_df)
        active_dropout_prob = DROPOUT_PROB
        sgd_optim_study_results[config_key] = {
            'time_taken': time_taken,
            'test_accuracy': test_accuracy,
            'F1_avg': F1_avg,
            'final_train_loss': CELoss,
            'layer_config': BASELINE_LAYER_NEURONS,
            'activation_config': BASELINE_ACTIVATION_FUNCS,
            'sgd_optim_tested': sgd_optim_value,
            'weight_decay_tested': BASELINE_WEIGHT_DECAY,
            'batch_size_tested': BASELINE_BATCH_SIZE,
            'dropout_prob_tested': active_dropout_prob,
            'learning_rate_tested': BASELINE_LEARNING_RATE,
            'history': trial_result
        }
    except Exception as e:
        print(f"    ERROR: {e}")
        sgd_optim_study_results[config_key] = {'error': str(e)}
print("\n" + "="*60 + "\nPhase 4 Complete.\n" + "="*60 + "\n")

# ================================================================
# PHASE 5: Weight Decay Parameter
# ================================================================

weight_decay_values_to_test = [0.98, 0.9, 1]
for i, weight_decay_value in enumerate(weight_decay_values_to_test):
    weight_decay_key_str = 'None' if weight_decay_value is None else str(weight_decay_value)
    config_key = f"weight_decay_{weight_decay_key_str}"
    print(f"\n--- Phase 5 / Trial {i+1}: Testing Weight Decay = {weight_decay_value} ---")
    try:
        nn = MLP(BASELINE_LAYER_NEURONS, BASELINE_ACTIVATION_FUNCS, weight_decay=weight_decay_value, initialisation=BASELINE_INITIALISATION)
        t0 = time.time()
        trial_result = nn.fit(train_df.X, train_df.y, learning_rate=BASELINE_LEARNING_RATE, epochs=EPOCHS, SGD_optim=BASELINE_SGD_OPTIM, batch_size=BASELINE_BATCH_SIZE)
        t1 = time.time(); time_taken = round(t1 - t0, 4)
        print(f"    Fit completed in {time_taken} seconds.")
        test_accuracy, F1_avg, CELoss = calculate_metrics(nn, trial_result, test_df, train_df)
        active_dropout_prob = DROPOUT_PROB
        weight_decay_study_results[config_key] = { 'time_taken': time_taken, 'test_accuracy': test_accuracy, 'F1_avg': F1_avg, 'final_train_loss': CELoss, 'layer_config': BASELINE_LAYER_NEURONS, 'activation_config': BASELINE_ACTIVATION_FUNCS, 'sgd_optim_tested': BASELINE_SGD_OPTIM, 'weight_decay_tested': weight_decay_value, 'batch_size_tested': BASELINE_BATCH_SIZE, 'dropout_prob_tested': active_dropout_prob, 'learning_rate_tested': BASELINE_LEARNING_RATE, 'history': trial_result }
    except Exception as e: print(f"    ERROR: {e}"); weight_decay_study_results[config_key] = {'error': str(e)}
print("\n" + "="*60 + "\nPhase 5 Complete.\n" + "="*60 + "\n")

# ================================================================
# PHASE 6: Batch Size
# ================================================================
batch_size_values_to_test = [100, 50, 1000]
for i, batch_size_value in enumerate(batch_size_values_to_test):
    config_key = f"batch_size_{batch_size_value}"
    print(f"\n--- Phase 6 / Trial {i+1}: Testing Batch Size = {batch_size_value} ---")
    try:
        nn = MLP(BASELINE_LAYER_NEURONS, BASELINE_ACTIVATION_FUNCS, weight_decay=BASELINE_WEIGHT_DECAY, initialisation=BASELINE_INITIALISATION)
        t0 = time.time()
        trial_result = nn.fit(train_df.X, train_df.y, learning_rate=BASELINE_LEARNING_RATE, epochs=EPOCHS, SGD_optim=BASELINE_SGD_OPTIM, batch_size=batch_size_value)
        t1 = time.time(); time_taken = round(t1 - t0, 4)
        print(f"    Fit completed in {time_taken} seconds.")
        test_accuracy, F1_avg, CELoss = calculate_metrics(nn, trial_result, test_df, train_df)
        active_dropout_prob = DROPOUT_PROB
        batch_size_study_results[config_key] = { 'time_taken': time_taken, 'test_accuracy': test_accuracy, 'F1_avg': F1_avg, 'final_train_loss': CELoss, 'layer_config': BASELINE_LAYER_NEURONS, 'activation_config': BASELINE_ACTIVATION_FUNCS, 'sgd_optim_tested': BASELINE_SGD_OPTIM, 'weight_decay_tested': BASELINE_WEIGHT_DECAY, 'batch_size_tested': batch_size_value, 'dropout_prob_tested': active_dropout_prob, 'learning_rate_tested': BASELINE_LEARNING_RATE, 'history': trial_result }
    except Exception as e: print(f"    ERROR: {e}"); batch_size_study_results[config_key] = {'error': str(e)}
print("\n" + "="*60 + "\nPhase 6 Complete.\n" + "="*60 + "\n")

# ================================================================
# PHASE 7: Dropout Probability (using global variable)
# ================================================================

dropout_prob_values_to_test = [0.5, 0.1, 0.9]
for i, dropout_prob_value in enumerate(dropout_prob_values_to_test):
    config_key = f"dropout_prob_{dropout_prob_value}"
    print(f"\n--- Phase 7 / Trial {i+1}: Setting GLOBAL DROPOUT_PROB = {dropout_prob_value} ---")
    DROPOUT_PROB = dropout_prob_value # Set global variable
    try:
        nn = MLP(BASELINE_LAYER_NEURONS, BASELINE_ACTIVATION_FUNCS, weight_decay=BASELINE_WEIGHT_DECAY, initialisation=BASELINE_INITIALISATION)
        t0 = time.time()
        trial_result = nn.fit(train_df.X, train_df.y, learning_rate=BASELINE_LEARNING_RATE, epochs=EPOCHS, SGD_optim=BASELINE_SGD_OPTIM, batch_size=BASELINE_BATCH_SIZE)
        t1 = time.time(); time_taken = round(t1 - t0, 4)
        print(f"    Fit completed in {time_taken} seconds.")
        test_accuracy, F1_avg, CELoss = calculate_metrics(nn, trial_result, test_df, train_df)
        dropout_prob_study_results[config_key] = { 'time_taken': time_taken, 'test_accuracy': test_accuracy, 'F1_avg': F1_avg, 'final_train_loss': CELoss, 'layer_config': BASELINE_LAYER_NEURONS, 'activation_config': BASELINE_ACTIVATION_FUNCS, 'sgd_optim_tested': BASELINE_SGD_OPTIM, 'weight_decay_tested': BASELINE_WEIGHT_DECAY, 'batch_size_tested': BASELINE_BATCH_SIZE, 'dropout_prob_tested': dropout_prob_value, 'learning_rate_tested': BASELINE_LEARNING_RATE, 'history': trial_result }
    except Exception as e: print(f"    ERROR: {e}"); dropout_prob_study_results[config_key] = {'error': str(e)}

DROPOUT_PROB = BASELINE_DROPOUT_PROB
print(f"\nGlobal DROPOUT_PROB reset to baseline: {DROPOUT_PROB} after Phase 7.")
print("\n" + "="*60 + "\nPhase 7 Complete.\n" + "="*60 + "\n")

# ================================================================
# PHASE 8: Learning Rate
# ================================================================
learning_rate_values_to_test = [0.005, 0.001, 0.01]
for i, learning_rate_value in enumerate(learning_rate_values_to_test):
    config_key = f"learning_rate_{learning_rate_value}"
    print(f"\n--- Phase 8 / Trial {i+1}: Testing Learning Rate = {learning_rate_value} ---")
    try:
        nn = MLP(BASELINE_LAYER_NEURONS, BASELINE_ACTIVATION_FUNCS, weight_decay=BASELINE_WEIGHT_DECAY, initialisation=BASELINE_INITIALISATION)
        t0 = time.time()
        trial_result = nn.fit(train_df.X, train_df.y, learning_rate=learning_rate_value, epochs=EPOCHS, SGD_optim=BASELINE_SGD_OPTIM, batch_size=BASELINE_BATCH_SIZE)
        t1 = time.time(); time_taken = round(t1 - t0, 4)
        print(f"    Fit completed in {time_taken} seconds.")
        test_accuracy, F1_avg, CELoss = calculate_metrics(nn, trial_result, test_df, train_df)
        active_dropout_prob = DROPOUT_PROB # Capture baseline dropout used
        learning_rate_study_results[config_key] = { 'time_taken': time_taken, 'test_accuracy': test_accuracy, 'F1_avg': F1_avg, 'final_train_loss': CELoss, 'layer_config': BASELINE_LAYER_NEURONS, 'activation_config': BASELINE_ACTIVATION_FUNCS, 'sgd_optim_tested': BASELINE_SGD_OPTIM, 'weight_decay_tested': BASELINE_WEIGHT_DECAY, 'batch_size_tested': BASELINE_BATCH_SIZE, 'dropout_prob_tested': active_dropout_prob, 'learning_rate_tested': learning_rate_value, 'history': trial_result }
    except Exception as e: print(f"    ERROR: {e}"); learning_rate_study_results[config_key] = {'error': str(e)}
print("\n" + "="*60 + "\nPhase 8 Complete.\n" + "="*60 + "\n")

# ================================================================
# WRITE ALL RESULTS TO FILE
# ================================================================
print("All Independent Ablation Studies Finished. Writing results to file...")

# Get current timestamp
now = datetime.datetime.now()
timestamp_str = now.strftime("%Y-%m-%d_%H-%M-%S")
results_filename = f"results_{timestamp_str}.txt"

all_results = [
    ("Phase 1: Neurons Per Layer", neuron_study_results),
    ("Phase 2: Activation Function", activation_study_results),
    ("Phase 3: Number of Hidden Layers", num_layers_study_results),
    ("Phase 4: SGD Optimization", sgd_optim_study_results),
    ("Phase 5: Weight Decay", weight_decay_study_results),
    ("Phase 6: Batch Size", batch_size_study_results),
    ("Phase 7: Dropout Probability", dropout_prob_study_results),
    ("Phase 8: Learning Rate", learning_rate_study_results),
]

try:
    with open(results_filename, 'w') as f:
        f.write(f"Ablation Study Results - Generated on: {now.strftime('%Y-%m-%d %H:%M:%S')}\n")
        f.write("=" * 80 + "\n")

        # Write Baseline Configuration
        f.write("Baseline Configuration:\n")
        f.write(f"  Num Hidden Layers: {BASELINE_NUM_HIDDEN_LAYERS}\n")
        f.write(f"  Neurons Per Layer: {BASELINE_NEURONS_PER_LAYER}\n")
        f.write(f"  Hidden Activation: {BASELINE_HIDDEN_ACTIVATION}\n")
        f.write(f"  SGD Optim: {BASELINE_SGD_OPTIM}\n")
        f.write(f"  Weight Decay: {BASELINE_WEIGHT_DECAY}\n")
        f.write(f"  Initialisation: {BASELINE_INITIALISATION}\n")
        f.write(f"  Batch Size: {BASELINE_BATCH_SIZE}\n")
        f.write(f"  Dropout Prob: {BASELINE_DROPOUT_PROB}\n")
        f.write(f"  Learning Rate: {BASELINE_LEARNING_RATE}\n")
        f.write(f"  Epochs: {EPOCHS}\n")
        f.write("=" * 80 + "\n\n")


        for phase_name, results_dict in all_results:
            f.write(f"--- {phase_name} ---\n\n")
            if not results_dict:
                f.write("  No results recorded for this phase.\n\n")
                continue

            # Sort results by config key for consistent order (optional)
            sorted_items = sorted(results_dict.items())

            for config_key, data in sorted_items:
                f.write(f"Config Key: {config_key}\n")
                if 'error' in data:
                    f.write(f"  Status: ERROR\n")
                    f.write(f"  Error Msg: {data['error']}\n")
                else:
                    # Safely get metrics with default -1 if missing
                    time_val = data.get('time_taken', -1)
                    acc = data.get('test_accuracy', -1)
                    f1 = data.get('F1_avg', -1)
                    loss = data.get('final_train_loss', -1)

                    f.write(f"  Status: Success\n")
                    f.write(f"  Time Taken (s): {time_val:.2f}\n")
                    f.write(f"  Test Accuracy: {acc:.4f}\n")
                    f.write(f"  Average F1 Score: {f1:.4f}\n")
                    f.write(f"  Final Train Loss: {loss:.4f}\n")

                f.write("-" * 40 + "\n") # Separator between trials
            f.write("\n") # Space after phase section

    print(f"Results successfully written to {results_filename}")

except IOError as e:
    print(f"ERROR: Could not write results to file {results_filename}. Error: {e}")
except Exception as e:
    print(f"An unexpected error occurred during file writing: {e}")