In [None]:
import numpy as np
from sklearn.utils import shuffle
from sklearn.model_selection import train_test_split
import pandas as pd
from matplotlib import pyplot as plt
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
import warnings
warnings.filterwarnings('ignore', 'overflow')

GENERAL FUNCTIONS

In [None]:
# Define default parameters
default_config = {
    'layers': [784, 256, 64, 10],
    'learning_rate': 0.1,
    'batch_size': 32,
    'iterations': 30
}

top3_models = []  # a list to store the top 3 models

In [None]:
def run_model(X_train, y_train, X_val, y_val, X_test, y_test, model_config):
    """
    Run a neural network model with specific configuration.
    :param X_train: Training dataset features
    :param y_train: Training dataset labels
    :param X_val: Validation dataset features
    :param y_val: Validation dataset labels
    :param X_test: Test dataset features
    :param y_test: Test dataset labels
    :param model_config: Model configuration parameters
    :return: Dictionary with model configuration and scores
    """
    layers = model_config.get('layers', default_config['layers'])
    learning_rate = model_config.get('learning_rate', default_config['learning_rate'])
    batch_size = model_config.get('batch_size', default_config['batch_size'])
    iterations = model_config.get('iterations', default_config['iterations'])
    print(f"Model configuration: layers = {layers}, learning_rate = {learning_rate}, batch_size = {batch_size}, iterations = {iterations}")
    
    nn = NeuralNetwork(layers=layers, batch_size=batch_size, learning_rate=learning_rate, iterations=iterations)
    nn.fit(X_train, y_train)

    train_score = nn.score(X_train, y_train)
    val_score = nn.score(X_val, y_val)
    test_score = nn.score(X_test, y_test)
    # Plot confusion matrix for the test set
    nn.plot_confusion_matrix(X_test, y_test)

    return {
        'layers': layers,
        'learning_rate': learning_rate,
        'batch_size': batch_size,
        'iterations': iterations,
        'train_score': train_score,
        'val_score': val_score,
        'test_score': test_score
    }

def test_configuration_set(X_train, y_train, X_val, y_val, X_test, y_test, config_set, param_name):
    """
    Test different configurations for a specific isolated parameter
    :param X_train: Training dataset features
    :param y_train: Training dataset labels
    :param X_val: Validation dataset features
    :param y_val: Validation dataset labels
    :param X_test: Test dataset features
    :param y_test: Test dataset labels
    :param config_set: Set of configurations to test
    :param param_name: Name of the parameter being tested
    :return: List of results for each configuration
    """
    results = []
    global top3_models  # declare as global 
    for config in config_set:
        model_config = default_config.copy()
        model_config.update(config)  
        print(f"The isolated parameter is: {param_name}")
        result = run_model(X_train, y_train, X_val, y_val, X_test, y_test, model_config)
        results.append(result)
        print(f"Validation Score: {result['val_score']}")
        print(f"Test Score: {result['test_score']}")
        print(f"\n\n\n                               ~~~~~~~~~~~~~~~~~~~~~~~~ NEXT MODEL ~~~~~~~~~~~~~~~~~~~~~~~~\n")
        
      
        if len(top3_models) < 3:
            top3_models.append(result)
        else:
            # find the minimum score in the top 3 models to remove
            min_score_model = min(top3_models, key=lambda x: x['val_score'])
            if result['val_score'] > min_score_model['val_score']:
                top3_models.remove(min_score_model)
                top3_models.append(result)
        
    return results

PLOT FUNCTIONS

In [None]:
# Plot the test scores for different configurations
def plot_test_scores(results, param_name):
    param_values = []
    test_scores = []

    for result in results:
        param_values.append(result[param_name])
        test_scores.append(result['test_score'])

    # Sort results by test score in descending order
    sorted_indices = sorted(range(len(test_scores)), key=lambda i: test_scores[i], reverse=False)
    param_values = [param_values[i] for i in sorted_indices]
    test_scores = [test_scores[i] for i in sorted_indices]

    y = range(len(param_values))
    
    plt.figure(figsize=(12, 6))
    bar_height = 0.4

    bars = plt.barh(y, test_scores, color='green', height=bar_height, label='Test Score', align='center')

    plt.ylabel(param_name)
    plt.xlabel('Test Accuracy')
    plt.xlim(0.85, 1.0)
    other_params = ", ".join([f"{k.capitalize().replace('_', ' ')}: {v}" for k, v in default_config.items() if k != param_name])
    plt.title(f'Test Accuracy vs {param_name}')
    plt.suptitle(other_params, fontsize=10)
    plt.yticks(y, param_values)
    plt.legend()

    for bar in bars:
        xval = bar.get_width()
        plt.text(xval + 0.01, bar.get_y() + bar.get_height()/2, round(xval, 4), ha='left', va='center')

    plt.show()
    
    
# create string of other parameters for the plot title
def create_other_params_string(config, param_name):
    other_params = {k: v for k, v in config.items() if k != param_name}
    return ", ".join([f"{k.capitalize().replace('_', ' ')}: {v}" for k, v in other_params.items()])    

In [None]:
class NeuralNetwork:
    def __init__(self, layers, batch_size, learning_rate, iterations):
        """
        Initialize the neural network with given parameters
        :param layers: List of layer sizes for each layer
        :param batch_size: Size of each training batch
        :param learning_rate: Learning rate for gradient descent
        :param iterations: Number of iterations for training
        """
        self.layers = layers
        self.batch_size = batch_size
        self.learning_rate = learning_rate
        self.iterations = iterations
        self.weights = self.init_weights()

    def init_weights(self):
        """
        Initialize the weights and biases of the network
        :return: Dictionary of weights and biases
        """
        weights = {}
        for i in range(1, len(self.layers)):
            weights[f'W{i}'] = np.random.randn(self.layers[i], self.layers[i - 1]) * np.sqrt(2. / self.layers[i - 1])
            weights[f'b{i}'] = np.zeros((self.layers[i], 1))
        return weights

   
    # ReLU activation function
    def ReLU(self, Z):
        return np.maximum(Z, 0)

    # softmax activation function
    def softmax(self, Z):
        A = np.exp(Z) / np.sum(np.exp(Z), axis=0)
        return A


    def forward_prop(self, X):
        """
        Perform forward propagation through the network
        :param X: Input data
        :return: a dictionary with previous results
        """
        memory = {'A0': X}
        A = X
        for i in range(1, len(self.layers) - 1):
            Z = self.weights[f'W{i}'] @ A + self.weights[f'b{i}']
            A = self.ReLU(Z)
            memory[f'Z{i}'] = Z
            memory[f'A{i}'] = A
        Z = self.weights[f'W{len(self.layers) - 1}'] @ A + self.weights[f'b{len(self.layers) - 1}']
        A = self.softmax(Z)
        memory[f'Z{len(self.layers) - 1}'] = Z
        memory[f'A{len(self.layers) - 1}'] = A
        return memory

    # derivative of ReLU activation function
    def ReLU_deriv(self, Z):
        return Z > 0

    def cross_entropy(self, one_hot_Y, pred_Y, epsilon=1e-12):
        """
        calculate the cross-entropy loss.
        :param one_hot_Y: One-hot encoded true labels
        :param pred_Y: Predicted labels
        :param epsilon: Small value to avoid division by zero
        :return: Cross-entropy loss
        """
        clip_pred_y = np.clip(pred_Y, epsilon, 1. - epsilon)  # clip predictions to avoid values of 0 and 1
        loss = -np.mean(np.sum(one_hot_Y * np.log(clip_pred_y), axis=0))
        return loss

    def one_hot(self, Y):
        """
        convert labels to one hot encoding
        :param Y: True labels
        :return: One hot labels
        """
        one_hot_Y = np.zeros((self.layers[-1], Y.size))
        one_hot_Y[Y, np.arange(Y.size)] = 1
        return one_hot_Y

    def backward_prop(self, memory, X, Y):
        """
        Perform backward propagation through the network 
        :param memory: dictionary containing previous results
        :param X: Input data
        :param Y: True labels
        :return: Dictionary of gradients
        """
        gradients = {}
        one_hot_Y = self.one_hot(Y)
        m = X.shape[1]
        A_last = memory[f'A{len(self.layers) - 1}']
        dZ = A_last - one_hot_Y
        gradients[f'dW{len(self.layers) - 1}'] = 1 / m * dZ @ memory[f'A{len(self.layers) - 2}'].T
        gradients[f'db{len(self.layers) - 1}'] = 1 / m * np.sum(dZ, axis=1, keepdims=True)
        for i in range(len(self.layers) - 2, 0, -1):
            dZ = (self.weights[f'W{i + 1}'].T @ dZ) * self.ReLU_deriv(memory[f'Z{i}'])
            gradients[f'dW{i}'] = 1 / m * dZ @ memory[f'A{i - 1}'].T
            gradients[f'db{i}'] = 1 / m * np.sum(dZ, axis=1, keepdims=True)
        return gradients

    def update_weights(self, gradients):
        """
        Update the weights and biases of the network using the computed gradients 
        :param gradients: Dictionary of gradients
        """
        for i in range(1, len(self.layers)):
            self.weights[f'W{i}'] -= self.learning_rate * gradients[f'dW{i}']
            self.weights[f'b{i}'] -= self.learning_rate * gradients[f'db{i}']
            
    def plot_training_history(self, accuracy_history, loss_history):
        """
        Plot the training and validation loss over iterations 
        :param accuracy_history: List of accuracy values over iterations
        :param loss_history: List of loss values over iterations
        """
        plt.figure(figsize=(10, 5))
        plt.plot(range(len(accuracy_history)), accuracy_history, label='validation Loss', color='blue')
        plt.plot(range(len(loss_history)), loss_history, label='Training Loss', color='green', linestyle='dotted')
        plt.title(f"Network's Configuration: {self.layers}")
        plt.xlabel('Iteration')
        plt.ylabel('Loss')
        plt.legend()
        plt.show()
        
    # Plot the confusion matrix    
    def plot_confusion_matrix(self, X, Y):
        predictions = self.predict(X)
        cm = confusion_matrix(Y, predictions)
        disp = ConfusionMatrixDisplay(confusion_matrix=cm)
        disp.plot(cmap='viridis')
        plt.title(f'Confusion Matrix for Model configuration: layers = {self.layers}, learning_rate = {self.learning_rate}, batch_size = {self.batch_size}, iterations = {self.iterations}', fontsize=10)
        plt.show()
     

    def fit(self, X, Y):
        """
        Train the neural network on the given data - updates the weights (and biases) of neurons network
        by using the gradient descent algorithm.
        :param X: Input data
        :param Y: True labels
        """
        val_loss_history = []
        train_loss_history = []
        
        for i in range(self.iterations):
            X, Y = shuffle(X.T, Y)
            X = X.T
            batches = [(X[:, k:k + self.batch_size], Y[k:k + self.batch_size]) for k in
                       range(0, X.shape[1], self.batch_size)]
            for batch_X, batch_Y in batches:
                memory = self.forward_prop(batch_X)
                gradients = self.backward_prop(memory, batch_X, batch_Y)
                self.update_weights(gradients)
            
              # calculate training loss
            memory = self.forward_prop(X)
            one_hot_Y = self.one_hot(Y)
            train_loss = self.cross_entropy(one_hot_Y, memory[f'A{len(self.layers) - 1}'])
            train_loss_history.append(train_loss)
            
            # calculate validation loss
            memory_val = self.forward_prop(X_val)
            one_hot_val_Y = self.one_hot(y_val)
            val_loss = self.cross_entropy(one_hot_val_Y, memory_val[f'A{len(self.layers) - 1}'])
            val_loss_history.append(val_loss)
            
            if i % 5 == 0 or i == self.iterations - 1:
                print("-----------------------------------\n")
                print(f"Iteration {i}, Training Accuracy: {self.score(X, Y)}, Training Loss: {train_loss}")
        self.plot_training_history(train_loss_history, val_loss_history)             

    def predict(self, X):
        """
        predict labels for the given input data
        :param X: Input data
        :return: Predicted labels
        """
        memory = self.forward_prop(X)
        A_last = memory[f'A{len(self.layers) - 1}']
        return np.argmax(A_last, axis=0)

    def score(self, X, Y):
        """
        Calculate the accuracy of the model on the given data
        :param X: Input data
        :param Y: True labels
        :return: Accuracy of the model
        """
        predictions = self.predict(X)
        return np.mean(predictions == Y)


In [None]:

# Load datasets
train_data = pd.read_csv('MNIST-train.csv')
test_data = pd.read_csv('MNIST-test.csv')

# Shuffle and split the training data
train_data = train_data.sample(frac=1).reset_index(drop=True)
test_data = test_data.sample(frac=1).reset_index(drop=True)

X_train_full = train_data.iloc[:, :-1].values  # Features (all columns except the last one)
y_train_full = train_data.iloc[:, -1].values  # Labels (last column)

# Split into training (90%) and validation (10%) sets
X_train, X_val, y_train, y_val = train_test_split(X_train_full, y_train_full, test_size=0.1, random_state=42)

# Normalize the data
X_train = X_train / 255.0
X_val = X_val / 255.0
X_test = test_data.iloc[:, :-1].values / 255.0
y_test = test_data.iloc[:, -1].values

# Transpose to match the format
X_train = X_train.T
X_val = X_val.T
X_test = X_test.T

In [None]:
# Define the different sets of configurations to test, each one with a different isolated parameter
layer_configs = [
     {'layers': [784, 128, 64, 10]},
    {'layers': [784, 10, 10]},
    {'layers': [784, 128, 64, 32, 10]},
    {'layers': [784, 10]},
     {'layers': [784, 256, 128, 32, 10]},
]


learning_rate_configs = [
    {'learning_rate': 0.1},
    {'learning_rate': 0.01},
    {'learning_rate': 0.001},
    {'learning_rate': 0.005},
]

batch_size_configs = [
    {'batch_size': 8},
    {'batch_size': 32},
    {'batch_size': 64},
    {'batch_size': 1}
]

iteration_configs = [
    {'iterations': 1},
    {'iterations': 10},
    {'iterations': 30},
    {'iterations': 100},
]

In [None]:
# Test and plot for each isolated parameter
for config_set, param_name in [(layer_configs, 'layers'), (learning_rate_configs, 'learning_rate'), (batch_size_configs, 'batch_size'), (iteration_configs, 'iterations')]:
    results = test_configuration_set(X_train, y_train, X_val, y_val, X_test, y_test, config_set, param_name)
    plot_test_scores(results, param_name)

# Print the top 3 models
print("\nTop 3 Models:")
top3_models = sorted(top3_models, key=lambda x: x['test_score'], reverse=True)
for model in top3_models:
    model['train_score'] = float(model['train_score'])
    model['val_score'] = float(model['val_score'])
    model['test_score'] = float(model['test_score'])
    print(f"Validation Score: {model['test_score']}")
    print(f"Model Configuration: {model}\n")