INSTALL THE FOLLOWING PYTHON PACKAGES FIRST BEFORE RUNNING THE PROGRAM

1) Numpy
2) NNFS


In [14]:
# Library imports
import numpy as np

Create classes for modularity

In [15]:
# Hidden Layers
# Dense
class Layer_Dense:
    # Layer initialization
    # randomly initialize weights and set biases to zero
    def __init__(self, n_inputs, n_neurons):
        self.weights = 0.01 * np.random.randn(n_inputs, n_neurons)
        self.biases = np.zeros((1, n_neurons))


    # Forward pass
    def forward(self, inputs):
        # Remember the input values
        self.inputs = inputs
        # Calculate the output values from inputs, weight and biases
        self.output = np.dot(inputs, self.weights) + self.biases

    # Backward pass/Backpropagation
    def backward(self, dvalues):
        # Gradients on parameters:
        self.dweights = np.dot(self.inputs.T, dvalues)
        self.dbiases = np.sum(dvalues, axis=0, keepdims=True)
        # Gradient on values
        self.dinputs = np.dot(dvalues, self.weights.T)


In [16]:
# Activation Functions
# Included here are the functions for both the forward and backward pass

# Linear
class ActivationLinear:
    def forward(self, inputs):
        self.inputs = inputs
        self.output = inputs

    def backward(self, dvalues):
        self.dinputs = dvalues.copy()

# Sigmoid
class ActivationSigmoid:
    def forward(self, inputs):
        self.inputs = inputs
        self.output = 1 / (1 + np.exp(-inputs))

    def backward(self, dvalues):
        self.dinputs = dvalues * (self.output * (1 - self.output))

# TanH
class ActivationTanH:
    def forward(self, inputs):
        self.inputs = inputs
        self.output = np.tanh(inputs)

    def backward(self, dvalues):
        self.dinputs = dvalues * (1 - self.output ** 2)

# ReLU
class Activation_ReLU:
    # Forward pass
    def forward(self, inputs):
        # Remember the input values
        self.inputs = inputs
        # Calculate the output values from inputs
        self.output = np.maximum(0, inputs)

    # Backward pass
    def backward(self, dvalues):
        # Make a copy of the original values first
        self.dinputs = dvalues.copy()
    
        # Zero gradient where input values were negative
        self.dinputs[self.inputs <= 0] = 0

# Softmax
class Activation_Softmax:
    # Forward pass
    def forward(self, inputs):
        # Remember the inputs values
        self.inputs = inputs

        # Get the unnormalized probabilities
        exp_values = np.exp(inputs - np.max(inputs, axis=1, keepdims=True))

        # Normalize them for each sample
        probabilities = exp_values / np.sum(exp_values, axis=1, keepdims=True)

        self.output = probabilities

    # Backward pass
    def backward(self, dvalues):
        # Create uninitialized array
        self.dinputs = np.empty_like(dvalues)

        # Enumerate outputs and gradients
        for index, (single_output, single_dvalues) in enumerate(zip(self.output, dvalues)):

            # Flatten output array
            single_output = single_output.reshape(-1, 1)
            # Calculate Jacobian matrix of the output
            jacobian_matrix = np.diagflat(single_output) - np.dot(single_output, single_output.T)
            # Calculate the sample-wise gradient
            # and add it to the array of sample gradients
            self.dinputs[index] = np.dot(jacobian_matrix, single_dvalues)

In [17]:
# Loss functions

class Loss:
    # Calculate the data and regularization losses
    # Given the model output and grou truth/target values
    def calculate(self, output, y):
        # Calculate sample losses
        sample_losses = self.forward(output, y)
        # Calculate the mean loss
        data_loss = np.mean(sample_losses)
        # Return the mean loss
        return data_loss

# MSE
class Loss_MSE:
    def forward(self, y_pred, y_true):
        # Calculate Mean Squared Error
        return np.mean((y_true - y_pred) ** 2, axis=-1)

    def backward(self, y_pred, y_true):
        # Gradient of MSE loss
        samples = y_true.shape[0]
        outputs = y_true.shape[1]
        self.dinputs = -2 * (y_true - y_pred) / outputs
        # Normalize gradients over samples
        self.dinputs = self.dinputs / samples

# Binary Cross-Entropy
class Loss_BinaryCrossEntropy:
    def forward(self, y_pred, y_true):
        # Clip predictions
        y_pred_clipped = np.clip(y_pred, 1e-7, 1 - 1e-7)
        # Calculate Binary Cross Entropy
        return -(y_true * np.log(y_pred_clipped) + (1 - y_true) * np.log(1 - y_pred_clipped))

    def backward(self, y_pred, y_true):
        # Gradient of BCE loss
        samples = y_true.shape[0]
        y_pred_clipped = np.clip(y_pred, 1e-7, 1 - 1e-7)
        self.dinputs = - (y_true / y_pred_clipped - (1 - y_true) / (1 - y_pred_clipped))
        # Normalize gradients over samples
        self.dinputs = self.dinputs / samples

# Categorical Cross-Entropy
class Loss_CategoricalCrossEntropy(Loss):
    # Forward pass
    def forward(self, y_pred, y_true):
        # Number of samples in a batch
        samples = y_pred.shape[0]

        # Clip data to prevent division by 0
        # Clip both sides to not drag mean towards any value
        y_pred_clipped = np.clip(y_pred, 1e-7, 1 - 1e-7)

        # Probabilities for target values
        # Only if categorical labels
        if len(y_true.shape) == 1:
            correct_confidences = y_pred_clipped[range(samples), y_true]
        # Mask values - only for one-hot encoded labels
        elif len(y_true.shape) == 2:
            correct_confidences = np.sum(y_pred_clipped * y_true, axis=1)

        # Losses
        negative_log_likelihoods = -np.log(correct_confidences)
        return negative_log_likelihoods

    # Backward pass
    def backward(self, dvalues, y_true):
        # Number of samples
        samples = len(dvalues)
        # Number of labels in every sample
        # Use the first sample to count them
        labels = len(dvalues[0])

        # Check if labels are sparse, turn them into one-hot vector values
        # the eye function creates a 2D array with ones on the diagonal and zeros elsewhere
        if len(y_true.shape) == 1:
            y_true = np.eye(labels)[y_true]

        # Calculate the gradient
        self.dinputs = -y_true / dvalues
        self.dinputs = self.dinputs / samples


<!-- Star -->

In [18]:
# Start of Optimizers (commented, just in case)
"""
class Optimizer_SGD:
    # Initialie the optimizer - default learning rate to 1
    def __init__(self, learning_rate=1.0):
        self.learning_rate = learning_rate
        self.iterations = 0

    # Update the parameters
    def update_params(self, layer):
        layer.weights += -self.learning_rate * layer.dweights
        layer.biases += -self.learning_rate * layer.dbiases
    
    def post_update_params(self):
        self.iterations += 1

"""

'\nclass Optimizer_SGD:\n    # Initialie the optimizer - default learning rate to 1\n    def __init__(self, learning_rate=1.0):\n        self.learning_rate = learning_rate\n        self.iterations = 0\n\n    # Update the parameters\n    def update_params(self, layer):\n        layer.weights += -self.learning_rate * layer.dweights\n        layer.biases += -self.learning_rate * layer.dbiases\n    \n    def post_update_params(self):\n        self.iterations += 1\n\n'

In [None]:
# No.2: Start of Optimizers (UPDATED VERSION that handles 3 optimizers discussed: 
# Learning Rate Decay, Momentum, and Adaptive Gradient)

class Optimizer_SGD:
    # Initialize the optimizer - default learning rate to 1
    # decay rate is set to 0.01, momentum to 0.0, and adaptive off because it hasn't been specified
    # As a result, training becomes smoother and more stable, improving the loss and accuracy trends

    # === Momentum (uses momentum=0.9 to accelerate learning and reduce oscillations) ===
    #def __init__(self, learning_rate=1.0, decay=0.01, momentum=0.9, adaptive=False, epsilon=1e-7):

    # === Adagrad (adaptive=True enables adaptive learning rate adjustment per parameter) ===
    #def __init__(self, learning_rate=1.0, decay=0.01, momentum=0.0, adaptive=True, epsilon=1e-7):

    # === Vanilla SGD (default) ===
    def __init__(self, learning_rate=1.0, decay=0.01, momentum=0.0, adaptive=False, epsilon=1e-7):
        self.learning_rate = learning_rate
        self.current_learning_rate = learning_rate
        self.decay = decay
        self.momentum = momentum
        self.adaptive = adaptive
        self.epsilon = epsilon
        self.iterations = 0

        # Initialize dictionaries for momentum and adaptive gradient
        self.weight_momentums = {}
        self.bias_momentums = {}
        self.weight_cache = {}
        self.bias_cache = {}

    # <----- LEARNING RATE DECAY ----->
    # Update learning rate using decay before running both FP and BP
    # Formula used: lr = lr / (1 + decay * iteration)
    def pre_update_params(self):
        if self.decay:
            self.current_learning_rate = self.learning_rate / (1 + self.decay * self.iterations)
        else:
            self.current_learning_rate = self.learning_rate


    # Update the parameters
    def update_params(self, layer):
       
        # <----- MOMENTUM IMPLEMENTATION ----->
        if self.momentum:
            # Initialize momentums if not present
            if layer not in self.weight_momentums:
                self.weight_momentums[layer] = 0
                self.bias_momentums[layer] = 0

            # Apply momentum formula:
            # v = (momentum * v) - (current_lr * gradient)
            weight_updates = (self.momentum * self.weight_momentums[layer]) - (
                self.current_learning_rate * layer.dweights
            )
            bias_updates = (self.momentum * self.bias_momentums[layer]) - (
                self.current_learning_rate * layer.dbiases
            )

            # Store momentum updates
            self.weight_momentums[layer] = weight_updates
            self.bias_momentums[layer] = bias_updates

        else:
            # Vanilla SGD
            weight_updates = -self.current_learning_rate * layer.dweights
            bias_updates = -self.current_learning_rate * layer.dbiases

        
        # <----- ADAPTIVE GRADIENT IMPLEMENTATION ----->       
        if self.adaptive:
            # Initialize cache if not present
            if layer not in self.weight_cache:
                self.weight_cache[layer] = 0
                self.bias_cache[layer] = 0

            # Accumulate squared gradients
            # cache = cache + (gradient ** 2)
            self.weight_cache[layer] += layer.dweights ** 2
            self.bias_cache[layer] += layer.dbiases ** 2

            # Update parameters using Adagrad formula:
            # w = w - (lr / sqrt(cache + epsilon)) * gradient
            layer.weights += -self.current_learning_rate * layer.dweights / (
                (self.weight_cache[layer] ** 0.5) + self.epsilon
            )
            layer.biases += -self.current_learning_rate * layer.dbiases / (
                (self.bias_cache[layer] ** 0.5) + self.epsilon
            )
        else:
            # Apply either Vanilla SGD or Momentum update
            layer.weights += weight_updates
            layer.biases += bias_updates

    # Increment iteration count after parameter updates
    def post_update_params(self):
        self.iterations += 1

        


Use most of the classes to create a functioning neural network, capable of performing a forward and backward pass

We can use a sample dataset from the Spiral module
We can also use the IRIS dataset

In [20]:
# Imports for the package
import nnfs
from nnfs.datasets import spiral_data

# Neural Network initialization
# Create the dataset
X, y = spiral_data(samples = 100, classes = 3)

# Create a Dense Layer with 2 input features and 3 output values
dense1 = Layer_Dense(2, 3)

# Create a ReLU activation for the first Dense layer
activation1 = Activation_ReLU()

# Create a 2nd dense layer with 3 input and 3 output values
dense2 = Layer_Dense(3, 3)

# Create a Softmax activation for the 2nd Dense layer
activation2 = Activation_Softmax()

# Create a loss function
loss_function = Loss_CategoricalCrossEntropy()

# Create the optimizer
optimizer = Optimizer_SGD()

In [21]:
# Iris Dataset

PERFORM ONLY 1 PASS

In [22]:
# Perform a forward pass of our training data
# give the input from the dataset to the first layer
dense1.forward(X)

# Activation function
activation1.forward(dense1.output)

# Pass on the 2nd layer
dense2.forward(activation1.output)

activation2.forward(dense2.output)

# Calculate the loss
loss_function.forward(activation2.output, y)

# Check the model's performance
predictions = np.argmax(activation2.output, axis=1)
if len(y.shape) == 2:
    y = np.argmax(y, axis=1)
accuracy = np.mean(predictions == y)

# Print the accuracy
print('acc:', accuracy)

acc: 0.33666666666666667


In [23]:
# Perform a backward pass of our training data
# From loss to 2nd softmax activation
loss_function.backward(activation2.output, y)
dvalues = loss_function.dinputs # Gradient of the loss w.r.t softmax output

print(dvalues.shape)
# print(dvalues)

# From 2nd softmax to 2nd dense layer
activation2.backward(dvalues)
# From 2nd dense layer to 1st ReLU activation
dense2.backward(activation2.dinputs)

# From 1st ReLU activation to 1st dense layer
activation1.backward(dense2.dinputs)
dense1.backward(activation1.dinputs)

(300, 3)


In [24]:
# Check the gradient values of the weights and biases of the established layers
print(dense1.dweights)
print(dense1.dbiases)
print(dense2.dweights)
print(dense2.dbiases)


# Update the weights and biases
optimizer.update_params(dense1)
optimizer.update_params(dense2)

[[ 3.3042468e-06 -3.9488236e-06 -9.9410361e-05]
 [-2.2006872e-05  3.0671345e-04  1.6974623e-04]]
[[-1.8163289e-05 -5.1999168e-04  1.4667865e-05]]
[[ 9.1446236e-05 -2.5220116e-04  1.6075492e-04]
 [-1.7278346e-04  3.9700870e-04 -2.2422522e-04]
 [ 4.4883702e-05 -1.2783038e-04  8.2946674e-05]]
[[ 4.6686037e-06 -8.4029743e-06  3.6098063e-06]]


EXERCISE FOUR:
1) Set up the code so that it will perform the Forward Pass (FP), Backpropagation (BP) and weight update in 1000 epochs

2) Modify the Optimizer class so that it will accept 3 optimizers we've discussed
    
    a) Learning rate decay

    b) Momentum

    c) Adaptive Gradient

    Hint: Updating the learning decay rate happens before running both FP and BP, implementing momentum, and vanilla SGD happens after the learning rate decay


In [25]:
# No.1A: TRAINING SETUP AND EXECUTION


# reimporting packages for clarity
import nnfs
from nnfs.datasets import spiral_data

# Initialize nnfs settings
nnfs.init()

# Create the dataset (100 samples and 3 classes)
X, y = spiral_data(samples = 100, classes = 3)

# Create the model layers
layer1 = Layer_Dense(2, 3)        # Input layer: 2 inputs going to 3 neurons
activation1 = Activation_ReLU()   # ReLU activation
layer2 = Layer_Dense(3, 3)        # Hidden layer: 3 neurons going to 3 outputs
activation2 = Activation_Softmax()# used Softmax for multi-class output

# Setting hyperparameters
learning_rate = 0.01
epochs = 1000

In [26]:
# ============================
# No.1B: TRAINING LOOP (for 1000 Epochs)
# ============================

# Result title
print("Results displaying every 100-epoch interval:\n")

# Initialize the optimizer (using only learning rate)
optimizer = Optimizer_SGD(learning_rate=1.0)

for epoch in range(epochs):

    # <---- Forward Pass ---- >
    layer1.forward(X)
    activation1.forward(layer1.output)
    layer2.forward(activation1.output)
    activation2.forward(layer2.output)

    # Initialize the loss object (only once outside loop would also be fine)
    loss_function = Loss_CategoricalCrossEntropy()

    # Calculate the loss using the class method
    loss = loss_function.calculate(activation2.output, y)

    # Accuracy: how many predictions match labels
    predictions = np.argmax(activation2.output, axis=1)
    accuracy = np.mean(predictions == y)

    # prints only 100 epochs per interval
    if epoch % 100 == 0 or epoch == epochs - 1:
        print(f'Epoch {epoch:4d} | Loss: {loss:.4f} | Accuracy: {accuracy:.3f}')

    # < ---- Backward Pass ---- >
    dvalues = activation2.output.copy()
    dvalues[range(len(X)), y] -= 1
    dvalues = dvalues / len(X)

    layer2.backward(dvalues)
    activation1.backward(layer2.dinputs)
    layer1.backward(activation1.dinputs)

    # < ---- Weight and Bias Updates ---- >
    # In this part, we use the Optimizer class instead of manually updating the weights and biases
    # The optimizer handles the parameter updates and includes learning rate decay adjustment
    # This means that in the pre-update, the learning rate is recalculated if decay is enabled
    # Pre-update: apply learning rate decay (currently set to 0.01)
    optimizer.pre_update_params()

    # Update parameters using the optimizer
    optimizer.update_params(layer1)
    optimizer.update_params(layer2)

    # Post-update: increment iteration counter
    optimizer.post_update_params()

Results displaying every 100-epoch interval:

Epoch    0 | Loss: 1.0986 | Accuracy: 0.340
Epoch  100 | Loss: 1.0368 | Accuracy: 0.427
Epoch  200 | Loss: 1.0297 | Accuracy: 0.437
Epoch  300 | Loss: 1.0259 | Accuracy: 0.430
Epoch  400 | Loss: 1.0246 | Accuracy: 0.417
Epoch  500 | Loss: 1.0237 | Accuracy: 0.417
Epoch  600 | Loss: 1.0233 | Accuracy: 0.417
Epoch  700 | Loss: 1.0229 | Accuracy: 0.417
Epoch  800 | Loss: 1.0226 | Accuracy: 0.417
Epoch  900 | Loss: 1.0222 | Accuracy: 0.417
Epoch  999 | Loss: 1.0220 | Accuracy: 0.417
