INSTALL THE FOLLOWING PYTHON PACKAGES FIRST BEFORE RUNNING THE PROGRAM

1) Numpy
2) NNFS


In [34]:
# Library imports
import numpy as np

Create classes for modularity

In [35]:
# Hidden Layers
# Dense
class Layer_Dense:
    # Layer initialization
    # randomly initialize weights and set biases to zero
    def __init__(self, n_inputs, n_neurons):
        self.weights = 0.01 * np.random.randn(n_inputs, n_neurons)
        self.biases = np.zeros((1, n_neurons))


    # Forward pass
    def forward(self, inputs):
        # Remember the input values
        self.inputs = inputs
        # Calculate the output values from inputs, weight and biases
        self.output = np.dot(inputs, self.weights) + self.biases

    # Backward pass/Backpropagation
    def backward(self, dvalues):
        # Gradients on parameters:
        self.dweights = np.dot(self.inputs.T, dvalues)
        self.dbiases = np.sum(dvalues, axis=0, keepdims=True)
        # Gradient on values
        self.dinputs = np.dot(dvalues, self.weights.T)


In [36]:
# Activation Functions
# Included here are the functions for both the forward and backward pass

# Linear
class ActivationLinear:
    def forward(self, inputs):
        self.inputs = inputs
        self.output = inputs

    def backward(self, dvalues):
        self.dinputs = dvalues.copy()

# Sigmoid
class ActivationSigmoid:
    def forward(self, inputs):
        self.inputs = inputs
        self.output = 1 / (1 + np.exp(-inputs))

    def backward(self, dvalues):
        self.dinputs = dvalues * (self.output * (1 - self.output))

# TanH
class ActivationTanH:
    def forward(self, inputs):
        self.inputs = inputs
        self.output = np.tanh(inputs)

    def backward(self, dvalues):
        self.dinputs = dvalues * (1 - self.output ** 2)

# ReLU
class Activation_ReLU:
    # Forward pass
    def forward(self, inputs):
        # Remember the input values
        self.inputs = inputs
        # Calculate the output values from inputs
        self.output = np.maximum(0, inputs)

    # Backward pass
    def backward(self, dvalues):
        # Make a copy of the original values first
        self.dinputs = dvalues.copy()
    
        # Zero gradient where input values were negative
        self.dinputs[self.inputs <= 0] = 0

# Softmax
class Activation_Softmax:
    # Forward pass
    def forward(self, inputs):
        # Remember the inputs values
        self.inputs = inputs

        # Get the unnormalized probabilities
        exp_values = np.exp(inputs - np.max(inputs, axis=1, keepdims=True))

        # Normalize them for each sample
        probabilities = exp_values / np.sum(exp_values, axis=1, keepdims=True)

        self.output = probabilities

    # Backward pass
    def backward(self, dvalues):
        # Create uninitialized array
        self.dinputs = np.empty_like(dvalues)

        # Enumerate outputs and gradients
        for index, (single_output, single_dvalues) in enumerate(zip(self.output, dvalues)):

            # Flatten output array
            single_output = single_output.reshape(-1, 1)
            # Calculate Jacobian matrix of the output
            jacobian_matrix = np.diagflat(single_output) - np.dot(single_output, single_output.T)
            # Calculate the sample-wise gradient
            # and add it to the array of sample gradients
            self.dinputs[index] = np.dot(jacobian_matrix, single_dvalues)

In [37]:
# Loss functions

class Loss:
    # Calculate the data and regularization losses
    # Given the model output and grou truth/target values
    def calculate(self, output, y):
        # Calculate sample losses
        sample_losses = self.forward(output, y)
        # Calculate the mean loss
        data_loss = np.mean(sample_losses)
        # Return the mean loss
        return data_loss

# MSE
class Loss_MSE:
    def forward(self, y_pred, y_true):
        # Calculate Mean Squared Error
        return np.mean((y_true - y_pred) ** 2, axis=-1)

    def backward(self, y_pred, y_true):
        # Gradient of MSE loss
        samples = y_true.shape[0]
        outputs = y_true.shape[1]
        self.dinputs = -2 * (y_true - y_pred) / outputs
        # Normalize gradients over samples
        self.dinputs = self.dinputs / samples

# Binary Cross-Entropy
class Loss_BinaryCrossEntropy:
    def forward(self, y_pred, y_true):
        # Clip predictions
        y_pred_clipped = np.clip(y_pred, 1e-7, 1 - 1e-7)
        # Calculate Binary Cross Entropy
        return -(y_true * np.log(y_pred_clipped) + (1 - y_true) * np.log(1 - y_pred_clipped))

    def backward(self, y_pred, y_true):
        # Gradient of BCE loss
        samples = y_true.shape[0]
        y_pred_clipped = np.clip(y_pred, 1e-7, 1 - 1e-7)
        self.dinputs = - (y_true / y_pred_clipped - (1 - y_true) / (1 - y_pred_clipped))
        # Normalize gradients over samples
        self.dinputs = self.dinputs / samples

# Categorical Cross-Entropy
class Loss_CategoricalCrossEntropy(Loss):
    # Forward pass
    def forward(self, y_pred, y_true):
        # Number of samples in a batch
        samples = y_pred.shape[0]

        # Clip data to prevent division by 0
        # Clip both sides to not drag mean towards any value
        y_pred_clipped = np.clip(y_pred, 1e-7, 1 - 1e-7)

        # Probabilities for target values
        # Only if categorical labels
        if len(y_true.shape) == 1:
            correct_confidences = y_pred_clipped[range(samples), y_true]
        # Mask values - only for one-hot encoded labels
        elif len(y_true.shape) == 2:
            correct_confidences = np.sum(y_pred_clipped * y_true, axis=1)

        # Losses
        negative_log_likelihoods = -np.log(correct_confidences)
        return negative_log_likelihoods

    # Backward pass
    def backward(self, dvalues, y_true):
        # Number of samples
        samples = len(dvalues)
        # Number of labels in every sample
        # Use the first sample to count them
        labels = len(dvalues[0])

        # Check if labels are sparse, turn them into one-hot vector values
        # the eye function creates a 2D array with ones on the diagonal and zeros elsewhere
        if len(y_true.shape) == 1:
            y_true = np.eye(labels)[y_true]

        # Calculate the gradient
        self.dinputs = -y_true / dvalues
        self.dinputs = self.dinputs / samples


<!-- Star -->

In [38]:
# Start of Optimizers

# Optimizer with Learning Rate Decay, Momentum, and Adaptive Gradient
class Optimizer_SGD:
    def __init__(self, learning_rate=1.0, decay=0.0, momentum=0.0, adaptive=False, epsilon=1e-7):
        self.learning_rate = learning_rate
        self.current_learning_rate = learning_rate
        self.decay = decay
        self.iterations = 0
        self.momentum = momentum
        self.adaptive = adaptive
        self.epsilon = epsilon

    def pre_update_params(self):
        # Apply learning rate decay before Forward & Backward passes
        if self.decay:
            self.current_learning_rate = self.learning_rate * (1.0 / (1.0 + self.decay * self.iterations))
        else:
            self.current_learning_rate = self.learning_rate

    def update_params(self, layer):
        # Initialize momentums and caches if they don't exist yet
        if not hasattr(layer, 'weight_momentums'):
            layer.weight_momentums = np.zeros_like(layer.weights)
            layer.bias_momentums = np.zeros_like(layer.biases)
        if self.adaptive and not hasattr(layer, 'weight_cache'):
            layer.weight_cache = np.zeros_like(layer.weights)
            layer.bias_cache = np.zeros_like(layer.biases)

        # ----- Adaptive Gradient -----
        if self.adaptive:
            layer.weight_cache += layer.dweights * layer.dweights
            layer.bias_cache += layer.dbiases * layer.dbiases

            weight_update = -(self.current_learning_rate * layer.dweights) / (np.sqrt(layer.weight_cache) + self.epsilon)
            bias_update = -(self.current_learning_rate * layer.dbiases) / (np.sqrt(layer.bias_cache) + self.epsilon)

            if self.momentum:
                layer.weight_momentums = self.momentum * layer.weight_momentums + weight_update
                layer.bias_momentums = self.momentum * layer.bias_momentums + bias_update
                layer.weights += layer.weight_momentums
                layer.biases += layer.bias_momentums
            else:
                layer.weights += weight_update
                layer.biases += bias_update
            return

        # ----- Momentum or Vanilla SGD -----
        if self.momentum:
            weight_updates = self.momentum * layer.weight_momentums - self.current_learning_rate * layer.dweights
            bias_updates = self.momentum * layer.bias_momentums - self.current_learning_rate * layer.dbiases

            layer.weight_momentums = weight_updates
            layer.bias_momentums = bias_updates

            layer.weights += weight_updates
            layer.biases += bias_updates
        else:
            layer.weights += -self.current_learning_rate * layer.dweights
            layer.biases += -self.current_learning_rate * layer.dbiases

    def post_update_params(self):
        self.iterations += 1


Use most of the classes to create a functioning neural network, capable of performing a forward and backward pass

We can use a sample dataset from the Spiral module
We can also use the IRIS dataset

In [39]:
# Imports for the package
import nnfs
from nnfs.datasets import spiral_data

# Neural Network initialization
# Create the dataset
X, y = spiral_data(samples = 100, classes = 3)

# Create a Dense Layer with 2 input features and 3 output values
dense1 = Layer_Dense(2, 3)

# Create a ReLU activation for the first Dense layer
activation1 = Activation_ReLU()

# Create a 2nd dense layer with 3 input and 3 output values
dense2 = Layer_Dense(3, 3)

# Create a Softmax activation for the 2nd Dense layer
activation2 = Activation_Softmax()

# Create a loss function
loss_function = Loss_CategoricalCrossEntropy()

# Create the optimizer
optimizer = Optimizer_SGD()

In [40]:
# Iris Dataset

PERFORM ONLY 1 PASS

In [41]:
# Perform a forward pass of our training data
# give the input from the dataset to the first layer
dense1.forward(X)

# Activation function
activation1.forward(dense1.output)

# Pass on the 2nd layer
dense2.forward(activation1.output)

activation2.forward(dense2.output)

# Calculate the loss
loss_function.forward(activation2.output, y)

# Check the model's performance
predictions = np.argmax(activation2.output, axis=1)
if len(y.shape) == 2:
    y = np.argmax(y, axis=1)
accuracy = np.mean(predictions == y)

# Print the accuracy
print('acc:', accuracy)

acc: 0.3466666666666667


In [42]:
# Perform a backward pass of our training data
# From loss to 2nd softmax activation
loss_function.backward(activation2.output, y)
dvalues = loss_function.dinputs # Gradient of the loss w.r.t softmax output

print(dvalues.shape)
# print(dvalues)

# From 2nd softmax to 2nd dense layer
activation2.backward(dvalues)
# From 2nd dense layer to 1st ReLU activation
dense2.backward(activation2.dinputs)

# From 1st ReLU activation to 1st dense layer
activation1.backward(dense2.dinputs)
dense1.backward(activation1.dinputs)

(300, 3)


In [43]:
# Check the gradient values of the weights and biases of the established layers
print(dense1.dweights)
print(dense1.dbiases)
print(dense2.dweights)
print(dense2.dbiases)


# Update the weights and biases
optimizer.update_params(dense1)
optimizer.update_params(dense2)

[[-2.38984618e-05  4.52121902e-05 -5.36579938e-04]
 [ 6.63203993e-05  9.92777199e-05 -3.94164364e-05]]
[[ 0.00012147 -0.00025889  0.0006916 ]]
[[ 1.69591678e-04 -3.28835917e-04  1.59244239e-04]
 [-1.69836264e-05  1.09616758e-04 -9.26331318e-05]
 [ 5.66655808e-05  1.87660143e-04 -2.44325724e-04]]
[[ 1.69481633e-05  3.77978828e-06 -2.07279516e-05]]


EXERCISE FOUR:
1) Set up the code so that it will perform the Forward Pass (FP), Backpropagation (BP) and weight update in 1000 epochs

2) Modify the Optimizer class so that it will accept 3 optimizers we've discussed
    
    a) Learning rate decay

    b) Momentum

    c) Adaptive Gradient

    Hint: Updating the learning decay rate happens before running both FP and BP, implementing momentum, and vanilla SGD happens after the learning rate decay


# ============================================================
# EXERCISE FOUR: Training for 1000 Epochs
# ============================================================

In [None]:

# Reimport dataset 
X, y = spiral_data(samples=100, classes=3)

def train_and_report(name, optimizer, epochs=1000, report_every=100):
    print("\n" + "="*60)
    print(f"Training with: {name.upper()}")
    print("="*60)

    # Initialize layers
    dense1 = Layer_Dense(2, 3)
    activation1 = Activation_ReLU()
    dense2 = Layer_Dense(3, 3)
    activation2 = Activation_Softmax()
    loss_function = Loss_CategoricalCrossEntropy()

    for epoch in range(epochs):
        optimizer.pre_update_params()

        # Forward pass
        dense1.forward(X)
        activation1.forward(dense1.output)
        dense2.forward(activation1.output)
        activation2.forward(dense2.output)

        # Compute loss and accuracy
        loss = loss_function.calculate(activation2.output, y)
        predictions = np.argmax(activation2.output, axis=1)
        if len(y.shape) == 2:
            y_labels = np.argmax(y, axis=1)
        else:
            y_labels = y
        acc = np.mean(predictions == y_labels)

        # Backward pass
        loss_function.backward(activation2.output, y)
        activation2.backward(loss_function.dinputs)
        dense2.backward(activation2.dinputs)
        activation1.backward(dense2.dinputs)
        dense1.backward(activation1.dinputs)

        # Update parameters
        optimizer.update_params(dense1)
        optimizer.update_params(dense2)
        optimizer.post_update_params()

        # Report every N epochs
        if epoch % report_every == 0 or epoch == epochs - 1:
            print(f"Epoch: {epoch:4d} | Loss: {loss:.4f} | Acc: {acc:.4f} | LR: {optimizer.current_learning_rate:.6f}")

    return name, loss, acc


# Run all 3 optimizers and collect final values
results = []

# 1️ Vanilla SGD
opt_vanilla = Optimizer_SGD(learning_rate=1.0)
results.append(train_and_report("vanilla", opt_vanilla))

# 2️ Learning Rate Decay
opt_decay = Optimizer_SGD(learning_rate=1.0, decay=1e-3)
results.append(train_and_report("decay", opt_decay))

# 3️ Momentum
opt_momentum = Optimizer_SGD(learning_rate=1.0, momentum=0.9)
results.append(train_and_report("momentum", opt_momentum))

# 4️ Adaptive Gradient
opt_adagrad = Optimizer_SGD(learning_rate=0.5, adaptive=True)
results.append(train_and_report("adaptive", opt_adagrad))

# SUMMARY COMPARISON
print("\nSUMMARY COMPARISON")
print("="*60)
for name, loss, acc in results:
    print(f"{name.lower():12s} - Loss: {loss:.4f}, Accuracy: {acc:.4f}")
print("="*60)



Training with: VANILLA
Epoch:    0 | Loss: 1.0986 | Acc: 0.3267 | LR: 1.000000
Epoch:  100 | Loss: 1.0978 | Acc: 0.4167 | LR: 1.000000
Epoch:  200 | Loss: 1.0736 | Acc: 0.4133 | LR: 1.000000
Epoch:  300 | Loss: 1.0723 | Acc: 0.4033 | LR: 1.000000
Epoch:  400 | Loss: 1.0700 | Acc: 0.4133 | LR: 1.000000
Epoch:  500 | Loss: 1.0659 | Acc: 0.4167 | LR: 1.000000
Epoch:  600 | Loss: 1.0650 | Acc: 0.4200 | LR: 1.000000
Epoch:  700 | Loss: 1.0646 | Acc: 0.4100 | LR: 1.000000
Epoch:  800 | Loss: 1.0643 | Acc: 0.4200 | LR: 1.000000
Epoch:  900 | Loss: 1.0641 | Acc: 0.4133 | LR: 1.000000
Epoch:  999 | Loss: 1.0640 | Acc: 0.4200 | LR: 1.000000

Training with: DECAY
Epoch:    0 | Loss: 1.0986 | Acc: 0.2933 | LR: 1.000000
Epoch:  100 | Loss: 1.0963 | Acc: 0.4200 | LR: 0.909091
Epoch:  200 | Loss: 1.0748 | Acc: 0.4033 | LR: 0.833333
Epoch:  300 | Loss: 1.0727 | Acc: 0.4367 | LR: 0.769231
Epoch:  400 | Loss: 1.0694 | Acc: 0.4233 | LR: 0.714286
Epoch:  500 | Loss: 1.0688 | Acc: 0.4300 | LR: 0.666667
Ep