In [21]:
# Library imports
import numpy as np
try:
    import nnfs
    from nnfs.datasets import spiral_data
    nnfs.init()
except Exception:
    def spiral_data(points=100, classes=3):
        X = np.zeros((points*classes, 2))
        y = np.zeros(points*classes, dtype='uint8')
        for class_number in range(classes):
            ix = range(points*class_number, points*(class_number+1))
            r = np.linspace(0.0,1,points)
            t = np.linspace(class_number*4, (class_number+1)*4, points) + np.random.randn(points)*0.2
            X[ix] = np.c_[r*np.sin(t*2.5), r*np.cos(t*2.5)]
            y[ix] = class_number
        return X, y

In [None]:
# Hidden Layers
# Dense
class Layer_Dense:
    def __init__(self, n_inputs, n_neurons):
        self.weights = 0.01 * np.random.randn(n_inputs, n_neurons)
        self.biases = np.zeros((1, n_neurons))
        self.dweights = None
        self.dbiases = None
        self.weight_momentum = np.zeros_like(self.weights)
        self.bias_momentum = np.zeros_like(self.biases)
    def forward(self, inputs):
        self.inputs = inputs
        self.output = np.dot(inputs, self.weights) + self.biases
    def backward(self, dvalues):
        self.dweights = np.dot(self.inputs.T, dvalues)
        self.dbiases = np.sum(dvalues, axis=0, keepdims=True)
        self.dinputs = np.dot(dvalues, self.weights.T)

In [50]:
# Activation Functions
# Included here are the functions for both the forward and backward pass

# Linear
class ActivationLinear:
    def forward(self, inputs):
        self.inputs = inputs
        self.output = inputs

    def backward(self, dvalues):
        self.dinputs = dvalues.copy()

# Sigmoid
class ActivationSigmoid:
    def forward(self, inputs):
        self.inputs = inputs
        self.output = 1 / (1 + np.exp(-inputs))

    def backward(self, dvalues):
        self.dinputs = dvalues * (self.output * (1 - self.output))

# TanH
class ActivationTanH:
    def forward(self, inputs):
        self.inputs = inputs
        self.output = np.tanh(inputs)

    def backward(self, dvalues):
        self.dinputs = dvalues * (1 - self.output ** 2)

# ReLU
class Activation_ReLU:
    def forward(self, inputs):
        self.inputs = inputs
        self.output = np.maximum(0, inputs)
    def backward(self, dvalues):
        self.dinputs = dvalues.copy()
        self.dinputs[self.inputs <= 0] = 0

# Softmax
class Activation_Softmax:
    def forward(self, inputs):
        # Shift values for numerical stability
        exp_values = np.exp(inputs - np.max(inputs, axis=1, keepdims=True))
        probabilities = exp_values / np.sum(exp_values, axis=1, keepdims=True)
        self.output = probabilities  # <-- this is important

    def backward(self, dvalues):
        # Create uninitialized array
        self.dinputs = np.empty_like(dvalues)

        # Enumerate outputs and gradients
        for index, (single_output, single_dvalues) in enumerate(zip(self.output, dvalues)):
            # Flatten output array
            single_output = single_output.reshape(-1, 1)
            # Calculate Jacobian matrix
            jacobian_matrix = np.diagflat(single_output) - np.dot(single_output, single_output.T)
            # Calculate sample-wise gradient
            self.dinputs[index] = np.dot(jacobian_matrix, single_dvalues)

In [112]:
# Loss functions

class Loss:
    # Calculate the data and regularization losses
    # Given the model output and grou truth/target values
    def calculate(self, output, y):
        # Calculate sample losses
        sample_losses = self.forward(output, y)
        # Calculate the mean loss
        data_loss = np.mean(sample_losses)
        # Return the mean loss
        return data_loss

# MSE
class Loss_MSE:
    def forward(self, y_pred, y_true):
        # Calculate Mean Squared Error
        return np.mean((y_true - y_pred) ** 2, axis=-1)

    def backward(self, y_pred, y_true):
        # Gradient of MSE loss
        samples = y_true.shape[0]
        outputs = y_true.shape[1]
        self.dinputs = -2 * (y_true - y_pred) / outputs
        # Normalize gradients over samples
        self.dinputs = self.dinputs / samples

# Binary Cross-Entropy
class Loss_BinaryCrossEntropy:
    def forward(self, y_pred, y_true):
        # Clip predictions
        y_pred_clipped = np.clip(y_pred, 1e-7, 1 - 1e-7)
        # Calculate Binary Cross Entropy
        return -(y_true * np.log(y_pred_clipped) + (1 - y_true) * np.log(1 - y_pred_clipped))

    def backward(self, y_pred, y_true):
        # Gradient of BCE loss
        samples = y_true.shape[0]
        y_pred_clipped = np.clip(y_pred, 1e-7, 1 - 1e-7)
        self.dinputs = - (y_true / y_pred_clipped - (1 - y_true) / (1 - y_pred_clipped))
        # Normalize gradients over samples
        self.dinputs = self.dinputs / samples

# Categorical Cross-Entropy
class Loss_CategoricalCrossEntropy:
    def forward(self, y_pred, y_true):
        samples = len(y_pred)
        # Clip data to prevent division by zero
        y_pred_clipped = np.clip(y_pred, 1e-7, 1 - 1e-7)

        # Probabilities for target values - only if categorical labels
        if len(y_true.shape) == 1:
            correct_confidences = y_pred_clipped[range(samples), y_true]
        elif len(y_true.shape) == 2:
            correct_confidences = np.sum(y_pred_clipped * y_true, axis=1)

        # Losses
        negative_log_likelihoods = -np.log(correct_confidences)
        return negative_log_likelihoods

    def backward(self, dvalues, y_true):
        # Number of samples
        samples = len(dvalues)
        # Number of labels in each sample
        labels = len(dvalues[0])

        # If labels are sparse, turn them into one-hot vectors
        if len(y_true.shape) == 1:
            y_true = np.eye(labels)[y_true]

        # Calculate gradient
        self.dinputs = -y_true / dvalues
        # Normalize gradient
        self.dinputs = self.dinputs / samples

# Combined Softmax + Categorical Cross-Entropy
class Activation_Softmax_Loss_CategoricalCrossentropy:
    def forward(self, inputs, y_true):
        # Softmax activation
        exp_values = np.exp(inputs - np.max(inputs, axis=1, keepdims=True))
        probabilities = exp_values / np.sum(exp_values, axis=1, keepdims=True)
        self.output = probabilities  # ✅ store as .output instead of .probs

        # Compute loss (Categorical Cross-Entropy)
        samples = len(inputs)
        y_pred_clipped = np.clip(probabilities, 1e-7, 1 - 1e-7)

        if len(y_true.shape) == 1:
            correct_confidences = y_pred_clipped[range(samples), y_true]
        else:
            correct_confidences = np.sum(y_pred_clipped * y_true, axis=1)

        negative_log_likelihoods = -np.log(correct_confidences)
        return np.mean(negative_log_likelihoods)

    def backward(self, dvalues, y_true):
        samples = len(dvalues)
        if len(y_true.shape) == 2:
            y_true = np.argmax(y_true, axis=1)

        self.dinputs = dvalues.copy()
        self.dinputs[range(samples), y_true] -= 1
        self.dinputs = self.dinputs / samples

In [121]:
class Optimizer:
    def __init__(self, learning_rate=1.0, decay=0.0, momentum=0.0):
        self.learning_rate = learning_rate
        self.current_learning_rate = learning_rate
        self.decay = decay
        self.iterations = 0
        self.momentum = momentum

   #SGD Optimizer
    def pre_update(self):
        if self.decay:
            self.current_learning_rate = self.learning_rate * (1. / (1. + self.decay * self.iterations))

    
    def update_sgd(self, layer, use_momentum=False):
        if use_momentum:
            if not hasattr(layer, 'weight_momentums'):
                layer.weight_momentums = np.zeros_like(layer.weights)
                layer.bias_momentums = np.zeros_like(layer.biases)

            weight_updates = self.momentum * layer.weight_momentums - self.current_learning_rate * layer.dweights
            bias_updates = self.momentum * layer.bias_momentums - self.current_learning_rate * layer.dbiases

            layer.weight_momentums = weight_updates
            layer.bias_momentums = bias_updates
        else:
            weight_updates = -self.current_learning_rate * layer.dweights
            bias_updates = -self.current_learning_rate * layer.dbiases

        layer.weights += weight_updates
        layer.biases += bias_updates

    #Adagrad Optimizer
    def update_adagrad(self, layer, epsilon=1e-7):
        if not hasattr(layer, 'weight_cache'):
            layer.weight_cache = np.zeros_like(layer.weights)
            layer.bias_cache = np.zeros_like(layer.biases)

        # Accumulate squared gradients
        layer.weight_cache += layer.dweights ** 2
        layer.bias_cache += layer.dbiases ** 2

        # Update parameters
        layer.weights += -self.current_learning_rate * layer.dweights / (np.sqrt(layer.weight_cache) + epsilon)
        layer.biases += -self.current_learning_rate * layer.dbiases / (np.sqrt(layer.bias_cache) + epsilon)

    # --- Increment iteration count (called after updates) ---
    def post_update(self):
        self.iterations += 1



In [114]:
def accuracy(predictions, y):
    return np.mean(np.argmax(predictions, axis=1) == y)

In [115]:
# Spiral Data
import nnfs
from nnfs.datasets import spiral_data

# Create the dataset
X, y = spiral_data(samples = 100, classes = 3)


print(X[:5])
print(X.shape)
print(y[:5])
print(y.shape)

[[ 0.          0.        ]
 [ 0.00150691  0.00998797]
 [-0.0110788   0.01689325]
 [ 0.01739753  0.02481128]
 [-0.01510686  0.03747358]]
(300, 2)
[0 0 0 0 0]
(300,)


In [106]:
# Iris Dataset
# From the scikit-learn library
# from sklearn.datasets import load_iris
# iris = load_iris()
# X = iris.data # Features
# y = iris.target # Target labels

# print(X[:5])
# print(X.shape)
# print(y[:5])
# print(y.shape)

In [107]:
# Neural Network initialization
# Create a Dense Layer with 2 input features and 3 output values
dense1 = Layer_Dense(2, 3)

# Make sure you check the shape of the features, in order to adjust the input size of the first layer
# dense1 = Layer_Dense(4, 3)

# Create a ReLU activation for the first Dense layer
activation1 = Activation_ReLU()

# Create a 2nd dense layer with 3 input and 3 output values
dense2 = Layer_Dense(3, 3)

# Create a Softmax activation for the 2nd Dense layer
activation2 = Activation_Softmax()

# Create a loss function
loss_function = Loss_CategoricalCrossEntropy()

# Create the optimizer
optimizer = Optimizer()

In [108]:
# Perform a forward pass of our training data
# give the input from the dataset to the first layer
dense1.forward(X)

# Activation function
activation1.forward(dense1.output)

# Pass on the 2nd layer
dense2.forward(activation1.output)

activation2.forward(dense2.output)

# Calculate the loss
loss_function.forward(activation2.output, y)

# Check the model's performance
predictions = np.argmax(activation2.output, axis=1)
if len(y.shape) == 2:
    y = np.argmax(y, axis=1)
accuracy = np.mean(predictions == y)

# Print the accuracy
print('acc:', accuracy)

acc: 0.31


In [109]:
# Perform a backward pass of our training data
# From loss to 2nd softmax activation
loss_function.backward(activation2.output, y)
dvalues = loss_function.dinputs # Gradient of the loss w.r.t softmax output

print(dvalues.shape)
# print(dvalues)

# From 2nd softmax to 2nd dense layer
activation2.backward(dvalues)
# From 2nd dense layer to 1st ReLU activation
dense2.backward(activation2.dinputs)

# From 1st ReLU activation to 1st dense layer
activation1.backward(dense2.dinputs)
dense1.backward(activation1.dinputs)

(300, 3)


In [None]:
#  Forward Pass 
dense1.forward(X)
activation1.forward(dense1.output)
dense2.forward(activation1.output)
activation2.forward(dense2.output)

# --- Compute Loss ---
loss = np.mean(loss_function.forward(activation2.output, y))
print("Loss:", loss)

# --- Backward Pass ---
loss_function.backward(activation2.output, y)
activation2.backward(loss_function.dinputs)
dense2.backward(activation2.dinputs)
activation1.backward(dense2.dinputs)
dense1.backward(activation1.dinputs)

# --- Check gradient values of the weights and biases ---
print("\nGradients (Dense1):")
print("Weights gradient:\n", dense1.dweights)
print("Bias gradient:\n", dense1.dbiases)

print("\nGradients (Dense2):")
print("Weights gradient:\n", dense2.dweights)
print("Bias gradient:\n", dense2.dbiases)

# --- Update the weights and biases ---
optimizer.pre_update()
optimizer.update_sgd(dense1, use_momentum=True)   # or use_momentum=False for vanilla SGD
optimizer.update_sgd(dense2, use_momentum=True)
optimizer.post_update()

print("\nWeights and biases updated successfully")

Loss: 1.0986134

Gradients (Dense1):
Weights gradient:
 [[ 1.7130929e-05  6.6266992e-05 -3.5919115e-06]
 [ 5.3872594e-05 -1.2405423e-04  2.2485861e-05]]
Bias gradient:
 [[-7.7243094e-05  9.2893482e-05  5.7905792e-05]]

Gradients (Dense2):
Weights gradient:
 [[ 5.1033556e-05  1.4065001e-04 -1.9168356e-04]
 [-8.2749262e-05  2.1760711e-04 -1.3485785e-04]
 [ 2.3539627e-05 -2.8032053e-04  2.5678091e-04]]
Bias gradient:
 [[-7.5218268e-06  2.1209707e-06  5.4205302e-06]]

Weights and biases updated successfully


In [None]:
print(" Training with SGD (Momentum = 0.9)")

X, y = spiral_data(samples=100, classes=3)

dense1 = Layer_Dense(2, 64)
activation1 = Activation_ReLU()
dense2 = Layer_Dense(64, 3)
loss_activation = Activation_Softmax_Loss_CategoricalCrossentropy()
optimizer = Optimizer(learning_rate=1.0, decay=1e-3, momentum=0.9)

epochs = 1000

for epoch in range(1, epochs + 1):
    # Forward pass
    dense1.forward(X)
    activation1.forward(dense1.output)
    dense2.forward(activation1.output)

    loss = loss_activation.forward(dense2.output, y)
    acc = accuracy(loss_activation.output, y)

    # Backward pass
    loss_activation.backward(loss_activation.output, y)
    dense2.backward(loss_activation.dinputs)
    activation1.backward(dense2.dinputs)
    dense1.backward(activation1.dinputs)

    # Update weights
    optimizer.pre_update()
    optimizer.update_sgd(dense1, use_momentum=True)
    optimizer.update_sgd(dense2, use_momentum=True)
    optimizer.post_update()

    if epoch % 100 == 0 or epoch == 1:
        print(f"Epoch {epoch}/{epochs} - Loss: {loss:.4f} - "
              f"Accuracy: {acc*100:.2f}% - LR: {optimizer.current_learning_rate:.5f}")

=== Training with SGD (Momentum = 0.9) ===
Epoch 1/1000 - Loss: 1.0986 - Accuracy: 35.67% - LR: 1.00000
Epoch 100/1000 - Loss: 1.0382 - Accuracy: 41.00% - LR: 0.90992
Epoch 200/1000 - Loss: 0.9462 - Accuracy: 56.67% - LR: 0.83403
Epoch 300/1000 - Loss: 0.9234 - Accuracy: 49.33% - LR: 0.76982
Epoch 400/1000 - Loss: 0.7989 - Accuracy: 54.67% - LR: 0.71480
Epoch 500/1000 - Loss: 0.7077 - Accuracy: 65.00% - LR: 0.66711
Epoch 600/1000 - Loss: 0.6217 - Accuracy: 72.00% - LR: 0.62539
Epoch 700/1000 - Loss: 0.5440 - Accuracy: 76.67% - LR: 0.58858
Epoch 800/1000 - Loss: 0.4914 - Accuracy: 81.00% - LR: 0.55586
Epoch 900/1000 - Loss: 0.4616 - Accuracy: 83.33% - LR: 0.52659
Epoch 1000/1000 - Loss: 0.4403 - Accuracy: 83.00% - LR: 0.50025


In [None]:
# === Training with Adagrad ===
print("\n Training with Adagrad")

dense1 = Layer_Dense(2, 64)
activation1 = Activation_ReLU()
dense2 = Layer_Dense(64, 3)
loss_activation = Activation_Softmax_Loss_CategoricalCrossentropy()
optimizer_adagrad = Optimizer(learning_rate=1.0, decay=1e-4)

epochs = 1000

for epoch in range(1, epochs + 1):
    optimizer_adagrad.pre_update()

    # Forward pass
    dense1.forward(X)
    activation1.forward(dense1.output)
    dense2.forward(activation1.output)

    # Loss + Accuracy
    loss = loss_activation.forward(dense2.output, y)
    acc = accuracy(loss_activation.output, y)

    # Backward pass
    loss_activation.backward(loss_activation.output, y)
    dense2.backward(loss_activation.dinputs)
    activation1.backward(dense2.dinputs)
    dense1.backward(activation1.dinputs)

    # Update weights using Adagrad
    optimizer_adagrad.update_adagrad(dense1)
    optimizer_adagrad.update_adagrad(dense2)
    optimizer_adagrad.post_update() 

    # Logging
    if epoch % 100 == 0 or epoch == 1:
        print(f"Epoch {epoch}/{epochs} - Loss: {loss:.4f} - "
              f"Accuracy: {acc*100:.2f}% - LR: {optimizer_adagrad.current_learning_rate:.5f}")


=== Training with Adagrad ===
Epoch 1/1000 - Loss: 1.0986 - Accuracy: 27.67% - LR: 1.00000
Epoch 100/1000 - Loss: 0.9910 - Accuracy: 45.67% - LR: 0.99020
Epoch 200/1000 - Loss: 0.9477 - Accuracy: 49.33% - LR: 0.98049
Epoch 300/1000 - Loss: 0.9098 - Accuracy: 50.00% - LR: 0.97097
Epoch 400/1000 - Loss: 0.8826 - Accuracy: 53.00% - LR: 0.96163
Epoch 500/1000 - Loss: 0.8354 - Accuracy: 53.33% - LR: 0.95247
Epoch 600/1000 - Loss: 0.7894 - Accuracy: 55.67% - LR: 0.94349
Epoch 700/1000 - Loss: 0.7237 - Accuracy: 64.67% - LR: 0.93467
Epoch 800/1000 - Loss: 0.7254 - Accuracy: 67.67% - LR: 0.92601
Epoch 900/1000 - Loss: 0.6883 - Accuracy: 70.33% - LR: 0.91752
Epoch 1000/1000 - Loss: 0.6529 - Accuracy: 72.33% - LR: 0.90917


In [2]:
# No. 4 Exlaination

#When we compared the two optimizers, SGD with Momentum and Adagrad, 
# we noticed some clear differences in how they behaved during training. 
# The Adagrad optimizer made the loss decrease faster in the early epochs, 
# and it stabilized around the 400th to 500th epoch. This means that the 
# model quickly adjusted its parameters and reached a point where the loss 
# didn’t change much anymore. On the other hand, SGD with Momentum took 
# longer to stabilize, around the 700th to 800th epoch, because it updates 
# parameters in a smoother and slower way. However, even though it took 
# more time to stabilize, SGD with Momentum ended up giving a slightly 
# better accuracy by the end of training.#

#The difference happens because Adagrad keeps reducing its learning rate 
# as training goes on, which helps it learn quickly at first but makes it 
# slow down later. Meanwhile, SGD with Momentum continues to push the model 
# toward a better minimum by keeping some of the previous updates (momentum), 
# which helps it reach higher accuracy. In short, Adagrad learns faster but 
# stops improving sooner, while SGD with Momentum takes longer but achieves 
# slightly better results in the end.
