INSTALL THE FOLLOWING PYTHON PACKAGES FIRST BEFORE RUNNING THE PROGRAM

1) Numpy
2) NNFS - for the Spiral dataset
3) scikit-learn - for the iris dataset

In [314]:
# Library imports
import numpy as np

Create classes for modularity

In [315]:
# Hidden Layers
# Dense
class Layer_Dense:
    # Layer initialization
    # randomly initialize weights and set biases to zero
    def __init__(self, n_inputs, n_neurons):
        self.weights = np.random.randn(n_inputs, n_neurons) * np.sqrt(2. / n_inputs)
        self.biases = np.zeros((1, n_neurons))


    # Forward pass
    def forward(self, inputs):
        # Remember the input values
        self.inputs = inputs
        # Calculate the output values from inputs, weight and biases
        self.output = np.dot(inputs, self.weights) + self.biases

    # Backward pass/Backpropagation
    def backward(self, dvalues):
        # Gradients on parameters:
        self.dweights = np.dot(self.inputs.T, dvalues)
        self.dbiases = np.sum(dvalues, axis=0, keepdims=True)
        # Gradient on values
        self.dinputs = np.dot(dvalues, self.weights.T)


In [316]:
# Activation Functions
# Included here are the functions for both the forward and backward pass

# Linear
class ActivationLinear:
    def forward(self, inputs):
        self.inputs = inputs
        self.output = inputs

    def backward(self, dvalues):
        self.dinputs = dvalues.copy()

# Sigmoid
class ActivationSigmoid:
    def forward(self, inputs):
        self.inputs = inputs
        self.output = 1 / (1 + np.exp(-inputs))

    def backward(self, dvalues):
        self.dinputs = dvalues * (self.output * (1 - self.output))

# TanH
class ActivationTanH:
    def forward(self, inputs):
        self.inputs = inputs
        self.output = np.tanh(inputs)

    def backward(self, dvalues):
        self.dinputs = dvalues * (1 - self.output ** 2)

# ReLU
class Activation_ReLU:
    # Forward pass
    def forward(self, inputs):
        # Remember the input values
        self.inputs = inputs
        # Calculate the output values from inputs
        self.output = np.maximum(0, inputs)

    # Backward pass
    def backward(self, dvalues):
        # Make a copy of the original values first
        self.dinputs = dvalues.copy()
    
        # Zero gradient where input values were negative
        self.dinputs[self.inputs <= 0] = 0

# Softmax
class Activation_Softmax:
    # Forward pass
    def forward(self, inputs):
        # Remember the inputs values
        self.inputs = inputs

        # Get the unnormalized probabilities
        exp_values = np.exp(inputs - np.max(inputs, axis=1, keepdims=True))

        # Normalize them for each sample
        probabilities = exp_values / np.sum(exp_values, axis=1, keepdims=True)

        self.output = probabilities

    # Backward pass
    def backward(self, dvalues):
        # Create uninitialized array
        self.dinputs = np.empty_like(dvalues)

        # Enumerate outputs and gradients
        for index, (single_output, single_dvalues) in enumerate(zip(self.output, dvalues)):

            # Flatten output array
            single_output = single_output.reshape(-1, 1)
            # Calculate Jacobian matrix of the output
            jacobian_matrix = np.diagflat(single_output) - np.dot(single_output, single_output.T)
            # Calculate the sample-wise gradient
            # and add it to the array of sample gradients
            self.dinputs[index] = np.dot(jacobian_matrix, single_dvalues)


class Activation_Softmax_Loss_CategoricalCrossentropy:
    def __init__(self):
        self.activation = Activation_Softmax()
        self.loss = Loss_CategoricalCrossEntropy()

    def forward(self, inputs, y_true):
        self.activation.forward(inputs)
        return self.loss.forward(self.activation.output, y_true)

    def backward(self, dvalues, y_true):
        samples = len(dvalues)
        if len(y_true.shape) == 2:
            y_true = np.argmax(y_true, axis=1)
        self.dinputs = dvalues.copy()
        self.dinputs[range(samples), y_true] -= 1
        self.dinputs = self.dinputs / samples


In [317]:
# Base Loss class
class Loss:
    # Computes the average loss over all samples
    def calculate(self, output, y):
        sample_losses = self.forward(output, y)
        return np.mean(sample_losses)


# Mean Squared Error (MSE) Loss
class Loss_MSE:
    def forward(self, y_pred, y_true):
        # Compute squared difference and average per sample
        return np.mean((y_true - y_pred) ** 2, axis=-1)

    def backward(self, y_pred, y_true):
        samples = y_true.shape[0]
        outputs = y_true.shape[1]
        # Derivative of MSE with respect to predictions
        self.dinputs = -2 * (y_true - y_pred) / outputs
        # Normalize over number of samples
        self.dinputs /= samples


# Binary Cross-Entropy (BCE) Loss
class Loss_BinaryCrossEntropy:
    def forward(self, y_pred, y_true):
        # Clip predictions to avoid log(0)
        y_pred_clipped = np.clip(y_pred, 1e-7, 1 - 1e-7)
        # Compute BCE per sample
        return -(y_true * np.log(y_pred_clipped) + (1 - y_true) * np.log(1 - y_pred_clipped))

    def backward(self, y_pred, y_true):
        samples = y_true.shape[0]
        # Clip again to avoid division by 0
        y_pred_clipped = np.clip(y_pred, 1e-7, 1 - 1e-7)
        # Derivative of BCE
        self.dinputs = -(y_true / y_pred_clipped - (1 - y_true) / (1 - y_pred_clipped))
        self.dinputs /= samples

class Loss_CategoricalCrossEntropy(Loss):
    def forward(self, y_pred, y_true):
        samples = y_pred.shape[0]
        # Clip predictions to avoid log(0)
        y_pred_clipped = np.clip(y_pred, 1e-7, 1 - 1e-7)

        # Select the predicted probability for the true class
        if len(y_true.shape) == 1:  # Sparse labels
            correct_confidences = y_pred_clipped[range(samples), y_true]
        else:  # One-hot encoded labels
            correct_confidences = np.sum(y_pred_clipped * y_true, axis=1)

        # Compute negative log likelihood
        return -np.log(correct_confidences)

    def backward(self, dvalues, y_true):
        samples = len(dvalues)
        labels = dvalues.shape[1]

        # Clip values to avoid division by zero
        dvalues_clipped = np.clip(dvalues, 1e-7, 1 - 1e-7)

        # Convert sparse labels to one-hot if needed
        if len(y_true.shape) == 1:
            y_true = np.eye(labels)[y_true]

        # Gradient
        self.dinputs = -y_true / dvalues_clipped
        self.dinputs /= samples


<!-- Star -->

In [318]:
class Optimizer_SGD:
    def __init__(self, learning_rate=1.0, decay=0.0, momentum=0.0, adaptive=False, epsilon=1e-7):
        self.learning_rate = learning_rate
        self.current_lr = learning_rate
        self.decay = decay
        self.momentum = momentum
        self.adaptive = adaptive
        self.epsilon = epsilon
        self.iterations = 0

    def pre_update_params(self):
        # Apply learning rate decay
        if self.decay:
            self.current_lr = self.learning_rate * (1.0 / (1.0 + self.decay * self.iterations))

    def update_params(self, layer):
        # ----- MOMENTUM -----
        if self.momentum:
            # Initialize momentum arrays if not present
            if not hasattr(layer, 'weight_momentums'):
                layer.weight_momentums = np.zeros_like(layer.weights)
                layer.bias_momentums = np.zeros_like(layer.biases)

            # Update momentum with current gradients
            weight_updates = self.momentum * layer.weight_momentums - self.current_lr * layer.dweights
            bias_updates = self.momentum * layer.bias_momentums - self.current_lr * layer.dbiases

            # Store momentum for next iteration
            layer.weight_momentums = weight_updates
            layer.bias_momentums = bias_updates
        else:
            # Vanilla SGD update (no momentum)
            weight_updates = -self.current_lr * layer.dweights
            bias_updates = -self.current_lr * layer.dbiases

        # ----- ADAPTIVE GRADIENT (Adagrad) -----
        if self.adaptive:
            if not hasattr(layer, 'weight_cache'):
                layer.weight_cache = np.zeros_like(layer.weights)
                layer.bias_cache = np.zeros_like(layer.biases)

            # Accumulate squared gradients
            layer.weight_cache += layer.dweights**2
            layer.bias_cache += layer.dbiases**2

            # Adjust updates by cached values
            weight_updates /= (np.sqrt(layer.weight_cache) + self.epsilon)
            bias_updates /= (np.sqrt(layer.bias_cache) + self.epsilon)

        # Apply updates
        layer.weights += weight_updates
        layer.biases += bias_updates

    def post_update_params(self):
        self.iterations += 1


Use most of the classes to create a functioning neural network, capable of performing a forward and backward pass

We can use a sample dataset from the Spiral module.  

We can also use the IRIS dataset.

In [319]:
# Spiral Data
import nnfs
from nnfs.datasets import spiral_data

# Create the dataset
X, y = spiral_data(samples = 100, classes = 3)

# print(X[:5])
# print(X.shape)
# print(y[:5])
# print(y.shape)

In [320]:
# from sklearn.preprocessing import StandardScaler
# scaler = StandardScaler()
# X = scaler.fit_transform(X)

In [321]:
from sklearn.datasets import load_iris
from sklearn.preprocessing import StandardScaler

iris = load_iris()
X = iris.data  # shape (150, 4)
y = iris.target

scaler = StandardScaler()
X = scaler.fit_transform(X)

In [322]:
# ----- FIRST HIDDEN LAYER -----
dense1 = Layer_Dense(4, 8)
# This creates the first dense (fully connected) layer.
# It takes 4 input features (from your dataset) and produces 8 outputs (neurons).
# Each neuron learns a weighted combination of the 4 input values.

activation1 = Activation_ReLU()
# Applies the ReLU (Rectified Linear Unit) activation function.
# ReLU introduces non-linearity, helping the model learn complex relationships
# that a simple linear model couldn’t capture.

# ----- SECOND HIDDEN LAYER -----
dense2 = Layer_Dense(8, 8)
# Takes the 8 outputs from the previous layer and produces another 8 features.
# Adding more dense layers helps the network learn deeper representations
# (hierarchical features).

activation2 = Activation_ReLU()
# Again using ReLU to keep the model non-linear and prevent vanishing gradients.

# ----- OUTPUT LAYER -----
dense3 = Layer_Dense(8, 3)
# The final dense layer produces 3 outputs — one for each class in a classification task.

# ----- COMBINED SOFTMAX + LOSS FUNCTION -----
loss_activation = Activation_Softmax_Loss_CategoricalCrossentropy()
# This combines the Softmax activation (to convert the output scores into probabilities)
# with the Categorical Cross-Entropy loss function (to measure prediction error).
# Using this combined version is more numerically stable and efficient than
# running them separately.

# ----- OPTIMIZER CONFIGURATION -----
# --- First Run: Plain SGD ---
# optimizer = Optimizer_SGD(learning_rate=0.1, decay=1e-3, momentum=0.0)
# Stochastic Gradient Descent (SGD) updates weights in the direction of the gradient.
# With momentum=0.0, it's the “vanilla” version — can be slower and noisier,
# but it’s good for comparison.

# --- Second Run: SGD + Momentum ---
optimizer = Optimizer_SGD(learning_rate=0.1, decay=1e-3, momentum=0.9)
# The momentum version remembers past gradients and smooths updates.
# It helps escape local minima and typically converges faster with more stable loss.


PERFORM ONLY 1 PASS

In [323]:
EPOCHS = 1000

for i in range(EPOCHS):
    # Pre-update step (handles learning rate decay)
    optimizer.pre_update_params()

    # ----- Forward Pass -----
    dense1.forward(X)
    activation1.forward(dense1.output)

    dense2.forward(activation1.output)
    activation2.forward(dense2.output)

    dense3.forward(activation2.output)

    # Combined Softmax + Loss in one step
    loss = loss_activation.forward(dense3.output, y)
    predictions = np.argmax(loss_activation.activation.output, axis=1)
    accuracy = np.mean(predictions == y)

    # Log progress
    if i % 100 == 0:
        print(f"Epoch {i}, Loss: {np.mean(loss):.3f}, Acc: {accuracy:.3f}")

    # ----- Backward Pass -----
    # Backprop from Softmax + CCE combined
    loss_activation.backward(loss_activation.activation.output, y)
    dense3.backward(loss_activation.dinputs)
    activation2.backward(dense3.dinputs)
    dense2.backward(activation2.dinputs)
    activation1.backward(dense2.dinputs)
    dense1.backward(activation1.dinputs)

    # ----- Update Weights -----
    optimizer.update_params(dense1)
    optimizer.update_params(dense2)
    optimizer.update_params(dense3)
    optimizer.post_update_params()

# After training
print("\nFinal comparison:")
print("Predictions:", predictions[:10])
print("Actual:     ", y[:10])


Epoch 0, Loss: 1.200, Acc: 0.520
Epoch 100, Loss: 0.038, Acc: 0.987
Epoch 200, Loss: 0.027, Acc: 0.987
Epoch 300, Loss: 0.019, Acc: 0.993
Epoch 400, Loss: 0.013, Acc: 1.000
Epoch 500, Loss: 0.009, Acc: 1.000
Epoch 600, Loss: 0.007, Acc: 1.000
Epoch 700, Loss: 0.006, Acc: 1.000
Epoch 800, Loss: 0.005, Acc: 1.000
Epoch 900, Loss: 0.004, Acc: 1.000

Final comparison:
Predictions: [0 0 0 0 0 0 0 0 0 0]
Actual:      [0 0 0 0 0 0 0 0 0 0]


In [324]:
# ----- Backward Pass -----
# Combined Softmax + CCE backward (from final output layer)
loss_activation.backward(dense3.output, y)
dvalues = loss_activation.dinputs  # gradient ready for backprop

# Backprop through the layers
dense3.backward(dvalues)
activation2.backward(dense3.dinputs)
dense2.backward(activation2.dinputs)
activation1.backward(dense2.dinputs)
dense1.backward(activation1.dinputs)


In [325]:
# Check the gradient values of the weights and biases of the established layers
print(dense1.dweights)
print(dense1.dbiases)
print(dense2.dweights)
print(dense2.dbiases)


# Update the weights and biases
optimizer.update_params(dense1)
optimizer.update_params(dense2)

[[ 32.35516878  -2.14284284 -62.67526064 -45.33322021  16.42893228
    8.7934406   50.72288929  -4.12104801]
 [ -3.04309566   2.65446158  29.11433096  39.15122448   2.26057737
   -4.13094683 -14.06868597  -3.30851075]
 [ 33.22751167  -3.13061338 -73.30332279 -58.09679251  11.85051792
   10.00907129  55.12085394  -3.32789895]
 [ 36.11447244  -2.99619215 -71.90831201 -56.05649133   9.21070013
    9.53531359  59.77485331  -3.52022175]]
[[28.24658459  2.39530611 69.80732913 44.90513704  7.82253556 -8.16675351
  46.83651128  5.50910333]]
[[ 0.00000000e+00  5.85260105e+01 -8.84296411e-01  0.00000000e+00
   0.00000000e+00  6.41398686e+01  0.00000000e+00 -5.65663913e+01]
 [ 0.00000000e+00  0.00000000e+00  5.07236215e+00  0.00000000e+00
   4.11465358e+00  0.00000000e+00  3.19198156e+00  7.02167277e+00]
 [-4.51325057e-03 -1.63625141e+00  5.60555756e+01  0.00000000e+00
   3.40814006e+01 -2.56530915e+00  2.53767100e+01  7.24214860e+01]
 [ 0.00000000e+00 -1.52856721e+00  1.18196126e+02  0.00000000e