In [1]:
%pip install numpy nnfs scikit-learn

Collecting nnfs
  Downloading nnfs-0.5.1-py3-none-any.whl.metadata (1.7 kB)
Collecting scikit-learn
  Downloading scikit_learn-1.7.2-cp313-cp313-win_amd64.whl.metadata (11 kB)
Collecting scipy>=1.8.0 (from scikit-learn)
  Downloading scipy-1.16.2-cp313-cp313-win_amd64.whl.metadata (60 kB)
Collecting joblib>=1.2.0 (from scikit-learn)
  Downloading joblib-1.5.2-py3-none-any.whl.metadata (5.6 kB)
Collecting threadpoolctl>=3.1.0 (from scikit-learn)
  Downloading threadpoolctl-3.6.0-py3-none-any.whl.metadata (13 kB)
Downloading nnfs-0.5.1-py3-none-any.whl (9.1 kB)
Downloading scikit_learn-1.7.2-cp313-cp313-win_amd64.whl (8.7 MB)
   ---------------------------------------- 0.0/8.7 MB ? eta -:--:--
   - -------------------------------------- 0.3/8.7 MB ? eta -:--:--
   ---- ----------------------------------- 1.0/8.7 MB 3.9 MB/s eta 0:00:02
   ------------------- -------------------- 4.2/8.7 MB 9.0 MB/s eta 0:00:01
   ---------------------------------------- 8.7/8.7 MB 14.5 MB/s eta 0:00:00
D


[notice] A new release of pip is available: 25.0.1 -> 25.3
[notice] To update, run: python.exe -m pip install --upgrade pip


In [2]:
import numpy as np
import nnfs
from nnfs.datasets import spiral_data

In [3]:
class Layer_Dense:
    def __init__(self, n_inputs, n_neurons):
        self.weights = 0.01 * np.random.randn(n_inputs, n_neurons)
        self.biases = np.zeros((1, n_neurons))

    def forward(self, inputs):
        self.inputs = inputs
        self.output = np.dot(inputs, self.weights) + self.biases

    def backward(self, dvalues):
        self.dweights = np.dot(self.inputs.T, dvalues)
        self.dbiases = np.sum(dvalues, axis=0, keepdims=True)
        self.dinputs = np.dot(dvalues, self.weights.T)

In [4]:
class Activation_ReLU:
    def forward(self, inputs):
        self.inputs = inputs
        self.output = np.maximum(0, inputs)

    def backward(self, dvalues):
        self.dinputs = dvalues.copy()
        self.dinputs[self.inputs <= 0] = 0

class Activation_Softmax:
    def forward(self, inputs):
        self.inputs = inputs
        exp_values = np.exp(inputs - np.max(inputs, axis=1, keepdims=True))
        probabilities = exp_values / np.sum(exp_values, axis=1, keepdims=True)
        self.output = probabilities

    def backward(self, dvalues):
        self.dinputs = np.empty_like(dvalues)
        for index, (single_output, single_dvalues) in enumerate(zip(self.output, dvalues)):
            single_output = single_output.reshape(-1, 1)
            jacobian_matrix = np.diagflat(single_output) - np.dot(single_output, single_output.T)
            self.dinputs[index] = np.dot(jacobian_matrix, single_dvalues)

In [5]:
class Loss:
    def calculate(self, output, y):
        sample_losses = self.forward(output, y)
        data_loss = np.mean(sample_losses)
        return data_loss

class Loss_CategoricalCrossEntropy(Loss):
    def forward(self, y_pred, y_true):
        samples = y_pred.shape[0]
        y_pred_clipped = np.clip(y_pred, 1e-7, 1 - 1e-7)
        
        if len(y_true.shape) == 1:
            correct_confidences = y_pred_clipped[range(samples), y_true]
        elif len(y_true.shape) == 2:
            correct_confidences = np.sum(y_pred_clipped * y_true, axis=1)
        
        negative_log_likelihoods = -np.log(correct_confidences)
        return negative_log_likelihoods

    def backward(self, dvalues, y_true):
        samples = len(dvalues)
        labels = len(dvalues[0])
        
        if len(y_true.shape) == 1:
            y_true = np.eye(labels)[y_true]
        
        self.dinputs = -y_true / dvalues
        self.dinputs = self.dinputs / samples

In [6]:
class Optimizer_SGD:
    def __init__(self, learning_rate=1.0, decay=0.0, momentum=0.0, use_adagrad=False):
        self.learning_rate = learning_rate
        self.current_learning_rate = learning_rate
        self.decay = decay
        self.iterations = 0
        self.momentum = momentum
        self.use_adagrad = use_adagrad
    
    # Update learning rate before forward pass
    def pre_update_params(self):
        if self.decay:
            self.current_learning_rate = self.learning_rate * (1.0 / (1.0 + self.decay * self.iterations))
    
    # Update weights after backpropagation
    def update_params(self, layer):
        # Momentum
        if self.momentum:
            if not hasattr(layer, 'weight_momentums'):
                layer.weight_momentums = np.zeros_like(layer.weights)
                layer.bias_momentums = np.zeros_like(layer.biases)
            
            weight_updates = self.momentum * layer.weight_momentums - self.current_learning_rate * layer.dweights
            layer.weight_momentums = weight_updates
            
            bias_updates = self.momentum * layer.bias_momentums - self.current_learning_rate * layer.dbiases
            layer.bias_momentums = bias_updates
        
        # AdaGrad
        elif self.use_adagrad:
            if not hasattr(layer, 'weight_cache'):
                layer.weight_cache = np.zeros_like(layer.weights)
                layer.bias_cache = np.zeros_like(layer.biases)
            
            layer.weight_cache += layer.dweights**2
            layer.bias_cache += layer.dbiases**2
            
            weight_updates = -self.current_learning_rate * layer.dweights / (np.sqrt(layer.weight_cache) + 1e-7)
            bias_updates = -self.current_learning_rate * layer.dbiases / (np.sqrt(layer.bias_cache) + 1e-7)
        
        # Vanilla SGD
        else:
            weight_updates = -self.current_learning_rate * layer.dweights
            bias_updates = -self.current_learning_rate * layer.dbiases
        
        layer.weights += weight_updates
        layer.biases += bias_updates
    
    def post_update_params(self):
        self.iterations += 1

In [7]:
def train_network(optimizer_name, learning_rate, decay=0.0, momentum=0.0, use_adagrad=False, epochs=1000):
    print(f"\n{'='*60}")
    print(f"Training with {optimizer_name}")
    print(f"Learning Rate: {learning_rate}, Decay: {decay}, Momentum: {momentum}, AdaGrad: {use_adagrad}")
    print(f"{'='*60}\n")
    
    # Load dataset
    X, y = spiral_data(samples=100, classes=3)
    
    # Initialize network
    dense1 = Layer_Dense(2, 3)
    activation1 = Activation_ReLU()
    dense2 = Layer_Dense(3, 3)
    activation2 = Activation_Softmax()
    loss_function = Loss_CategoricalCrossEntropy()
    
    # Initialize optimizer
    optimizer = Optimizer_SGD(learning_rate=learning_rate, decay=decay, momentum=momentum, use_adagrad=use_adagrad)
    
    loss_history = []
    accuracy_history = []
    
    for epoch in range(epochs):
        # Learning rate decay
        optimizer.pre_update_params()
        
        # Forward pass
        dense1.forward(X)
        activation1.forward(dense1.output)
        dense2.forward(activation1.output)
        activation2.forward(dense2.output)
        
        # Calculate loss
        loss = loss_function.calculate(activation2.output, y)
        
        # Calculate accuracy
        predictions = np.argmax(activation2.output, axis=1)
        if len(y.shape) == 2:
            y_labels = np.argmax(y, axis=1)
        else:
            y_labels = y
        accuracy = np.mean(predictions == y_labels)
        
        loss_history.append(loss)
        accuracy_history.append(accuracy)
        
        # Backpropagation
        loss_function.backward(activation2.output, y)
        activation2.backward(loss_function.dinputs)
        dense2.backward(activation2.dinputs)
        activation1.backward(dense2.dinputs)
        dense1.backward(activation1.dinputs)
        
        # Update weights
        optimizer.update_params(dense1)
        optimizer.update_params(dense2)
        optimizer.post_update_params()
        
        # Print every 100 epochs
        if epoch % 100 == 0:
            print(f"Epoch {epoch:4d} | Loss: {loss:.4f} | Accuracy: {accuracy:.4f} | LR: {optimizer.current_learning_rate:.6f}")
    
    print(f"\nFinal Loss: {loss:.4f}")
    print(f"Final Accuracy: {accuracy:.4f}")
    
    return loss_history, accuracy_history

In [9]:
loss_sgd, acc_sgd = train_network(
    "Vanilla SGD + Learning Rate Decay",
    learning_rate=1.0,
    decay=0.01,
    epochs=1000
)


Training with Vanilla SGD + Learning Rate Decay
Learning Rate: 1.0, Decay: 0.01, Momentum: 0.0, AdaGrad: False

Epoch    0 | Loss: 1.0986 | Accuracy: 0.3733 | LR: 1.000000
Epoch  100 | Loss: 1.0980 | Accuracy: 0.4167 | LR: 0.500000
Epoch  200 | Loss: 1.0893 | Accuracy: 0.4167 | LR: 0.333333
Epoch  300 | Loss: 1.0762 | Accuracy: 0.4133 | LR: 0.250000
Epoch  400 | Loss: 1.0712 | Accuracy: 0.4267 | LR: 0.200000
Epoch  500 | Loss: 1.0688 | Accuracy: 0.4300 | LR: 0.166667
Epoch  600 | Loss: 1.0672 | Accuracy: 0.4267 | LR: 0.142857
Epoch  700 | Loss: 1.0661 | Accuracy: 0.4267 | LR: 0.125000
Epoch  800 | Loss: 1.0654 | Accuracy: 0.4200 | LR: 0.111111
Epoch  900 | Loss: 1.0649 | Accuracy: 0.4200 | LR: 0.100000

Final Loss: 1.0646
Final Accuracy: 0.4133


In [10]:
loss_momentum, acc_momentum = train_network(
    "SGD + Momentum",
    learning_rate=1.0,
    decay=0.01,
    momentum=0.9,
    epochs=1000
)


Training with SGD + Momentum
Learning Rate: 1.0, Decay: 0.01, Momentum: 0.9, AdaGrad: False

Epoch    0 | Loss: 1.0986 | Accuracy: 0.3000 | LR: 1.000000
Epoch  100 | Loss: 1.0694 | Accuracy: 0.4267 | LR: 0.500000
Epoch  200 | Loss: 1.0694 | Accuracy: 0.4267 | LR: 0.333333
Epoch  300 | Loss: 1.0694 | Accuracy: 0.4267 | LR: 0.250000
Epoch  400 | Loss: 1.0694 | Accuracy: 0.4267 | LR: 0.200000
Epoch  500 | Loss: 1.0694 | Accuracy: 0.4267 | LR: 0.166667
Epoch  600 | Loss: 1.0694 | Accuracy: 0.4267 | LR: 0.142857
Epoch  700 | Loss: 1.0694 | Accuracy: 0.4267 | LR: 0.125000
Epoch  800 | Loss: 1.0694 | Accuracy: 0.4267 | LR: 0.111111
Epoch  900 | Loss: 1.0694 | Accuracy: 0.4267 | LR: 0.100000

Final Loss: 1.0694
Final Accuracy: 0.4267


In [11]:
loss_adagrad, acc_adagrad = train_network(
    "SGD + AdaGrad",
    learning_rate=1.0,
    decay=0.0,
    use_adagrad=True,
    epochs=1000
)


Training with SGD + AdaGrad
Learning Rate: 1.0, Decay: 0.0, Momentum: 0.0, AdaGrad: True

Epoch    0 | Loss: 1.0986 | Accuracy: 0.3333 | LR: 1.000000
Epoch  100 | Loss: 1.0733 | Accuracy: 0.4333 | LR: 1.000000
Epoch  200 | Loss: 1.0733 | Accuracy: 0.4333 | LR: 1.000000
Epoch  300 | Loss: 1.0733 | Accuracy: 0.4300 | LR: 1.000000
Epoch  400 | Loss: 1.0733 | Accuracy: 0.4333 | LR: 1.000000
Epoch  500 | Loss: 1.0733 | Accuracy: 0.4333 | LR: 1.000000
Epoch  600 | Loss: 1.0733 | Accuracy: 0.4333 | LR: 1.000000
Epoch  700 | Loss: 1.0733 | Accuracy: 0.4333 | LR: 1.000000
Epoch  800 | Loss: 1.0733 | Accuracy: 0.4333 | LR: 1.000000
Epoch  900 | Loss: 1.0733 | Accuracy: 0.4333 | LR: 1.000000

Final Loss: 1.0733
Final Accuracy: 0.4333


In [None]:
# COMPARISON ANALYSIS

# 1. Vanilla SGD + Learning Rate Decay
#    - Took around 100-200 epochs to stabilize
#    - Final accuracy: 41.33%
#    - The loss kept going down slowly the whole time

# 2. SGD + Momentum
#    - Stabilized at around 100 epochs
#    - Final accuracy: 42.67%
#    - Got stuck at epoch 100 and didn't change after that

# 3. SGD + AdaGrad
#    - Stabilized at around 100 epochs
#    - Final accuracy: 43.33%
#    - Also got stuck at epoch 100 and stayed the same

# SConclusion:
# AdaGrad had the best accuracy at 43.33%
# Momentum and AdaGrad both stabilized faster (around 100 epochs)
# Vanilla SGD kept improving slowly but ended up with lower accuracy
# Overall the model performed less than 50% accuracy only resulted to 42-43% correct