In [None]:
!pip install nnfs



In [None]:
import numpy as np
import nnfs
from nnfs.datasets import spiral_data

In [None]:


# ============= DENSE LAYER CLASS =============
class Layer_Dense:
    def __init__(self, n_inputs, n_neurons):
        self.weights = 0.01 * np.random.randn(n_inputs, n_neurons)
        self.biases = np.zeros((1, n_neurons))

    def forward(self, inputs):
        self.inputs = inputs
        self.output = np.dot(inputs, self.weights) + self.biases

    def backward(self, dvalues):
        self.dweights = np.dot(self.inputs.T, dvalues)
        self.dbiases = np.sum(dvalues, axis=0, keepdims=True)
        self.dinputs = np.dot(dvalues, self.weights.T)

# ============= ACTIVATION FUNCTIONS =============
class Activation_ReLU:
    def forward(self, inputs):
        self.inputs = inputs
        self.output = np.maximum(0, inputs)

    def backward(self, dvalues):
        self.dinputs = dvalues.copy()
        self.dinputs[self.inputs <= 0] = 0

class Activation_Softmax:
    def forward(self, inputs):
        self.inputs = inputs
        exp_values = np.exp(inputs - np.max(inputs, axis=1, keepdims=True))
        probabilities = exp_values / np.sum(exp_values, axis=1, keepdims=True)
        self.output = probabilities

    def backward(self, dvalues):
        self.dinputs = np.empty_like(dvalues)
        for index, (single_output, single_dvalues) in enumerate(zip(self.output, dvalues)):
            single_output = single_output.reshape(-1, 1)
            jacobian_matrix = np.diagflat(single_output) - np.dot(single_output, single_output.T)
            self.dinputs[index] = np.dot(jacobian_matrix, single_dvalues)

# ============= LOSS FUNCTIONS =============
class Loss:
    def calculate(self, output, y):
        sample_losses = self.forward(output, y)
        data_loss = np.mean(sample_losses)
        return data_loss

class Loss_CategoricalCrossEntropy(Loss):
    def forward(self, y_pred, y_true):
        samples = y_pred.shape[0]
        y_pred_clipped = np.clip(y_pred, 1e-7, 1 - 1e-7)

        if len(y_true.shape) == 1:
            correct_confidences = y_pred_clipped[range(samples), y_true]
        elif len(y_true.shape) == 2:
            correct_confidences = np.sum(y_pred_clipped * y_true, axis=1)

        negative_log_likelihoods = -np.log(correct_confidences)
        return negative_log_likelihoods

    def backward(self, dvalues, y_true):
        samples = len(dvalues)
        labels = len(dvalues[0])

        if len(y_true.shape) == 1:
            y_true = np.eye(labels)[y_true]

        self.dinputs = -y_true / dvalues
        self.dinputs = self.dinputs / samples

# ============= OPTIMIZER =============
class Optimizer_SGD:
    def __init__(self, learning_rate=1.0, decay=0.0, momentum=0.0, adaptive=False, epsilon=1e-7):
        self.learning_rate = learning_rate
        self.initial_learning_rate = learning_rate
        self.decay = decay
        self.momentum = momentum
        self.adaptive = adaptive
        self.epsilon = epsilon
        self.iterations = 0

    def update_learning_rate(self):
        """Update learning rate with decay BEFORE forward and backward passes"""
        if self.decay:
            self.learning_rate = self.initial_learning_rate / (1 + self.decay * self.iterations)
        else:
            self.learning_rate = self.initial_learning_rate

    def update_params(self, layer):
        """Update layer parameters with momentum/vanilla SGD AFTER backward pass"""

        # Initialize momentum arrays on first call
        if not hasattr(layer, 'weight_momentums'):
            layer.weight_momentums = np.zeros_like(layer.weights)
            layer.bias_momentums = np.zeros_like(layer.biases)

        # Initialize cache arrays for adaptive gradient
        if self.adaptive and not hasattr(layer, 'weight_cache'):
            layer.weight_cache = np.zeros_like(layer.weights)
            layer.bias_cache = np.zeros_like(layer.biases)

        # ===== ADAPTIVE GRADIENT PATH =====
        if self.adaptive:
            # Accumulate squared gradients
            layer.weight_cache += layer.dweights ** 2
            layer.bias_cache += layer.dbiases ** 2

            # Calculate adaptive weight and bias updates
            weight_update = -(self.learning_rate * layer.dweights) / (np.sqrt(layer.weight_cache) + self.epsilon)
            bias_update = -(self.learning_rate * layer.dbiases) / (np.sqrt(layer.bias_cache) + self.epsilon)

            # Apply momentum to adaptive gradients if enabled
            if self.momentum:
                layer.weight_momentums = self.momentum * layer.weight_momentums + weight_update
                layer.bias_momentums = self.momentum * layer.bias_momentums + bias_update
                layer.weights += layer.weight_momentums
                layer.biases += layer.bias_momentums
            else:
                layer.weights += weight_update
                layer.biases += bias_update
            return

        # ===== STANDARD PATH (MOMENTUM OR VANILLA) =====
        if self.momentum:
            # Momentum update
            weight_updates = self.momentum * layer.weight_momentums - self.learning_rate * layer.dweights
            bias_updates = self.momentum * layer.bias_momentums - self.learning_rate * layer.dbiases

            layer.weight_momentums = weight_updates
            layer.bias_momentums = bias_updates

            layer.weights += weight_updates
            layer.biases += bias_updates
        else:
            # Vanilla SGD
            layer.weights -= self.learning_rate * layer.dweights
            layer.biases -= self.learning_rate * layer.dbiases

    def post_update_params(self):
        """Increment iteration counter"""
        self.iterations += 1

# ============= TRAINING FUNCTION =============
def train_model(optimizer_name, optimizer, epochs=1000):
    print("\n" + "-"*60)
    print(f"Training results: {optimizer_name.upper()}")
    print("-"*60)

    # Create fresh dataset for each training run
    X, y = spiral_data(samples=100, classes=3)

    # Initialize network layers
    dense1 = Layer_Dense(2, 3)
    activation1 = Activation_ReLU()
    dense2 = Layer_Dense(3, 3)
    activation2 = Activation_Softmax()
    loss_function = Loss_CategoricalCrossEntropy()

    for epoch in range(epochs):
        # UPDATE LEARNING RATE DECAY BEFORE FP and BP
        optimizer.update_learning_rate()

        # FORWARD PASS
        dense1.forward(X)
        activation1.forward(dense1.output)
        dense2.forward(activation1.output)
        activation2.forward(dense2.output)

        # Calculate loss
        loss = loss_function.calculate(activation2.output, y)

        # Calculate accuracy
        predictions = np.argmax(activation2.output, axis=1)
        y_check = y.copy()
        if len(y_check.shape) == 2:
            y_check = np.argmax(y_check, axis=1)
        accuracy = np.mean(predictions == y_check)

        # BACKWARD PASS
        loss_function.backward(activation2.output, y)
        activation2.backward(loss_function.dinputs)
        dense2.backward(activation2.dinputs)
        activation1.backward(dense2.dinputs)
        dense1.backward(activation1.dinputs)

        # UPDATE WEIGHTS WITH MOMENTUM/SGD/ADAPTIVE AFTER BP
        optimizer.update_params(dense1)
        optimizer.update_params(dense2)
        optimizer.post_update_params()

        # Print progress every 100 epochs
        if epoch % 100 == 0 or epoch == epochs - 1:
            print(f"Epoch: {epoch:4d} | Final Loss: {loss:.4f} | Final Accuracy: {accuracy:.4f} | LR: {optimizer.learning_rate:.6f}")

    return optimizer_name, loss, accuracy

# ============= CREATE DATASET =============
X, y = spiral_data(samples=100, classes=3)

# ============= MAIN TRAINING EXECUTION =============
print("\n" + ">"*60)
print("EXERCISE UNIT 4")
print(">"*60)

results = []

# METHOD 1: VANILLA SGD (No decay, no momentum)
optimizer_vanilla = Optimizer_SGD(learning_rate=1.0)
results.append(train_model("Vanilla", optimizer_vanilla))

# METHOD 2: LEARNING RATE DECAY
optimizer_decay = Optimizer_SGD(learning_rate=1.0, decay=0.001)
results.append(train_model("Learning Rate Decay", optimizer_decay))

# METHOD 3: MOMENTUM
optimizer_momentum = Optimizer_SGD(learning_rate=1.0, momentum=0.9)
results.append(train_model("Momentum", optimizer_momentum))

# METHOD 4: ADAPTIVE GRADIENT (AdaGrad)
optimizer_adaptive = Optimizer_SGD(learning_rate=0.5, adaptive=True)
results.append(train_model("Adaptive Gradient", optimizer_adaptive))

# ============= SUMMARY RESULTS =============
print("\n" + "-"*60)
print("SUMMARY RESULTS")
print("-"*60)
for method_name, final_loss, final_acc in results:
    print(f"{method_name.ljust(12)} - Loss: {final_loss:.4f}, Accuracy: {final_acc:.4f}")
print("-"*60)


>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>
EXERCISE UNIT 4
>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>

------------------------------------------------------------
Training results: VANILLA
------------------------------------------------------------
Epoch:    0 | Final Loss: 1.0986 | Final Accuracy: 0.3333 | LR: 1.000000
Epoch:  100 | Final Loss: 1.0979 | Final Accuracy: 0.4067 | LR: 1.000000
Epoch:  200 | Final Loss: 1.0849 | Final Accuracy: 0.3867 | LR: 1.000000
Epoch:  300 | Final Loss: 1.0834 | Final Accuracy: 0.3667 | LR: 1.000000
Epoch:  400 | Final Loss: 1.0804 | Final Accuracy: 0.3800 | LR: 1.000000
Epoch:  500 | Final Loss: 1.0733 | Final Accuracy: 0.3867 | LR: 1.000000
Epoch:  600 | Final Loss: 1.0695 | Final Accuracy: 0.3700 | LR: 1.000000
Epoch:  700 | Final Loss: 1.0669 | Final Accuracy: 0.3733 | LR: 1.000000
Epoch:  800 | Final Loss: 1.0650 | Final Accuracy: 0.3933 | LR: 1.000000
Epoch:  900 | Final Loss: 1.0635 | Final Accuracy: 0.39