# Exercise 4 - Neural Network with Different Optimizers
Herald Kent Amolong - CS 3B

This exercise implements a neural network with backpropagation using 3 different optimizers and compares their performance.

In [54]:
import numpy as np

print("print libraries successfully!")

print libraries successfully!


## Layer Initialization

In [55]:
class Layer_Dense:
    # Layer initialization
    # randomly initialize weights and set biases to zero
    def __init__(self, n_inputs, n_neurons):
        self.weights = 0.01 * np.random.randn(n_inputs, n_neurons)
        self.biases = np.zeros((1, n_neurons))


    # Forward pass
    def forward(self, inputs):
        # Remember the input values
        self.inputs = inputs
        # Calculate the output values from inputs, weight and biases
        self.output = np.dot(inputs, self.weights) + self.biases

    # Backward pass/Backpropagation
    def backward(self, dvalues):
        # Gradients on parameters:
        self.dweights = np.dot(self.inputs.T, dvalues)
        self.dbiases = np.sum(dvalues, axis=0, keepdims=True)
        # Gradient on values
        self.dinputs = np.dot(dvalues, self.weights.T)

## Activation Functions

In [56]:
# Activation Functions
# Included here are the functions for both the forward and backward pass

# Linear
class ActivationLinear:
    def forward(self, inputs):
        self.inputs = inputs
        self.output = inputs

    def backward(self, dvalues):
        self.dinputs = dvalues.copy()

# Sigmoid
class ActivationSigmoid:
    def forward(self, inputs):
        self.inputs = inputs
        self.output = 1 / (1 + np.exp(-inputs))

    def backward(self, dvalues):
        self.dinputs = dvalues * (self.output * (1 - self.output))

# TanH
class ActivationTanH:
    def forward(self, inputs):
        self.inputs = inputs
        self.output = np.tanh(inputs)

    def backward(self, dvalues):
        self.dinputs = dvalues * (1 - self.output ** 2)

# ReLU
class Activation_ReLU:
    # Forward pass
    def forward(self, inputs):
        # Remember the input values
        self.inputs = inputs
        # Calculate the output values from inputs
        self.output = np.maximum(0, inputs)

    # Backward pass
    def backward(self, dvalues):
        # Make a copy of the original values first
        self.dinputs = dvalues.copy()
    
        # Zero gradient where input values were negative
        self.dinputs[self.inputs <= 0] = 0

# Softmax
class Activation_Softmax:
    # Forward pass
    def forward(self, inputs):
        # Remember the inputs values
        self.inputs = inputs

        # Get the unnormalized probabilities
        exp_values = np.exp(inputs - np.max(inputs, axis=1, keepdims=True))

        # Normalize them for each sample
        probabilities = exp_values / np.sum(exp_values, axis=1, keepdims=True)

        self.output = probabilities

    # Backward pass
    def backward(self, dvalues):
        # Create uninitialized array
        self.dinputs = np.empty_like(dvalues)

        # Enumerate outputs and gradients
        for index, (single_output, single_dvalues) in enumerate(zip(self.output, dvalues)):

            # Flatten output array
            single_output = single_output.reshape(-1, 1)
            # Calculate Jacobian matrix of the output
            jacobian_matrix = np.diagflat(single_output) - np.dot(single_output, single_output.T)
            # Calculate the sample-wise gradient
            # and add it to the array of sample gradients
            self.dinputs[index] = np.dot(jacobian_matrix, single_dvalues)

## Loss Functions

In [57]:
# Loss functions

class Loss:
    # Calculate the data and regularization losses
    # Given the model output and grou truth/target values
    def calculate(self, output, y):
        # Calculate sample losses
        sample_losses = self.forward(output, y)
        # Calculate the mean loss
        data_loss = np.mean(sample_losses)
        # Return the mean loss
        return data_loss

# MSE
class Loss_MSE:
    def forward(self, y_pred, y_true):
        # Calculate Mean Squared Error
        return np.mean((y_true - y_pred) ** 2, axis=-1)

    def backward(self, y_pred, y_true):
        # Gradient of MSE loss
        samples = y_true.shape[0]
        outputs = y_true.shape[1]
        self.dinputs = -2 * (y_true - y_pred) / outputs
        # Normalize gradients over samples
        self.dinputs = self.dinputs / samples

# Binary Cross-Entropy
class Loss_BinaryCrossEntropy:
    def forward(self, y_pred, y_true):
        # Clip predictions
        y_pred_clipped = np.clip(y_pred, 1e-7, 1 - 1e-7)
        # Calculate Binary Cross Entropy
        return -(y_true * np.log(y_pred_clipped) + (1 - y_true) * np.log(1 - y_pred_clipped))

    def backward(self, y_pred, y_true):
        # Gradient of BCE loss
        samples = y_true.shape[0]
        y_pred_clipped = np.clip(y_pred, 1e-7, 1 - 1e-7)
        self.dinputs = - (y_true / y_pred_clipped - (1 - y_true) / (1 - y_pred_clipped))
        # Normalize gradients over samples
        self.dinputs = self.dinputs / samples

# Categorical Cross-Entropy
class Loss_CategoricalCrossEntropy(Loss):
    # Forward pass
    def forward(self, y_pred, y_true):
        # Number of samples in a batch
        samples = y_pred.shape[0]

        # Clip data to prevent division by 0
        # Clip both sides to not drag mean towards any value
        y_pred_clipped = np.clip(y_pred, 1e-7, 1 - 1e-7)

        # Probabilities for target values
        # Only if categorical labels
        if len(y_true.shape) == 1:
            correct_confidences = y_pred_clipped[range(samples), y_true]
        # Mask values - only for one-hot encoded labels
        elif len(y_true.shape) == 2:
            correct_confidences = np.sum(y_pred_clipped * y_true, axis=1)

        # Losses
        negative_log_likelihoods = -np.log(correct_confidences)
        return negative_log_likelihoods

    # Backward pass - FIXED VERSION
    def backward(self, dvalues, y_true):
        # Number of samples
        samples = len(dvalues)
        # Number of labels in every sample
        labels = len(dvalues[0])

        # Check if labels are sparse, turn them into one-hot vector values
        if len(y_true.shape) == 1:
            y_true = np.eye(labels)[y_true]

        # Calculate the gradient - FIXED: Clip dvalues to prevent division by zero
        dvalues_clipped = np.clip(dvalues, 1e-7, 1 - 1e-7)
        self.dinputs = -y_true / dvalues_clipped
        self.dinputs = self.dinputs / samples

## Optimizer

In [58]:
# Three different optimizers

class Optimizer_SGD_Decay:
    def __init__(self, learning_rate=1.0, decay=0.0):
        self.learning_rate = learning_rate
        self.current_learning_rate = learning_rate
        self.decay = decay
        self.iterations = 0
    
    def pre_update_params(self):
        # Update learning rate before forward pass
        if self.decay:
            self.current_learning_rate = self.learning_rate * (1.0 / (1.0 + self.decay * self.iterations))
    
    def update_params(self, layer):
        layer.weights += -self.current_learning_rate * layer.dweights
        layer.biases += -self.current_learning_rate * layer.dbiases
    
    def post_update_params(self):
        self.iterations += 1

class Optimizer_SGD_Momentum:
    def __init__(self, learning_rate=1.0, momentum=0.9):
        self.learning_rate = learning_rate
        self.momentum = momentum
        self.iterations = 0
    
    def pre_update_params(self):
        pass
    
    def update_params(self, layer):
        # Initialize momentum if first time
        if not hasattr(layer, 'weight_momentums'):
            layer.weight_momentums = np.zeros_like(layer.weights)
            layer.bias_momentums = np.zeros_like(layer.biases)
        
        # Update momentum
        layer.weight_momentums = self.momentum * layer.weight_momentums - self.learning_rate * layer.dweights
        layer.bias_momentums = self.momentum * layer.bias_momentums - self.learning_rate * layer.dbiases
        
        # Apply updates
        layer.weights += layer.weight_momentums
        layer.biases += layer.bias_momentums
    
    def post_update_params(self):
        self.iterations += 1

class Optimizer_AdaGrad:
    def __init__(self, learning_rate=1.0, epsilon=1e-7):
        self.learning_rate = learning_rate
        self.epsilon = epsilon
        self.iterations = 0
    
    def pre_update_params(self):
        pass
    
    def update_params(self, layer):
        # Initialize cache if first time
        if not hasattr(layer, 'weight_cache'):
            layer.weight_cache = np.zeros_like(layer.weights)
            layer.bias_cache = np.zeros_like(layer.biases)
        
        # Update cache with squared gradients
        layer.weight_cache += layer.dweights**2
        layer.bias_cache += layer.dbiases**2
        
        # Update parameters
        layer.weights += -self.learning_rate * layer.dweights / (np.sqrt(layer.weight_cache) + self.epsilon)
        layer.biases += -self.learning_rate * layer.dbiases / (np.sqrt(layer.bias_cache) + self.epsilon)
    
    def post_update_params(self):
        self.iterations += 1

print("Optimizers created")

Optimizers created


## Create Dataset

In [59]:
# Create spiral dataset
def create_spiral_data(samples, classes):
    X = np.zeros((samples*classes, 2))
    y = np.zeros(samples*classes, dtype='uint8')
    
    for class_number in range(classes):
        ix = range(samples*class_number, samples*(class_number+1))
        r = np.linspace(0.0, 1, samples)
        t = np.linspace(class_number*4, (class_number+1)*4, samples) + np.random.randn(samples)*0.2
        X[ix] = np.c_[r*np.sin(t*2.5), r*np.cos(t*2.5)]
        y[ix] = class_number
    
    return X, y

# Generate data
X, y = create_spiral_data(samples=100, classes=3)
print(f"Data shape: X={X.shape}, y={y.shape}")
print("Dataset created successfully")

Data shape: X=(300, 2), y=(300,)
Dataset created successfully


## Training Function

In [60]:
def train_network(X, y, optimizer, epochs=1000, print_interval=100):
    # Create network layers
    dense1 = Layer_Dense(2, 64)
    activation1 = Activation_ReLU()
    dense2 = Layer_Dense(64, 32)
    activation2 = Activation_ReLU()
    dense3 = Layer_Dense(32, 3)
    activation3 = Activation_Softmax()
    loss_function = Loss_CategoricalCrossEntropy()
    
    # Track results
    accuracy_log = []
    loss_log = []
    loss_history = []
    epochs_to_stabilize = None
    
    print(f"Training with {optimizer.__class__.__name__}")
    
    for epoch in range(epochs):
        # Learning rate decay before forward pass
        optimizer.pre_update_params()
        
        # Forward pass
        dense1.forward(X)
        activation1.forward(dense1.output)
        dense2.forward(activation1.output)
        activation2.forward(dense2.output)
        dense3.forward(activation2.output)
        activation3.forward(dense3.output)
        
        # Calculate loss
        loss = loss_function.calculate(activation3.output, y)
        loss_history.append(loss)
        
        # Calculate accuracy
        predictions = np.argmax(activation3.output, axis=1)
        if len(y.shape) == 2:
            y_true = np.argmax(y, axis=1)
        else:
            y_true = y
        accuracy = np.mean(predictions == y_true)
        
        # Backward pass
        loss_function.backward(activation3.output, y)
        activation3.backward(loss_function.dinputs)
        dense3.backward(activation3.dinputs)
        activation2.backward(dense3.dinputs)
        dense2.backward(activation2.dinputs)
        activation1.backward(dense2.dinputs)
        dense1.backward(activation1.dinputs)
        
        # Update weights
        optimizer.update_params(dense1)
        optimizer.update_params(dense2)
        optimizer.update_params(dense3)
        optimizer.post_update_params()
        
        # Print progress
        if epoch % print_interval == 0:
            lr = getattr(optimizer, "current_learning_rate", getattr(optimizer, "learning_rate", "N/A"))
            print(f'Epoch: {epoch:4d}, Acc: {accuracy:.4f}, Loss: {loss:.4f}, LR: {lr:.6f}')
            accuracy_log.append(accuracy)
            loss_log.append(loss)
        
        # Check if loss stabilized
        if epochs_to_stabilize is None and epoch >= 50:
            recent_losses = loss_history[-50:]
            if max(recent_losses) - min(recent_losses) < 0.01:
                epochs_to_stabilize = epoch
    
    final_accuracy = accuracy
    if epochs_to_stabilize is None:
        epochs_to_stabilize = epochs
    
    print(f"Final Accuracy: {final_accuracy:.4f}")
    print(f"Stabilized at epoch: {epochs_to_stabilize}")
    
    return accuracy_log, loss_log, epochs_to_stabilize, final_accuracy

## Run Experiments

In [61]:
# Test 1: Learning Rate Decay
print("Test 1: Learning Rate Decay")
optimizer1 = Optimizer_SGD_Decay(learning_rate=1.0, decay=0.001)
acc_log1, loss_log1, stabilize1, final_acc1 = train_network(X, y, optimizer1, epochs=1000)
print()

Test 1: Learning Rate Decay
Training with Optimizer_SGD_Decay
Epoch:    0, Acc: 0.3567, Loss: 1.0986, LR: 1.000000
Epoch:  100, Acc: 0.4233, Loss: 1.0986, LR: 0.909091
Epoch:  100, Acc: 0.4233, Loss: 1.0986, LR: 0.909091
Epoch:  200, Acc: 0.4267, Loss: 1.0986, LR: 0.833333
Epoch:  200, Acc: 0.4267, Loss: 1.0986, LR: 0.833333
Epoch:  300, Acc: 0.4300, Loss: 1.0986, LR: 0.769231
Epoch:  300, Acc: 0.4300, Loss: 1.0986, LR: 0.769231
Epoch:  400, Acc: 0.4233, Loss: 1.0986, LR: 0.714286
Epoch:  400, Acc: 0.4233, Loss: 1.0986, LR: 0.714286
Epoch:  500, Acc: 0.4233, Loss: 1.0985, LR: 0.666667
Epoch:  500, Acc: 0.4233, Loss: 1.0985, LR: 0.666667
Epoch:  600, Acc: 0.4233, Loss: 1.0982, LR: 0.625000
Epoch:  600, Acc: 0.4233, Loss: 1.0982, LR: 0.625000
Epoch:  700, Acc: 0.4267, Loss: 1.0964, LR: 0.588235
Epoch:  700, Acc: 0.4267, Loss: 1.0964, LR: 0.588235
Epoch:  800, Acc: 0.4100, Loss: 1.0828, LR: 0.555556
Epoch:  800, Acc: 0.4100, Loss: 1.0828, LR: 0.555556
Epoch:  900, Acc: 0.4167, Loss: 1.076

In [62]:
# Test 2: Momentum
print("Test 2: Momentum")
optimizer2 = Optimizer_SGD_Momentum(learning_rate=0.1, momentum=0.9)
acc_log2, loss_log2, stabilize2, final_acc2 = train_network(X, y, optimizer2, epochs=1000)
print()

Test 2: Momentum
Training with Optimizer_SGD_Momentum
Epoch:    0, Acc: 0.3967, Loss: 1.0986, LR: 0.100000
Epoch:  100, Acc: 0.4267, Loss: 1.0986, LR: 0.100000
Epoch:  100, Acc: 0.4267, Loss: 1.0986, LR: 0.100000
Epoch:  200, Acc: 0.4133, Loss: 1.0986, LR: 0.100000
Epoch:  200, Acc: 0.4133, Loss: 1.0986, LR: 0.100000
Epoch:  300, Acc: 0.4367, Loss: 1.0985, LR: 0.100000
Epoch:  300, Acc: 0.4367, Loss: 1.0985, LR: 0.100000
Epoch:  400, Acc: 0.4233, Loss: 1.0983, LR: 0.100000
Epoch:  400, Acc: 0.4233, Loss: 1.0983, LR: 0.100000
Epoch:  500, Acc: 0.4167, Loss: 1.0954, LR: 0.100000
Epoch:  500, Acc: 0.4167, Loss: 1.0954, LR: 0.100000
Epoch:  600, Acc: 0.4067, Loss: 1.0765, LR: 0.100000
Epoch:  600, Acc: 0.4067, Loss: 1.0765, LR: 0.100000
Epoch:  700, Acc: 0.4133, Loss: 1.0759, LR: 0.100000
Epoch:  700, Acc: 0.4133, Loss: 1.0759, LR: 0.100000
Epoch:  800, Acc: 0.4200, Loss: 1.0750, LR: 0.100000
Epoch:  800, Acc: 0.4200, Loss: 1.0750, LR: 0.100000
Epoch:  900, Acc: 0.4433, Loss: 1.0697, LR: 0

In [63]:
# Test 3: AdaGrad
print("Test 3: AdaGrad")
optimizer3 = Optimizer_AdaGrad(learning_rate=1.0)
acc_log3, loss_log3, stabilize3, final_acc3 = train_network(X, y, optimizer3, epochs=1000)
print()

Test 3: AdaGrad
Training with Optimizer_AdaGrad
Epoch:    0, Acc: 0.2867, Loss: 1.0986, LR: 1.000000
Epoch:  100, Acc: 0.3333, Loss: 10.7454, LR: 1.000000
Epoch:  100, Acc: 0.3333, Loss: 10.7454, LR: 1.000000
Epoch:  200, Acc: 0.3333, Loss: 10.7454, LR: 1.000000
Epoch:  200, Acc: 0.3333, Loss: 10.7454, LR: 1.000000
Epoch:  300, Acc: 0.3333, Loss: 10.7454, LR: 1.000000
Epoch:  300, Acc: 0.3333, Loss: 10.7454, LR: 1.000000
Epoch:  400, Acc: 0.3333, Loss: 10.7454, LR: 1.000000
Epoch:  400, Acc: 0.3333, Loss: 10.7454, LR: 1.000000
Epoch:  500, Acc: 0.3333, Loss: 10.7454, LR: 1.000000
Epoch:  500, Acc: 0.3333, Loss: 10.7454, LR: 1.000000
Epoch:  600, Acc: 0.3333, Loss: 10.7454, LR: 1.000000
Epoch:  600, Acc: 0.3333, Loss: 10.7454, LR: 1.000000
Epoch:  700, Acc: 0.3333, Loss: 10.7454, LR: 1.000000
Epoch:  700, Acc: 0.3333, Loss: 10.7454, LR: 1.000000
Epoch:  800, Acc: 0.3333, Loss: 10.7454, LR: 1.000000
Epoch:  800, Acc: 0.3333, Loss: 10.7454, LR: 1.000000
Epoch:  900, Acc: 0.3333, Loss: 10.

## Results

In [64]:
# Show accuracy progression
print("Accuracy every 100 epochs:")
print("Epoch\t| LR Decay\t| Momentum\t| AdaGrad")
print("-" * 50)

for i, epoch in enumerate(range(0, 1000, 100)):
    print(f"{epoch:4d}\t| {acc_log1[i]:.4f}\t| {acc_log2[i]:.4f}\t| {acc_log3[i]:.4f}")

Accuracy every 100 epochs:
Epoch	| LR Decay	| Momentum	| AdaGrad
--------------------------------------------------
   0	| 0.3567	| 0.3967	| 0.2867
 100	| 0.4233	| 0.4267	| 0.3333
 200	| 0.4267	| 0.4133	| 0.3333
 300	| 0.4300	| 0.4367	| 0.3333
 400	| 0.4233	| 0.4233	| 0.3333
 500	| 0.4233	| 0.4167	| 0.3333
 600	| 0.4233	| 0.4067	| 0.3333
 700	| 0.4267	| 0.4133	| 0.3333
 800	| 0.4100	| 0.4200	| 0.3333
 900	| 0.4167	| 0.4433	| 0.3333


In [65]:
# Comparison summary
print("\nFinal Results:")
print(f"LR Decay: {stabilize1} epochs to stabilize, {final_acc1:.4f} accuracy")
print(f"Momentum: {stabilize2} epochs to stabilize, {final_acc2:.4f} accuracy") 
print(f"AdaGrad: {stabilize3} epochs to stabilize, {final_acc3:.4f} accuracy")

# Find best performers
results = [
    ("LR Decay", stabilize1, final_acc1),
    ("Momentum", stabilize2, final_acc2),
    ("AdaGrad", stabilize3, final_acc3)
]

fastest = min(results, key=lambda x: x[1])
most_accurate = max(results, key=lambda x: x[2])

print(f"\nFastest convergence: {fastest[0]} ({fastest[1]} epochs)")
print(f"Highest accuracy: {most_accurate[0]} ({most_accurate[2]:.4f})")


Final Results:
LR Decay: 50 epochs to stabilize, 0.4100 accuracy
Momentum: 50 epochs to stabilize, 0.4233 accuracy
AdaGrad: 50 epochs to stabilize, 0.3333 accuracy

Fastest convergence: LR Decay (50 epochs)
Highest accuracy: Momentum (0.4233)


## Analysis

In [66]:
# Compare the two best optimizers
best_two = sorted(results, key=lambda x: x[2], reverse=True)[:2]
opt1_name, opt1_stab, opt1_acc = best_two[0]
opt2_name, opt2_stab, opt2_acc = best_two[1]

print(f"Comparing {opt1_name} vs {opt2_name}:")
print(f"{opt1_name}: {opt1_stab} epochs, {opt1_acc:.4f} accuracy")
print(f"{opt2_name}: {opt2_stab} epochs, {opt2_acc:.4f} accuracy")

print(f"\nConclusion:")
print(f"For this spiral dataset, {opt1_name} performed better with {opt1_acc:.4f} accuracy.")
print(f"It took {opt1_stab} epochs to stabilize vs {opt2_stab} for {opt2_name}.")

if opt1_acc - opt2_acc > 0.01:
    print(f"{opt1_name} is clearly superior for this task.")
else:
    print(f"Both optimizers performed similarly, but {opt1_name} has a slight edge.")
    
print("The momentum optimizer generally works well for this type of classification problem.")

Comparing Momentum vs LR Decay:
Momentum: 50 epochs, 0.4233 accuracy
LR Decay: 50 epochs, 0.4100 accuracy

Conclusion:
For this spiral dataset, Momentum performed better with 0.4233 accuracy.
It took 50 epochs to stabilize vs 50 for LR Decay.
Momentum is clearly superior for this task.
The momentum optimizer generally works well for this type of classification problem.
