# Imports

In [1]:
import numpy as np

# Layer Classes

In [2]:
class Layer_Dense:
    def __init__(self, n_inputs, n_neurons):
        self.weights = 0.01 * np.random.randn(n_inputs, n_neurons)
        self.biases = np.zeros((1, n_neurons))
    def forward(self, inputs):
        self.inputs = inputs
        self.output = np.dot(inputs, self.weights) + self.biases
    def backward(self, dvalues):
        self.dweights = np.dot(self.inputs.T, dvalues)
        self.dbiases = np.sum(dvalues, axis=0, keepdims=True)
        self.dinputs = np.dot(dvalues, self.weights.T)

# Activation Functions

In [3]:
class Activation_ReLU:
    def forward(self, inputs):
        self.inputs = inputs
        self.output = np.maximum(0, inputs)
    def backward(self, dvalues):
        self.dinputs = dvalues.copy()
        self.dinputs[self.inputs <= 0] = 0

class Activation_Softmax:
    def forward(self, inputs):
        exp_values = np.exp(inputs - np.max(inputs, axis=1, keepdims=True))
        probabilities = exp_values / np.sum(exp_values, axis=1, keepdims=True)
        self.output = probabilities
    def backward(self, dvalues):
        self.dinputs = np.empty_like(dvalues)
        for index, (single_output, single_dvalues) in enumerate(zip(self.output, dvalues)):
            single_output = single_output.reshape(-1, 1)
            jacobian_matrix = np.diagflat(single_output) - np.dot(single_output, single_output.T)
            self.dinputs[index] = np.dot(jacobian_matrix, single_dvalues)

# Loss Function

In [4]:
class Loss:
    def calculate(self, output, y):
        sample_losses = self.forward(output, y)
        return np.mean(sample_losses)

class Loss_CategoricalCrossEntropy(Loss):
    def forward(self, y_pred, y_true):
        samples = len(y_pred)
        y_pred_clipped = np.clip(y_pred, 1e-7, 1 - 1e-7)
        if len(y_true.shape) == 1:
            correct_confidences = y_pred_clipped[range(samples), y_true]
        else:
            correct_confidences = np.sum(y_pred_clipped * y_true, axis=1)
        return -np.log(correct_confidences)
    def backward(self, dvalues, y_true):
        samples = len(dvalues)
        labels = len(dvalues[0])
        if len(y_true.shape) == 1:
            y_true = np.eye(labels)[y_true]
        self.dinputs = -y_true / dvalues
        self.dinputs = self.dinputs / samples

# Optimizer (Learning Rate Decay, Momentum, Adagrad)

In [5]:
class Optimizer:
    def __init__(self, learning_rate=1.0, decay=0.0, momentum=0.0, use_adagrad=False):
        self.learning_rate = learning_rate
        self.current_learning_rate = learning_rate
        self.decay = decay
        self.iterations = 0
        self.momentum = momentum
        self.use_adagrad = use_adagrad
        self.epsilon = 1e-7
    def pre_update_params(self):
        if self.decay:
            self.current_learning_rate = self.learning_rate * (1.0 / (1.0 + self.decay * self.iterations))
    def update_params(self, layer):
        if self.use_adagrad:
            if not hasattr(layer, "weight_cache"):
                layer.weight_cache = np.zeros_like(layer.weights)
                layer.bias_cache = np.zeros_like(layer.biases)
            layer.weight_cache += layer.dweights ** 2
            layer.bias_cache += layer.dbiases ** 2
            layer.weights += -self.current_learning_rate * layer.dweights / (np.sqrt(layer.weight_cache) + self.epsilon)
            layer.biases += -self.current_learning_rate * layer.dbiases / (np.sqrt(layer.bias_cache) + self.epsilon)
        elif self.momentum:
            if not hasattr(layer, "weight_momentums"):
                layer.weight_momentums = np.zeros_like(layer.weights)
                layer.bias_momentums = np.zeros_like(layer.biases)
            weight_updates = self.momentum * layer.weight_momentums - self.current_learning_rate * layer.dweights
            bias_updates = self.momentum * layer.bias_momentums - self.current_learning_rate * layer.dbiases
            layer.weight_momentums = weight_updates
            layer.bias_momentums = bias_updates
            layer.weights += weight_updates
            layer.biases += bias_updates
        else:
            layer.weights += -self.current_learning_rate * layer.dweights
            layer.biases += -self.current_learning_rate * layer.dbiases
    def post_update_params(self):
        self.iterations += 1

#  Data Generation

In [6]:
def generate_data(samples=100):
    X = np.random.randn(samples, 2)
    y = np.random.randint(0, 3, samples)
    return X, y

X, y = generate_data(300)

#  Model Initialization

In [7]:
dense1 = Layer_Dense(2, 3)
activation1 = Activation_ReLU()
dense2 = Layer_Dense(3, 3)
activation2 = Activation_Softmax()
loss_function = Loss_CategoricalCrossEntropy()

# Training Loop & Accuracy Display

In [8]:
def train_with_optimizer(opt, epochs=1000, print_every=100):
    for epoch in range(epochs):
        dense1.forward(X)
        activation1.forward(dense1.output)
        dense2.forward(activation1.output)
        activation2.forward(dense2.output)
        loss = loss_function.calculate(activation2.output, y)
        predictions = np.argmax(activation2.output, axis=1)
        accuracy = np.mean(predictions == y)
        loss_function.backward(activation2.output, y)
        activation2.backward(loss_function.dinputs)
        dense2.backward(activation2.dinputs)
        activation1.backward(dense2.dinputs)
        dense1.backward(activation1.dinputs)
        opt.pre_update_params()
        opt.update_params(dense1)
        opt.update_params(dense2)
        opt.post_update_params()
        if epoch % print_every == 0:
            print(f"Epoch {epoch}, Loss {loss:.3f}, Accuracy {accuracy:.3f}, LR {opt.current_learning_rate:.5f}")
    return accuracy

# Optimizer Comparison

In [9]:
opt_decay = Optimizer(learning_rate=1.0, decay=1e-3)
acc_decay = train_with_optimizer(opt_decay)
opt_momentum = Optimizer(learning_rate=1.0, momentum=0.9)
acc_momentum = train_with_optimizer(opt_momentum)
opt_adagrad = Optimizer(learning_rate=1.0, use_adagrad=True)
acc_adagrad = train_with_optimizer(opt_adagrad)

print("Decay Accuracy", acc_decay)
print("Momentum Accuracy", acc_momentum)
print("Adagrad Accuracy", acc_adagrad)

Epoch 0, Loss 1.099, Accuracy 0.347, LR 1.00000
Epoch 100, Loss 1.091, Accuracy 0.377, LR 0.90909
Epoch 200, Loss 1.083, Accuracy 0.407, LR 0.83333
Epoch 300, Loss 1.081, Accuracy 0.427, LR 0.76923
Epoch 400, Loss 1.081, Accuracy 0.433, LR 0.71429
Epoch 500, Loss 1.080, Accuracy 0.430, LR 0.66667
Epoch 600, Loss 1.080, Accuracy 0.437, LR 0.62500
Epoch 700, Loss 1.080, Accuracy 0.430, LR 0.58824
Epoch 800, Loss 1.080, Accuracy 0.420, LR 0.55556
Epoch 900, Loss 1.079, Accuracy 0.413, LR 0.52632
Epoch 0, Loss 1.079, Accuracy 0.410, LR 1.00000
Epoch 100, Loss 1.072, Accuracy 0.430, LR 1.00000
Epoch 200, Loss 1.072, Accuracy 0.420, LR 1.00000
Epoch 300, Loss 1.072, Accuracy 0.420, LR 1.00000
Epoch 400, Loss 1.072, Accuracy 0.420, LR 1.00000
Epoch 500, Loss 1.072, Accuracy 0.420, LR 1.00000
Epoch 600, Loss 1.072, Accuracy 0.420, LR 1.00000
Epoch 700, Loss 1.072, Accuracy 0.417, LR 1.00000
Epoch 800, Loss 1.072, Accuracy 0.427, LR 1.00000
Epoch 900, Loss 1.072, Accuracy 0.427, LR 1.00000
Epoc