In [15]:
import numpy as np
import nnfs
from nnfs.datasets import spiral_data

In [16]:
class Layer_Dense:
    def __init__(self, n_inputs, n_neurons):
        self.weights = 0.01 * np.random.randn(n_inputs, n_neurons)
        self.biases = np.zeros((1, n_neurons))
        # For momentum and Adagrad
        self.weight_momentums = np.zeros_like(self.weights)
        self.bias_momentums = np.zeros_like(self.biases)
        self.weight_cache = np.zeros_like(self.weights)
        self.bias_cache = np.zeros_like(self.biases)

    def forward(self, inputs):
        self.inputs = inputs
        self.output = np.dot(inputs, self.weights) + self.biases

    def backward(self, dvalues):
        self.dweights = np.dot(self.inputs.T, dvalues)
        self.dbiases = np.sum(dvalues, axis=0, keepdims=True)
        self.dinputs = np.dot(dvalues, self.weights.T)


In [17]:
class Activation_ReLU:
    def forward(self, inputs):
        self.inputs = inputs
        self.output = np.maximum(0, inputs)
    def backward(self, dvalues):
        self.dinputs = dvalues.copy()
        self.dinputs[self.inputs <= 0] = 0

class Activation_Softmax:
    def forward(self, inputs):
        exp_values = np.exp(inputs - np.max(inputs, axis=1, keepdims=True))
        self.output = exp_values / np.sum(exp_values, axis=1, keepdims=True)


In [18]:
class Loss:
    def calculate(self, output, y):
        sample_losses = self.forward(output, y)
        return np.mean(sample_losses)

class Loss_CategoricalCrossEntropy(Loss):
    def forward(self, y_pred, y_true):
        samples = y_pred.shape[0]
        y_pred_clipped = np.clip(y_pred, 1e-7, 1 - 1e-7)
        if len(y_true.shape) == 1:
            correct_confidences = y_pred_clipped[range(samples), y_true]
        else:
            correct_confidences = np.sum(y_pred_clipped * y_true, axis=1)
        return -np.log(correct_confidences)

# Faster Softmax + CCE backward
class Activation_Softmax_Loss_CCE:
    def backward(self, y_pred, y_true):
        samples = y_pred.shape[0]
        if len(y_true.shape) == 2:
            y_true = np.argmax(y_true, axis=1)
        dinputs = y_pred.copy()
        dinputs[range(samples), y_true] -= 1
        self.dinputs = dinputs / samples


In [19]:
class Optimizer_SGD:
    def __init__(self, learning_rate=1.0, decay=0.0, momentum=0.0, adagrad=False, epsilon=1e-7):
        self.learning_rate = learning_rate
        self.current_learning_rate = learning_rate
        self.decay = decay
        self.iterations = 0
        self.momentum = momentum
        self.use_adagrad = adagrad
        self.epsilon = epsilon

    def pre_update_params(self):
        if self.decay:
            self.current_learning_rate = self.learning_rate / (1 + self.decay * self.iterations)

    def update_params(self, layer):
        if self.use_adagrad:
            # Adagrad
            layer.weight_cache += layer.dweights ** 2
            layer.bias_cache += layer.dbiases ** 2
            layer.weights -= self.current_learning_rate * layer.dweights / (np.sqrt(layer.weight_cache) + self.epsilon)
            layer.biases  -= self.current_learning_rate * layer.dbiases / (np.sqrt(layer.bias_cache) + self.epsilon)
        elif self.momentum:
            # Momentum
            layer.weight_momentums = self.momentum * layer.weight_momentums + self.current_learning_rate * layer.dweights
            layer.bias_momentums   = self.momentum * layer.bias_momentums + self.current_learning_rate * layer.dbiases
            layer.weights -= layer.weight_momentums
            layer.biases  -= layer.bias_momentums
        else:
            # Vanilla SGD
            layer.weights -= self.current_learning_rate * layer.dweights
            layer.biases  -= self.current_learning_rate * layer.dbiases

    def post_update_params(self):
        self.iterations += 1


In [24]:
nnfs.init()
X, y = spiral_data(samples=100, classes=3)

dense1 = Layer_Dense(2, 64)
activation1 = Activation_ReLU()
dense2 = Layer_Dense(64, 3)
activation2 = Activation_Softmax()

loss_function = Loss_CategoricalCrossEntropy()
combo = Activation_Softmax_Loss_CCE()

# Choose optimizer: remove # to enable optimizer
# optimizer = Optimizer_SGD(learning_rate=1.0, decay=1e-3)          # Decay
# optimizer = Optimizer_SGD(learning_rate=0.1, momentum=0.9)        # Momentum
optimizer = Optimizer_SGD(learning_rate=1.0, adagrad=True, decay=1e-3)  # Adagrad


In [25]:
epochs = 1000
for epoch in range(1, epochs+1):
    optimizer.pre_update_params()

    # Forward
    dense1.forward(X)
    activation1.forward(dense1.output)
    dense2.forward(activation1.output)
    activation2.forward(dense2.output)

    # Loss + accuracy
    loss = loss_function.calculate(activation2.output, y)
    predictions = np.argmax(activation2.output, axis=1)
    acc = np.mean(predictions == y)

    # Backward
    combo.backward(activation2.output, y)
    dense2.backward(combo.dinputs)
    activation1.backward(dense2.dinputs)
    dense1.backward(activation1.dinputs)

    # Update
    optimizer.update_params(dense1)
    optimizer.update_params(dense2)
    optimizer.post_update_params()

    # Log every 100 epochs
    if epoch % 100 == 0:
        print(f"epoch {epoch:4d} | lr={optimizer.current_learning_rate:.4f} | loss={loss:.4f} | acc={acc:.4f}")


epoch  100 | lr=0.9099 | loss=1.0215 | acc=0.4633
epoch  200 | lr=0.8340 | loss=0.9497 | acc=0.5233
epoch  300 | lr=0.7698 | loss=0.8843 | acc=0.5767
epoch  400 | lr=0.7148 | loss=0.8335 | acc=0.6033
epoch  500 | lr=0.6671 | loss=0.8037 | acc=0.6233
epoch  600 | lr=0.6254 | loss=0.7789 | acc=0.6333
epoch  700 | lr=0.5886 | loss=0.7624 | acc=0.6300
epoch  800 | lr=0.5559 | loss=0.7466 | acc=0.6233
epoch  900 | lr=0.5266 | loss=0.7228 | acc=0.6400
epoch 1000 | lr=0.5003 | loss=0.7046 | acc=0.6633
