In [1]:
import numpy as np
import nnfs
from nnfs.datasets import spiral_data
nnfs.init()

In [2]:
# Dense layer 
class Layer_Dense: 
 
    # Layer initialization 
    def __init__(self, n_inputs, n_neurons): 
        # Initialize weights and biases 
        self.weights = 0.01 * np.random.randn(n_inputs, n_neurons) 
        self.biases = np.zeros((1, n_neurons)) 
 
    # Forward pass 
    def forward(self, inputs): 
        # Remember input values 
        self.inputs = inputs 
        # Calculate output values from inputs, weights and biases 
        self.output = np.dot(inputs, self.weights) + self.biases 
 
    # Backward pass 
    def backward(self, dvalues): 
        # Gradients on parameters 
        self.dweights = np.dot(self.inputs.T, dvalues) 
        self.dbiases = np.sum(dvalues, axis=0, keepdims=True) 
        # Gradient on values 
        self.dinputs = np.dot(dvalues, self.weights.T) 
 
 
# ReLU activation 
class Activation_ReLU: 
 
    # Forward pass 
    def forward(self, inputs): 
          # Remember input values 
        self.inputs = inputs 
        # Calculate output values from inputs 
        self.output = np.maximum(0, inputs) 
 
    # Backward pass 
    def backward(self, dvalues): 
        # Since we need to modify original variable, 
        # let's make a copy of values first 
        self.dinputs = dvalues.copy() 
 
        # Zero gradient where input values were negative 
        self.dinputs[self.inputs <= 0] = 0 
 
 
# Softmax activation 
class Activation_Softmax: 
 
    # Forward pass 
    def forward(self, inputs): 
        # Remember input values 
        self.inputs = inputs 
 
        # Get unnormalized probabilities 
        exp_values = np.exp(inputs - np.max(inputs, axis=1, 
                                            keepdims=True)) 
        # Normalize them for each sample 
        probabilities = exp_values/np.sum(exp_values, axis=1, 
                                            keepdims=True) 
 
        self.output = probabilities 
 
    # Backward pass 
    def backward(self, dvalues): 
 
        # Create uninitialized array 
        self.dinputs = np.empty_like(dvalues) 
 
        # Enumerate outputs and gradients 
        for index, (single_output, single_dvalues) in enumerate(zip(self.output, dvalues)): 
            # Flatten output array 
            single_output = single_output.reshape(-1, 1) 
            # Calculate Jacobian matrix of the output and 
            jacobian_matrix = np.diagflat(single_output) - np.dot(single_output, single_output.T) 
            # Calculate sample-wise gradient 
            # and add it to the array of sample gradients 
            self.dinputs[index] = np.dot(jacobian_matrix, 
                                         single_dvalues) 
 
 
# Common loss class 
class Loss: 
 
    # Calculates the data and regularization losses 
    # given model output and ground truth values 
    def calculate(self, output, y): 
 
        # Calculate sample losses 
        sample_losses = self.forward(output, y) 
 
        # Calculate mean loss 
        data_loss = np.mean(sample_losses) 
 
        # Return loss 
        return data_loss 
 
 
# Cross-entropy loss 
class Loss_CategoricalCrossentropy(Loss): 
 
    # Forward pass 
    def forward(self, y_pred, y_true): 
 
        # Number of samples in a batch 
        samples = len(y_pred) 
 
        # Clip data to prevent division by 0 
        # Clip both sides to not drag mean towards any value 
        y_pred_clipped = np.clip(y_pred, 1e-7, 1 - 1e-7) 
 
        # Probabilities for target values - 
        # only if categorical labels 
        if len(y_true.shape) == 1: 
            correct_confidences = y_pred_clipped[ 
                range(samples), 
                y_true 
            ] 
        # Mask values - only for one-hot encoded labels 
        elif len(y_true.shape) == 2: 
            correct_confidences = np.sum( 
                y_pred_clipped * y_true, 
                axis=1 
            ) 
 
        # Losses 
        negative_log_likelihoods = -np.log(correct_confidences) 
        return negative_log_likelihoods 
 
    # Backward pass 
    def backward(self, dvalues, y_true): 
 
        # Number of samples 
        samples = len(dvalues) 
        # Number of labels in every sample 
        # We'll use the first sample to count them 
        labels = len(dvalues[0]) 
 
        # If labels are sparse, turn them into one-hot vector 
        if len(y_true.shape) == 1: 
            y_true = np.eye(labels)[y_true] 
 
        # Calculate gradient 
        self.dinputs = -y_true / dvalues 
        # Normalize gradient 
        self.dinputs = self.dinputs/samples 
 
 
# Softmax classifier - combined Softmax activation 
# and cross-entropy loss for faster backward step 
class Activation_Softmax_Loss_CategoricalCrossentropy(): 
 
    # Creates activation and loss function objects 
    def __init__(self): 
        self.activation = Activation_Softmax() 
        self.loss = Loss_CategoricalCrossentropy() 
 
    # Forward pass 
    def forward(self, inputs, y_true): 
        # Output layer's activation function 
        self.activation.forward(inputs) 
        # Set the output 
        self.output = self.activation.output 
        # Calculate and return loss value 
        return self.loss.calculate(self.output, y_true) 
    
    # Backward pass 
    def backward(self, dvalues, y_true): 
 
        # Number of samples 
        samples = len(dvalues) 
 
        # If labels are one-hot encoded, 
        # turn them into discrete values 
        if len(y_true.shape) == 2: 
            y_true = np.argmax(y_true, axis=1) 
 
        # Copy so we can safely modify 
        self.dinputs = dvalues.copy() 
        # Calculate gradient 
        self.dinputs[range(samples), y_true] -= 1 
        # Normalize gradient 
        self.dinputs = self.dinputs/samples 


In [3]:
class Optimiser_SGD:

    # initialise optimiser -set settings
    # learning rate ofg 1.0 is deafult for this optimiser
    def __init__(self,learning_rate=1.0):
        self.learning_rate = learning_rate

    # update parameters
    def update_params(self,layer):
        layer.weights += -self.learning_rate*layer.dweights
        layer.biases += -self.learning_rate*layer.dbiases

Recall that the layer object contains its parameters (weights and biases) and also, at this stage, the 
gradient that is calculated during backpropagation. We store these in the layer’s properties so that 
the optimizer can make use of them. In our main neural network code, we’d bring the 
optimization in after backpropagation. Let’s make a 1x64 densely-connected neural network (1 
hidden layer with 64 neurons) and use the same dataset as before: 

In [4]:
X,y = spiral_data(samples=100,classes=3)

In [5]:
# Create Dense layer with 2 input features and 64 output values 
dense1 = Layer_Dense(2, 64) 

In [6]:
# Create ReLU activation (to be used with Dense layer): 
activation1 = Activation_ReLU() 

In [7]:
# Create second Dense layer with 64 input features (as we take output  
# of previous layer here) and 3 output values (output values) 
dense2 = Layer_Dense(64, 3)

In [8]:
# Create Softmax classifier's combined loss and activation 
loss_activation = Activation_Softmax_Loss_CategoricalCrossentropy() 

In [9]:
#The next step is to create the optimizer’s object: 
# Create optimizer 
optimizer = Optimiser_SGD()

Now peform a forward pass of the sample data

In [10]:
# forward pass through this layer
dense1.forward(X)

In [11]:
# Perform a forward pass through activation function 
# takes the output of first dense layer here 
activation1.forward(dense1.output) 

In [12]:
# Perform a forward pass through second Dense layer 
# takes outputs of activation function of first layer as inputs 
dense2.forward(activation1.output)

In [13]:
# Perform a forward pass through the activation/loss function 
# takes the output of second dense layer here and returns loss 
loss = loss_activation.forward(dense2.output, y) 

In [14]:
# let's print the loss
print('loss:', loss)

loss: 1.0985943


In [15]:
# Calculate accuracy from output of activation2 and targets 
# calculate values along first axis 
predictions = np.argmax(loss_activation.output, axis=1) 
if len(y.shape) == 2: 
    y = np.argmax(y, axis=1) 
accuracy = np.mean(predictions==y) 
print('acc:', accuracy) 

acc: 0.36


Now do backward pass,which is called backpropagation

In [16]:
# backward pass
loss_activation.backward(loss_activation.output,y)
dense2.backward(loss_activation.dinputs)
activation1.backward(dense2.dinputs)
dense1.backward(activation1.dinputs)

Use the optimiser to update the weights

In [17]:
optimizer.update_params(dense1)
optimizer.update_params(dense2)

**Each full pass through all of the training data is called an epoch.** <br>
In most deep learning tasks, a neural network will be trained for multiple epochs, though the ideal scenario would be to 
have a perfect model with ideal weights and biases after only one epoch.  To add multiple epochs 
of training into our code, we will initialize our model and run a loop around all the code 
performing the forward pass, backward pass, and optimization calculations:

In [18]:
# Create dataset 
X, y = spiral_data(samples=100, classes=3) 
 
# Create Dense layer with 2 input features and 64 output values 
dense1 = Layer_Dense(2, 64) 
 
# Create ReLU activation (to be used with Dense layer): 
activation1 = Activation_ReLU() 
 
# Create second Dense layer with 64 input features (as we take output  
# of previous layer here) and 3 output values (output values) 
dense2 = Layer_Dense(64, 3) 
 
# Create Softmax classifier's combined loss and activation 
loss_activation = Activation_Softmax_Loss_CategoricalCrossentropy() 
 
# Create optimizer 
optimiser = Optimiser_SGD() 

In [19]:
# Train in loop 
for epoch in range(10001): 
 
    # Perform a forward pass of our training data through this layer 
    dense1.forward(X) 
 
    # Perform a forward pass through activation function 
    # takes the output of first dense layer here 
    activation1.forward(dense1.output) 
 
    # Perform a forward pass through second Dense layer 
    # takes outputs of activation function of first layer as inputs 
    dense2.forward(activation1.output) 
 
    # Perform a forward pass through the activation/loss function 
    # takes the output of second dense layer here and returns loss 
    loss = loss_activation.forward(dense2.output, y)

    # Calculate accuracy from output of activation2 and targets 
    # calculate values along first axis 
    predictions = np.argmax(loss_activation.output, axis=1) 
    if len(y.shape) == 2: 
        y = np.argmax(y, axis=1) 
    accuracy = np.mean(predictions==y) 
 
    if not epoch % 100: 
        print(f'epoch: {epoch}, ' + 
              f'acc: {accuracy:.3f}, ' + 
              f'loss: {loss:.3f}') 
 
    # Backward pass 
    loss_activation.backward(loss_activation.output, y) 
    dense2.backward(loss_activation.dinputs) 
    activation1.backward(dense2.dinputs) 
    dense1.backward(activation1.dinputs) 
 
    # Update weights and biases 
    optimiser.update_params(dense1) 
    optimiser.update_params(dense2)

epoch: 0, acc: 0.343, loss: 1.099
epoch: 100, acc: 0.407, loss: 1.083
epoch: 200, acc: 0.397, loss: 1.071
epoch: 300, acc: 0.410, loss: 1.070
epoch: 400, acc: 0.410, loss: 1.069
epoch: 500, acc: 0.413, loss: 1.067
epoch: 600, acc: 0.410, loss: 1.064
epoch: 700, acc: 0.423, loss: 1.058
epoch: 800, acc: 0.450, loss: 1.047
epoch: 900, acc: 0.430, loss: 1.050
epoch: 1000, acc: 0.427, loss: 1.045
epoch: 1100, acc: 0.440, loss: 1.038
epoch: 1200, acc: 0.453, loss: 1.029
epoch: 1300, acc: 0.400, loss: 1.019
epoch: 1400, acc: 0.480, loss: 1.026
epoch: 1500, acc: 0.413, loss: 1.003
epoch: 1600, acc: 0.397, loss: 0.994
epoch: 1700, acc: 0.443, loss: 0.976
epoch: 1800, acc: 0.403, loss: 0.995
epoch: 1900, acc: 0.463, loss: 0.973
epoch: 2000, acc: 0.487, loss: 0.969
epoch: 2100, acc: 0.470, loss: 0.956
epoch: 2200, acc: 0.497, loss: 0.951
epoch: 2300, acc: 0.480, loss: 0.936
epoch: 2400, acc: 0.470, loss: 0.915
epoch: 2500, acc: 0.493, loss: 0.904
epoch: 2600, acc: 0.587, loss: 0.862
epoch: 2700, 

Above gives us an update of where we are (epochs), the model’s accuracy, and loss every 100 epochs. We can see consistent improvements.