In [1]:
import numpy as np
import nnfs
from nnfs.datasets import spiral_data
nnfs.init()

In [2]:
# Dense layer 
class Layer_Dense: 
 
    # Layer initialization 
    def __init__(self, n_inputs, n_neurons): 
        # Initialize weights and biases 
        self.weights = 0.01 * np.random.randn(n_inputs, n_neurons) 
        self.biases = np.zeros((1, n_neurons)) 
 
    # Forward pass 
    def forward(self, inputs): 
        # Remember input values 
        self.inputs = inputs 
        # Calculate output values from inputs, weights and biases 
        self.output = np.dot(inputs, self.weights) + self.biases 
 
    # Backward pass 
    def backward(self, dvalues): 
        # Gradients on parameters 
        self.dweights = np.dot(self.inputs.T, dvalues) 
        self.dbiases = np.sum(dvalues, axis=0, keepdims=True) 
        # Gradient on values 
        self.dinputs = np.dot(dvalues, self.weights.T) 
 
 
# ReLU activation 
class Activation_ReLU: 
 
    # Forward pass 
    def forward(self, inputs): 
          # Remember input values 
        self.inputs = inputs 
        # Calculate output values from inputs 
        self.output = np.maximum(0, inputs) 
 
    # Backward pass 
    def backward(self, dvalues): 
        # Since we need to modify original variable, 
        # let's make a copy of values first 
        self.dinputs = dvalues.copy() 
 
        # Zero gradient where input values were negative 
        self.dinputs[self.inputs <= 0] = 0 
 
 
# Softmax activation 
class Activation_Softmax: 
 
    # Forward pass 
    def forward(self, inputs): 
        # Remember input values 
        self.inputs = inputs 
 
        # Get unnormalized probabilities 
        exp_values = np.exp(inputs - np.max(inputs, axis=1, 
                                            keepdims=True)) 
        # Normalize them for each sample 
        probabilities = exp_values/np.sum(exp_values, axis=1, 
                                            keepdims=True) 
 
        self.output = probabilities 
 
    # Backward pass 
    def backward(self, dvalues): 
 
        # Create uninitialized array 
        self.dinputs = np.empty_like(dvalues) 
 
        # Enumerate outputs and gradients 
        for index, (single_output, single_dvalues) in enumerate(zip(self.output, dvalues)): 
            # Flatten output array 
            single_output = single_output.reshape(-1, 1) 
            # Calculate Jacobian matrix of the output and 
            jacobian_matrix = np.diagflat(single_output) - np.dot(single_output, single_output.T) 
            # Calculate sample-wise gradient 
            # and add it to the array of sample gradients 
            self.dinputs[index] = np.dot(jacobian_matrix, 
                                         single_dvalues) 
 
 
# Common loss class 
class Loss: 
 
    # Calculates the data and regularization losses 
    # given model output and ground truth values 
    def calculate(self, output, y): 
 
        # Calculate sample losses 
        sample_losses = self.forward(output, y) 
 
        # Calculate mean loss 
        data_loss = np.mean(sample_losses) 
 
        # Return loss 
        return data_loss 
 
 
# Cross-entropy loss 
class Loss_CategoricalCrossentropy(Loss): 
 
    # Forward pass 
    def forward(self, y_pred, y_true): 
 
        # Number of samples in a batch 
        samples = len(y_pred) 
 
        # Clip data to prevent division by 0 
        # Clip both sides to not drag mean towards any value 
        y_pred_clipped = np.clip(y_pred, 1e-7, 1 - 1e-7) 
 
        # Probabilities for target values - 
        # only if categorical labels 
        if len(y_true.shape) == 1: 
            correct_confidences = y_pred_clipped[ 
                range(samples), 
                y_true 
            ] 
        # Mask values - only for one-hot encoded labels 
        elif len(y_true.shape) == 2: 
            correct_confidences = np.sum( 
                y_pred_clipped * y_true, 
                axis=1 
            ) 
 
        # Losses 
        negative_log_likelihoods = -np.log(correct_confidences) 
        return negative_log_likelihoods 
 
    # Backward pass 
    def backward(self, dvalues, y_true): 
 
        # Number of samples 
        samples = len(dvalues) 
        # Number of labels in every sample 
        # We'll use the first sample to count them 
        labels = len(dvalues[0]) 
 
        # If labels are sparse, turn them into one-hot vector 
        if len(y_true.shape) == 1: 
            y_true = np.eye(labels)[y_true] 
 
        # Calculate gradient 
        self.dinputs = -y_true / dvalues 
        # Normalize gradient 
        self.dinputs = self.dinputs/samples 
 
 
# Softmax classifier - combined Softmax activation 
# and cross-entropy loss for faster backward step 
class Activation_Softmax_Loss_CategoricalCrossentropy(): 
 
    # Creates activation and loss function objects 
    def __init__(self): 
        self.activation = Activation_Softmax() 
        self.loss = Loss_CategoricalCrossentropy() 
 
    # Forward pass 
    def forward(self, inputs, y_true): 
        # Output layer's activation function 
        self.activation.forward(inputs) 
        # Set the output 
        self.output = self.activation.output 
        # Calculate and return loss value 
        return self.loss.calculate(self.output, y_true) 
    
    # Backward pass 
    def backward(self, dvalues, y_true): 
 
        # Number of samples 
        samples = len(dvalues) 
 
        # If labels are one-hot encoded, 
        # turn them into discrete values 
        if len(y_true.shape) == 2: 
            y_true = np.argmax(y_true, axis=1) 
 
        # Copy so we can safely modify 
        self.dinputs = dvalues.copy() 
        # Calculate gradient 
        self.dinputs[range(samples), y_true] -= 1 
        # Normalize gradient 
        self.dinputs = self.dinputs/samples 


In [3]:
class Optimiser_SGD:

    # initialise optimiser -set settings
    # learning rate ofg 1.0 is deafult for this optimiser
    def __init__(self,learning_rate=1.0):
        self.learning_rate = learning_rate

    # update parameters
    def update_params(self,layer):
        layer.weights += -self.learning_rate*layer.dweights
        layer.biases += -self.learning_rate*layer.dbiases

Recall that the layer object contains its parameters (weights and biases) and also, at this stage, the 
gradient that is calculated during backpropagation. We store these in the layer’s properties so that 
the optimizer can make use of them. In our main neural network code, we’d bring the 
optimization in after backpropagation. Let’s make a 1x64 densely-connected neural network (1 
hidden layer with 64 neurons) and use the same dataset as before: 

In [4]:
X,y = spiral_data(samples=100,classes=3)

In [5]:
# Create Dense layer with 2 input features and 64 output values 
dense1 = Layer_Dense(2, 64) 

In [6]:
# Create ReLU activation (to be used with Dense layer): 
activation1 = Activation_ReLU() 

In [7]:
# Create second Dense layer with 64 input features (as we take output  
# of previous layer here) and 3 output values (output values) 
dense2 = Layer_Dense(64, 3)

In [8]:
# Create Softmax classifier's combined loss and activation 
loss_activation = Activation_Softmax_Loss_CategoricalCrossentropy() 

In [9]:
#The next step is to create the optimizer’s object: 
# Create optimizer 
optimizer = Optimiser_SGD()

Now peform a forward pass of the sample data

In [10]:
# forward pass through this layer
dense1.forward(X)

In [11]:
# Perform a forward pass through activation function 
# takes the output of first dense layer here 
activation1.forward(dense1.output) 

In [12]:
# Perform a forward pass through second Dense layer 
# takes outputs of activation function of first layer as inputs 
dense2.forward(activation1.output)

In [13]:
# Perform a forward pass through the activation/loss function 
# takes the output of second dense layer here and returns loss 
loss = loss_activation.forward(dense2.output, y) 

In [14]:
# let's print the loss
print('loss:', loss)

loss: 1.0985943


In [15]:
# Calculate accuracy from output of activation2 and targets 
# calculate values along first axis 
predictions = np.argmax(loss_activation.output, axis=1) 
if len(y.shape) == 2: 
    y = np.argmax(y, axis=1) 
accuracy = np.mean(predictions==y) 
print('acc:', accuracy) 

acc: 0.36


Now do backward pass,which is called backpropagation

In [16]:
# backward pass
loss_activation.backward(loss_activation.output,y)
dense2.backward(loss_activation.dinputs)
activation1.backward(dense2.dinputs)
dense1.backward(activation1.dinputs)

Use the optimiser to update the weights

In [17]:
optimizer.update_params(dense1)
optimizer.update_params(dense2)

**Each full pass through all of the training data is called an epoch.** <br>
In most deep learning tasks, a neural network will be trained for multiple epochs, though the ideal scenario would be to 
have a perfect model with ideal weights and biases after only one epoch.  To add multiple epochs 
of training into our code, we will initialize our model and run a loop around all the code 
performing the forward pass, backward pass, and optimization calculations:

In [18]:
# Create dataset 
X, y = spiral_data(samples=100, classes=3) 
 
# Create Dense layer with 2 input features and 64 output values 
dense1 = Layer_Dense(2, 64) 
 
# Create ReLU activation (to be used with Dense layer): 
activation1 = Activation_ReLU() 
 
# Create second Dense layer with 64 input features (as we take output  
# of previous layer here) and 3 output values (output values) 
dense2 = Layer_Dense(64, 3) 
 
# Create Softmax classifier's combined loss and activation 
loss_activation = Activation_Softmax_Loss_CategoricalCrossentropy() 
 
# Create optimizer 
optimiser = Optimiser_SGD() 

In [19]:
# Train in loop 
for epoch in range(10001): 
 
    # Perform a forward pass of our training data through this layer 
    dense1.forward(X) 
 
    # Perform a forward pass through activation function 
    # takes the output of first dense layer here 
    activation1.forward(dense1.output) 
 
    # Perform a forward pass through second Dense layer 
    # takes outputs of activation function of first layer as inputs 
    dense2.forward(activation1.output) 
 
    # Perform a forward pass through the activation/loss function 
    # takes the output of second dense layer here and returns loss 
    loss = loss_activation.forward(dense2.output, y)

    # Calculate accuracy from output of activation2 and targets 
    # calculate values along first axis 
    predictions = np.argmax(loss_activation.output, axis=1) 
    if len(y.shape) == 2: 
        y = np.argmax(y, axis=1) 
    accuracy = np.mean(predictions==y) 
 
    if not epoch % 100: 
        print(f'epoch: {epoch}, ' + 
              f'acc: {accuracy:.3f}, ' + 
              f'loss: {loss:.3f}') 
 
    # Backward pass 
    loss_activation.backward(loss_activation.output, y) 
    dense2.backward(loss_activation.dinputs) 
    activation1.backward(dense2.dinputs) 
    dense1.backward(activation1.dinputs) 
 
    # Update weights and biases 
    optimiser.update_params(dense1) 
    optimiser.update_params(dense2)

epoch: 0, acc: 0.343, loss: 1.099
epoch: 100, acc: 0.407, loss: 1.083
epoch: 200, acc: 0.397, loss: 1.071
epoch: 300, acc: 0.410, loss: 1.070
epoch: 400, acc: 0.410, loss: 1.069
epoch: 500, acc: 0.413, loss: 1.067
epoch: 600, acc: 0.410, loss: 1.064
epoch: 700, acc: 0.423, loss: 1.058
epoch: 800, acc: 0.450, loss: 1.047
epoch: 900, acc: 0.430, loss: 1.050
epoch: 1000, acc: 0.427, loss: 1.045
epoch: 1100, acc: 0.440, loss: 1.038
epoch: 1200, acc: 0.453, loss: 1.029
epoch: 1300, acc: 0.400, loss: 1.019
epoch: 1400, acc: 0.480, loss: 1.026
epoch: 1500, acc: 0.413, loss: 1.003
epoch: 1600, acc: 0.397, loss: 0.994
epoch: 1700, acc: 0.443, loss: 0.976
epoch: 1800, acc: 0.403, loss: 0.995
epoch: 1900, acc: 0.463, loss: 0.973
epoch: 2000, acc: 0.487, loss: 0.969
epoch: 2100, acc: 0.470, loss: 0.956
epoch: 2200, acc: 0.497, loss: 0.951
epoch: 2300, acc: 0.480, loss: 0.936
epoch: 2400, acc: 0.470, loss: 0.915
epoch: 2500, acc: 0.493, loss: 0.904
epoch: 2600, acc: 0.587, loss: 0.862
epoch: 2700, 

Above gives us an update of where we are (epochs), the model’s accuracy, and loss every 100 epochs. We can see consistent improvements.

## Learning Rate

- If the learning rate is too small,then small updates to the parameters caused stagnation in the model’s learning — the model got stuck in a local minimum. <br>
![](img1.png)
-  With our example here, as well as with optimizing full neural networks, we do not know where the global minimum is. How do we know if we’ve reached the global minimum or at least gotten close? <br>
The loss function measures how far the model is with its predictions to the real target values, so, as long as the loss value is not 0​ or very close to 0​, and the model stopped learning, we’re at some local minimum.
- In reality, we almost never approach a loss of 0​ for various reasons:
 1. One reason for this may be imperfect neural network hyperparameters. 
 2. Another reason for this may be insufficient data. If you did reach a loss of 0 with a neural network, you should find it suspicious: **overfitting**

We can try modifying the learning rate: <br> <br>
![](img2.png) <br> <br>
This time, the model escaped this local minimum but got stuck at another one. Let’s see one more example after another learning rate change: <br> <br>
![](img3.png) <br> <br>
This time the model got stuck at a local minimum near the global minimum. The model was able to escape the “deeper” local minimums, so it might be counter-intuitive why it is stuck here. <br>
Remember, the model follows the direction of steepest descent of the loss function, no matter how large or slight the descent is. For this reason, we’ll introduce momentum and the other techniques to prevent such situations. 


## Momentum in gradient descent

Momentum, in an optimizer, adds to the gradient what, in the physical world, we could call inertia. <br> <br>
![](img4.png) <br><br>
In the above figure you can see we used a very small learning rate here with a large momentum. The color change from green, 
through orange to red presents the advancement of the gradient descent process, the steps. We can see that the model achieved the goal and found the global minimum, but this took many steps. <br>
**Can this be done better?** <br> <br>
![](img5.png) <br> <br>
And even better: <br> <br>
![](img6.png) <br><br>
With these examples, we were able to find the global minimum in about **200, 100, and 50 steps**, respectively, by modifying the learning rate and the momentum. It’s possible to significantly shorten the training time by adjusting the parameters of the optimizer.<br>
However, we have to be careful with these hyper-parameter adjustments, as this won’t necessarily always help the model: <br><br>
![](img7.png) <br><br>
In the above case,the learning rate is set too high, the model might not be able to find the global minimum. Even, at some point, if it does, further adjustments could cause it to jump out of this minimum. The model was “jumping” around some minimum and what this might mean is that we should try to:
- lower the learning rate
- raise the momentum, or 
- possibly apply a learning rate decay (lowering the learning rate during training)

Now if we set the learning rate far too high: <br><br>
![](img8.png) <br><br>
In the above situation, the model starts “jumping” around, and moves in what we might observe as random directions. This is an example of “**overshooting**,” with every step — the direction of a change is correct, but the amount of the gradient applied is too large. In an extreme situation, we could cause a **gradient explosion**: <br><br>
![](img9.png) <br><br>
**Note:** <br><br>
A **gradient explosion** is a situation where the parameter updates cause the function’s output to rise instead of fall, and, with each step, the loss value and gradient become larger. At some point, the floating-point variable limitation causes an overflow as it cannot hold values of this size anymore, and the model is no longer able to train.<br>
It’s crucial to recognize this situation forming during training, especially for large models, where the training can take days, weeks, or more. It is possible to tune the model’s hyper-parameters in time to save the model and to continue training. <br><br>
When we choose the learning rate and the other hyper-parameters correctly, the learning process can be relatively quick:
<img src="img10.png" style="width: 45%; display: inline-block; margin-right: 5%;" />
<img src="img11.png" style="width: 45%; display: inline-block;" />

The challenge is to choose the hyper-parameters correctly, and it is not always an easy task.<br>
**Few tips:**
-  It is usually best to start with the optimizer defaults, perform a few steps, and observe the training process when 
tuning different settings.
-  It is not always possible to see meaningful results in a short-enough period of time, and, in this case, it’s good to have the ability to update the optimizer’s settings during training.
- How you choose the learning rate, and other hyper-parameters, depends on the model, data, including the amount of data, the parameter initialization method, etc. There is no single, best way to set hyper-parameters, but experience usually helps.  

For a summary of learning rates — if we plot the loss along an axis of steps: <br> <br>
![](img12.png) <br><br>
We can see various examples of relative learning rates and what loss will ideally look like as a 
graph over time (steps) of training. <br> <br>


Knowing what the learning rate should be to get the most out of your training process isn’t possible, but a good rule is that your initial training will benefit from a larger learning rate to take initial steps faster. If you start with steps that are too small, you might get stuck in a local minimum and be unable to leave it due to not making large enough updates to the parameters. 
**For example, what if we make the learning rate 0.85 rather than 1.0 with the SGD optimizer?**

In [20]:
# Create dataset 
X, y = spiral_data(samples=100, classes=3) 
# Create Dense layer with 2 input features and 64 output values 
dense1 = Layer_Dense(2, 64) 
# Create ReLU activation (to be used with Dense layer): 
activation1 = Activation_ReLU() 

# Create second Dense layer with 64 input features (as we take output 
# of previous layer here) and 3 output values (output values) 
dense2 = Layer_Dense(64, 3) 
 
# Create Softmax classifier's combined loss and activation 
loss_activation = Activation_Softmax_Loss_CategoricalCrossentropy() 
 
# Create optimizer 
optimiser = Optimiser_SGD(learning_rate=.85) 
 
# Train in loop 
for epoch in range(10001): 
 
    # Perform a forward pass of our training data through this layer 
    dense1.forward(X) 
 
    # Perform a forward pass through activation function 
    # takes the output of first dense layer here 
    activation1.forward(dense1.output) 
 
    # Perform a forward pass through second Dense layer 
    # takes outputs of activation function of first layer as inputs 
    dense2.forward(activation1.output) 
 
    # Perform a forward pass through the activation/loss function 
    # takes the output of second dense layer here and returns loss 
    loss = loss_activation.forward(dense2.output, y) 
 
    # Calculate accuracy from output of activation2 and targets 
    # calculate values along first axis 
    predictions = np.argmax(loss_activation.output, axis=1) 
    if len(y.shape) == 2: 
        y = np.argmax(y, axis=1) 
    accuracy = np.mean(predictions==y) 
 
    if not epoch % 100: 
        print(f'epoch: {epoch}, ' + 
              f'acc: {accuracy:.3f}, ' + 
              f'loss: {loss:.3f}') 
 
    # Backward pass 
    loss_activation.backward(loss_activation.output, y) 
    dense2.backward(loss_activation.dinputs) 
    activation1.backward(dense2.dinputs) 
    dense1.backward(activation1.dinputs) 
 
    # Update weights and biases 
    optimiser.update_params(dense1) 
    optimiser.update_params(dense2)

epoch: 0, acc: 0.320, loss: 1.099
epoch: 100, acc: 0.390, loss: 1.095
epoch: 200, acc: 0.390, loss: 1.081
epoch: 300, acc: 0.387, loss: 1.078
epoch: 400, acc: 0.403, loss: 1.078
epoch: 500, acc: 0.417, loss: 1.077
epoch: 600, acc: 0.423, loss: 1.076
epoch: 700, acc: 0.423, loss: 1.074
epoch: 800, acc: 0.447, loss: 1.072
epoch: 900, acc: 0.447, loss: 1.070
epoch: 1000, acc: 0.440, loss: 1.068
epoch: 1100, acc: 0.443, loss: 1.065
epoch: 1200, acc: 0.467, loss: 1.062
epoch: 1300, acc: 0.453, loss: 1.058
epoch: 1400, acc: 0.463, loss: 1.053
epoch: 1500, acc: 0.477, loss: 1.046
epoch: 1600, acc: 0.487, loss: 1.038
epoch: 1700, acc: 0.467, loss: 1.029
epoch: 1800, acc: 0.437, loss: 1.034
epoch: 1900, acc: 0.470, loss: 1.022
epoch: 2000, acc: 0.440, loss: 1.031
epoch: 2100, acc: 0.453, loss: 1.021
epoch: 2200, acc: 0.430, loss: 1.020
epoch: 2300, acc: 0.453, loss: 1.023
epoch: 2400, acc: 0.517, loss: 1.013
epoch: 2500, acc: 0.477, loss: 1.000
epoch: 2600, acc: 0.497, loss: 1.004
epoch: 2700, 

If you'll compare with the above case where learning_rate = 1 , we have **less accuracy** and **high loss** .<br><br>
So, it is very much possible that we git stuck in a local minima and due to smaller updates,couldn't move out of it. 
<br><br>
**NOTE:** <br>
Lower accuracy isn't always associated with higher loss and vice-versa. <br>
**Why?** <br>
-  Remember, even if we desire the best accuracy out of our model, the optimizer’s task is to decrease loss, not raise accuracy 
directly. 
- . Loss is the mean value of all of the sample losses, and some of them could drop significantly, while others might rise just slightly, changing the prediction for them from a correct to an incorrect class at the same time. 
- This would cause a lower mean loss in general, but also more incorrectly predicted samples, which will, at the same time, lower the accuracy.


**Learning** <br>
In a direct comparison of these two models in training, different learning rates did not show that the lower this learning rate value is, the better. In most cases, we want to start with a larger learning rate and decrease the learning rate over time/steps. <br><br>
A commonly-used solution to keep initial updates large and explore various learning rates during 
training is to implement a **learning rate decay.**

## Learning Rate Decay

The idea of a learning rate decay is to start with a large learning rate, say 1.0 in our case, and 
then decrease it during training. Let's program a Decay Rate which steadily decays the learning rate per batch or epoch. <br>
We are going to update the learning rate decay each step by the reciprocal  of the step count fraction. This fraction is a new **hyperparameter** that we'll add to the optimiser called **learning rate decay**.   <br> <br>
How this decaying works is it takes the step and the decaying ratio and 
multiplies them. The further in training, the bigger the step is, and the bigger result of this 
multiplication is. We then take its reciprocal (the further in training, the lower the value) and 
multiply the initial learning rate by it. The added 1​ makes sure that the resulting algorithm never 
raises the learning rate. <br><br>
For example, for the first step, we might divide 1 by the learning rate, 0.001​ for example, which will result in a current learning rate of 1000​. That’s definitely not what we wanted. 1 divided by the 1+fraction ensures that the result, a fraction of the starting learning rate, will always be less than or equal to 1, decreasing over time. That’s the desired result — start with the current learning rate and make it smaller with time. The code for determining the current decay rate: 

In [22]:
start_Learning_rate = 1
learning_rate_decay = 0.1
step = 1
learning_rate = start_Learning_rate*(1./(1 + learning_rate_decay*step))
print(learning_rate)

0.9090909090909091


In [24]:
# on step = 20
step = 20
learning_rate = start_Learning_rate*(1./(1 + learning_rate_decay*step))
print(learning_rate)

0.3333333333333333


In practice 0.1 would be considered very aggressive decay rate.

In [25]:
start_Learning_rate = 1.0
learning_rate_decay = 0.1
for step in range(21):
    learning_rate = start_Learning_rate*(1./(1 + learning_rate_decay*step))
    print(learning_rate)

1.0
0.9090909090909091
0.8333333333333334
0.7692307692307692
0.7142857142857143
0.6666666666666666
0.625
0.588235294117647
0.5555555555555556
0.5263157894736842
0.5
0.47619047619047616
0.45454545454545453
0.4347826086956522
0.41666666666666663
0.4
0.3846153846153846
0.37037037037037035
0.35714285714285715
0.3448275862068965
0.3333333333333333


This learning rate decay scheme lowers the learning rate each step using the mentioned formula. 
Initially, the learning rate drops fast, but the change in the learning rate lowers each step, letting 
the model sit as close as possible to the minimum. The model needs small updates near the end of 
training to be able to get as close to this point as possible. 

In [27]:
#SGD Optimiser
class Optimiser_SGD:

    # Initialize optimizer - set settings, 
    # learning rate of 1. is default for this optimizer
    def __init__(self,learning_rate=1.0,decay = 0.):
        self.learning_rate = learning_rate
        self.current_learning_rate = learning_rate
        self.decay = decay
        self.iterations = 0

    """This method will update the learning rate if decay is anything other than zero"""
    # call once before any updates
    def pre_update_params(self):
        if self.decay:
            self.current_learning_rate = self.learning_rate*(1./(1. + self.decay*self.iterations))
    
    #update parameters
    def update_params(self,layer):
        layer.weights += -self.current_learning_rate*layer.dweights
        layer.biases += -self.current_learning_rate*layer.dbiases

    # call once after any parameter updates
    def post_update_params(self):
        self.iterations += 1


Now let's train our model with a decay rate of (1e-2) i.e. 0.01

In [31]:
# Create dataset 
X, y = spiral_data(samples=100, classes=3) 
# Create Dense layer with 2 input features and 64 output values 
dense1 = Layer_Dense(2, 64) 
# Create ReLU activation (to be used with Dense layer): 
activation1 = Activation_ReLU() 

# Create second Dense layer with 64 input features (as we take output 
# of previous layer here) and 3 output values (output values) 
dense2 = Layer_Dense(64, 3) 
 
# Create Softmax classifier's combined loss and activation 
loss_activation = Activation_Softmax_Loss_CategoricalCrossentropy() 
 
# Create optimizer 
optimiser = Optimiser_SGD(decay=1e-2) 
 
# Train in loop 
for epoch in range(10001): 
 
    # Perform a forward pass of our training data through this layer 
    dense1.forward(X) 
 
    # Perform a forward pass through activation function 
    # takes the output of first dense layer here 
    activation1.forward(dense1.output) 
 
    # Perform a forward pass through second Dense layer 
    # takes outputs of activation function of first layer as inputs 
    dense2.forward(activation1.output) 
 
    # Perform a forward pass through the activation/loss function 
    # takes the output of second dense layer here and returns loss 
    loss = loss_activation.forward(dense2.output, y) 
 
    # Calculate accuracy from output of activation2 and targets 
    # calculate values along first axis 
    predictions = np.argmax(loss_activation.output, axis=1) 
    if len(y.shape) == 2: 
        y = np.argmax(y, axis=1) 
    accuracy = np.mean(predictions==y) 
 
    if not epoch % 100: 
        print(f'epoch: {epoch}, ' + 
              f'acc: {accuracy:.3f}, ' + 
              f'loss: {loss:.3f} ' +
              f'lr: {optimiser.current_learning_rate}')
 
    # Backward pass 
    loss_activation.backward(loss_activation.output, y) 
    dense2.backward(loss_activation.dinputs) 
    activation1.backward(dense2.dinputs) 
    dense1.backward(activation1.dinputs) 
 
    # Update weights and biases 
    optimiser.pre_update_params()
    optimiser.update_params(dense1) 
    optimiser.update_params(dense2)
    optimiser.post_update_params()

epoch: 0, acc: 0.413, loss: 1.099 lr: 1.0
epoch: 100, acc: 0.437, loss: 1.082 lr: 0.5025125628140703
epoch: 200, acc: 0.450, loss: 1.061 lr: 0.33444816053511706
epoch: 300, acc: 0.470, loss: 1.055 lr: 0.2506265664160401
epoch: 400, acc: 0.473, loss: 1.053 lr: 0.2004008016032064
epoch: 500, acc: 0.477, loss: 1.052 lr: 0.1669449081803005
epoch: 600, acc: 0.470, loss: 1.052 lr: 0.14306151645207438
epoch: 700, acc: 0.463, loss: 1.051 lr: 0.1251564455569462
epoch: 800, acc: 0.473, loss: 1.051 lr: 0.11123470522803114
epoch: 900, acc: 0.480, loss: 1.050 lr: 0.10010010010010009
epoch: 1000, acc: 0.477, loss: 1.050 lr: 0.09099181073703366
epoch: 1100, acc: 0.480, loss: 1.049 lr: 0.08340283569641367
epoch: 1200, acc: 0.477, loss: 1.049 lr: 0.07698229407236336
epoch: 1300, acc: 0.477, loss: 1.049 lr: 0.07147962830593281
epoch: 1400, acc: 0.477, loss: 1.049 lr: 0.066711140760507
epoch: 1500, acc: 0.477, loss: 1.048 lr: 0.06253908692933083
epoch: 1600, acc: 0.480, loss: 1.048 lr: 0.0588581518540317

**Observations**: 
Notice that from the last run, you got an even poorer accuracy and higher loss. <br><br>

So,**this definitely got stuck somwhere because your learning rate decayed far too quickly and became too small,trapping the model in some local minima. Notice how loss and accuracy stopped changing very much.** <br> <br>

We can try decaying slower, try decay = 1e-3 or 0.001 :




In [32]:
# Create dataset 
X, y = spiral_data(samples=100, classes=3) 
# Create Dense layer with 2 input features and 64 output values 
dense1 = Layer_Dense(2, 64) 
# Create ReLU activation (to be used with Dense layer): 
activation1 = Activation_ReLU() 

# Create second Dense layer with 64 input features (as we take output 
# of previous layer here) and 3 output values (output values) 
dense2 = Layer_Dense(64, 3) 
 
# Create Softmax classifier's combined loss and activation 
loss_activation = Activation_Softmax_Loss_CategoricalCrossentropy() 
 
# Create optimizer 
optimiser = Optimiser_SGD(decay=1e-3) 
 
# Train in loop 
for epoch in range(10001): 
 
    # Perform a forward pass of our training data through this layer 
    dense1.forward(X) 
 
    # Perform a forward pass through activation function 
    # takes the output of first dense layer here 
    activation1.forward(dense1.output) 
 
    # Perform a forward pass through second Dense layer 
    # takes outputs of activation function of first layer as inputs 
    dense2.forward(activation1.output) 
 
    # Perform a forward pass through the activation/loss function 
    # takes the output of second dense layer here and returns loss 
    loss = loss_activation.forward(dense2.output, y) 
 
    # Calculate accuracy from output of activation2 and targets 
    # calculate values along first axis 
    predictions = np.argmax(loss_activation.output, axis=1) 
    if len(y.shape) == 2: 
        y = np.argmax(y, axis=1) 
    accuracy = np.mean(predictions==y) 
 
    if not epoch % 100: 
        print(f'epoch: {epoch}, ' + 
              f'acc: {accuracy:.3f}, ' + 
              f'loss: {loss:.3f} ' +
              f'lr: {optimiser.current_learning_rate}')
 
    # Backward pass 
    loss_activation.backward(loss_activation.output, y) 
    dense2.backward(loss_activation.dinputs) 
    activation1.backward(dense2.dinputs) 
    dense1.backward(activation1.dinputs) 
 
    # Update weights and biases 
    optimiser.pre_update_params()
    optimiser.update_params(dense1) 
    optimiser.update_params(dense2)
    optimiser.post_update_params()

epoch: 0, acc: 0.297, loss: 1.099 lr: 1.0
epoch: 100, acc: 0.427, loss: 1.081 lr: 0.9099181073703367
epoch: 200, acc: 0.407, loss: 1.070 lr: 0.8340283569641367
epoch: 300, acc: 0.397, loss: 1.067 lr: 0.7698229407236336
epoch: 400, acc: 0.400, loss: 1.066 lr: 0.7147962830593281
epoch: 500, acc: 0.413, loss: 1.065 lr: 0.66711140760507
epoch: 600, acc: 0.413, loss: 1.065 lr: 0.6253908692933083
epoch: 700, acc: 0.417, loss: 1.064 lr: 0.5885815185403178
epoch: 800, acc: 0.417, loss: 1.062 lr: 0.5558643690939411
epoch: 900, acc: 0.427, loss: 1.059 lr: 0.526592943654555
epoch: 1000, acc: 0.440, loss: 1.056 lr: 0.5002501250625312
epoch: 1100, acc: 0.453, loss: 1.054 lr: 0.4764173415912339
epoch: 1200, acc: 0.460, loss: 1.052 lr: 0.45475216007276037
epoch: 1300, acc: 0.450, loss: 1.050 lr: 0.43497172683775553
epoch: 1400, acc: 0.453, loss: 1.048 lr: 0.4168403501458941
epoch: 1500, acc: 0.450, loss: 1.046 lr: 0.4001600640256102
epoch: 1600, acc: 0.460, loss: 1.044 lr: 0.3847633705271258
epoch: 1

Can we get even better results? <br>
YES,remember you might think that your starting learning rate is too high. <br><br>

**Stochastic Gradient Descent with learning rate decay can do fairly well but is still a fairly basic optimization method that only follows a gradient without any additional logic that could potentially help the model find the global minimum to the loss function. One option for improving the SGD optimizer is to introduce** momentum.

## Stochastic Gradient Descent with Momentum 

Momentum creates a rolling average of gradients over some number of updates and uses this average with the unique gradient at each step. <br>
Another way of understanding this is to imagine a ball going down a hill — even if it finds a small hole or hill, momentum will let it go straight through it towards a lower minimum — the bottom of this hill. This can help in cases where you’re stuck in some local minimum (a hole), bouncing back and forth. With momentum, a model is more likely to pass through local minimums, further decreasing loss. Simply put, momentum may still point towards the global gradient descent direction. <br><br>
![](img13.png)<br><br>
Recall this above situation from the chapter. With regular updates, the SGD optimizer might determine that the next best step is one that keeps the model in a local minimum. Remember that the gradient descent points towards the current steepest  loss ascent for that step,it's negative towards steepest descent and which may not necessarily follow descent towards global minima. We may wind up with a gradient that points in one direction and then the opposite direction in next update;<br>
the gradient could continue to bounce back and forth around a local minimum like this, keeping the optimization 
of the loss stuck.<br>
**Instead, momentum uses the previous update’s direction to influence the next update’s direction, minimizing the chances of bouncing around and getting stuck.** <br>
Recall the example below:<br><br>

![](img14.png)<br><br>

**How to utilise momentum then?** <br><br>

Set a parameter between 0 & 1,representing the farction of previous update to retain,and subtracting (adding the negative) our actual gradient,multiplied by the learning rate(like before),from it.<br>
The update contains a portion of the gradient from preceding steps as our momentum (direction of previous changes) and only a portion of the current gradient;<br> <br>
**Caution:** <br>
- The bigger the role that momentum takes in the update, the slower the update can change the direction.
- When we set the momentum fraction too high, the model might stop learning at all since the direction of the updates won’t be able to follow the global gradient descent.



The code update is as follows:<br>
```
weight_updates = self.momentum*layer.weight_momentums - self.current_learning_rates*layer.dweights
```
self.momentum is a hyperparamter choosen at the start,layer.weight_momentums starts as all zeros and get updated during training.<br>
```
layer.weight_momentums = weight_updates
```
**The momentum is always the previous update to the parameters.** <br>
The updated SGD Optmiser's class looks like this:

In [38]:
#SGD + momentum Optimiser 
class Optimiser_SGD:

    # Initialize optimizer - set settings, 
    # learning rate of 1. is default for this optimizer
    def __init__(self,learning_rate=1.0,decay = 0.,momentum = 0.):
        self.learning_rate = learning_rate
        self.current_learning_rate = learning_rate
        self.decay = decay
        self.iterations = 0
        self.momentum = momentum

    """This method will update the learning rate if decay is anything other than zero"""
    # call once before any updates
    def pre_update_params(self):
        if self.decay:
            self.current_learning_rate = self.learning_rate*(1./(1. + self.decay*self.iterations))
    
    """Major changes are in this method wrt vanilla SGD"""
    #update parameters
    def update_params(self,layer):

        # if we use momentum
        if self.momentum:

            # If layer does not contain momentum arrays, create them filled with zeros 
            if not hasattr(layer, 'weight_momentums'): 
                layer.weight_momentums = np.zeros_like(layer.weights) 
                # If there is no momentum array for weights 
                # The array doesn't exist for biases yet either. 
                layer.bias_momentums = np.zeros_like(layer.biases)

            # Build weight updates with momentum - take previous updates multiplied by retain factor and update with 
            # current gradients
            weight_updates = self.momentum*layer.weight_momentums - self.current_learning_rate*layer.dweights
            layer.weight_momentums = weight_updates

            # build bias updates
            bias_updates = self.momentum*layer.bias_momentums - self.current_learning_rate*layer.biases
            layer.bias_momentums = bias_updates

        # Vanilla SGD updates (as before momentum update) 
        else:
            weight_updates = -self.current_learning_rate*layer.dweights
            bias_updates = -self.current_learning_rate*layer.dbiases

        # Update weights and biases using either 
        # vanilla or momentum updates 
        layer.weights += weight_updates
        layer.biases += bias_updates

    # call once after any parameter updates
    def post_update_params(self):
        self.iterations += 1


Let’s show an example illustrating how adding momentum changes the learning process.<br>
Keeping the same starting learning rate (1) and decay (1e-3) from the previous training attempt and using a momentum of 0.5:

In [53]:
# Create dataset 
X, y = spiral_data(samples=100, classes=3)  # remember this produces 300 samples
 
# Create Dense layer with 2 input features and 64 output values 
dense1 = Layer_Dense(2, 64) 
 
# Create ReLU activation (to be used with Dense layer): 
activation1 = Activation_ReLU() 
 
# Create second Dense layer with 64 input features (as we take output 
# of previous layer here) and 3 output values (output values) 
dense2 = Layer_Dense(64, 3) 
 
# Create Softmax classifier's combined loss and activation 
loss_activation = Activation_Softmax_Loss_CategoricalCrossentropy() 
 
# Create optimizer 
optimiser = Optimiser_SGD(decay=1e-3, momentum=0.8)

# Train in loop 
for epoch in range(10001): 
 
    # Perform a forward pass of our training data through this layer 
    dense1.forward(X) 
 
    # Perform a forward pass through activation function 
    # takes the output of first dense layer here 
    activation1.forward(dense1.output) 
     # Perform a forward pass through second Dense layer 
    # takes outputs of activation function of first layer as inputs 
    dense2.forward(activation1.output) 
 
    # Perform a forward pass through the activation/loss function 
    # takes the output of second dense layer here and returns loss 
    loss = loss_activation.forward(dense2.output, y) 
 
    # Calculate accuracy from output of activation2 and targets 
    # calculate values along first axis 
    predictions = np.argmax(loss_activation.output, axis=1) 
    if len(y.shape) == 2: 
        y = np.argmax(y, axis=1) 
    accuracy = np.mean(predictions==y) 
 
    if not epoch % 100: 
        print(f'epoch: {epoch}, ' + 
              f'acc: {accuracy:.3f}, ' + 
              f'loss: {loss:.3f}, ' + 
              f'lr: {optimiser.current_learning_rate}') 
 
    # Backward pass 
    loss_activation.backward(loss_activation.output, y) 
    dense2.backward(loss_activation.dinputs) 
    activation1.backward(dense2.dinputs) 
    dense1.backward(activation1.dinputs) 
 
    # Update weights and biases 
    optimiser.pre_update_params() 
    optimiser.update_params(dense1) 
    optimiser.update_params(dense2) 
    optimiser.post_update_params()

epoch: 0, acc: 0.310, loss: 1.099, lr: 1.0
epoch: 100, acc: 0.460, loss: 1.053, lr: 0.9099181073703367
epoch: 200, acc: 0.457, loss: 1.050, lr: 0.8340283569641367
epoch: 300, acc: 0.453, loss: 1.048, lr: 0.7698229407236336
epoch: 400, acc: 0.450, loss: 1.047, lr: 0.7147962830593281
epoch: 500, acc: 0.447, loss: 1.046, lr: 0.66711140760507
epoch: 600, acc: 0.447, loss: 1.045, lr: 0.6253908692933083
epoch: 700, acc: 0.450, loss: 1.045, lr: 0.5885815185403178
epoch: 800, acc: 0.437, loss: 1.044, lr: 0.5558643690939411
epoch: 900, acc: 0.437, loss: 1.044, lr: 0.526592943654555
epoch: 1000, acc: 0.437, loss: 1.044, lr: 0.5002501250625312
epoch: 1100, acc: 0.437, loss: 1.043, lr: 0.4764173415912339
epoch: 1200, acc: 0.440, loss: 1.043, lr: 0.45475216007276037
epoch: 1300, acc: 0.440, loss: 1.043, lr: 0.43497172683775553
epoch: 1400, acc: 0.437, loss: 1.043, lr: 0.4168403501458941
epoch: 1500, acc: 0.437, loss: 1.043, lr: 0.4001600640256102
epoch: 1600, acc: 0.433, loss: 1.043, lr: 0.38476337

**Observation**:<br>
So,instead of decreasing the loss even further, we came down very little. Also,notice that the loss isn't changing much as itertaions increase. Recall the cautions: <br>
- The bigger the role that momentum takes in the update, the slower the update can change the direction.
- When we set the momentum fraction too high, the model might stop learning at all since the direction of the updates won’t be able to follow the global gradient descent.

Well,I searched on the internet and found that there are multiple reasons that SGD+momentum can perform poorer than Vanilla SGD. You can try hyperparameter tuning. There might be less training data(looks like the case here). <br>
Let's try decreasing the momentum to 0.5:   

In [57]:


nnfs.init()

# Create dataset 
X, y = spiral_data(samples=100, classes=3)  # remember this produces 300 samples
 
# Create Dense layer with 2 input features and 64 output values 
dense1 = Layer_Dense(2, 64) 
 
# Create ReLU activation (to be used with Dense layer): 
activation1 = Activation_ReLU() 
 
# Create second Dense layer with 64 input features (as we take output 
# of previous layer here) and 3 output values (output values) 
dense2 = Layer_Dense(64, 3) 
 
# Create Softmax classifier's combined loss and activation 
loss_activation = Activation_Softmax_Loss_CategoricalCrossentropy() 
 
# Create optimizer 
optimiser = Optimiser_SGD(decay=1e-3, momentum=0.5)

# Train in loop 
for epoch in range(10001): 
 
    # Perform a forward pass of our training data through this layer 
    dense1.forward(X) 
 
    # Perform a forward pass through activation function 
    # takes the output of first dense layer here 
    activation1.forward(dense1.output) 
     # Perform a forward pass through second Dense layer 
    # takes outputs of activation function of first layer as inputs 
    dense2.forward(activation1.output) 
 
    # Perform a forward pass through the activation/loss function 
    # takes the output of second dense layer here and returns loss 
    loss = loss_activation.forward(dense2.output, y) 
 
    # Calculate accuracy from output of activation2 and targets 
    # calculate values along first axis 
    predictions = np.argmax(loss_activation.output, axis=1) 
    if len(y.shape) == 2: 
        y = np.argmax(y, axis=1) 
    accuracy = np.mean(predictions==y) 
 
    if not epoch % 100: 
        print(f'epoch: {epoch}, ' + 
              f'acc: {accuracy:.3f}, ' + 
              f'loss: {loss:.3f}, ' + 
              f'lr: {optimiser.current_learning_rate}') 
 
    # Backward pass 
    loss_activation.backward(loss_activation.output, y) 
    dense2.backward(loss_activation.dinputs) 
    activation1.backward(dense2.dinputs) 
    dense1.backward(activation1.dinputs) 
 
    # Update weights and biases 
    optimiser.pre_update_params() 
    optimiser.update_params(dense1) 
    optimiser.update_params(dense2) 
    optimiser.post_update_params()

epoch: 0, acc: 0.360, loss: 1.099, lr: 1.0
epoch: 100, acc: 0.377, loss: 1.081, lr: 0.9099181073703367
epoch: 200, acc: 0.433, loss: 1.077, lr: 0.8340283569641367
epoch: 300, acc: 0.430, loss: 1.076, lr: 0.7698229407236336
epoch: 400, acc: 0.420, loss: 1.076, lr: 0.7147962830593281
epoch: 500, acc: 0.407, loss: 1.076, lr: 0.66711140760507
epoch: 600, acc: 0.393, loss: 1.076, lr: 0.6253908692933083
epoch: 700, acc: 0.397, loss: 1.075, lr: 0.5885815185403178
epoch: 800, acc: 0.393, loss: 1.075, lr: 0.5558643690939411
epoch: 900, acc: 0.393, loss: 1.075, lr: 0.526592943654555
epoch: 1000, acc: 0.397, loss: 1.075, lr: 0.5002501250625312
epoch: 1100, acc: 0.393, loss: 1.075, lr: 0.4764173415912339
epoch: 1200, acc: 0.400, loss: 1.075, lr: 0.45475216007276037
epoch: 1300, acc: 0.407, loss: 1.074, lr: 0.43497172683775553
epoch: 1400, acc: 0.407, loss: 1.074, lr: 0.4168403501458941
epoch: 1500, acc: 0.403, loss: 1.074, lr: 0.4001600640256102
epoch: 1600, acc: 0.403, loss: 1.073, lr: 0.38476337

Not much of a change in results. <br>
**Although it didn't give desired results here but SGD Optimiser with momentum is usually one of 2 main choices for an optimizer in practice next to the Adam optimizer.** <br>
But before that the next modification to Stochastic Gradient Descent is **AdaGrad**

## AdaGrad

- AdaGrad,short for adaptive gradient,institutes a per-parameter learning rate rather than a globally-shared rate. The idea here is to normalize updates made to the features.
- During the training process, some weights can rise significantly, while others tend to not change by much. It is usually better for weights to not rise too high compared to the other weights, and we’ll talk about this with regularization techniques.
- AdaGrad provides a way to normalize parameter updates by keeping a history of previous updates — the bigger the sum of the updates is, in either direction (positive or negative), the smaller updates are made further in training. 
- This lets less-frequently updated parameters to keep-up with changes, effectively utilizing more neurons for training. 

The concept of AdaGrad can be contained in the following two lines of code:
```
cache += parm_gradient**2
parm_updates = learning_rate*parm_gradient/(sqrt(cache) + eps)
```
- The cache holds a history of squared gradients, and the parm_updates is a function of the learning rate multiplied by the gradient (basic SGD so far) and then is divided by the square root of the cache plus some epsilon value.
- **The division operation performed with a constantly rising 
cache might also cause the learning to stall as updates become smaller with time, due to the 
monotonic nature of updates.That’s why this optimizer is not widely used, except for some 
specific applications.**
- The **epsilon** is a hyperparameter(pre-training control knob setting) preventing division by 0. The epsilon value is usually a small value, such as 1e-7​, which we’ll be defaulting to.
- 