# Chapter 9 : Backpropogation

Let’s start with a simplified forward pass with just one neuron. Rather than backpropagating from the loss function for a full neural network, let’s backpropagate the ReLU function for a single neuron and act as if we intend to minimize the output for this single neuron. We’re first doing this only as a demonstration to simplify the explanation, since minimizing the output from a ReLU activated 
neuron doesn’t serve any purpose other than as an exercise.

In [None]:
# an example neuron with three inputs
x = [1.0,-2.0,3.0] # input values
w = [-3.0,-1.0,2.0] # weights
b = 1.0 # bias

In [None]:
# multiplying input sby weights
xw0 = x[0]*w[0]
xw1 = x[1]*w[1]
xw2 = x[2]*w[2]

# adding weighted inputs and bias
z = xw0 + xw1 + xw2 + b

# ReLU activation function
y = max(z,0)

In [None]:
# Backward Pass
dvalue = 1.0

#Derivative of ReLU and chain rule 
drelu_dz = dvalue*(1.0 if z>0 else 0.0) # z is denoting the sum only
print(drelu_dz)

In [None]:
# partial deivatives of sums and the chain rule
dsum_dxw0 = 1
dsum_dxw1 = 1
dsum_dxw2 = 1
dsum_db = 1
drelu_dxw0 = drelu_dz*dsum_dxw0
drelu_dxw1 = drelu_dz*dsum_dxw1
drelu_dxw2 = drelu_dz*dsum_dxw2
drelu_db = drelu_dz*dsum_db
print(drelu_dxw0,drelu_dxw1,drelu_dxw2,drelu_db)

In [None]:
# partial derivatives of multiplications and chain rule
dmul_dx0 = w[0] 
dmul_dx1 = w[1] 
dmul_dx2 = w[2] 
dmul_dw0 = x[0] 
dmul_dw1 = x[1] 
dmul_dw2 = x[2] 
drelu_dx0 = drelu_dxw0 * dmul_dx0 
drelu_dw0 = drelu_dxw0 * dmul_dw0 
drelu_dx1 = drelu_dxw1 * dmul_dx1 
drelu_dw1 = drelu_dxw1 * dmul_dw1 
drelu_dx2 = drelu_dxw2 * dmul_dx2 
drelu_dw2 = drelu_dxw2 * dmul_dw2 
print(drelu_dx0, drelu_dw0, drelu_dx1, drelu_dw1, drelu_dx2, drelu_dw2)

In [None]:
import numpy as np
# Passed in gradient from the next layer 
# for the purpose of this example we're going to use 
# a vector of 1s 
dvalues = np.array([[1.,1.,1.]])

# We have 3 sets of weights - one set for each neuron 
# we have 4 inputs, thus 4 weights 
# recall that we keep weights transposed
weights = np.array([[0.2, 0.8, -0.5, 1], 
                    [0.5, -0.91, 0.26, -0.5], 
                    [-0.26, -0.27, 0.17, 0.87]]).T

# sum weights of given input 
# and multiply by the passed in gradient for this neuron 
# dx0 = sum(weights[0]*dvalues[0]) 
# dx1 = sum(weights[1]*dvalues[0]) 
# dx2 = sum(weights[2]*dvalues[0])
# dx3 = sum(weights[3]*dvalues[0])
# dinputs = np.array([dx0, dx1, dx2, dx3]) 

# or a better method
dinputs = np.dot(dvalues[0],weights.T)



print(dinputs)

In [None]:
dvalues = np.array([[1.,1.,1.],
                    [2.,2.,2.],
                    [3.,3.,3.]])

weights = np.array([[0.2, 0.8, -0.5, 1], 
                    [0.5, -0.91, 0.26, -0.5], 
                    [-0.26, -0.27, 0.17, 0.87]]).T
dinputs = np.dot(dvalues,weights.T)

print(dinputs)

In [None]:
"""For weights"""

dvalues = np.array([[1.,1.,1.],
                    [2.,2.,2.],
                    [3.,3.,3.]])

# We have 3 sets of inputs - samples 
inputs = np.array([[1, 2, 3, 2.5], 
                   [2., 5., -1., 2], 
                   [-1.5, 2.7, 3.3, -0.8]]) 

# sum weights of given input 
# and multiply by the passed in gradient for this neuron 
dweights = np.dot(inputs.T, dvalues)

print(dweights)

In [None]:
# Passed in gradient from the next layer 
# for the purpose of this example we're going to use 
# an array of an incremental gradient values 
dvalues = np.array([[1., 1., 1.], 
                    [2., 2., 2.], 
                    [3., 3., 3.]]) 

# One bias for each neuron 
# biases are the row vector with a shape (1, neurons) 
biases = np.array([[2, 3, 0.5]])

# dbiases - sum values, do this over samples (first axis), keepdims 
# since this by default will produce a plain list - 
# we explained this in the chapter 4 
dbiases = np.sum(dvalues, axis=0, keepdims=True)
                 
print(dbiases)

In [None]:
# Example layer output 
z = np.array([[1, 2, -3, -4], 
              [2, -7, -1, 3], 
              [-1, 2, 5, -1]]) 
 
dvalues = np.array([[1, 2, 3, 4], 
                    [5, 6, 7, 8], 
                    [9, 10, 11, 12]]) 
 
# ReLU activation's derivative 
drelu = np.zeros_like(z) 
drelu[z > 0] = 1 
 
print(drelu) 
 
# The chain rule 
drelu *= dvalues 
 
print(drelu) 


In [None]:
weights = np.array([[0.2, 0.8, -0.5, 1], 
                    [0.5, -0.91, 0.26, -0.5], 
                    [-0.26, -0.27, 0.17, 0.87]]).T

print(weights)

In [None]:
sum(weights[0])*

In [None]:
dvalue = np.array([[1.,1.,1.]])
dvalue[0]

In [None]:
weights[0]

In [None]:
0.2+0.5

In [None]:
0.7-0.26

In [None]:
# Dense Layer
class Layer_Dense:

    #Layer initialisation
    def __init__(self,inputs,neurons):
        self.weights = 0.01*np.random.randn(inputs,neurons)
        self.biases = np.zeros((1,neurons))
        self.inputs = inputs

    #Forward pass
    def forward(self,inputs):
        self.output = np.dot(inputs,self.weights) + self.biases 


    #Backward Pass
    def backward(self,dvalues):
        #Gradients on params
        self.dweights = np.dot(self.inputs.T,dvalues)
        self.dbiases = np.sum(dvalues,axis = 0,keepdims = True)

        # gradient on values
        self.dinputs = np.dot(dvalues,self.weights.T)
        


In [None]:
# ReLU Activation
class Activation_ReLU:

    def forward(self,inputs):
        self.inputs = inputs
        self.output = np.maximum(0,inputs)

    def backward(self,dvalues):
        # Since we need to modify the original variable, 
        # let's make a copy of the values first 
        self.dinputs = dvalues.copy()

        #zero gradient where input values were negative
        self.dinputs[self.inputs <= 0] = 0

In [None]:
# Common loss class 
class Loss: 
 
    # Calculates the data and regularization losses 
    # given model output and ground truth values 
    def calculate(self, output, y): 
 
        # Calculate sample losses 
        sample_losses = self.forward(output, y) 
 
        # Calculate mean loss 
        data_loss = np.mean(sample_losses) 
 
        # Return loss 
        return data_loss
        
# cross entropy loss
class Loss_CategoricalCrossentropy(Loss):

    #backward pass
    def backward(self,dvalues,y_true):
         # Number of samples 
        samples = len(dvalues) 
        # Number of labels in every sample 
        # We'll use the first sample to count them 
        labels = len(dvalues[0]) 
 
        # If labels are sparse, turn them into one-hot vector 
        if len(y_true.shape) == 1: 
            y_true = np.eye(labels)[y_true] 
 
        # Calculate gradient 
        self.dinputs = -y_true / dvalues 
        # Normalize gradient 
        self.dinputs = self.dinputs / samples


In [None]:
# Softmax activation 
class Activation_Softmax: 
    ... 
    # Backward pass 
    def backward(self, dvalues): 
 
        # Create uninitialized array 
        self.dinputs = np.empty_like(dvalues) 
 
        # Enumerate outputs and gradients 
        for index, (single_output, single_dvalues) in enumerate(zip(self.output, dvalues)): 
            # Flatten output array 
            single_output = single_output.reshape(-1, 1) 
            # Calculate Jacobian matrix of the output and 
            jacobian_matrix = np.diagflat(single_output) - np.dot(single_output, single_output.T) 
            # Calculate sample-wise gradient 
            # and add it to the array of sample gradients 
            self.dinputs[index] = np.dot(jacobian_matrix, 
                                         single_dvalues)

In [None]:
# Softmax classifier - combined Softmax activation 
# and cross-entropy loss for faster backward step 
class Activation_Softmax_Loss_CategoricalCrossentropy(): 
 
    # Creates activation and loss function objects 
    def __init__(self): 
        self.activation = Activation_Softmax() 
        self.loss = Loss_CategoricalCrossentropy() 
 
    # Forward pass 
    def forward(self, inputs, y_true): 
        # Output layer's activation function 
        self.activation.forward(inputs) 
        # Set the output 
        self.output = self.activation.output 
        # Calculate and return loss value 
        return self.loss.calculate(self.output, y_true) 
 
    # Backward pass 
    def backward(self, dvalues, y_true): 
 
        # Number of samples 
        samples = len(dvalues)

    # If labels are one-hot encoded, 
        # turn them into discrete values 
        if len(y_true.shape) == 2: 
            y_true = np.argmax(y_true, axis=1) 
 
        # Copy so we can safely modify 
        self.dinputs = dvalues.copy() 
        # Calculate gradient 
        self.dinputs[range(samples), y_true] -= 1 
        # Normalize gradient 
        self.dinputs = self.dinputs/samples

In [None]:
import numpy as np

In [None]:
dvalues = np.array([[1.,1.,1.]])



In [None]:
dvalues[0]

In [None]:
weights = np.array([[0.2, 0.8, -0.5, 1], 
                    [0.5, -0.91, 0.26, -0.5], 
                    [-0.26, -0.27, 0.17, 0.87]]).T 

In [None]:
weights

In [None]:
weights[0]

In [None]:
sum(weights[0]*dvalues[0])

In [1]:
import numpy as np

In [28]:
dvalues = np.array([[1.,1.,1.],
                    [2.,2.,2.],
                    [3.,3.,3.]])

In [8]:
weights = np.array([[0.2, 0.8, -0.5, 1], 
                    [0.5, -0.91, 0.26, -0.5], 
                    [-0.26, -0.27, 0.17, 0.87]]).T

In [9]:
weights

array([[ 0.2 ,  0.5 , -0.26],
       [ 0.8 , -0.91, -0.27],
       [-0.5 ,  0.26,  0.17],
       [ 1.  , -0.5 ,  0.87]])

In [11]:
dinputs = np.dot(dvalues,weights.T)
dinputs

array([[ 0.44, -0.38, -0.07,  1.37],
       [ 0.88, -0.76, -0.14,  2.74],
       [ 1.32, -1.14, -0.21,  4.11]])

In [15]:
inputs = np.array([[1, 2, 3, 2.5], 
                   [2., 5., -1., 2], 
                   [-1.5, 2.7, 3.3, -0.8]])

In [16]:
dweights = np.dot(inputs.T,dvalues)

In [17]:
dweights

array([[ 0.5,  0.5,  0.5],
       [20.1, 20.1, 20.1],
       [10.9, 10.9, 10.9],
       [ 4.1,  4.1,  4.1]])

In [22]:
input1 = np.array([[1,2,3,2.5]])
dvalue1 = np.array([[1.,1.,1.]])
dweight1 = np.dot(input1.T,dvalue1)
dweight1

array([[1. , 1. , 1. ],
       [2. , 2. , 2. ],
       [3. , 3. , 3. ],
       [2.5, 2.5, 2.5]])

In [23]:
input2 = np.array([[2.,5.,-1.,2.]])
dvalue2 = np.array([[2.,2.,2.]])
dweight2 = np.dot(input2.T,dvalue2)
dweight2

array([[ 4.,  4.,  4.],
       [10., 10., 10.],
       [-2., -2., -2.],
       [ 4.,  4.,  4.]])

In [24]:
input3 = np.array([[-1.5, 2.7, 3.3, -0.8]])
dvalue3 = np.array([[3.,3.,3.]])
dweight3 = np.dot(input3.T,dvalue3)
dweight3

array([[-4.5, -4.5, -4.5],
       [ 8.1,  8.1,  8.1],
       [ 9.9,  9.9,  9.9],
       [-2.4, -2.4, -2.4]])

In [25]:
(dweight1 + dweight2 + dweight3) == dweights

array([[ True,  True,  True],
       [ True,  True,  True],
       [ True,  True,  True],
       [ True,  True,  True]])

In [29]:
dbiases = np.sum(dvalues,axis=0,keepdims=True)

In [30]:
dbiases

array([[6., 6., 6.]])

In [1]:
import numpy as np

In [2]:
# Example layer output 
z = np.array([[1, 2, -3, -4], 
              [2, -7, -1, 3], 
              [-1, 2, 5, -1]]) 

In [3]:

dvalues = np.array([[1, 2, 3, 4], 
                    [5, 6, 7, 8], 
                    [9, 10, 11, 12]]) 

In [5]:
# ReLU's activation's derivative
drelu = np.zeros_like(z)
drelu[z>0] = 1

In [6]:
drelu

array([[1, 1, 0, 0],
       [1, 0, 0, 1],
       [0, 1, 1, 0]])

In [7]:
# the chain rule 
drelu *= dvalues
drelu

array([[ 1,  2,  0,  0],
       [ 5,  0,  0,  8],
       [ 0, 10, 11,  0]])

In [1]:
import numpy as np

In [2]:
# Example layer output 
z = np.array([[1, 2, -3, -4], 
              [2, -7, -1, 3], 
              [-1, 2, 5, -1]]) 
 
dvalues = np.array([[1, 2, 3, 4], 
                    [5, 6, 7, 8], 
                    [9, 10, 11, 12]]) 

In [3]:
drelu = dvalues.copy()

In [4]:
drelu[z<0] = 0

In [5]:
drelu

array([[ 1,  2,  0,  0],
       [ 5,  0,  0,  8],
       [ 0, 10, 11,  0]])

In [1]:
import numpy as np


# Passed in gradient from the next layer 
# for the purpose of this example we're going to use 
# an array of an incremental gradient values 
dvalues = np.array([[1., 1., 1.], 
                    [2., 2., 2.], 
                    [3., 3., 3.]])  
 
# We have 3 sets of inputs - samples 
inputs = np.array([[1, 2, 3, 2.5], 
                   [2., 5., -1., 2], 
                   [-1.5, 2.7, 3.3, -0.8]]) 

# We have 3 sets of weights - one set for each neuron 
# we have 4 inputs, thus 4 weights 
# recall that we keep weights transposed 
weights = np.array([[0.2, 0.8, -0.5, 1], 
                    [0.5, -0.91, 0.26, -0.5], 
                    [-0.26, -0.27, 0.17, 0.87]]).T 

# One bias for each neuron 
# biases are the row vector with a shape (1, neurons) 
biases = np.array([[2, 3, 0.5]]) 

# forward pass
layer_outputs = np.dot(inputs,weights) + biases # Dense layer
relu_outputs = np.maximum(0,layer_outputs) #ReLU Activation

# Let's optimize and test backpropagation here 
# ReLU activation - simulates derivative with respect to input values 
# from next layer passed to current layer during backpropagation 
drelu = relu_outputs.copy() 
drelu = np.where(drelu>0,1,0)

#dense layer
# dinputs -multiply by weights
dinputs = np.dot(drelu,weights.T)
# dweights - multiply by inputs
dweights = np.dot(inputs.T,drelu)
# dbiases - sum values, do this over samples (first axis), keepdims
dbiases = np.sum(drelu,axis=0,keepdims=True)

#update parameters
weights -= 0.001*dweights
biases -= 0.001*dbiases

print(weights)
print(biases)



[[ 0.1985  0.5005 -0.2615]
 [ 0.7903 -0.9147 -0.2797]
 [-0.5053  0.2537  0.1647]
 [ 0.9963 -0.5017  0.8663]]
[[1.997 2.998 0.497]]


In [2]:
a = np.array([1,-2,-9,-7,5,6,-7])
a[a<0] = 0
a

array([1, 0, 0, 0, 5, 6, 0])

In [None]:
class Layer_Dense:

    # Layer initialisation
    def __init__(self,n_inputs,n_neurons):
        self.weights = 0.01*np.random.randn(n_inputs,n_neurons)
        self.biases = np.zeros((1,n_neurons))

    # forward base
    def forward(self,inputs):
        self.inputs = inputs
        self.outputs = np.dot(inputs,self.weights) + self.biases

    # backward pass
    def backward(self,dvalues):
        # gradients on parameters
        self.dweights = np.dot(self.inputs.T,dvalues)
        self.biases = np.sum(dvalues,axis=0,keepdims=True)

        # gradients on inputs
        self.dinputs = np.dot(dvalues,self.weights.T)

class Activation_ReLU:

    #forward pass
    def forward(self,inputs):
        self.inputs = inputs
        self.output = np.maximum(0,inputs)

    # backward pass
    def backward(self,dvalues):
        # Since we need to modify the original variable, 
        # let's make a copy of the values first 
        self.dinputs = dvalues.copy()

        #zero gradient where input values are negative
        self.dinputs[self.inputs <= 0 ] = 0
        

class Loss_CategoricalCrossEntropy(Loss):

    # backward pass
    def backward(self,dvalues,y_true):

        # number of samples
        samples = len(dvalues)

        # number of labels in each sample
        labels = len(dvalues[0])

        # if labels are sparse then turn them into one hot vector
        if len(y_true.shape) == 1:
            y_true = np.eye(labels)[y_true]

        # calculate gradient
        self.dinputs = -y_true/dvalues

        # normalise gradinet
        self.dinputs = self.dinputs/samples

class Activation_Softmax:

    #backward pass
    def backward(self,dvalues):

        # create uninitialised array
        self.dinputs = np.empty_like(dvalues)

        #enumerate outputs and gradients
        for index,(single_output,single_dvalues) in enumerate(zip(self.output,dvalues)):
            #flatten output array
            single_output = single_output.reshape(-1,1)

            #calculate JAcobian matrix of the output 
            jacobian_matrix = np.diagflat(single_output) - np.dot(single_output, single_output.T) 

        # Calculate sample-wise gradient 
        # and add it to the array of sample gradients 
        self.dinputs[index] = np.dot(jacobian_matrix, single_dvalues)
        
        