# Chapter 9 : Backpropogation

Let’s start with a simplified forward pass with just one neuron. Rather than backpropagating from the loss function for a full neural network, let’s backpropagate the ReLU function for a single neuron and act as if we intend to minimize the output for this single neuron. We’re first doing this only as a demonstration to simplify the explanation, since minimizing the output from a ReLU activated 
neuron doesn’t serve any purpose other than as an exercise.

In [1]:
# an example neuron with three inputs
x = [1.0,-2.0,3.0] # input values
w = [-3.0,-1.0,2.0] # weights
b = 1.0 # bias

In [2]:
# multiplying input sby weights
xw0 = x[0]*w[0]
xw1 = x[1]*w[1]
xw2 = x[2]*w[2]

# adding weighted inputs and bias
z = xw0 + xw1 + xw2 + b

# ReLU activation function
y = max(z,0)

In [3]:
# Backward Pass
dvalue = 1.0

#Derivative of ReLU and chain rule 
drelu_dz = dvalue*(1.0 if z>0 else 0.0) # z is denoting the sum only
print(drelu_dz)

1.0


In [4]:
# partial deivatives of sums and the chain rule
dsum_dxw0 = 1
dsum_dxw1 = 1
dsum_dxw2 = 1
dsum_db = 1
drelu_dxw0 = drelu_dz*dsum_dxw0
drelu_dxw1 = drelu_dz*dsum_dxw1
drelu_dxw2 = drelu_dz*dsum_dxw2
drelu_db = drelu_dz*dsum_db
print(drelu_dxw0,drelu_dxw1,drelu_dxw2,drelu_db)

1.0 1.0 1.0 1.0


In [6]:
# partial derivatives of multiplications and chain rule
dmul_dx0 = w[0] 
dmul_dx1 = w[1] 
dmul_dx2 = w[2] 
dmul_dw0 = x[0] 
dmul_dw1 = x[1] 
dmul_dw2 = x[2] 
drelu_dx0 = drelu_dxw0 * dmul_dx0 
drelu_dw0 = drelu_dxw0 * dmul_dw0 
drelu_dx1 = drelu_dxw1 * dmul_dx1 
drelu_dw1 = drelu_dxw1 * dmul_dw1 
drelu_dx2 = drelu_dxw2 * dmul_dx2 
drelu_dw2 = drelu_dxw2 * dmul_dw2 
print(drelu_dx0, drelu_dw0, drelu_dx1, drelu_dw1, drelu_dx2, drelu_dw2)

-3.0 1.0 -1.0 -2.0 2.0 3.0


In [3]:
import numpy as np
# Passed in gradient from the next layer 
# for the purpose of this example we're going to use 
# a vector of 1s 
dvalues = np.array([[1.,1.,1.]])

# We have 3 sets of weights - one set for each neuron 
# we have 4 inputs, thus 4 weights 
# recall that we keep weights transposed
weights = np.array([[0.2, 0.8, -0.5, 1], 
                    [0.5, -0.91, 0.26, -0.5], 
                    [-0.26, -0.27, 0.17, 0.87]]).T

# sum weights of given input 
# and multiply by the passed in gradient for this neuron 
# dx0 = sum(weights[0]*dvalues[0]) 
# dx1 = sum(weights[1]*dvalues[0]) 
# dx2 = sum(weights[2]*dvalues[0])
# dx3 = sum(weights[3]*dvalues[0])
# dinputs = np.array([dx0, dx1, dx2, dx3]) 

# or a better method
dinputs = np.dot(dvalues[0],weights.T)



print(dinputs)

[ 0.44 -0.38 -0.07  1.37]


In [4]:
dvalues = np.array([[1.,1.,1.],
                    [2.,2.,2.],
                    [3.,3.,3.]])

weights = np.array([[0.2, 0.8, -0.5, 1], 
                    [0.5, -0.91, 0.26, -0.5], 
                    [-0.26, -0.27, 0.17, 0.87]]).T
dinputs = np.dot(dvalues,weights.T)

print(dinputs)

[[ 0.44 -0.38 -0.07  1.37]
 [ 0.88 -0.76 -0.14  2.74]
 [ 1.32 -1.14 -0.21  4.11]]


In [5]:
"""For weights"""

dvalues = np.array([[1.,1.,1.],
                    [2.,2.,2.],
                    [3.,3.,3.]])

# We have 3 sets of inputs - samples 
inputs = np.array([[1, 2, 3, 2.5], 
                   [2., 5., -1., 2], 
                   [-1.5, 2.7, 3.3, -0.8]]) 

# sum weights of given input 
# and multiply by the passed in gradient for this neuron 
dweights = np.dot(inputs.T, dvalues)

print(dweights)

[[ 0.5  0.5  0.5]
 [20.1 20.1 20.1]
 [10.9 10.9 10.9]
 [ 4.1  4.1  4.1]]


In [6]:
# Passed in gradient from the next layer 
# for the purpose of this example we're going to use 
# an array of an incremental gradient values 
dvalues = np.array([[1., 1., 1.], 
                    [2., 2., 2.], 
                    [3., 3., 3.]]) 

# One bias for each neuron 
# biases are the row vector with a shape (1, neurons) 
biases = np.array([[2, 3, 0.5]])

# dbiases - sum values, do this over samples (first axis), keepdims 
# since this by default will produce a plain list - 
# we explained this in the chapter 4 
dbiases = np.sum(dvalues, axis=0, keepdims=True)
                 
print(dbiases)

[[6. 6. 6.]]


In [7]:
# Example layer output 
z = np.array([[1, 2, -3, -4], 
              [2, -7, -1, 3], 
              [-1, 2, 5, -1]]) 
 
dvalues = np.array([[1, 2, 3, 4], 
                    [5, 6, 7, 8], 
                    [9, 10, 11, 12]]) 
 
# ReLU activation's derivative 
drelu = np.zeros_like(z) 
drelu[z > 0] = 1 
 
print(drelu) 
 
# The chain rule 
drelu *= dvalues 
 
print(drelu) 


[[1 1 0 0]
 [1 0 0 1]
 [0 1 1 0]]
[[ 1  2  0  0]
 [ 5  0  0  8]
 [ 0 10 11  0]]


In [8]:
weights = np.array([[0.2, 0.8, -0.5, 1], 
                    [0.5, -0.91, 0.26, -0.5], 
                    [-0.26, -0.27, 0.17, 0.87]]).T

print(weights)

[[ 0.2   0.5  -0.26]
 [ 0.8  -0.91 -0.27]
 [-0.5   0.26  0.17]
 [ 1.   -0.5   0.87]]


In [9]:
sum(weights[0])*

0.43999999999999995

In [13]:
dvalue = np.array([[1.,1.,1.]])
dvalue[0]

array([1., 1., 1.])

In [10]:
weights[0]

array([ 0.2 ,  0.5 , -0.26])

In [11]:
0.2+0.5

0.7

In [12]:
0.7-0.26

0.43999999999999995

In [2]:
# Dense Layer
class Layer_Dense:

    #Layer initialisation
    def __init__(self,inputs,neurons):
        self.weights = 0.01*np.random.randn(inputs,neurons)
        self.biases = np.zeros((1,neurons))
        self.inputs = inputs

    #Forward pass
    def forward(self,inputs):
        self.output = np.dot(inputs,self.weights) + self.biases 


    #Backward Pass
    def backward(self,dvalues):
        #Gradients on params
        self.dweights = np.dot(self.inputs.T,dvalues)
        self.dbiases = np.sum(dvalues,axis = 0,keepdims = True)

        # gradient on values
        self.dinputs = np.dot(dvalues,self.weights.T)
        


In [3]:
# ReLU Activation
class Activation_ReLU:

    def forward(self,inputs):
        self.inputs = inputs
        self.output = np.maximum(0,inputs)

    def backward(self,dvalues):
        # Since we need to modify the original variable, 
        # let's make a copy of the values first 
        self.dinputs = dvalues.copy()

        #zero gradient where input values were negative
        self.dinputs[self.inputs <= 0] = 0

In [6]:
# Common loss class 
class Loss: 
 
    # Calculates the data and regularization losses 
    # given model output and ground truth values 
    def calculate(self, output, y): 
 
        # Calculate sample losses 
        sample_losses = self.forward(output, y) 
 
        # Calculate mean loss 
        data_loss = np.mean(sample_losses) 
 
        # Return loss 
        return data_loss
        
# cross entropy loss
class Loss_CategoricalCrossentropy(Loss):

    #backward pass
    def backward(self,dvalues,y_true):
         # Number of samples 
        samples = len(dvalues) 
        # Number of labels in every sample 
        # We'll use the first sample to count them 
        labels = len(dvalues[0]) 
 
        # If labels are sparse, turn them into one-hot vector 
        if len(y_true.shape) == 1: 
            y_true = np.eye(labels)[y_true] 
 
        # Calculate gradient 
        self.dinputs = -y_true / dvalues 
        # Normalize gradient 
        self.dinputs = self.dinputs / samples


In [4]:
# Softmax activation 
class Activation_Softmax: 
    ... 
    # Backward pass 
    def backward(self, dvalues): 
 
        # Create uninitialized array 
        self.dinputs = np.empty_like(dvalues) 
 
        # Enumerate outputs and gradients 
        for index, (single_output, single_dvalues) in enumerate(zip(self.output, dvalues)): 
            # Flatten output array 
            single_output = single_output.reshape(-1, 1) 
            # Calculate Jacobian matrix of the output and 
            jacobian_matrix = np.diagflat(single_output) - np.dot(single_output, single_output.T) 
            # Calculate sample-wise gradient 
            # and add it to the array of sample gradients 
            self.dinputs[index] = np.dot(jacobian_matrix, 
                                         single_dvalues)

In [5]:
# Softmax classifier - combined Softmax activation 
# and cross-entropy loss for faster backward step 
class Activation_Softmax_Loss_CategoricalCrossentropy(): 
 
    # Creates activation and loss function objects 
    def __init__(self): 
        self.activation = Activation_Softmax() 
        self.loss = Loss_CategoricalCrossentropy() 
 
    # Forward pass 
    def forward(self, inputs, y_true): 
        # Output layer's activation function 
        self.activation.forward(inputs) 
        # Set the output 
        self.output = self.activation.output 
        # Calculate and return loss value 
        return self.loss.calculate(self.output, y_true) 
 
    # Backward pass 
    def backward(self, dvalues, y_true): 
 
        # Number of samples 
        samples = len(dvalues)

    # If labels are one-hot encoded, 
        # turn them into discrete values 
        if len(y_true.shape) == 2: 
            y_true = np.argmax(y_true, axis=1) 
 
        # Copy so we can safely modify 
        self.dinputs = dvalues.copy() 
        # Calculate gradient 
        self.dinputs[range(samples), y_true] -= 1 
        # Normalize gradient 
        self.dinputs = self.dinputs/samples