# Backpropagation
In back prop, the NN adjusts it parameters propotionate to the erro in its guess. It does this by traversing backwards from the output, collecting the derivatives of the error with respect to the parameters of the function, and optimizing the parameters using `gradient descent`.

In [1]:
import platform
import numpy as np
import matplotlib
import matplotlib.pyplot as plt

print(platform.python_version())
print(np.__version__)
print(matplotlib. __version__)

#python version 3.9.7
#numpy version 1.21.2
#matplotlib version 3.5.0

3.7.12
1.19.5
3.2.2


In [8]:
# full forward pass with relu
# one traning example with three input features and three neurons

x = [1.0, -2.0, 3.0]
w = [-3.0, -1.0, 2.0]
b = 1.0

z = np.dot(w, x) + b
a = max(z, 0)
print(a)

6.0


Lossely interpreted as: `ReLU[sum(inputs * weights) + bias]`\
Rewite the equation to the form that will allow us to determine how to calculate the derivatives more easily: `y` = `ReLU(sum(mul(x, w), b))`

In [3]:
# derivative of the relu function with respect to z
relu_dz = (1 if z > 0 else 0.)

In [9]:
# backpass
# the derivative from the next layer
dvalue = 1.0

# derivative of relu and the chain rule
drelu_dz = dvalue * relu_dz
print(drelu_dz)

# partial derivatives of the dot product, the chain rule
dsum_dxw0 = 1
dsum_dxw1 = 1
dsum_dxw2 = 1
dsum_db = 1
drelu_dxw0 = drelu_dz * dsum_dxw0
drelu_dxw1 = drelu_dz * dsum_dxw1
drelu_dxw2 = drelu_dz * dsum_dxw2
drelu_db = drelu_dz * dsum_db
print(drelu_dxw0, drelu_dxw1, drelu_dxw2, drelu_db)

# Partial derivatives of the multiplication, the chain rule 
dmul_dx0 = w[0] 
dmul_dx1 = w[1] 
dmul_dx2 = w[2] 
dmul_dw0 = x[0] 
dmul_dw1 = x[1] 
dmul_dw2 = x[2] 
drelu_dx0 = drelu_dxw0 * dmul_dx0 
drelu_dw0 = drelu_dxw0 * dmul_dw0 
drelu_dx1 = drelu_dxw1 * dmul_dx1 
drelu_dw1 = drelu_dxw1 * dmul_dw1 
drelu_dx2 = drelu_dxw2 * dmul_dx2 
drelu_dw2 = drelu_dxw2 * dmul_dw2 
print(drelu_dx0, drelu_dw0, drelu_dx1, drelu_dw1, drelu_dx2, drelu_dw2)

1.0
1.0 1.0 1.0 1.0
-3.0 1.0 -1.0 -2.0 2.0 3.0


In [11]:
# backward pass with multiple layers of neurons
dvalues = np.ones((3, 3))

# 3 sets of weights - one set for each neurons
# 4 inputs features, thus 4 weights recall tha we keep weights transposed
weights = np.array([[0.2, 0.8, -0.5, 1], 
                    [0.5, -0.91, 0.26, -0.5], 
                    [-0.26, -0.27, 0.17, 0.87]])

# sum weights of given input and mul by the passed in gradient for this neuron
dX = np.dot(weights.T, dvalues)
print(dX)

[[ 0.44  0.44  0.44]
 [-0.38 -0.38 -0.38]
 [-0.07 -0.07 -0.07]
 [ 1.37  1.37  1.37]]


In [12]:
# forward pass and back pass 
dvalues = np.array([[1., 1., 1.,],
                    [2., 2., 2.,],
                    [3., 3., 3.,]])

# 3 sets of inputs - samples
inputs = np.array([[1, 2, 3, 2.5],
                   [2., 5., -1., 2.],
                   [-1.5, 2.7, 3.3, -0.8]])

# 3 set of weights, one for each neurons
#  4 inputs
weights = np.array([[0.2, 0.8, -0.5, 1], 
                    [0.5, -0.91, 0.26, -0.5], 
                    [-0.26, -0.27, 0.17, 0.87]]).T

# one bias for each neurons
biases = np.array([[2, 3, 0.5]])

In [13]:
# forward pass
z = np.dot(inputs, weights) + biases
a = np.maximum(z, 0) # relu activation function

# back pass
# from next layer passed to current layer during backprop
drelu = a.copy()
drelu[z <= 0] = 0

# dense layer
dweights = np.dot(inputs.T, drelu)
dbiases = np.sum(drelu, axis=0, keepdims=True)

weights += -0.001 * dweights
biases += -0.001 * dbiases

print(weights)
print(biases)

[[ 0.179515   0.5003665 -0.262746 ]
 [ 0.742093  -0.9152577 -0.2758402]
 [-0.510153   0.2529017  0.1629592]
 [ 0.971328  -0.5021842  0.8636583]]
[[1.98489  2.997739 0.497389]]
