# Automactic Differentiation and Backpropagation

## Import 

In [1]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:90% !important; }</style>"))

import torch as t
import numpy as np

## Examble 1


### Function
\
<font size="5">
$out(a,b) = (a+b)*(b+1)$
</font>

In [2]:
# define leaf nodes
a = t.tensor(2.0, requires_grad=True)
b = t.tensor(1.0, requires_grad=True)

leaf_nodes = [a,b]
leaf_nodes_name = ['a','b']

In [3]:
# function computation
c = a+b
print(' c:',c)

d = b+1.0
print(' d:',d)

e = c*d 
print('Output e:',e)

 c: tensor(3., grad_fn=<AddBackward0>)
 d: tensor(2., grad_fn=<AddBackward0>)
Output e: tensor(6., grad_fn=<MulBackward0>)


![CG_examble_1.png](attachment:CG_examble_1.png)

In [4]:
e.backward()

![CG_examble_1_PD.png](attachment:CG_examble_1_PD.png)

<font size="5">
Using chain rule and sum rule:

\
$\frac{\partial out}{\partial a} = \frac{\partial out}{\partial e}*\frac{\partial e}{\partial c}*\frac{\partial c}{\partial a} \\
\frac{\partial out}{\partial a} = 1*2*1 = 2$

\
$\frac{\partial out}{\partial b} = \frac{\partial out}{\partial e}*\frac{\partial e}{\partial c}*\frac{\partial c}{\partial b} + \frac{\partial out}{\partial e}*\frac{\partial e}{\partial d}*\frac{\partial d}{\partial b} \\
\frac{\partial out}{\partial b} = 1*2*1 + 1*3*1 = 5$
</font>



In [5]:
print()
print('Gradients for all leaf nodes:')
print('-----------------------------')
for node, nodename in zip(leaf_nodes, leaf_nodes_name):
    print(nodename,':')
    print(node)
    print('Gradient:',node.grad)
    print()


Gradients for all leaf nodes:
-----------------------------
a :
tensor(2., requires_grad=True)
Gradient: tensor(2.)

b :
tensor(1., requires_grad=True)
Gradient: tensor(5.)



## Examble 2

### Function
\
<font size="5">
$out(x_0,w_0,x_1,w_1,w_2) = \frac{1}{-1*max(\exp((x_0*w_0)+(x_1*w_1)),w_2)}$
</font>

In [6]:
# define leaf nodes
x0 = t.tensor(4.0, requires_grad=True)
w0 = t.tensor(1.0, requires_grad=True)

x1 = t.tensor(-2.0, requires_grad=True)
w1 = t.tensor(2.0, requires_grad=True)

w2 = t.tensor(0.5, requires_grad=True)

leaf_nodes = [x0,w0,x1,w1,w2]
leaf_nodes_name = ['x0','w0','x1','w1','w2']

In [7]:
# function computation
y0 = x0*w0
print(' y0:',y0)

y1 = x1*w1
print(' y1:',y1)

y2 = y0+y1
print(' y2:',y2)

y3 = t.exp(y2)
print(' y3:',y3)

y4 = t.max(y3,w2)
print(' y4:',y4)

y5 = -1.0*y4
print(' y5:',y5)

y6 = 1.0/y5
print('Output y6:',y6)

 y0: tensor(4., grad_fn=<MulBackward0>)
 y1: tensor(-4., grad_fn=<MulBackward0>)
 y2: tensor(0., grad_fn=<AddBackward0>)
 y3: tensor(1., grad_fn=<ExpBackward>)
 y4: tensor(1., grad_fn=<MaximumBackward>)
 y5: tensor(-1., grad_fn=<MulBackward0>)
Output y6: tensor(-1., grad_fn=<MulBackward0>)


**Computational Graph**

![CG_examble_2.png](attachment:CG_examble_2.png)

In [8]:
y6.backward()

![CG_examble_2_PD-2.png](attachment:CG_examble_2_PD-2.png)

<font size="5">
Using chain rule:

\
$\frac{\partial out}{\partial x_0} = \frac{\partial out}{\partial y_6}*\frac{\partial y_6}{\partial y_5}*\frac{\partial y_5}{\partial y_4}*\frac{\partial y_4}{\partial y_3}*\frac{\partial y_3}{\partial y_2} *\frac{\partial y_2}{\partial y_0} *\frac{\partial y_0}{\partial x_0} \\
\frac{\partial out}{\partial x_0} = 1*(-1)*(-1)*1*1*1*1 = 1$

\
$\frac{\partial out}{\partial w_0} = \frac{\partial out}{\partial y_6}*\frac{\partial y_6}{\partial y_5}*\frac{\partial y_5}{\partial y_4}*\frac{\partial y_4}{\partial y_3}*\frac{\partial y_3}{\partial y_2} *\frac{\partial y_2}{\partial y_0} *\frac{\partial y_0}{\partial w_0} \\
\frac{\partial out}{\partial w_0} = 1*(-1)*(-1)*1*1*1*4 = 4$

\
$\frac{\partial out}{\partial x_1} = \frac{\partial out}{\partial y_6}*\frac{\partial y_6}{\partial y_5}*\frac{\partial y_5}{\partial y_4}*\frac{\partial y_4}{\partial y_3}*\frac{\partial y_3}{\partial y_2} *\frac{\partial y_2}{\partial y_1} *\frac{\partial y_1}{\partial x_1} \\
\frac{\partial out}{\partial x_1} = 1*(-1)*(-1)*1*1*1*2 = 2$

\
$\frac{\partial out}{\partial w_1} = \frac{\partial out}{\partial y_6}*\frac{\partial y_6}{\partial y_5}*\frac{\partial y_5}{\partial y_4}*\frac{\partial y_4}{\partial y_3}*\frac{\partial y_3}{\partial y_2} *\frac{\partial y_2}{\partial y_1} *\frac{\partial y_1}{\partial w_1} \\
\frac{\partial out}{\partial w_1} = 1*(-1)*(-1)*1*1*1*(-2) = -2$

\
$\frac{\partial out}{\partial w_1} = \frac{\partial out}{\partial y_6}*\frac{\partial y_6}{\partial y_5}*\frac{\partial y_5}{\partial y_4}*\frac{\partial y_4}{\partial w_2} \\
\frac{\partial out}{\partial w_2} = 1*(-1)*(-1)*0 = 0$    

</font>



In [9]:
print()
print('Gradients for all leaf nodes:')
print('-----------------------------')
for node, nodename in zip(leaf_nodes, leaf_nodes_name):
    print(nodename,':')
    print(node)
    print('Gradient:',node.grad)
    print()


Gradients for all leaf nodes:
-----------------------------
x0 :
tensor(4., requires_grad=True)
Gradient: tensor(1.)

w0 :
tensor(1., requires_grad=True)
Gradient: tensor(4.)

x1 :
tensor(-2., requires_grad=True)
Gradient: tensor(2.)

w1 :
tensor(2., requires_grad=True)
Gradient: tensor(-2.)

w2 :
tensor(0.5000, requires_grad=True)
Gradient: tensor(0.)



## PyTorch Module example

In [10]:
import torch as t
import torch.nn as nn

### Typical Neuron in Neural Networks

![SimpleNeuron.png](attachment:SimpleNeuron.png)

### Activation Functions
[PyTorch Activations](https://pytorch.org/docs/stable/nn.html#non-linear-activations-other)

![activation_functions.png](attachment:activation_functions.png)

### Define Neuron/Layer and Input

In [11]:
print('Layer Setup')
print('-----------')
linear_layer = nn.Linear(4,1, bias=True)
print('Layer weights w_i:',linear_layer.weight,'\n')
print('Layer bias b:',linear_layer.bias,'\n')

print('Gradients:')
print('----------')
print('Layer weights w_i gradient:',linear_layer.weight.grad,'\n')
print('Layer bias b gradient:',linear_layer.bias.grad,'\n')

print('Activation:')
print('-----------')
activation_function = nn.Tanh()
print('Activation Function:',activation_function,'\n')

print('Input:')
print('------')
nn_input = t.tensor([1.0,2.0,3.0,4.0])
print('Network Input:',nn_input)
print('Network Input grad_flag:',nn_input.requires_grad,'\n')

Layer Setup
-----------
Layer weights w_i: Parameter containing:
tensor([[-0.2819, -0.1359,  0.4542,  0.1628]], requires_grad=True) 

Layer bias b: Parameter containing:
tensor([0.4950], requires_grad=True) 

Gradients:
----------
Layer weights w_i gradient: None 

Layer bias b gradient: None 

Activation:
-----------
Activation Function: Tanh() 

Input:
------
Network Input: tensor([1., 2., 3., 4.])
Network Input grad_flag: False 



### Forward Path

In [12]:
nn_output = activation_function(linear_layer(nn_input))
print('Network Output:',nn_output)

Network Output: tensor([0.9607], grad_fn=<TanhBackward>)


### Backward Path

In [13]:
nn_output.backward()

In [14]:
print('Gradients:')
print('----------')
print('Layer weights w_i gradient:',linear_layer.weight.grad,'\n')
print('Layer bias b gradient:',linear_layer.bias.grad,'\n')

Gradients:
----------
Layer weights w_i gradient: tensor([[0.0770, 0.1540, 0.2310, 0.3080]]) 

Layer bias b gradient: tensor([0.0770]) 

