# Backpropagation on RNN

## Forward pass

In [1]:
import numpy as np

In [36]:
W = np.array([2.0, 1.0])
U = np.array([[1.0, 2.0], [3.0, 4.0]])

w = np.array([2.0, 4.0])

x = np.array([0.2, 0.3, 0.4])
y = 7.0

In [37]:
h1 = np.tanh(W.dot(x[0]))
h1

array([0.37994896, 0.19737532])

In [38]:
h2 = np.tanh(W.dot(x[1]) + U.dot(h1))
h2

array([0.87975882, 0.97711011])

In [39]:
h3 = np.tanh(W.dot(x[2]) + U.dot(h2))
h3

array([0.9986059 , 0.99999815])

In [40]:
y_ = w.dot(h3)
y_

5.99720442000804

## Backward pass

In [41]:

def dtanh(x):
    """Derivative of tanh-activation function."""
    return 1 - np.tanh(x)**2


$$
\frac{\partial loss}{\partial W} = \frac{\partial loss}{\partial \hat{y}} \frac{\partial \hat{y}}{\partial h_3} \frac{\partial h_3}{\partial h_2} \frac{\partial h_2}{\partial h_1} \frac{\partial h_1}{\partial W}
$$

with
\begin{align}
\frac{\partial loss}{\partial \hat{y}} &= y - \hat{y}, \\
\frac{\partial h_t}{\partial h_{t-1}} &= g^\prime \left( W x_t + U h_{t-1}\right) \cdot U, \\
\frac{\partial h_1}{\partial W} &= g^\prime \left(W x_1 \right) \cdot x_1
\end{align}

similarly
$$
\frac{\partial loss}{\partial U} = \frac{\partial loss}{\partial \hat{y}} \frac{\partial \hat{y}}{\partial h_3} \frac{\partial h_3}{\partial h_2} \frac{\partial h_2}{\partial U}
$$
with
$$
\frac{\partial h_2}{\partial U} = g^\prime \left(W x_2 + U h_1 \right) \cdot h_1
$$


In [42]:
dldy_ = y - y_
dy_dh3 = w
dh3dh2 = dtanh(W.dot(x[2]) + U.dot(h2)).dot(U)
dh2dh1 = dtanh(W.dot(x[1]) + U.dot(h1)).dot(U)
dh1dW = dtanh(W.dot(x[0])) * x[0]
dh2dU = dtanh(W.dot(x[1]) + U.dot(h1)).dot(h1)

In [43]:
dlossdW = dldy_ * dy_dh3 * dh3dh2 * dh2dh1 * dh1dW
dlossdU = dldy_ * dy_dh3 * dh3dh2 * dh2dU

In [44]:
dlossdW

array([0.00034735, 0.00272708])

In [45]:
dlossdU

array([0.00053191, 0.00212484])

## Gradient Descent

In [46]:
eta = 0.001
W -= eta * dlossdW
U -= eta * dlossdU

In [47]:
W

array([1.99999965, 0.99999727])

In [48]:
U

array([[0.99999947, 1.99999788],
       [2.99999947, 3.99999788]])