# Backpropagation on LSTM

In [1]:
import numpy as np
from numpy import tanh

In [2]:

# Define activation functions
def sigmoid(x):
    """Sigmoid activation function."""
    return 1 / (1 + np.exp(-x))


def dsigmoid(x):
    """Derivative of sigmoid function."""
    return sigmoid(x) * (1 - sigmoid(x))


def dtanh(x):
    """Derivative of tanh function."""
    return 1 - np.tanh(x)**2


## forward pass
$loss = ||y-\hat{y}||^2$

$\hat{y}_t = w^\prime h_t$, where $w$ is an $n$-dimensional vector of weights

$h_t = o_t * g(c_t)$, where $*$ denotes element-wise multiplication

$c_t = f_t * c_{t-1} + i_t * \tilde{c_t}$

$\tilde{c}_t = g(W_c x_t + U_c h_{t-1})$

and for the gates:
\begin{align}
o_t &= \sigma\left( W_{o} x_t + U_{o} h_{t-1} \right)\\
f_t &= \sigma\left( W_{f} x_t + U_{f} h_{t-1} \right)\\
i_t &= \sigma\left( W_{i} x_t + U_{i} h_{t-1} \right)\\
\end{align}

In [3]:
# The sample
x = np.array([0.2, 0.3, 0.4])
y = 7.0

# Initialize Weights
Wc = np.array([0.2, 0.4])

Wo = np.array([0.1, 3.1])
Wf = np.array([2.3, 0.2])
Wi = np.array([3.1, 0.1])

Uc = np.array([[1.8, 3.6], [4.7, 2.9]])
Uo = np.array([[0.1, 0.9], [0.7, 4.3]])
Uf = np.array([[3.6, 4.1], [1.0, 0.9]])
Ui = np.array([[1.5, 2.6], [2.1, 0.2]])

w = np.array([2.0, 4.0])

In [11]:
# Forward pass
h = [np.zeros_like(Wc)]
o = []
f = []
i = []
c_ = []
c = [np.zeros_like(Wc)]
y_ = []

for t, xt in enumerate(x):
    # Calculate values of gates
    ot = sigmoid(Wo.dot(xt) + Uo.dot(h[t]))
    ft = sigmoid(Wf.dot(xt) + Uf.dot(h[t]))
    it = sigmoid(Wi.dot(xt) + Ui.dot(h[t]))

    # Calculate candidate update
    c_t = tanh(Wc.dot(xt) + Uc.dot(h[t]))
    ct = ft * c[t] + it * c_t

    # Calculate cell state
    ht = ot * tanh(ct)

    # Prediction at step t
    y_t = w.dot(ht)

    # Save variables to container
    h.append(ht)
    o.append(ot)
    f.append(ft)
    i.append(it)
    c_.append(c_t)
    c.append(ct)
    y_.append(y_t)


## backward pass



We need the following gradients


* candidate update: $\frac{\partial loss}{\partial W_c}$ & $\frac{\partial loss}{\partial U_c}$
* output gate update: $\frac{\partial loss}{\partial W_o}$ & $\frac{\partial loss}{\partial U_o}$
* forget gate update:$ \frac{\partial loss}{\partial W_f} $ & $ \frac{\partial loss}{\partial U_f}$
* input gate update:$ \frac{\partial loss}{\partial W_i}$ & $\frac{\partial loss}{\partial U_i}$


### Gradients for the output gate:

\begin{align}
\frac{\partial {loss}}{\partial W_o} &= \sum\frac{\partial {loss}_t}{\partial \hat{y}_t} \frac{\partial \hat{y}_t}{\partial h_t} \frac{\partial h_t}{\partial o_t} \frac{\partial o_t}{\partial W_o}\\
                                   &= \sum(y - \hat{y}_t)\ w * g(c_t) * \sigma^\prime \left( W_o x_t + U_o h_{t-1} \right)\ x_t
\end{align}


In [5]:
dLossdWo = np.zeros_like(Wo)

for t, xt in enumerate(x):
    dotdWo = dsigmoid(Wo.dot(xt) + Uo.dot(h[t])) * xt
    # Note that c = (c0, c1, c2, c3), thus we need to index it differently
    dLossdWo += (y-y_[t]) * w * tanh(c[t+1]) * dotdWo

And for the gradient of the loss function with respect to $U_o$ we get

$$
\frac{\partial loss}{\partial U_o} = \sum\frac{\partial {loss}_t}{\partial \hat{y}_t} \frac{\partial \hat{y}_t}{\partial h_t} \frac{\partial h_t}{\partial o_t} \frac{\partial o_t}{\partial U_o}.
$$

Here we need to make a distiction: the first part is the same as before $\frac{\partial {loss}_t}{\partial \hat{y}_t} \frac{\partial \hat{y}_t}{\partial h_t} \frac{\partial h_t}{\partial o_t}$ which in our example is of dimension $2 \times 1$. However, the last part is of dimension $2 \times 2$ since it is the Jacobian of 

$$
U_o h_{t-1} = \begin{bmatrix} u_{11} h_1 + u_{12} h_2 \\ u_{21} h_1 + u_{22} h_2 \end{bmatrix} $$
which is given by 
$$
\frac{\partial U_o h_{t-1}}{\partial U_o} = \begin{bmatrix} h_1 & h_2 \\ h_1 & h_2 \end{bmatrix} = \begin{bmatrix} h^T_{t-1} \\ h^T_{t-1} \end{bmatrix}, \text{ where } h_{t-1} = \begin{bmatrix}h_1 \\ h_2 \end{bmatrix}
$$

In `numpy` this is not a problem, just a simple broadcasting operation. In mathematical notation we need to insert a fitting diagonal matrix in order to multiply the first row of $\frac{\partial {loss}_t}{\partial \hat{y}_t} \frac{\partial \hat{y}_t}{\partial h_t} \frac{\partial h_t}{\partial o_t}$ with all entries of $\frac{\partial U_o h_{t-1}}{\partial U_o}$'s first row and the second row of the vector with all elements in the matrix' second row.

\begin{align}
\frac{\partial loss}{\partial U_o} &= \sum\frac{\partial {loss}_t}{\partial \hat{y}_t} \frac{\partial \hat{y}_t}{\partial h_t} \frac{\partial h_t}{\partial o_t} \frac{\partial o_t} {\partial U_o} \\
                                   &= \sum(y_t-\hat{y}_t)\ \textbf{diag}\left[ w * g(c_t) * \sigma^\prime \left(W_o x_t + U_o h_{t-1} \right) \right] \cdot \begin{bmatrix} h^T_{t-1} \\ h^T_{t-1} \end{bmatrix}
\end{align}

where $*$ denotes element-wise multiplication.

In [64]:
dLossdUo = np.zeros_like(Uo)

for t, xt in enumerate(x):
    dy_dot = w * tanh(c[t+1]) * dsigmoid(Wo.dot(xt) + Uo.dot(h[t]))
    # Expand so that the above multiplication can be performed element-wise
    dLossdUo += (y-y_[t]) * dy_dot.reshape(-1, 1) * h[t]

### Gradients for the input gate

\begin{align}
\frac{\partial loss}{\partial W_i} &= \sum\frac{\partial loss_t}{\partial \hat{y}_t}\frac{\partial \hat{y}_t}{\partial h_t}\frac{\partial h_t}{\partial c_t} \frac{\partial c_t}{\partial i_t}\frac{\partial i_t}{\partial W_i} \\
                                   &= \sum(y-\hat{y}_t)\ w * \left[o_t * g^\prime(c_t)\right] * \tilde{c}_t* \sigma^\prime \left( W_i x_t + U_i h_{t-1} \right)\ x_t
\end{align}


In [65]:
dLossdWi = np.zeros_like(Wi)

for t, xt in enumerate(x):
    ditdWi = dsigmoid(Wi.dot(xt) + Ui.dot(h[t])) * xt
    dLossdWi += (y-y_[t]) * w * (o[t] * dtanh(c[t+1])) * c_[t] * ditdWi


\begin{align}
\frac{\partial loss}{\partial U_i} &= \sum\frac{\partial loss_t}{\partial \hat{y}_t} \frac{\partial \hat{y}_t}{\partial h_t} \frac{\partial h_t}{\partial c_t} \frac{\partial c_t}{\partial i_t} \frac{\partial i_t}{\partial U_i} \\
                                   &= \sum(y-\hat{y}_t)\ \textbf{diag} \left[w * [ o_t * g^\prime (c_t)] * \tilde{c}_t * \sigma^\prime \left( W_i x_t + U_i h_{t-1} \right)\right] \cdot \begin{bmatrix}h^T_{t-1} \\ h^T_{t-1}\end{bmatrix}
\end{align}

In [66]:
dLossdUi = np.zeros_like(Ui)

for t, xt in enumerate(x):
    dy_dit = w * (o[t]*dtanh(c[t])) * c_[t] * dsigmoid(Wi.dot(xt)+Ui.dot(h[t]))
    dLossdUi += (y-y_[t]) * dy_dit.reshape(-1, 1) * h[t]

### Gradients for the forget gate

\begin{align}
\frac{\partial loss}{\partial W_f} &= \sum\frac{\partial loss_t}{\partial \hat{y}_t}\frac{\partial \hat{y}_t}{\partial h_t}\frac{\partial h_t}{\partial c_t} \frac{\partial c_t}{\partial f_t}\frac{\partial f_t}{\partial W_f} \\
                                   &= \sum(y-\hat{y}_t) w * \left[o_t * g^\prime(c_t)\right] * c_{t-1} * \sigma^\prime \left( W_f x_t + U_f h_{t-1} \right) x_t
\end{align}

In [67]:
dLossdWf = np.zeros_like(Wf)

for t, xt in enumerate(x):
    dftdWf = dsigmoid(Wf.dot(xt) + Uf.dot(h[t])) * xt
    dLossdWf += (y-y_[t]) * w * (o[t] * dtanh(c[t+1])) * c[t] * dftdWf


\begin{align}
\frac{\partial loss}{\partial U_f} &= \sum\frac{\partial loss_t}{\partial \hat{y}_t} \frac{\partial \hat{y}_t}{\partial h_t} \frac{\partial h_t}{\partial c_t} \frac{\partial c_t}{\partial f_t} \frac{\partial f_t}{\partial U_f} \\
                                   &= \sum(y-\hat{y}_t)\ \textbf{diag} \left[w * [ o_t * g^\prime (c_t)] * c_{t-1} * \sigma^\prime \left( W_f x_t + U_f h_{t-1} \right)\right] \cdot \begin{bmatrix}h^T_{t-1} \\ h^T_{t-1}\end{bmatrix}
\end{align}

In [68]:
dLossdUf = np.zeros_like(Uf)

for t, xt in enumerate(x):
    dy_dft = w*(o[t]*dtanh(c[t+1])) * c[t] * dsigmoid(Wf.dot(xt)+Uf.dot(h[t]))
    dLossdUf += (y-y_[t]) * dy_dft.reshape(-1, 1) * h[t]

### Gradients for the cell state

\begin{align}
\frac{\partial loss}{\partial W_c} &= \sum\frac{\partial loss_t}{\partial \hat{y}_t}\frac{\partial \hat{y}_t}{\partial h_t}\frac{\partial h_t}{\partial c_t}\frac{\partial c_t}{\partial \tilde{c}_t} \frac{\partial \tilde{c}_t}{\partial W_c} \\
                                   &= \sum(y - \hat{y}_t)\ w * [o_t * g^\prime(c_t)] * i_t * g^\prime(W_c x_t + U_c h_{t-1})\ x_t
\end{align}

In [69]:
dLossdWc = np.zeros_like(Wc)

for t, xt in enumerate(x):
    dc_tdWc = dtanh(Wc.dot(xt) + Uc.dot(h[t])) * xt
    dLossdWc += (y-y_[t]) * w * (o[t] * dtanh(c[t+1])) * i[t] * dc_tdWc

\begin{align}
\frac{\partial loss}{\partial U_c} &= \sum\frac{\partial loss_t}{\partial \hat{y}_t}\frac{\partial \hat{y}_t}{\partial h_t}\frac{\partial h_t}{\partial c_t}\frac{\partial c_t}{\partial \tilde{c}_t}\frac{\partial \tilde{c}_t}{\partial U_c} \\
                          &= \sum(y_t - \hat{y}_t)\ \textbf{diag} \left[w * [o_t * g^\prime(c_t)] * i_t * g^\prime(W_c x_t + U_c h_{t-1})\right] \begin{bmatrix} h^T_{t-1} \\ h^T_{t-1} \end{bmatrix}
\end{align}

In [70]:
dLossdUc = np.zeros_like(Uc)

for t, xt in enumerate(x):
    dc_tdf = w * (o[t] * dtanh(c[t+1])) * i[t] * dtanh(Wc.dot(xt)+Uc.dot(h[t]))
    dLossdUc += (y-y_[t]) * dc_tdf.reshape(-1, 1) * h[t]

### Weights of the outer layer

\begin{align}
\frac{\partial loss}{\partial w} &= \sum\frac{\partial loss_t}{\partial \hat{y}_t}\frac{\partial \hat{y}_t}{\partial w} \\
                                 &= \sum (y-\hat{y}_t) h_t 
\end{align}

In [71]:
dLossdw = np.zeros_like(w)

for t, xt in enumerate(x):
    dLossdw += (y - y_[t]) * h[t+1]

### Update the weights


In [72]:
eta = 0.01

Wc -= eta * dLossdWc
Wo -= eta * dLossdWo
Wf -= eta * dLossdWf
Wi -= eta * dLossdWi

Uc -= eta * dLossdUc
Uo -= eta * dLossdUo
Uf -= eta * dLossdUf
Ui -= eta * dLossdUi

w -= eta * dLossdw