In [14]:
import numpy as np
import pandas as pd
from numpy import tanh

In [15]:
# Define activation functions
def sigmoid(x):
    """Sigmoid activation function."""
    return 1 / (1 + np.exp(-x))


def dsigmoid(x):
    """Derivative of sigmoid function."""
    return sigmoid(x) * (1 - sigmoid(x))


def dtanh(x):
    """Derivative of tanh function."""
    return 1 - np.tanh(x)**2


$
% Vector shortcuts
\newcommand{\Htil}{\color{green}{\tilde{h}_t}}
\newcommand{\H}{\color{green}{h_{t-1}}}
\newcommand{\R}{\color{blue}{r_t}}
\newcommand{\Z}{\color{red}{z_t}}
% Gates Matrix Notation
\newcommand{\Gfull}[1][]{\color{green}{g^{#1}\left( W_h x_t + U_h \cdot ( \R * \H)\right)}}
\newcommand{\Zfull}[1][]{\color{red}{\sigma^{#1} \left( W_z x_t + U_z \cdot h_{t-1} \right)}}
\newcommand{\Rfull}[1][]{\color{blue}{\sigma^{#1} \left( W_r x_t + U_r \cdot h_{t-1} \right)}}
% Vector Elements
\newcommand{\htil}[1]{\color{green}{\tilde{h}^t_{#1}}}
\newcommand{\h}[1]{\color{green}{h_{#1}^{t-1}}}
\newcommand{\z}[1]{\color{red}{z_{#1}^{t}}}
\newcommand{\r}[1]{\color{blue}{r_{#1}^{t}}}
% Explicit Vector Elements
\newcommand{\gfull}[2][]{\color{green}{g^{#1} \left( w^h_{#2} x_t + u^h_{#2 1} \r{1} \h{1} + u^h_{#2 2} \r{2} \h{2} \right)}}
\newcommand{\zfull}[2][]{\color{red}{\sigma^{#1} \left(w^z_{#2} x_t + u^z_{#2 1}h_1^{t-1}+u^z_{#2 2} h_2^{t-1}\right)}}
\newcommand{\rfull}[2][]{\color{blue}{\sigma^{#1} \left(w^r_{#2} x_t + u^r_{#2 1}h_1^{t-1}+u^r_{#2 2} h_2^{t-1}\right)}}
\newcommand{\gfullfull}[2][]{\color{green}{g^{#1} \left( w^h_{#2} x_t + u^h_{#2 1} \rfull{1} \h{1} + u^h_{#2 2} \rfull{2} \h{2} \right)}}
% Weight Matrices
\newcommand{\Wh}{\color{green}{W_h}}
\newcommand{\Wz}{\color{red}{W_z}}
\newcommand{\Wr}{\color{blue}{W_r}}
\newcommand{\Uh}{\color{green}{U_h}}
\newcommand{\Uz}{\color{red}{U_z}}
\newcommand{\Ur}{\color{blue}{U_r}}
% Weight Matrix Elements
\newcommand{\wh}[1]{\color{green}{w^h_{#1}}}
\newcommand{\wz}[1]{\color{red}{w^z_{#1}}}
\newcommand{\wr}[1]{\color{blue}{w^r_{#1}}}
\newcommand{\uh}[2]{\color{green}{u^h_{#1 #2}}}
\newcommand{\uz}[2]{\color{red}{u^z_{#1 #2}}}
\newcommand{\ur}[2]{\color{blue}{u^r_{#1 #2}}}
% Miscellaneous
\newcommand{\dxt}[1]{\color{#1}{x_t}}
\newcommand{\yhat}{\hat{y}_t}
\newcommand{\deriv}[2]{\frac{\partial #1}{\partial #2}}
\DeclareMathOperator{\diag}{diag}
$

# Backpropagation GRU

Consider again, a low-dimensional numerical example with only one case with features $x = \begin{pmatrix}x_1, & x_2, & x_3 \end{pmatrix}^T$ and label $y$. Additionally let's assume we only have two neurons in our network. This means for the weight matrices 

\begin{alignat}{2}
\color{green}{W_h} &= \begin{bmatrix} \wh{1} \\ \wh{2} \end{bmatrix}  \quad
\color{green}{U_h} &&= \begin{bmatrix} \uh{1}{1} & \uh{1}{2} \\ \uh{2}{1} & \uh{2}{2}\end{bmatrix}\\ 
\color{red}{W_z} &= \begin{bmatrix} \wz{1} \\ \wz{2} \end{bmatrix}  \quad
\color{red}{U_z} &&= \begin{bmatrix} \uz{1}{1} & \uz{1}{2} \\ \uz{2}{1} & \uz{2}{2} \end{bmatrix}\\
\color{blue}{W_r} &= \begin{bmatrix} \wr{1} \\ \wr{2} \end{bmatrix} \quad
\color{blue}{U_r} &&= \begin{bmatrix} \ur{1}{1} & \ur{1}{2} \\ \ur{2}{1} & \ur{2}{2} \end{bmatrix} 
\end{alignat}

Define the candidate as
\begin{align}
\Htil &= \Gfull \\ &= \begin{pmatrix} \gfull{1} \\ \gfull{2} \end{pmatrix} \\ &= \begin{pmatrix} \gfullfull{1} \\ \gfullfull{2} \end{pmatrix}
\end{align}

Define the gates as
\begin{equation}
\Z = \Zfull = \begin{pmatrix} \z{1} \\ \z{2} \end{pmatrix} = \begin{pmatrix} \zfull{1} \\ \zfull{2} \end{pmatrix}  \\
\R = \Rfull = \begin{pmatrix} \r{1} \\ \r{2} \end{pmatrix} = \begin{pmatrix} \rfull{1} \\ \rfull{2} \end{pmatrix},
\end{equation}

where $g$ is the $tanh$-function and $\sigma$ the sigmoid function.

And the final update of the hidden state as

\begin{align}
h_t &= \color{red}{1-\Z} * h_{t-1} + \Z * \Htil \\ 
    &= 
\begin{pmatrix}
\color{red}{(1-\z{1})} h^{t-1}_1 + \z{1} \gfull{1} \\
\color{red}{(1-\z{2})} h^{t-1}_2 + \z{2} \gfull{2}
\end{pmatrix} \\
    &= 
\begin{pmatrix}
\color{red}{(1-\zfull{1})} h^{t-1}_1 + \zfull{1} \gfullfull{1} \\
\color{red}{(1-\zfull{2})} h^{t-1}_2 + \zfull{2} \gfullfull{2}
\end{pmatrix}
\end{align}

For the actual prediction at a step $t$, we connect every output of our hidden state in a dense layer. Which means that we are taking a weighted sum of all two of them.

\begin{align}
\hat{y}_t &= W^T h_t = w_1 h^t_1 + w_2 h^t_2 \\ &= w_1 \color{red}{(1-\zfull{1})} h^{t-1}_1 \\&\quad + \zfull{1} \gfullfull{1} \\ &+ w_2
\color{red}{(1-\zfull{2})} h^{t-1}_2 \\&\quad+ \zfull{2} \gfullfull{2},
\end{align}

Where $W = \begin{pmatrix} w_1 \\ w_2 \end{pmatrix}$ is the matrix that contains the weights for the outer layer. In our case this is just a $2\times 1$ vector.

For a simple example containing just one sample $x \in \mathbb{R}^2$, suppose that the the weights at a certain point look like that:

In [16]:
# The sample
x = np.array([0.2, 0.3, 0.4])
y = 7.0

# Initialize Weights
Wh = np.array([0.2, 0.9])

Wz = np.array([0.1, 3.1])
Wr = np.array([2.3, 0.5])

Uh = np.array([[1.5, 2.6], [1.8, 3.6]])
Uz = np.array([[0.1, 4.1], [0.2, 1.0]])
Ur = np.array([[1.3, 7.1], [9.1, 4.5]])

w = np.array([2.0, 4.0])

We can implement the forward pass like that:

In [17]:
h = []
h_ = []
z = []
r = []
y_ = []

def GRU_forward():
    """Perform forward pass."""
    h.append(np.zeros_like(Wh))
    
    for t, xt in enumerate(x, start=len(y_)):
        # Calculate values of the gates
        zt = sigmoid(Wz.dot(xt) + Uz.dot(h[t]))
        rt = sigmoid(Wr.dot(xt) + Ur.dot(h[t]))

        # Calculate candidate update
        h_t = tanh(Wh.dot(xt) + Uh.dot(rt * h[t]))

        # Calculate hidden state
        ht = (1-zt) * h[t] + zt * h_t

        # Calculate prediction at step t
        y_t = w.dot(ht)

        # Save variables to container
        h.append(ht)
        h_.append(h_t)
        z.append(zt)
        r.append(rt)
        y_.append(y_t)
    
    return y_[-1]

yhat_before = GRU_forward()

Where we want to keep track of the prediction before updating the weights, such that we can compare it with the prediction we obtain after we update our weight matrices.

### Weights for the candidate $\tilde{h}_t$

Let's start with taking partial derivatives of $\hat{y}_t$ with respect to the elemens of $\Wh$:

\begin{equation}
\frac{\partial \hat{y}_t}{\partial \wh{1}} = w_1 \z{1} \gfull[\prime]{1} \color{green}{x_t}
\end{equation}

Which is just an application of the chain-rule for derivatives. Similarly we get

\begin{equation}
\frac{\partial \hat{y}_t}{\partial \wh{2}} = w_2 \z{2} \gfull[\prime]{2} \color{green}{x_t}.
\end{equation}

We can obtain the gradient by combining the two partial derivatives in the following manner

\begin{align}
\frac{\partial \hat{y}_t}{\partial \Wh} &= 
    \begin{bmatrix} 
        \frac{\partial \hat{y}_t}{\partial \wh{1}} \\ \frac{\partial \hat{y}_t}{\partial \wh{2}}  
    \end{bmatrix}                       =
    \begin{bmatrix} 
        w_1 \z{1} \gfull{1} \color{green}{x_t} \\
        w_2 \z{2} \gfull{2} \color{green}{x_t}
    \end{bmatrix} \\
                                        &= W * \Z * \Gfull[\prime] \color{green}{x_t}.
\end{align}

Additionally, we can make use of the element-wise product in order to obtain a formulation that has a straightforward `numpy` implementation. We get the same result if we deploy the logic of the backpropagation algorithm.

\begin{align}
\frac{\partial {loss}_t}{\partial W_h} &= \frac{\partial {loss}_t}{\partial \hat{y}_t}\frac{\partial \hat{y}_t}{\partial h_t}\frac{\partial h_t}{\partial \tilde{h}_t}\frac{\partial \tilde{h}_t}{\partial \Wh} \\
                                   &= -(y-\hat{y}_t) W * \Z * \Gfull[\prime] \color{green}{x_t}
\end{align}

In [18]:
dLossdWh = np.zeros_like(Wh)

for t, xt in enumerate(x):
    # Note that `h` has an entry at start, so indexing at t accesses h_{t-1}
    dh_tdWh = dtanh(Wh.dot(xt) + Uh.dot(r[t] * h[t])) * xt
    dLossdWh += -(y-y_[t]) * w * z[t] * dh_tdWh

We continue with the partial derivatives of $\hat{y}_t$ with respect to the elements of $\Uh$

\begin{align}
\frac{\partial \hat{y}_t}{\partial \uh{1}{1}} &= w_1 \z{1} \gfull[\prime]{1} \r{1} \h{1} \\
\frac{\partial \hat{y}_t}{\partial \uh{1}{2}} &= w_1 \z{1} \gfull[\prime]{1} \r{2} \h{2} \\
\frac{\partial \hat{y}_t}{\partial \uh{2}{1}} &= w_2 \z{2} \gfull[\prime]{2} \r{1} \h{1} \\
\frac{\partial \hat{y}_t}{\partial \uh{2}{2}} &= w_2 \z{2} \gfull[\prime]{2} \r{2} \h{2}.
\end{align}

These we can combine to form the jacobian of of $\yhat$ with respect to $\Uh$

\begin{align}
\deriv{\yhat}{\Uh} &= 
\begin{bmatrix} 
    \deriv{\yhat}{\uh{1}{1}} & \deriv{\yhat}{\uh{1}{2}} \\
    \deriv{\yhat}{\uh{2}{1}} & \deriv{\yhat}{\uh{2}{2}}
\end{bmatrix}      = 
\begin{bmatrix}
w_1 \z{1} \gfull[\prime]{1} \r{1} \h{1} &
w_1 \z{1} \gfull[\prime]{1} \r{2} \h{2} \\
w_2 \z{2} \gfull[\prime]{2} \r{1} \h{1} &
w_2 \z{2} \gfull[\prime]{2} \r{2} \h{2}.
\end{bmatrix} \\
                  &=
\begin{bmatrix} w_1 & w_1 \\ w_2 & w_2 \end{bmatrix} * 
\begin{bmatrix} \z{1} & \z{1} \\ \z{2} & \z{2} \end{bmatrix} * 
\begin{bmatrix} \gfull[\prime]{1} & \gfull[\prime]{1} \\ \gfull[\prime]{2} & \gfull[\prime]{2} \end{bmatrix} * 
\begin{bmatrix} \r{1} & \r{2} \\ \r{1} & \r{2} \end{bmatrix} * 
\begin{bmatrix} \h{1} & \h{2} \\ \h{1} & \h{2} \end{bmatrix} \\
                  &=
\begin{bmatrix} w & w \end{bmatrix} * \begin{bmatrix} \Z & \Z \end{bmatrix} * \begin{bmatrix} \Gfull[\prime] & \Gfull[\prime] \end{bmatrix} * \begin{bmatrix} \R^T \\ \R^T \end{bmatrix} * \begin{bmatrix} \H^T \\ \H^T \end{bmatrix} \\
&= \diag(w) \cdot \diag(\Z) \cdot \diag(\Gfull[\prime]) \cdot \mathbb{1} \mathbb{1}^T \cdot \diag(\R) \cdot \diag(\H)
\end{align}

Again, following the logic of the backpropagation algorithm, we obtain the same result

\begin{align}
\frac{\partial {loss}_t}{\partial \Uh} &= \frac{\partial {loss}_t}{\partial \hat{y}_t}\frac{\partial \hat{y}_t}{\partial h_t}\frac{\partial h_t}{\partial \tilde{h}_t}\frac{\partial \tilde{h}_t}{\partial \Uh} \\
                                    &= -(y-\hat{y}_t)\ \diag \left[ w * \Z * \Gfull[\prime]\right] \cdot  \begin{bmatrix} (\R * \H)^T \\ (\R * \H)^T \end{bmatrix}
\end{align}

In [19]:
dLossdUh = np.zeros_like(Uh)

for t, xt in enumerate(x):
    dy_dh_t = w * z[t] * dtanh(Wh.dot(xt) + Uh.dot(r[t] * h[t]))
    dLossdUh += -(y-y_[t]) * dy_dh_t.reshape(-1, 1) * h[t]

### Weights for the update gate $z_t$

Using the same logic, we calculate the partial derivatives of $\yhat$ with respect to the elements of $\Wz$, which gives 

\begin{equation}
\deriv{\yhat}{\wz{1}} = w_1 \left(\color{red}{-} \zfull[\prime]{1}\right) \h{1} + \htil{1} \zfull[\prime]{1} \dxt{red},
\end{equation}


\begin{equation}
\deriv{\yhat}{\wz{2}} = w_2 \left(\color{red}{-} \zfull[\prime]{2}\right) \h{1} + \htil{1} \zfull[\prime]{2} \dxt{red},
\end{equation}

and when combined leaves us the gradient of $\yhat$ with respect to $\Wz$ as

\begin{align}
\deriv{\yhat}{\Wz} &= \begin{bmatrix}
\deriv{\yhat}{\wz{1}} \\ \deriv{\yhat}{\wz{2}}
\end{bmatrix} = 
\begin{bmatrix}
    w_1 \left(\color{red}{-} \zfull[\prime]{1}\right) \h{1} + \htil{1} \zfull[\prime]{1} \dxt{red} \\
    w_2 \left(\color{red}{-} \zfull[\prime]{2}\right) \h{1} + \htil{1} \zfull[\prime]{2} \dxt{red}
\end{bmatrix} \\
&= w * \begin{bmatrix} -\H + \Htil \end{bmatrix} * \Zfull[\prime] \dxt{red}
\end{align}

With that knowledge we can implement the gradient of the loss function with respect to $\Wz$ as 

\begin{align}
\frac{\partial {loss}_t}{\partial W_z} &= \frac{\partial {loss}_t}{\partial \hat{y}_t}\frac{\partial \hat{y}_t}{\partial h_t}\frac{\partial h_t}{\partial z_t}\frac{\partial z_t}{\partial W_z} \\
                                   &= -(y-\hat{y}_t)\ w * \begin{bmatrix} -\H + \Htil \end{bmatrix} * \Zfull[\prime] \dxt{red}.
\end{align}

Which in turn we can translate into `Python` code.

In [20]:
dLossdWz = np.zeros_like(Wz)

for t, xt in enumerate(x):
    dztdWz = dsigmoid(Wz.dot(xt) + Uz.dot(h[t])) * xt
    dLossdWz += -(y-y_[t]) * w * (-h[t+1] - h_[t]) * dztdWz

By now this should be getting old. We take partial derivatives of $\yhat$ with respect to each element of $\Uz$. This leaves us with

\begin{align}
\deriv{\yhat}{\uz{1}{1}} &= w_1 \left[\color{red}{(- \zfull[\prime]{1} \h{1} )} \h{1} + \htil{1} \zfull[\prime]{1} \h{1} \right], \\
\deriv{\yhat}{\uz{1}{2}} &= w_1 \left[\color{red}{(- \zfull[\prime]{1} \h{2} )} \h{1} + \htil{1} \zfull[\prime]{1} \h{2} \right], \\
\deriv{\yhat}{\uz{2}{1}} &= w_2 \left[\color{red}{(- \zfull[\prime]{2} \h{1} )} \h{2} + \htil{2} \zfull[\prime]{2} \h{1} \right] \text{and} \\
\deriv{\yhat}{\uz{2}{2}} &= w_2 \left[\color{red}{(- \zfull[\prime]{2} \h{2} )} \h{2} + \htil{2} \zfull[\prime]{2} \h{2} \right]. 
\end{align}

We can combine these partial derivatives in order to form the jacobian of $\yhat$ with respect to $\Uz$ as

\begin{align}
\deriv{\yhat}{\Uz} &= 
\begin{bmatrix} 
    \deriv{\yhat}{\uz{1}{1}} & \deriv{\yhat}{\uz{1}{2}} \\
    \deriv{\yhat}{\uz{2}{1}} & \deriv{\yhat}{\uz{2}{2}}
\end{bmatrix} \\ &=
\begin{bmatrix}  
     w_1 \left[\color{red}{(- \zfull[\prime]{1} \h{1} )} \h{1} + \htil{1} \zfull[\prime]{1} \h{1} \right] &
     w_1 \left[\color{red}{(- \zfull[\prime]{1} \h{2} )} \h{1} + \htil{1} \zfull[\prime]{1} \h{2} \right] \\
     w_2 \left[\color{red}{(- \zfull[\prime]{2} \h{1} )} \h{2} + \htil{2} \zfull[\prime]{2} \h{1} \right] &
     w_2 \left[\color{red}{(- \zfull[\prime]{2} \h{2} )} \h{2} + \htil{2} \zfull[\prime]{2} \h{2} \right] 
\end{bmatrix} \\ &=
\begin{bmatrix} w & w \end{bmatrix} * 
\begin{bmatrix} \Htil - \H & \Htil - \H \end{bmatrix} * 
\begin{bmatrix} \Zfull[\prime] & \Zfull[\prime] \end{bmatrix} * 
\begin{bmatrix} \H^T \\ \H^T \end{bmatrix} \\
&= \diag(w) \cdot \diag(\Htil - \H) \cdot \Zfull[\prime] \cdot \H^T.
\end{align}

And by the same token, we need to use the previous derivations in order to find the derivative of the loss function with respect to $\Uz$, where now one can easily see that:

\begin{align}
\frac{\partial {loss}_t}{\partial \Uz} &= \frac{\partial loss}{\partial \hat{y}_t}\frac{\partial \hat{y}_t}{\partial h_t}\frac{\partial h_t}{\partial z_t}\frac{\partial z_t}{\partial \Uz} \\
                                   &= -(y-\hat{y}_t)\ \diag \left[ w * [-\H + \Htil] * \Zfull[\prime]\right] \cdot \begin{bmatrix} \H^T \\ \H^T \end{bmatrix}.
\end{align}

Having done that, we can implement it in the now too familiar way.

In [21]:
dLossdUz = np.zeros_like(Uz)

for t, xt in enumerate(x):
    dy_dzt = w * [-h[t+1] + h_[t] * dsigmoid(Wz.dot(xt) + Uz.dot(h[t]))]
    dLossdUz += -(y-y_[t]) * dy_dzt.reshape(-1, 1) * h[t]

### Weights for the reset gate $r_t$

Similarly, we can proceed with obtaining the gradients of $\yhat$ with respect to the elements of the reset gate.

\begin{align}
\deriv{\yhat}{\wr{1}} &= w_1 \z{1} \gfull[\prime]{1} \uh{1}{1} \h{1} \rfull[\prime]{1} \dxt{blue} \\ 
&\quad + w_2 \z{2} \gfull[\prime]{2} \uh{2}{1} \h{1} \rfull[\prime]{1} \dxt{blue}
\end{align}

and

\begin{align}
\deriv{\yhat}{\wr{2}} &= w_1 \z{1} \gfull[\prime]{1} \uh{1}{2} \h{2} \rfull[\prime]{2} \dxt{blue} \\ 
&\quad + w_2 \z{2} \gfull[\prime]{2} \uh{2}{2} \h{2} \rfull[\prime]{2} \dxt{blue}.
\end{align}

By collecting the partial derivatives into a vector we form the gradient of $\yhat$ with respect to $\Wr$

\begin{align}
\deriv{\yhat}{\Wr} &= \begin{bmatrix}\deriv{\yhat}{\wr{1}} \\ \deriv{\yhat}{\wr{2}} \end{bmatrix} \\ 
&= w^T \cdot \left(
\begin{bmatrix} \Z & \Z \end{bmatrix} * 
\begin{bmatrix} \Gfull[\prime] & \Gfull[\prime] \end{bmatrix} * 
\Uh *
\begin{bmatrix} \H^T \\ \H^T \end{bmatrix} *
\begin{bmatrix} \Rfull[\prime]^T \\ \Rfull[\prime]^T \end{bmatrix}
\right) \dxt{blue} \\
&= w^T \cdot \diag(\Z) \cdot \diag(\Gfull[\prime]) \cdot \Uh \cdot \diag(\H)\cdot \diag(\Rfull[\prime]) \dxt{blue}.
\end{align}

Subsequently, when looking at the loss function we see that it's gradient with respect to $\Wr$ can be computed as

\begin{align}
\frac{\partial {loss}_t}{\partial \Wr} &= \frac{\partial {loss}_t}{\partial \hat{y}_t}\frac{\partial \hat{y}_t}{\partial h_t}\frac{\partial h_t}{\partial \tilde{h}_t}\frac{\partial \tilde{h}_t}{\partial \Wr} \\
                                   &= -(y-\hat{y}_t)\ w^T \cdot \left(
\begin{bmatrix} \Z & \Z \end{bmatrix} * 
\begin{bmatrix} \Gfull[\prime] & \Gfull[\prime] \end{bmatrix} * 
\Uh *
\begin{bmatrix} \H^T \\ \H^T \end{bmatrix} *
\begin{bmatrix} \Rfull[\prime]^T \\ \Rfull[\prime]^T \end{bmatrix}
\right) \dxt{blue}.
\end{align}

Which we also implement in `Python`.

In [22]:
dLossdWr = np.zeros_like(Wr)

for t, xt in enumerate(x):
    z_dg = z[t] * dtanh(Wh.dot(xt) + Uh.dot(r[t] * h[t]))
    d_rt = dsigmoid(Wr.dot(xt) + Ur.dot(h[t]))
    dLossdWr += -(y-y_[t]) * w.dot(z_dg.reshape(-1, 1) * Uh * h[t] * d_rt) * xt

Finally, we compute the partial derivatives of $\yhat$ with respect to $\Ur$

\begin{align}
\deriv{\yhat}{\ur{1}{1}} &= w_1 \z{1} \gfull[\prime]{1} \uh{1}{1} \h{1} \rfull[\prime]{1} \h{1} \\ 
                         & \quad + w_2 \z{2} \gfull[\prime]{2} \uh{2}{1} \h{1} \rfull[\prime]{1} \h{1} \\
\deriv{\yhat}{\ur{1}{2}} &= w_1 \z{1} \gfull[\prime]{1} \uh{1}{1} \h{1} \rfull[\prime]{1} \h{2} \\
                         & \quad + w_2 \z{2} \gfull[\prime]{2} \uh{2}{1} \h{1} \rfull[\prime]{1} \h{2} \\
\deriv{\yhat}{\ur{2}{1}} &= w_1 \z{1} \gfull[\prime]{1} \uh{1}{2} \h{2} \rfull[\prime]{2} \h{1} \\
                         & \quad + w_2 \z{2} \gfull[\prime]{2} \uh{2}{2} \h{2} \rfull[\prime]{2} \h{1} \\
\deriv{\yhat}{\ur{2}{2}} &= w_1 \z{1} \gfull[\prime]{1} \uh{1}{2} \h{2} \rfull[\prime]{2} \h{2} \\
                         & \quad + w_2 \z{2} \gfull[\prime]{2} \uh{2}{2} \h{2} \rfull[\prime]{2} \h{2}.
\end{align}

And by combining all partial derivatives, we obtain the jacobian of $\yhat$ with respect to $\Ur$

\begin{align}
\deriv{\yhat}{\Ur} &=
\begin{bmatrix} 
    \deriv{\yhat}{\ur{1}{1}} & \deriv{\yhat}{\ur{1}{2}} \\
    \deriv{\yhat}{\ur{2}{1}} & \deriv{\yhat}{\ur{2}{2}}
\end{bmatrix} \\ &= 
\begin{bmatrix}(w * \Z * \Gfull[\prime])^T \\ (w * \Z * \Gfull[\prime])^T \end{bmatrix} \cdot \Uh^T  * 
\begin{bmatrix} \H \cdot \H^T \end{bmatrix} *
\begin{bmatrix} \Rfull[\prime] & \Rfull[\prime] \end{bmatrix} \\ &=
\diag(w) \cdot \diag(\Z) \cdot \diag(\Gfull[\prime]) \cdot \Uh^T \cdot \diag(\H) \cdot \diag(\Rfull[\prime]) \cdot \mathbb{1}\mathbb{1}^T \cdot \diag(\H).
\end{align}

We implement this gradient in the following way:

In [23]:
dLossdUr = np.zeros_like(Ur)

for t, xt in enumerate(x):
    w_z_dg = w * z[t] * dtanh(Wh.dot(xt) + Uh.dot(r[t] * h[t]))
    outer_h = np.outer(h[t], h[t])
    dr = dsigmoid(Wr.dot(xt)+Ur.dot(h[t])).reshape(-1, 1)

    dLossdUr += -(y - y_[t]) * w_z_dg.dot(Uh.T)*outer_h * dr


# Update the weights

Now that we have computed the gradients of the loss function with respect to any of the weight matrices, we can update the weights and then see if this did indeed improve the prediction of our toy example.

In [24]:
eta = 0.1
Wr -= eta * dLossdWr
Wz -= eta * dLossdWz
Wh -= eta * dLossdWh

Ur -= eta * dLossdUr
Uz -= eta * dLossdUz
Uh -= eta * dLossdUh

Now we perform the forward pass again with the updated weights.

In [25]:
yhat_after = GRU_forward()

In [26]:
pd.DataFrame({r"$\hat{y}_{before}$": yhat_before, r"$\hat{y}_{after}$": yhat_after, r"$y$": y}, index=[1])

Unnamed: 0,$\hat{y}_{before}$,$\hat{y}_{after}$,$y$
1,5.14414,5.986334,7.0


And indeed, we are better than before. If we were to reiterate these steps multiple times we could get arbitrarily close to the true value of $7$.