# Gated Recurrent Unit Networks

In this additional challenge, students will build their own GRU layer from scratch.  

---

In [1]:
import os
import sys
import numpy as np
import tensorflow as tf

from preprocess import get_data

In [2]:
data_path = "../../data"

## Toy Dataset

No spoilers for the preprocessing part of the homework here. 

In [3]:
example_sentence = "The word <_UNK> is not a common word but flower is a common word"

example_sentence_list = [
    'the', 'word', '<_unk>', 'is', 'not', 
    'a', 'common', 'word', 'but', 'flower', 
    'is', 'a', 'common', 'word']
example_unique_words = [
    '<_unk>', 'a', 'but', 'common', 
    'flower', 'is', 'not', 'the', 'word']
example_w2t_dict = {
    '<_unk>': 0, 'a': 1, 'but': 2, 'common': 3, 'flower': 4, 
    'is': 5, 'not': 6, 'the': 7, 'word': 8}
example_sentence_tokenized = [7, 8, 0, 5, 6, 1, 3, 8, 2, 4, 5, 1, 3, 8]

print(f"1. example_sentence_list        \n\t{example_sentence_list}\n")
print(f"2. example_unique_words         \n\t{example_unique_words}\n")
print(f"3. example_w2t_dict             \n\t{example_w2t_dict}\n")
print(f"4. example_sentence_tokenized   \n\t{example_sentence_tokenized}\n")

1. example_sentence_list        
	['the', 'word', '<_unk>', 'is', 'not', 'a', 'common', 'word', 'but', 'flower', 'is', 'a', 'common', 'word']

2. example_unique_words         
	['<_unk>', 'a', 'but', 'common', 'flower', 'is', 'not', 'the', 'word']

3. example_w2t_dict             
	{'<_unk>': 0, 'a': 1, 'but': 2, 'common': 3, 'flower': 4, 'is': 5, 'not': 6, 'the': 7, 'word': 8}

4. example_sentence_tokenized   
	[7, 8, 0, 5, 6, 1, 3, 8, 2, 4, 5, 1, 3, 8]



In [4]:
X_RNN = np.array([
    [7, 8, 0, 5],
    [6, 1, 3, 8],
    [2, 4, 5, 1]])
y_RNN = np.array([
    [8, 0, 5, 6],
    [1, 3, 8, 2],
    [4, 5, 1, 3]])

print(f"X_RNN shape = {X_RNN.shape}")
print(f"y_RNN shape = {y_RNN.shape}")

print(f"X_RNN     --> y_RNN")
for each_X, each_y in zip(X_RNN, y_RNN):
    print(f"{each_X} --> {each_y}")

X_RNN shape = (3, 4)
y_RNN shape = (3, 4)
X_RNN     --> y_RNN
[7 8 0 5] --> [8 0 5 6]
[6 1 3 8] --> [1 3 8 2]
[2 4 5 1] --> [4 5 1 3]


## Keras GRU Layer 

We've already looked at `tf.keras.layers.GRU`'s API. 

- The Keras GRU Layer expects the input shape to be in the **batch-major form**, `[batch, timesteps, embedding]`. 
- In our language model, `timesteps` is basically our `window`. 
  - That's because we treat a sequence of words as a time-series data.

Also, the most important keywards arguments are `units`, `return_state` and `return_sequences`. 
- `units` is the output embedding size, 
- `return_state` and `return_sequences` are the Boolean variables to return the final state and the sequences of outputs.

In [5]:
embedding_layer = tf.keras.layers.Embedding(input_dim  = 9, 
                                            output_dim = 2)

X_RNN_embedding = embedding_layer(X_RNN)
batch_size, window_size, embedding_size= X_RNN_embedding.shape ## (3, 4, 2)
print(f"RNN input tokens shape = {X_RNN.shape}")
print(f"RNN embeddings shape   = {X_RNN_embedding.shape}")

RNN input tokens shape = (3, 4)
RNN embeddings shape   = (3, 4, 2)


2022-11-06 15:19:09.882475: I tensorflow/core/platform/cpu_feature_guard.cc:142] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


We also know that all Keras LSTM layers have the same weight structures, no matter the value of the Boolean flags.

In [6]:
gru           = tf.keras.layers.GRU(units=embedding_size, return_sequences=False, return_state=False)
gru_state     = tf.keras.layers.GRU(units=embedding_size, return_sequences=False, return_state=True )
gru_seq       = tf.keras.layers.GRU(units=embedding_size, return_sequences=True,  return_state=False)
gru_seq_state = tf.keras.layers.GRU(units=embedding_size, return_sequences=True,  return_state=True )

# the Keras GRU layers initialize their weight 
#   not when they are declared
#   but when they are complied
gru.build(X_RNN_embedding.shape)
gru_state.build(X_RNN_embedding.shape)
gru_seq.build(X_RNN_embedding.shape)
gru_seq_state.build(X_RNN_embedding.shape)

# Now all four layers have exact same weights
gru_weights = gru.get_weights()
gru_state.set_weights(gru_weights)
gru_seq.set_weights(gru_weights)
gru_seq_state.set_weights(gru_weights)

### Keras GRU Layer Weights

It's time to see how those weights work under the hood. 
- The GRU weights are in fact three trainable Tensor variables named `kernel`, `recurrent_kernel`, and `bias`.
- `kernel` is the array of weights for the input
- `recurrent_kernel` is the array of weights for the previous hidden state
- `bias` is the array of biases

In [7]:
for each_weight_tensor in gru_seq_state.weights:
    print(each_weight_tensor.name)
    print(each_weight_tensor, end = "\n\n")

gru_cell_3/kernel:0
<tf.Variable 'gru_cell_3/kernel:0' shape=(2, 6) dtype=float32, numpy=
array([[-0.7932513 , -0.08078247, -0.04787922,  0.3257944 ,  0.27332598,
        -0.45060182],
       [ 0.6973912 , -0.4646765 ,  0.74344295, -0.66043484,  0.29146868,
        -0.27759284]], dtype=float32)>

gru_cell_3/recurrent_kernel:0
<tf.Variable 'gru_cell_3/recurrent_kernel:0' shape=(2, 6) dtype=float32, numpy=
array([[ 0.2723887 , -0.46107632,  0.23672704,  0.14683235, -0.32230827,
         0.7291989 ],
       [ 0.02906279,  0.19178717,  0.5550917 , -0.39322656,  0.6429818 ,
         0.29358748]], dtype=float32)>

gru_cell_3/bias:0
<tf.Variable 'gru_cell_3/bias:0' shape=(2, 6) dtype=float32, numpy=
array([[0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0.]], dtype=float32)>



At this point, you might be wondering 
> but wait a second. Shouldn't there be **three pairs of weights and biases**, <br>
> because there are three internal feed-forward netwroks in a GRU unit?

And, you are right. There are three pairs of weights and biases for each internal feed-forward network, but the developers of TensorFlow and Keras only decided to put the weights and biases together in a different way. We can reshape them to be make it easier for us. 

In [9]:
units = embedding_size
W, U, b = gru_weights

### kernel: weights for the input vector x_{t}
W_z, W_r, W_h = (W[:, :units], W[:, units:(2*units)], W[:, (2*units):])

### recurrent kernel: weights for the previous hidden state h_{t-1}
U_z, U_r, U_h = (U[:, :units], U[:, units:(2*units)], U[:, (2*units):])

### bias 
# Keras distinguishes between the input bias and recurrent bias for more flexibility
# but we can just add them together and treat them as a single bias
b = tf.reduce_sum(b, axis = 0)
b_z, b_r, b_h = (b[:units], b[units:(units*2)], b[(units*2):])

## Your Own Implementation of GRU

Now we can use the weights and biases $W$, $U$, and $b$ in the way that we've covered in the lecture. 

- $x_t$ is the current input at timestep $t$.
- $h_{t-1}$ is the previous hidden state. 

\begin{align*}
z_t &= \sigma \left( W_z x_t + U_z h_{t-1} + b_z \right) & \textsf{Update Gate Vector}\\
r_t &= \sigma \left( W_r x_t + U_r h_{t-1} + b_r \right) & \textsf{Reset Gate Vector}\\
\hat{h}_t &= \tanh \left( W_h x_t + r_t \odot ( U_h h_{t-1}) + b_h \right) & \textsf{Candidate Activation Vector}\\
h_t &= z_t \odot h_{t-1} + (1 - z_t) \odot \hat{h}_t  & \textsf{Output, Hidden State Update}\\
\end{align*}

There a a few peculiarities that you should be aware of.
+ Usage of the update gate vector $z_t$ 
  + Different versions of GRU uses $z_t$ differently: either as a proportion of the previous hidden state to be kept or to be forgotten. 
  + The Keras version uses $z_t$ as a proportion to be kept and not forgotten. 
  + i.e. $z_t$ is mutliplied to $h_{t-1}$ element-wisely, and not $(1- z_t)$.
+ Application of the reset gate vector $r_t$
  + Sometimes the reset gate vector $r_t$ is applied after or before the matrix multiplication when calculating the candidate activation vector. 
  + The Keras version applies it after the matrix multiplication like $r_t \odot ( U_h h_{t-1})$, but not before like $U_h (r_t \odot h_{t-1})$.
  + This Keras behavior can be toggled with the Boolean keyword argument `reset_after`.
  + The "before" version is more widely used as it is based on the [latest submission](https://arxiv.org/abs/1406.1078v3) of the Cho et al. paper.
  + The "after" version appears in the [first submitted draft](https://arxiv.org/abs/1406.1078v1) of the Cho et al. ppaper


Now, your job is to finish implementing the `call` method below. 
- The inputs need to be reshaped into the time-major form `[timesteps, batch, embedding]`.
  - This is because is parallelizing the recurrent operations through all timesteps is very difficult.
  - So, we will use the for-loop to advance in the timestep dimension.
  - Then, inside the for-loop, we wil use matrix multiplications for the batch of inputs in the same timestep.
- Remember that, in a single timestep, the input data is in the matrix form with shape `[batch, embedding]`.
  - So, do something like $Y = XW$ instead of $y_i = W x_i, i \in \{1, 2, 3, \cdots\}$.
  - Also, the hidden and cell states are also matrices with the same shape `[batch, embedding]`.
- You should return the whole sequence of outputs and the final hidden and cell states
  - Like when `return_sequences = True` and `return_state = True`.
- The outputs needs to be reshaped back into the batch-major form `[batch, timesteps, embedding]`.

In [18]:
import tensorflow as tf

class MyGRU(tf.keras.layers.Layer):
    def __init__(self, units, **kwargs):
        self.units = units
        super(MyGRU, self).__init__(**kwargs)
        
    def build(self, input_shape):
        kernel_shape = tf.TensorShape((input_shape[-1], 3*self.units))

        # Create trainable weight variables for this layer.
        self.kernel = self.add_weight(
            name="kernel",                shape=kernel_shape, dtype=tf.float32,
            initializer="glorot_uniform", trainable=True)
        
        self.recurrent_kernel = self.add_weight(
            name="recurrent_kernel",      shape=kernel_shape, dtype = tf.float32,
            initializer="orthogonal",     trainable=True)
        
        self.bias = self.add_weight(
            name = "bias",                shape=kernel_shape, dtype=tf.float32,
            initializer = "zeros",        trainable=True)
        
        # Make sure to call the `build` method at the end
        super(MyGRU, self).build(input_shape)
        
    def call(self, inputs, initial_state = None):
        ## Hidden state 
        if initial_state is None:
            ht = tf.zeros(shape=(inputs.shape[0], self.units), dtype=tf.float32)
        else:
            ht = tf.identity(initial_state)
        
        ## Weights and biases
        W, U, b, units = self.kernel, self.recurrent_kernel, self.bias, self.units
        W_z, W_r, W_h = (W[:, :units], W[:, units:(2*units)], W[:, (2*units):])
        U_z, U_r, U_h = (U[:, :units], U[:, units:(2*units)], U[:, (2*units):])
        b = tf.reduce_sum(b, axis=0)
        b_z, b_r, b_h = (b[:units], b[units:(units*2)], b[(units*2):])
        
        outputs = [] ## we need the whole sequence of outputs
        inputs_time_major = tf.transpose(inputs, perm = [1, 0, 2]) ## swap the batch and timestep axes

        ## TODO: complete this for-loop, hint: the LaTeX equation cell above
        for input_each_step in inputs_time_major:
            z = tf.sigmoid(tf.matmul(input_each_step, W_z) + tf.matmul(ht, U_z) + b_z)
            r = tf.sigmoid(tf.matmul(input_each_step, W_r) + tf.matmul(ht, U_r) + b_r)
            h_t = tf.tanh(tf.matmul(input_each_step, W_h) + tf.matmul(r * ht, U_h) + b_h)
            ht = z * ht + (1-z) * h_t
            outputs.append(ht)

        ## TODO: get the whole sequence of outputs, hint: tf.stack
        outputs = tf.stack(outputs, axis = 0)
        
        ## TODO: swap the batch and timestep axes again, hint: tf.transpose
        outputs = tf.transpose(outputs, perm = [1,0,2])
        
        return outputs, ht
    
    def compute_output_shape(self, input_shape):
        shape = tf.TensorShape(input_shape).as_list()
        shape[-1] = self.units
        return tf.TensorShape(shape)
    
    def get_config(self):
        base_config = super(MyGRU, self).get_config()
        base_config["units"] = self.units
        return base_config

## Compare with Keras GRU Layer

Now we have to see if your GRU layer returns the exact same outputs as the Keras GRU layer. 

So, we will initialize your GRU layer with **the same weights** from the other Keras GRU layers.

In [19]:
my_gru = MyGRU(units = 2)
my_gru.build(X_RNN_embedding.shape)
my_gru.set_weights(gru_weights)
my_gru.get_weights()

[array([[-0.7932513 , -0.08078247, -0.04787922,  0.3257944 ,  0.27332598,
         -0.45060182],
        [ 0.6973912 , -0.4646765 ,  0.74344295, -0.66043484,  0.29146868,
         -0.27759284]], dtype=float32),
 array([[ 0.2723887 , -0.46107632,  0.23672704,  0.14683235, -0.32230827,
          0.7291989 ],
        [ 0.02906279,  0.19178717,  0.5550917 , -0.39322656,  0.6429818 ,
          0.29358748]], dtype=float32),
 array([[0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0.]], dtype=float32)]

Then calculate the outputs and the states from your own GRU layer. 

In [20]:
# random initial states
tf_generator = tf.random.Generator.from_seed(42)
input_state_h = tf_generator.normal(shape=(1, units))

# initial states in the [batch, embedding] format
ht = tf.repeat(input_state_h, repeats=batch_size, axis=0)

my_output_seq, my_state_h = my_gru(X_RNN_embedding, initial_state = ht)
print(f"my output sequence, shape = {my_output_seq.shape} \n{my_output_seq}\n")
print(f"my final hidden state, shape = {my_state_h.shape} \n{my_state_h}\n")

my output sequence, shape = (3, 4, 2) 
[[[-0.2893883  -0.13847744]
  [-0.14664622 -0.12091866]
  [-0.06827874 -0.11017781]
  [-0.04329339 -0.07862912]]

 [[-0.3066318  -0.1320553 ]
  [-0.14222276 -0.13091716]
  [-0.07816978 -0.10284282]
  [-0.05463332 -0.06477179]]

 [[-0.2987355  -0.13594003]
  [-0.15116738 -0.11893713]
  [-0.07873564 -0.09824978]
  [-0.04567615 -0.07368217]]]

my final hidden state, shape = (3, 2) 
[[-0.04329339 -0.07862912]
 [-0.05463332 -0.06477179]
 [-0.04567615 -0.07368217]]



In [16]:
keras_output_seq, keras_state_h = gru_seq_state(X_RNN_embedding, initial_state = ht)

print(f"Keras output sequence, shape = {keras_output_seq.shape} \n{keras_output_seq}\n")
print(f"Keras final hidden state, shape = {keras_state_h.shape} \n{keras_state_h}\n")

Keras output sequence, shape = (3, 4, 2) 
[[[-0.28814057 -0.14925188]
  [-0.14620402 -0.13064463]
  [-0.06858468 -0.11712603]
  [-0.04374851 -0.08319197]]

 [[-0.30576056 -0.1399014 ]
  [-0.14172646 -0.13877474]
  [-0.07779704 -0.10896343]
  [-0.0546933  -0.06884565]]

 [[-0.29775515 -0.14463294]
  [-0.14979988 -0.12917906]
  [-0.07879876 -0.10531791]
  [-0.04621286 -0.07828905]]]

Keras final hidden state, shape = (3, 2) 
[[-0.04374851 -0.08319197]
 [-0.0546933  -0.06884565]
 [-0.04621286 -0.07828905]]



If you have implemented your GRU layer correctly, you will have the same outputs and states within reasonable error margins for the floating point representations.

In [21]:
print(np.allclose(my_output_seq.numpy(), keras_output_seq.numpy()))
print(np.allclose(my_state_h.numpy(), keras_state_h.numpy()))
# use np.isclose for element-wise comparison

False
False
