In [321]:
%load_ext nb_black

The nb_black extension is already loaded. To reload it, use:
  %reload_ext nb_black


<IPython.core.display.Javascript object>

In [322]:
import numpy as np

from sklearn.preprocessing import MultiLabelBinarizer
from tensorflow.keras.layers import Dense, Embedding, GRU, GRUCell
from tensorflow.keras.losses import BinaryCrossentropy
from tensorflow.keras.models import Sequential
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.utils import set_random_seed

<IPython.core.display.Javascript object>

In [323]:
%%latex

$
\newcommand{\W}[1]{\mathbf{W}_{#1}}
\newcommand{\U}[1]{\mathbf{U}_{#1}}
\newcommand{\sigmoid}{\text{sigmoid}}
$

<IPython.core.display.Latex object>

<IPython.core.display.Javascript object>

# GRU

In [324]:
%%latex

$z^{\langle t\rangle} = \sigmoid\left(\W{z}\cdot x^{\langle t\rangle} + \U{z}\cdot h^{\langle t-1 \rangle} + b_z\right)\\[2em]

r^{\langle t\rangle} = \sigmoid\left(\W{r}\cdot x^{\langle t\rangle} + \U{r}\cdot h^{\langle t-1 \rangle} + b_r\right)\\[2em]

\tilde h^{\langle t\rangle} = \tanh\left(\W{h}\cdot x^{\langle t\rangle} + \U{h} \cdot (r^{\langle t\rangle}\odot h^{\langle t-1 \rangle}) + b_h\right)\\[2em]


h^{\langle t\rangle} = z^{\langle t\rangle}\cdot h^{\langle t-1\rangle} + (1-z^{\langle t\rangle})\cdot \tilde h^{\langle t \rangle}

$


<IPython.core.display.Latex object>

<IPython.core.display.Javascript object>

# Example

In [325]:
texts = ["the book was great super", "the book was awful no"]
y = np.array([[1.0], [0.0]])

<IPython.core.display.Javascript object>

In [326]:
tokenizer = Tokenizer(oov_token="<OOV>")
tokenizer.fit_on_texts(texts)
vocab = tokenizer.index_word
vocab

{1: '<OOV>',
 2: 'the',
 3: 'book',
 4: 'was',
 5: 'great',
 6: 'super',
 7: 'awful',
 8: 'no'}

<IPython.core.display.Javascript object>

In [327]:
sequences = tokenizer.texts_to_sequences(texts)
sequences

[[2, 3, 4, 5, 6], [2, 3, 4, 7, 8]]

<IPython.core.display.Javascript object>

In [328]:
PADDING = "post"
TRUNCATING = "post"
MAXLEN = 5

padded_sequences = pad_sequences(
    sequences, padding=PADDING, truncating=TRUNCATING, maxlen=MAXLEN
)

padded_sequences

array([[2, 3, 4, 5, 6],
       [2, 3, 4, 7, 8]])

<IPython.core.display.Javascript object>

In [352]:
INPUT_DIM = len(vocab) + 1 # max_index in the vocab + 1
INPUT_LENGTH = MAXLEN
OUTPUT_DIM = 7

RETURN_SEQUENCES = True # If True return the hidden states of all the in-between timesteps else return only the hidden state
                         # of the last timestep



set_random_seed(42)

model = Sequential([
    Embedding(input_dim=INPUT_DIM, output_dim=OUTPUT_DIM, input_length=INPUT_LENGTH, trainable=True, mask_zero=False),
    GRU(units=4, return_state=False, use_bias=True, reset_after=True, return_sequences=RETURN_SEQUENCES),
])


OPTIMIZER = Adam()
LOSS = BinaryCrossentropy()
METRICS = ["accuracy"]

model.compile(optimizer=OPTIMIZER, loss=LOSS, metrics=METRICS)
model.summary()



Model: "sequential_41"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_32 (Embedding)    (None, 5, 7)              63        
                                                                 
 gru_41 (GRU)                (None, 5, 4)              156       
                                                                 
Total params: 219
Trainable params: 219
Non-trainable params: 0
_________________________________________________________________


<IPython.core.display.Javascript object>

In [353]:
model.predict(padded_sequences)



array([[[-0.02255583, -0.00166022, -0.002121  ,  0.01063325],
        [-0.01299257,  0.00261979, -0.01043404,  0.02199029],
        [-0.01740611, -0.009634  , -0.02022436,  0.003124  ],
        [-0.00431032,  0.01444161, -0.01362657,  0.01084094],
        [ 0.00090356,  0.01463092, -0.02333421, -0.01767044]],

       [[-0.02255583, -0.00166022, -0.002121  ,  0.01063325],
        [-0.01299257,  0.00261979, -0.01043404,  0.02199029],
        [-0.01740611, -0.009634  , -0.02022436,  0.003124  ],
        [-0.0052803 , -0.03340074,  0.01058613,  0.02608683],
        [-0.00959935, -0.0366431 ,  0.03656987,  0.02465998]]],
      dtype=float32)

<IPython.core.display.Javascript object>

# Validate

In [354]:
def sigmoid(z: np.ndarray | float) -> np.ndarray | float:
    return 1 / (1 + np.exp(-z))

<IPython.core.display.Javascript object>

In [355]:
embeddings = model.layers[0].get_weights()[0]

<IPython.core.display.Javascript object>

In [356]:
W = model.layers[1].get_weights()[0]
print(W.shape)

(7, 12)


<IPython.core.display.Javascript object>

In [357]:
W_z = W[:, :4]
W_r = W[:, 4:8]
W_h = W[:, 8:]


print(W_z.shape)

(7, 4)


<IPython.core.display.Javascript object>

In [358]:
U = model.layers[1].get_weights()[1]
print(U.shape)


(4, 12)


<IPython.core.display.Javascript object>

In [359]:
U_z = U[:, :4]
U_r = U[:, 4:8]
U_h = U[:, 8:]

print(U_z.shape)

(4, 4)


<IPython.core.display.Javascript object>

In [360]:
input_biases, recurrent_biases = model.layers[1].get_weights()[2]


b_z = input_biases[:4]
b_r = input_biases[4:8]
b_h = input_biases[8:]




<IPython.core.display.Javascript object>

In [361]:
# For timestep 1:

h_0 = np.array([0, 0, 0, 0])

x_embeddings = embeddings[2]

z = sigmoid(x_embeddings.dot(W_z) + U_z.T.dot(h_0) + b_z)
r = sigmoid(x_embeddings.dot(W_r) + U_r.T.dot(h_0) + b_r)
hc = np.tanh(x_embeddings.dot(W_h) + U_h.T.dot(r * h_0) + b_h)
h_1 = z * h_0 + (1 - z) * hc

h_1

array([-0.02255584, -0.00166022, -0.00212101,  0.01063325])

<IPython.core.display.Javascript object>

In [362]:
# For timestep 2:

x_embedding = embeddings[3]

z = sigmoid(x_embedding.dot(W_z) + U_z.T.dot(h_1) + b_z)
r = sigmoid(x_embedding.dot(W_r) + U_r.T.dot(h_1) + b_r)
hc = np.tanh(x_embedding.dot(W_h) + U_h.T.dot(r * h_1) + b_h)
h_2 = z * h_1 + (1 - z) * hc

h_2

array([-0.01298971,  0.00261219, -0.01043557,  0.02199294])

<IPython.core.display.Javascript object>