# Setup

In [1]:
import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'

import tensorrt
import tensorflow as tf
from pathlib import Path
import numpy as np
import pandas as pd
import tensorflow_datasets as tfds
import os
import tensorflow_hub as hub

tf.get_logger().setLevel('ERROR')

In [20]:
shakespeare_url = 'https://homl.info/shakespeare'
filepath = tf.keras.utils.get_file('shakespeare.txt', shakespeare_url)
with open(filepath) as f:
    shakespeare_text = f.read()

In [21]:
print(shakespeare_text[:80])

First Citizen:
Before we proceed any further, hear me speak.

All:
Speak, speak.


# Tokenization through TextVectorization

In [22]:
text_vec_layer = tf.keras.layers.TextVectorization(split='character', standardize='lower')
text_vec_layer.adapt([shakespeare_text])
encoded = text_vec_layer([shakespeare_text])[0]


In [23]:
encoded -= 2 # drop 0 (pad) and 1 (unknown)
n_tokens = text_vec_layer.vocabulary_size() - 2
dataset_size = len(encoded)

print(encoded[:80])
print(n_tokens)
print(dataset_size)

tf.Tensor(
[19  5  8  7  2  0 18  5  2  5 35  1  9 23 10 21  1 19  3  8  1  0 16  1
  0 22  8  3 18  1  1 12  0  4  9 15  0 19 13  8  2  6  1  8 17  0  6  1
  4  8  0 14  1  0  7 22  1  4 24 26 10 10  4 11 11 23 10  7 22  1  4 24
 17  0  7 22  1  4 24 26], shape=(80,), dtype=int64)
39
1115394


# Stateless RNN

## Preparing the datasets

In [104]:
# def to_dataset(sequence, length, shuffle=False, seed=None, batch_size=32):
#     dataset = tf.data.Dataset.from_tensor_slices(sequence)
#     dataset = dataset.window(2, shift=1).flat_map(lambda window: window.batch(2, drop_remainder=True))
#     dataset = dataset.window(length, shift=1).flat_map(lambda window: window.batch(length, drop_remainder=True))
#     if shuffle:
#         dataset = dataset.shuffle(10000, seed=seed)
#     return dataset.map(lambda x: (x[:, 0], x[:, 1])).batch(batch_size)

def to_dataset(sequence, length, shuffle=False, seed=None, batch_size=32):
    dataset = tf.data.Dataset.from_tensor_slices(sequence)
    dataset = dataset.window(length + 1, shift=1, drop_remainder=True).flat_map(lambda window: window.batch(length + 1))
    if shuffle:
        dataset = dataset.shuffle(10000, seed=seed)
    return dataset.map(lambda x: (x[:-1], x[1:])).batch(batch_size).prefetch(5)

In [99]:
for x, y in to_dataset(encoded[:80], 5).take(1):
    print(np.concatenate([x[0].numpy()[:, None], y[0].numpy()[:, None]], axis=1))

[[19  5]
 [ 5  8]
 [ 8  7]
 [ 7  2]
 [ 2  0]]


In [348]:
length = 100
train_set = to_dataset(encoded[:1_000_000], length=length, shuffle=True)
valid_set = to_dataset(encoded[1_000_000:1_060_000], length=length)
test_set = to_dataset(encoded[1_060_000:], length=length)

## Training

In [106]:
model = tf.keras.Sequential([
    tf.keras.layers.Embedding(input_dim=n_tokens, output_dim=16),
    tf.keras.layers.GRU(128, return_sequences=True),
    tf.keras.layers.Dense(n_tokens, activation='softmax')
])
model.compile(
    optimizer=tf.keras.optimizers.Nadam(learning_rate=1e-3),
    loss=tf.keras.losses.sparse_categorical_crossentropy,
    metrics=['accuracy']
)
folder = Path() / 'data/01-vanilla-char-rnn'
checkpoints_folder = folder / 'checkpoints'
if checkpoints_folder.exists():
    model.load_weights(checkpoints_folder)
else: 
    checkpoints_cb = tf.keras.callbacks.ModelCheckpoint(checkpoints_folder, save_best_only=True, monitor='val_accuracy')
    history = model.fit(
        train_set, 
        epochs=10,
        validation_data=valid_set,
        callbacks=[checkpoints_cb]
    )

Epoch 1/10
  31244/Unknown - 473s 15ms/step - loss: 1.3058 - accuracy: 0.6095



INFO:tensorflow:Assets written to: data/01-vanilla-char-rnn/checkpoints/assets


INFO:tensorflow:Assets written to: data/01-vanilla-char-rnn/checkpoints/assets


Epoch 2/10



INFO:tensorflow:Assets written to: data/01-vanilla-char-rnn/checkpoints/assets


INFO:tensorflow:Assets written to: data/01-vanilla-char-rnn/checkpoints/assets


Epoch 3/10



INFO:tensorflow:Assets written to: data/01-vanilla-char-rnn/checkpoints/assets


INFO:tensorflow:Assets written to: data/01-vanilla-char-rnn/checkpoints/assets


Epoch 4/10



INFO:tensorflow:Assets written to: data/01-vanilla-char-rnn/checkpoints/assets


INFO:tensorflow:Assets written to: data/01-vanilla-char-rnn/checkpoints/assets


Epoch 5/10



INFO:tensorflow:Assets written to: data/01-vanilla-char-rnn/checkpoints/assets


INFO:tensorflow:Assets written to: data/01-vanilla-char-rnn/checkpoints/assets


Epoch 6/10



INFO:tensorflow:Assets written to: data/01-vanilla-char-rnn/checkpoints/assets


INFO:tensorflow:Assets written to: data/01-vanilla-char-rnn/checkpoints/assets


Epoch 7/10
Epoch 8/10



INFO:tensorflow:Assets written to: data/01-vanilla-char-rnn/checkpoints/assets


INFO:tensorflow:Assets written to: data/01-vanilla-char-rnn/checkpoints/assets


Epoch 9/10



INFO:tensorflow:Assets written to: data/01-vanilla-char-rnn/checkpoints/assets


INFO:tensorflow:Assets written to: data/01-vanilla-char-rnn/checkpoints/assets


Epoch 10/10


<keras.callbacks.History at 0x7f88ac8e4050>

## Inferring

In [108]:
full_model = tf.keras.Sequential([
    text_vec_layer,
    tf.keras.layers.Lambda(lambda x: x - 2),  # no <PAD> or <UNK> tokens
    model
])

In [134]:
# Shape: (1, 17, 39) = (1, input tokens, output probabilities)
y_proba = full_model.predict(['To be or not to b'], verbose=False)[0, -1]
y_logits = tf.math.log(y_proba)
tf.random.categorical([y_logits], 1)[0, 0]
# output_token = tf.argmax(y_proba) + 2
# text_vec_layer.get_vocabulary()[output_token]

<tf.Tensor: shape=(), dtype=int64, numpy=8>

In [331]:
def generate_text(input: str, length: int, temperature=1, top_k=None):
    output = input
    top_k = top_k if top_k else n_tokens
    for _ in range(length):
        y_proba = full_model.predict([output], verbose=False)[0, -1:]
        y_proba_values_topk, y_proba_indices_topk = tf.math.top_k(y_proba, top_k)
        y_logits = tf.math.log(y_proba_values_topk) / temperature
        draw = tf.random.categorical(y_logits, num_samples=1)[0, 0]
        # + 2 is specific to the use case of ignoring PAD and UNK tokens of text_vec_layer
        output_token = y_proba_indices_topk[0, draw] + 2 
        output += text_vec_layer.get_vocabulary()[output_token]
    return output

In [332]:
tf.random.set_seed(42)
print(generate_text('To be or not to b', length=40, temperature=0.7))

To be or not to be so will withou't.
thou art a great the


In [166]:
a = 1.5634
b = 0.8764

softmax_a = np.exp(a) / (np.exp(a) + np.exp(b))
softmax_b = np.exp(b) / (np.exp(a) + np.exp(b))

print(softmax_a, softmax_b)
print(np.log(softmax_a), np.log(softmax_b))

0.6652992298681197 0.33470077013188027
-0.4075183698058058 -1.0945183698058054


# Stateful RNN

## Preparing the datasets

In [342]:
def to_dataset_for_stateful_rnn(sequence, length):
    ds = tf.data.Dataset.from_tensor_slices(sequence)
    ds = ds.window(length + 1, shift=length, drop_remainder=True)
    ds = ds.flat_map(lambda window: window.batch(length + 1)).batch(1)
    return ds.map(lambda window: (window[:, :-1], window[:, 1:])).prefetch(1)

In [344]:
print(encoded[:20])
for x, y in to_dataset_for_stateful_rnn(encoded[:80], 5).take(2):
    # print(x.shape, y.shape)
    print(x)
    print(y)

tf.Tensor([19  5  8  7  2  0 18  5  2  5 35  1  9 23 10 21  1 19  3  8], shape=(20,), dtype=int64)
tf.Tensor([[19  5  8  7  2]], shape=(1, 5), dtype=int64)
tf.Tensor([[5 8 7 2 0]], shape=(1, 5), dtype=int64)
tf.Tensor([[ 0 18  5  2  5]], shape=(1, 5), dtype=int64)
tf.Tensor([[18  5  2  5 35]], shape=(1, 5), dtype=int64)


In [347]:
length = 100
train_set_stateful = to_dataset_for_stateful_rnn(encoded[:1_000_000], length=length)
valid_set_stateful = to_dataset_for_stateful_rnn(encoded[1_000_000:1_060_000], length=length)
test_set_stateful = to_dataset_for_stateful_rnn(encoded[1_060_000:], length=length)

## Training

In [351]:
stateful_model = tf.keras.Sequential([
    tf.keras.layers.Embedding(input_dim=n_tokens, output_dim=16, batch_input_shape=[1, None]),
    tf.keras.layers.GRU(128, return_sequences=True, stateful=True),
    tf.keras.layers.Dense(n_tokens, activation='softmax')
])

# We still need to reset the states every epoch
class ResetStateCallback(tf.keras.callbacks.Callback):
    def on_epoch_begin(self, epoch, logs=None):
        self.model.reset_states()

stateful_model.compile(
    optimizer=tf.keras.optimizers.Nadam(learning_rate=1e-3),
    loss=tf.keras.losses.sparse_categorical_crossentropy,
    metrics=['accuracy']
)

folder = Path() / 'data/02-stateful-char-rnn'
checkpoints_folder = folder / 'checkpoints'
if checkpoints_folder.exists():
    stateful_model.load_weights(checkpoints_folder)
else: 
    checkpoints_cb = tf.keras.callbacks.ModelCheckpoint(checkpoints_folder, save_best_only=True, monitor='val_accuracy')
    history = stateful_model.fit(
        train_set_stateful, 
        epochs=10,
        validation_data=valid_set_stateful,
        callbacks=[checkpoints_cb, ResetStateCallback()]
    )

Epoch 1/10
   9998/Unknown - 149s 15ms/step - loss: 1.8669 - accuracy: 0.4495



INFO:tensorflow:Assets written to: data/02-stateful-char-rnn/checkpoints/assets


INFO:tensorflow:Assets written to: data/02-stateful-char-rnn/checkpoints/assets


Epoch 2/10



INFO:tensorflow:Assets written to: data/02-stateful-char-rnn/checkpoints/assets


INFO:tensorflow:Assets written to: data/02-stateful-char-rnn/checkpoints/assets


Epoch 3/10



INFO:tensorflow:Assets written to: data/02-stateful-char-rnn/checkpoints/assets


INFO:tensorflow:Assets written to: data/02-stateful-char-rnn/checkpoints/assets


Epoch 4/10



INFO:tensorflow:Assets written to: data/02-stateful-char-rnn/checkpoints/assets


INFO:tensorflow:Assets written to: data/02-stateful-char-rnn/checkpoints/assets


Epoch 5/10



INFO:tensorflow:Assets written to: data/02-stateful-char-rnn/checkpoints/assets


INFO:tensorflow:Assets written to: data/02-stateful-char-rnn/checkpoints/assets


Epoch 6/10



INFO:tensorflow:Assets written to: data/02-stateful-char-rnn/checkpoints/assets


INFO:tensorflow:Assets written to: data/02-stateful-char-rnn/checkpoints/assets


Epoch 7/10



INFO:tensorflow:Assets written to: data/02-stateful-char-rnn/checkpoints/assets


INFO:tensorflow:Assets written to: data/02-stateful-char-rnn/checkpoints/assets


Epoch 8/10



INFO:tensorflow:Assets written to: data/02-stateful-char-rnn/checkpoints/assets


INFO:tensorflow:Assets written to: data/02-stateful-char-rnn/checkpoints/assets


Epoch 9/10
Epoch 10/10


## Inferring

In [354]:
full_stateful_model = tf.keras.Sequential([
    text_vec_layer,
    tf.keras.layers.Lambda(lambda x: x - 2),
    stateful_model
])

def generate_stateful_text(input: str, length: int, temperature=1):
    output = input
    for _ in range(length):
        y_proba = full_stateful_model.predict([output], verbose=False)[0, -1:]
        y_logits = tf.math.log(y_proba) / temperature
        output_token = tf.random.categorical(y_logits, num_samples=1)[0, 0] + 2
        output += text_vec_layer.get_vocabulary()[output_token]
    return output

In [360]:
print(generate_stateful_text('to be or not to b', 40, temperature=0.7))

to be or not to be
him for you was a given will i will ca


# Sentiment Analysis

## Preparing the datasets

In [366]:
[x for x in tfds.list_builders() if 'imdb' in x]

['imdb_reviews', 'huggingface:imdb', 'huggingface:imdb_urdu_reviews']

In [375]:
raw_train_set, raw_valid_set, raw_test_set = tfds.load(
    name='imdb_reviews',
    split=['train[:90%]', 'train[90%:]', 'test'],
    as_supervised=True
)

train_set = raw_test_set.shuffle(10000).batch(32).prefetch(1)
valid_set = raw_valid_set.batch(32).prefetch(1)
test_set = raw_test_set.batch(32).prefetch(1)

for x, y in test_set.take(1):
    print(x.shape, y.shape)
    print(x[0])
    print(y[0])
    print('---')
    print(x[1])
    print(y[1])
    print('---')
    print(x[2])
    print(y[2])

(32,) (32,)
tf.Tensor(b"There are films that make careers. For George Romero, it was NIGHT OF THE LIVING DEAD; for Kevin Smith, CLERKS; for Robert Rodriguez, EL MARIACHI. Add to that list Onur Tukel's absolutely amazing DING-A-LING-LESS. Flawless film-making, and as assured and as professional as any of the aforementioned movies. I haven't laughed this hard since I saw THE FULL MONTY. (And, even then, I don't think I laughed quite this hard... So to speak.) Tukel's talent is considerable: DING-A-LING-LESS is so chock full of double entendres that one would have to sit down with a copy of this script and do a line-by-line examination of it to fully appreciate the, uh, breadth and width of it. Every shot is beautifully composed (a clear sign of a sure-handed director), and the performances all around are solid (there's none of the over-the-top scenery chewing one might've expected from a film like this). DING-A-LING-LESS is a film whose time has come.", shape=(), dtype=string)
tf.Tensor(

In [392]:
for (x1, x2), (y1, y2), (z1, z2) in tf.data.Dataset.zip((raw_train_set, raw_test_set, raw_valid_set)).take(1):
    print(x1)
    print(x2)
    print(y1)
    print(y2)
    print(z1)
    print(z2)


tf.Tensor(b"This was an absolutely terrible movie. Don't be lured in by Christopher Walken or Michael Ironside. Both are great actors, but this must simply be their worst role in history. Even their great acting could not redeem this movie's ridiculous storyline. This movie is an early nineties US propaganda piece. The most pathetic scenes were those when the Columbian rebels were making their cases for revolutions. Maria Conchita Alonso appeared phony, and her pseudo-love affair with Walken was nothing but a pathetic emotional plug in a movie that was devoid of any real meaning. I am disappointed that there are movies like this, ruining actor's like Christopher Walken's good name. I could barely sit through it.", shape=(), dtype=string)
tf.Tensor(0, shape=(), dtype=int64)
tf.Tensor(b"There are films that make careers. For George Romero, it was NIGHT OF THE LIVING DEAD; for Kevin Smith, CLERKS; for Robert Rodriguez, EL MARIACHI. Add to that list Onur Tukel's absolutely amazing DING-A-L

In [394]:
vocab_size = 1000
text_vec_layer = tf.keras.layers.TextVectorization(max_tokens=vocab_size)
full_dataset = tf.data.Dataset.zip((raw_train_set, raw_valid_set, raw_test_set)).map(lambda a, b, c: tf.stack((a[0], b[0], c[0])))
text_vec_layer.adapt(full_dataset)

In [419]:
print(text_vec_layer.get_vocabulary()[:10])
print(text_vec_layer(['the is in', 'the is']))
print(text_vec_layer.vocabulary_size())

['', '[UNK]', 'the', 'a', 'and', 'of', 'to', 'is', 'in', 'i']
tf.Tensor(
[[2 7 8]
 [2 7 0]], shape=(2, 3), dtype=int64)
1000


## Training

### Naive: without masking

In [399]:
embed_size = 128
imdb_model = tf.keras.Sequential([
    text_vec_layer,
    tf.keras.layers.Embedding(input_dim=text_vec_layer.vocabulary_size(), output_dim=embed_size),
    tf.keras.layers.GRU(128),
    tf.keras.layers.Dense(1, activation='sigmoid')
])
imdb_model.compile(
    optimizer=tf.keras.optimizers.Nadam(learning_rate=1e-3),
    loss=tf.keras.losses.binary_crossentropy,
    metrics=tf.keras.metrics.binary_accuracy
)
history = imdb_model.fit(
    train_set,
    epochs=2,
    validation_data=valid_set
)

Epoch 1/2
Epoch 2/2


In [410]:
# Why is there no learning? Due to lots of zero padding, long padded sequences have the GRU cell forget what the review is about 
for x, y in train_set.take(1):
    tokenized = text_vec_layer(x)
    print(tokenized.shape)
    for row in range(5):
        # Print last 100 tokens in the sequence
        print(tokenized[row][-100:])

(32, 736)
tf.Tensor(
[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0], shape=(100,), dtype=int64)
tf.Tensor(
[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0], shape=(100,), dtype=int64)
tf.Tensor(
[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0], shape=(100,), dtype=int64)
tf.Tensor(
[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0], shape=(100,), dtype=int64)
tf.Tensor(
[0 0 0 

### With masking: mask_zero=True

In [411]:
embed_size = 128
masked_imdb_model = tf.keras.Sequential([
    text_vec_layer,
    tf.keras.layers.Embedding(
        input_dim=text_vec_layer.vocabulary_size(), 
        output_dim=embed_size,
        mask_zero=True
    ),
    tf.keras.layers.GRU(128),
    tf.keras.layers.Dense(1, activation='sigmoid')
])
masked_imdb_model.compile(
    optimizer=tf.keras.optimizers.Nadam(learning_rate=1e-3),
    loss=tf.keras.losses.binary_crossentropy,
    metrics=tf.keras.metrics.binary_accuracy
)
history = masked_imdb_model.fit(
    train_set,
    epochs=2,
    validation_data=valid_set
)

Epoch 1/2
Epoch 2/2


### With masking: tf.layers.Masking

In [None]:
# An alternative approach with Masking layer
# tf.​​reduce_any(tf.math.not_equal(X, 0), axis=-1)
masked_imdb_model2 = tf.keras.Sequential([
    text_vec_layer,
    tf.keras.layers.Masking(),
    tf.keras.layers.Embedding(
        input_dim=text_vec_layer.vocabulary_size(), 
        output_dim=embed_size,
    ),
    tf.keras.layers.GRU(128),
    tf.keras.layers.Dense(1, activation='sigmoid')
])

### With masking: Functional API

In [414]:
# Strings are considered atomic values and their length is not part of the shape
# So with batches, the shape of the input is (32,) 
inputs = tf.keras.layers.Input(shape=[], dtype=tf.string)
# Returns (32, sequence length)
token_ids = text_vec_layer(inputs)
# Returns (32, sequence length), so there is no need to do tf.reduce_any(..., axis=-1)
mask = tf.not_equal(token_ids, 0)
Z = tf.keras.layers.Embedding(
    input_dim=text_vec_layer.vocabulary_size(), 
    output_dim=embed_size,
)(token_ids)
Z = tf.keras.layers.GRU(128, dropout=0.2)(Z, mask=mask)
outputs = tf.keras.layers.Dense(1, activation='sigmoid')(Z)
masked_imdb_model3 = tf.keras.Model(inputs=[inputs], outputs=[outputs])


(None, None)


In [415]:
masked_imdb_model3.compile(
    optimizer=tf.keras.optimizers.Nadam(learning_rate=1e-3),
    loss=tf.keras.losses.binary_crossentropy,
    metrics=tf.keras.metrics.binary_accuracy
)
history = masked_imdb_model3.fit(
    train_set,
    epochs=2,
    validation_data=valid_set
)

tensorboard --logdir=ch16/data/03-masked-char-rnn
Epoch 1/2
Epoch 2/2


### With Ragged tensors instead of masking

In [416]:
text_vec_layer_ragged = tf.keras.layers.TextVectorization(max_tokens=vocab_size, ragged=True)
text_vec_layer_ragged.adapt(full_dataset)

In [418]:
print(text_vec_layer_ragged.get_vocabulary()[:10])
print(text_vec_layer_ragged(['the is in', 'the is']))

['', '[UNK]', 'the', 'a', 'and', 'of', 'to', 'is', 'in', 'i']
<tf.RaggedTensor [[2, 7, 8], [2, 7]]>


In [424]:
folder = Path() / 'data/03-masked-char-rnn'
tensorboard_cb = tf.keras.callbacks.TensorBoard(folder / 'tensorboard')
print(f'tensorboard --logdir=ch16/{folder}')

checkpoints_cb = tf.keras.callbacks.ModelCheckpoint(folder / 'checkpoints')

ragged_imdb_model = tf.keras.Sequential([
    text_vec_layer_ragged,
    tf.keras.layers.Embedding(
        input_dim=text_vec_layer.vocabulary_size(), 
        output_dim=embed_size,
    ),
    tf.keras.layers.GRU(128),
    tf.keras.layers.Dense(1, activation='sigmoid')
])
ragged_imdb_model.compile(
    optimizer=tf.keras.optimizers.Nadam(learning_rate=1e-3),
    loss=tf.keras.losses.binary_crossentropy,
    metrics=['accuracy']
)
history = ragged_imdb_model.fit(
    train_set,
    epochs=2,
    validation_data=valid_set,
    callbacks=[tensorboard_cb, checkpoints_cb]
)

tensorboard --logdir=ch16/data/03-masked-char-rnn
Epoch 1/2



INFO:tensorflow:Assets written to: data/03-masked-char-rnn/checkpoints/assets


INFO:tensorflow:Assets written to: data/03-masked-char-rnn/checkpoints/assets


Epoch 2/2



INFO:tensorflow:Assets written to: data/03-masked-char-rnn/checkpoints/assets


INFO:tensorflow:Assets written to: data/03-masked-char-rnn/checkpoints/assets




In [426]:
model.save(folder / 'tensorboard' / 'model.ckpt')



INFO:tensorflow:Assets written to: data/03-masked-char-rnn/tensorboard/model.ckpt/assets


INFO:tensorflow:Assets written to: data/03-masked-char-rnn/tensorboard/model.ckpt/assets


# Pretrained embeddings

In [3]:
raw_train_set, raw_valid_set, raw_test_set = tfds.load(
    name='imdb_reviews',
    split=['train[:90%]', 'train[90%:]', 'test'],
    as_supervised=True
)

train_set = raw_test_set.shuffle(5000).batch(16).prefetch(tf.data.experimental.AUTOTUNE)
valid_set = raw_valid_set.batch(16).prefetch(tf.data.experimental.AUTOTUNE)
test_set = raw_test_set.batch(16).prefetch(tf.data.experimental.AUTOTUNE)

os.environ["TFHUB_CACHE_DIR"] = "/home/amitaharoni/.tfhub"
model = tf.keras.Sequential([
    hub.KerasLayer(
        "https://tfhub.dev/google/universal-sentence-encoder/4", 
        trainable=True, 
        dtype=tf.string, input_shape=[]
    ),
    tf.keras.layers.Dense(64, activation="relu"),
    tf.keras.layers.Dense(1, activation="sigmoid")
])
model.compile(loss="binary_crossentropy", optimizer="nadam", metrics=["accuracy"])
folder = Path() / 'data/04-pretrained-finetuning' / 'model'
if folder.exists():
    tf.keras.models.load_model(folder)
else:
    model.fit(train_set, validation_data=valid_set, epochs=10)
    model.save(folder)

InternalError: Failed copying input tensor from /job:localhost/replica:0/task:0/device:CPU:0 to /job:localhost/replica:0/task:0/device:GPU:0 in order to run _EagerConst: Dst tensor is not initialized.

# Random zip trick

In [39]:
# TRICK: zip can be used to unzip
a = [1, 2, 3, 4]
b = ['a', 'b', 'c', 'd']

# After `zip`, the number of elements in each list becomes the length of the result
zipped_list = list(zip(a, b))
# => [(1, 'a'), (2, 'b'), (3, 'c'), (4, 'd')]

# Since we now have N lists, each containing 2 elements, the length of the result is 2
list(zip(*zipped_list))
# => [(1, 2, 3, 4), ('a', 'b', 'c', 'd')]

[(1, 2, 3, 4), ('a', 'b', 'c', 'd')]

# NMT - Neural Machine Translation (Encoder-Decoder networks)

In [408]:
url = "https://storage.googleapis.com/download.tensorflow.org/data/spa-eng.zip"
path = tf.keras.utils.get_file('spa-eng.zip', origin=url, cache_dir='datasets', extract=True)
text = (Path(path).with_name('spa-eng') / 'spa.txt').read_text()

In [409]:
text = text.replace('¡', '').replace('¿', '')
pairs = [pair.split('\t') for pair in text.splitlines()]
np.random.shuffle(pairs)
print(len(pairs))
sentences_en, sentences_es = zip(*pairs)
for i in range(3):
    print(sentences_en[i], '=>', sentences_es[i])

118964
What time is it? => Qué hora es?
I need to know how to do this. => Necesito saber cómo hacer esto.
Please fill this bucket with water. => Por favor, llena este balde con agua.


In [410]:
vocab_size = 1000
# All sentences in the dataset have a max of 50 words, so this standardize without losing information
max_length = 50
text_vec_layer_en = tf.keras.layers.TextVectorization(max_tokens=vocab_size, output_sequence_length=max_length)
text_vec_layer_es = tf.keras.layers.TextVectorization(max_tokens=vocab_size, output_sequence_length=max_length)
text_vec_layer_en.adapt(sentences_en)
text_vec_layer_es.adapt([f'startofseq {s} endofseq' for s in sentences_es])

In [50]:

print(text_vec_layer_en.get_vocabulary()[:10])
print(text_vec_layer_es.get_vocabulary()[:10])

['', '[UNK]', 'the', 'i', 'to', 'you', 'tom', 'a', 'is', 'he']
['', '[UNK]', 'startofseq', 'endofseq', 'de', 'que', 'a', 'no', 'tom', 'la']


In [414]:
breakpoint = 100_000
# Tokenization and embedding will be part of the model
x_train = tf.constant(sentences_en[:breakpoint])
x_valid = tf.constant(sentences_en[breakpoint:])
x_train_decoder = tf.constant([f'startofseq {s}' for s in sentences_es[:breakpoint]])
x_valid_decoder = tf.constant([f'startofseq {s}' for s in sentences_es[breakpoint:]])

# But we tokenize to sparse categories the labels (the model will have softmax)
y_train = text_vec_layer_es([f'{s} endofseq' for s in sentences_es[:breakpoint]])
y_valid = text_vec_layer_es([f'{s} endofseq' for s in sentences_es[breakpoint:]])

In [421]:
print(y_train[0])
print(y_train[1])

tf.Tensor(
[ 25 140  12   3   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0], shape=(50,), dtype=int64)
tf.Tensor(
[151 195  90  52  57   3   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0], shape=(50,), dtype=int64)


In [59]:
embed_size = 128
encoder_input = tf.keras.layers.Input(shape=[], dtype=tf.string)
decoder_input = tf.keras.layers.Input(shape=[], dtype=tf.string)

encoder_tokens = text_vec_layer_en(encoder_input)
decoder_tokens = text_vec_layer_es(decoder_input)

encoder_embedding_layer = tf.keras.layers.Embedding(vocab_size, embed_size, mask_zero=True)
decoder_embedding_layer = tf.keras.layers.Embedding(vocab_size, embed_size, mask_zero=True)

encoder_embeddings = encoder_embedding_layer(encoder_tokens)
decoder_embeddings = decoder_embedding_layer(decoder_tokens)

encoder = tf.keras.layers.LSTM(512, return_state=True)
encoder_output, *encoder_states = encoder(encoder_embeddings)

decoder = tf.keras.layers.LSTM(512, return_sequences=True)
decoder_output = decoder(decoder_embeddings, initial_state=encoder_states)

output_layer = tf.keras.layers.Dense(vocab_size, activation='softmax')
y_proba = output_layer(decoder_output)

nmt_model = tf.keras.models.Model(inputs=[encoder_input, decoder_input], outputs=[y_proba])

In [60]:
nmt_model.compile(
    loss=tf.keras.losses.sparse_categorical_crossentropy,
    optimizer=tf.keras.optimizers.Nadam(learning_rate=1e-3),
    metrics=['accuracy']
)
nmt_model.fit(
    (x_train, x_train_decoder), y_train, 
    epochs=10, 
    validation_data=((x_valid, x_valid_decoder), y_valid)
)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x7fed8faee190>

In [101]:
def translate(en_input):
    es_output = ''
    en_input = tf.constant([en_input])
    for token_index in range(max_length):
        es_input = tf.constant([f'startofseq {es_output}'])
        # The shape of predict() ~ [1, 50, 1000]
        # We are interested in the most recently predicted token, i.e. at token_index
        y_proba = nmt_model.predict([en_input, es_input], verbose=False)[0, token_index]
        next_token = y_proba.argmax()
        next_word = text_vec_layer_es.get_vocabulary()[next_token]
        if next_word == 'endofseq':
            break
        es_output = f'{es_output} {next_word}'
    return es_output.strip()

In [77]:
y_proba = nmt_model.predict([tf.constant(['how are you?']), tf.constant(['startofseq'])])
next_token = y_proba.argmax(axis=-1)
# text_vec_layer_es.get_vocabulary()[next_token]
next_token



array([[90,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
         1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
         1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
         1,  1]])

In [109]:
print(translate('I am going to bed'))
print(translate('I like soccer'))
print(translate('I like soccer and going to the beach'))

me voy a la cama
me gusta el fútbol
me gusta ir a [UNK] a las seis y media


# Bidirectional RNNs

In [110]:
embed_size = 128
encoder_input = tf.keras.layers.Input(shape=[], dtype=tf.string)
decoder_input = tf.keras.layers.Input(shape=[], dtype=tf.string)

encoder_tokens = text_vec_layer_en(encoder_input)
decoder_tokens = text_vec_layer_es(decoder_input)

encoder_embedding_layer = tf.keras.layers.Embedding(vocab_size, embed_size, mask_zero=True)
decoder_embedding_layer = tf.keras.layers.Embedding(vocab_size, embed_size, mask_zero=True)

encoder_embeddings = encoder_embedding_layer(encoder_tokens)
decoder_embeddings = decoder_embedding_layer(decoder_tokens)

# Note the decrease in number of units to 256, as the final output is actually 512
encoder = tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(256, return_state=True))
encoder_output, *encoder_states = encoder(encoder_embeddings)
encoder_states = [
    tf.concat(encoder_states[::2], axis=-1),  # short-term state of sub-RNNs (0 & 2)
    tf.concat(encoder_states[1::2], axis=-1)  # long-term state of sub-RNNs (1 & 3)
]

decoder = tf.keras.layers.LSTM(512, return_sequences=True)
decoder_output = decoder(decoder_embeddings, initial_state=encoder_states)

output_layer = tf.keras.layers.Dense(vocab_size, activation='softmax')
y_proba = output_layer(decoder_output)

nmt_model = tf.keras.models.Model(inputs=[encoder_input, decoder_input], outputs=[y_proba])

In [111]:
nmt_model.compile(
    loss=tf.keras.losses.sparse_categorical_crossentropy,
    optimizer=tf.keras.optimizers.Nadam(learning_rate=1e-3),
    metrics=['accuracy']
)
nmt_model.fit(
    (x_train, x_train_decoder), y_train, 
    epochs=10, 
    validation_data=((x_valid, x_valid_decoder), y_valid)
)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x7fec25ecf510>

In [112]:
print(translate('I am going to bed'))
print(translate('I like soccer'))
print(translate('I like soccer and going to the beach'))

me voy a la cama
me gusta el fútbol
me gusta el fútbol y a la playa


In [133]:
print(translate('He likes reading books'))

a él le gusta leer libros


In [131]:
found = False
for x in sentences_en:
    if 'reading books' in x.lower():
        print(f'FOUND: {x}')
        found = True
if not found:
    print('Did not find it')

FOUND: He often sits for many hours reading books.
FOUND: Reading books is interesting.
FOUND: I spent hours reading books.
FOUND: Reading books will make you smarter.
FOUND: Tom enjoys reading books in French.
FOUND: He devoted himself to reading books.
FOUND: I prefer reading books to watching television.
FOUND: I love reading books.
FOUND: I prefer reading books to watching television.
FOUND: Reading books is very interesting.
FOUND: I love reading books.
FOUND: I prefer reading books to watching television.
FOUND: We enjoy reading books.
FOUND: I like reading books.


In [286]:
def beam_search(input, beam_width=3):
    # We add another column which contains 0 for unfinished sentences and 1 for finished
    proba_columns = text_vec_layer_es.vocabulary_size() + 1
    # Compute indices to make it easier to move from a flattened array back to a matrix
    x, y = np.indices((beam_width, proba_columns))
    unflatten = tf.reshape(tf.stack([x, y], axis=-1), shape=(-1, 2))

    en_sentences = tf.constant([input] * beam_width)
    es_prefix = tf.constant(['startofseq '] * beam_width)
    finished = tf.Variable([[0.]] * beam_width, dtype=tf.float32, shape=[beam_width, 1])
    
    # First words
    vocab_probas = nmt_model.predict((tf.constant([input]), tf.constant(['startofseq'])), verbose=False)[0, 0, :]
    top_k = tf.math.top_k(vocab_probas, k=beam_width)
    es_sentences = tf.Variable([text_vec_layer_es.get_vocabulary()[x] for x in top_k.indices], dtype=tf.string)
    # Use log probabilities to avoid numerical problems with long multiplications. 
    # So instead of P1 * P2, we do log(P1) + log(P2)
    sentence_probas = tf.constant(tf.math.log(top_k.values), dtype=tf.float32, shape=[beam_width, 1])

    for token_index in range(1, max_length):
        curr_sentences = tf.constant(es_sentences)
        # predict ~ (beam_width, 50, 1000)
        # vocab_proba ~ (beam_width, 1000)
        vocab_probas = nmt_model.predict((en_sentences, es_prefix + curr_sentences), verbose=False)[:, token_index]
        # Concatenating the finish flag
        # vocab_proba ~ (beam_width, 1001)
        vocab_probas = tf.concat([vocab_probas, finished], axis=-1)
        # flat_next_word_probas ~ (beam_width * 1001) 
        flat_next_word_probas = tf.reshape(sentence_probas + tf.math.log(vocab_probas), shape=[-1])
        sentence_probas, flat_next_tokens = tf.math.top_k(flat_next_word_probas, k=beam_width)
        sentence_probas = tf.reshape(sentence_probas, shape=[beam_width, 1])
        for idx, flat_token in enumerate(flat_next_tokens):
            chosen_sentence_idx, chosen_token = unflatten[flat_token]
            if not finished[chosen_sentence_idx]:
                word = text_vec_layer_es.get_vocabulary()[chosen_token]
                if word == 'endofseq':
                    finished[chosen_sentence_idx].assign(1)
                else:
                    es_sentences[idx].assign(curr_sentences[chosen_sentence_idx] + ' ' + word)

        if np.all(finished):
            break
    print(es_sentences.numpy())
    best_sentence = sentence_probas.numpy().argmax()
    return es_sentences[best_sentence].numpy().strip()


In [287]:
beam_search('I like my car')

[b'me gusta mi coche' b'me gusta mi carro' b'me gusta mi auto']


b'me gusta mi coche'

In [265]:
tf.constant(['hello', 'world']) + tf.constant([' hi', ' bye'])

<tf.Tensor: shape=(2,), dtype=string, numpy=array([b'hello hi', b'world bye'], dtype=object)>

In [284]:
def beam_search_book(sentence_en, beam_width, verbose=False):
    X = np.array([sentence_en])  # encoder input
    X_dec = np.array(["startofseq"])  # decoder input
    y_proba = nmt_model.predict((X, X_dec), verbose=False)[0, 0]  # first token's probas
    top_k = tf.math.top_k(y_proba, k=beam_width)
    top_translations = [  # list of best (log_proba, translation)
        (np.log(word_proba), text_vec_layer_es.get_vocabulary()[word_id])
        for word_proba, word_id in zip(top_k.values, top_k.indices)
    ]
    
    # extra code – displays the top first words in verbose mode
    if verbose:
        print("Top first words:", top_translations)

    for idx in range(1, max_length):
        candidates = []
        for log_proba, translation in top_translations:
            if translation.endswith("endofseq"):
                candidates.append((log_proba, translation))
                continue  # translation is finished, so don't try to extend it
            X = np.array([sentence_en])  # encoder input
            X_dec = np.array(["startofseq " + translation])  # decoder input
            y_proba = nmt_model.predict((X, X_dec), verbose=False)[0, idx]  # last token's proba
            for word_id, word_proba in enumerate(y_proba):
                word = text_vec_layer_es.get_vocabulary()[word_id]
                candidates.append((log_proba + np.log(word_proba),
                                   f"{translation} {word}"))
        top_translations = sorted(candidates, reverse=True)[:beam_width]

        # extra code – displays the top translation so far in verbose mode
        if verbose:
            print("Top translations so far:", top_translations)

        if all([tr.endswith("endofseq") for _, tr in top_translations]):
            return top_translations[0][1].replace("endofseq", "").strip()

In [285]:
beam_search_book('I like my car', 3, verbose=True)

Top first words: [(-0.0039384337, 'me'), (-6.8156004, 'mi'), (-7.1107097, 'a')]
Top translations so far: [(-0.005191154, 'me gusta'), (-6.997386, 'me [UNK]'), (-7.1742353, 'a mi')]
Top translations so far: [(-0.03985434, 'me gusta mi'), (-3.4733727, 'me gusta el'), (-6.407665, 'me gusta ese')]
Top translations so far: [(-0.62566996, 'me gusta mi coche'), (-1.7064795, 'me gusta mi carro'), (-1.795997, 'me gusta mi auto')]
Top translations so far: [(-0.62573564, 'me gusta mi coche endofseq'), (-1.706535, 'me gusta mi carro endofseq'), (-1.7960483, 'me gusta mi auto endofseq')]


'me gusta mi coche'

# Attention mechanisms

In [290]:
embed_size = 128
encoder_input = tf.keras.layers.Input(shape=[], dtype=tf.string)
decoder_input = tf.keras.layers.Input(shape=[], dtype=tf.string)

encoder_tokens = text_vec_layer_en(encoder_input)
decoder_tokens = text_vec_layer_es(decoder_input)

encoder_embedding_layer = tf.keras.layers.Embedding(vocab_size, embed_size, mask_zero=True)
decoder_embedding_layer = tf.keras.layers.Embedding(vocab_size, embed_size, mask_zero=True)

encoder_embeddings = encoder_embedding_layer(encoder_tokens)
decoder_embeddings = decoder_embedding_layer(decoder_tokens)

# Note that we now return sequences
encoder = tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(256, return_state=True, return_sequences=True))
encoder_output, *encoder_states = encoder(encoder_embeddings)
encoder_states = [
    tf.concat(encoder_states[::2], axis=-1),  # short-term state of sub-RNNs (0 & 2)
    tf.concat(encoder_states[1::2], axis=-1)  # long-term state of sub-RNNs (1 & 3)
]

decoder = tf.keras.layers.LSTM(512, return_sequences=True)
decoder_output = decoder(decoder_embeddings, initial_state=encoder_states)

# Luong attention
# To send the full decoder's states (hidden + long term) we would need to write a custom layer.
# Instead, we simply send the outputs (= hidden states), which in practice works well.
attention_layer = tf.keras.layers.Attention()
attention_output = attention_layer([decoder_output, encoder_output])

output_layer = tf.keras.layers.Dense(vocab_size, activation='softmax')
y_proba = output_layer(attention_output)

nmt_model = tf.keras.models.Model(inputs=[encoder_input, decoder_input], outputs=[y_proba])

In [291]:
nmt_model.compile(
    loss=tf.keras.losses.sparse_categorical_crossentropy,
    optimizer=tf.keras.optimizers.Nadam(learning_rate=1e-3),
    metrics=['accuracy']
)
nmt_model.fit(
    (x_train, x_train_decoder), y_train, 
    epochs=10, 
    validation_data=((x_valid, x_valid_decoder), y_valid)
)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x7fec1a5b6d90>

In [292]:
print(translate('I am going to bed'))
print(translate('I like soccer'))
print(translate('I like soccer and going to the beach'))

me voy a la cama
me gusta fútbol
me gusta el fútbol y a a la playa


In [293]:
print(beam_search('I am going to bed'))
print(beam_search('I like soccer'))
print(beam_search('I like soccer and going to the beach'))

[b'me voy a la cama' b'voy a la cama' b'me [UNK]']
b'me voy a la cama'
[b'me gusta f\xc3\xbatbol' b'me gusta el f\xc3\xbatbol'
 b'me gusta jugar el f\xc3\xbatbol']
b'me gusta f\xc3\xbatbol'
[b'me gusta el f\xc3\xbatbol y a a la playa'
 b'me gusta el f\xc3\xbatbol y a la playa'
 b'me gusta el f\xc3\xbatbol y a ir a la playa']
b'me gusta el f\xc3\xbatbol y a a la playa'


# Transformers

## Positional encodings: Trainable embeddings approach (not used in paper)

In [None]:
max_length = 50
embed_size = 128

pos_embed_layer = tf.keras.layers.Embedding(input_dim=max_length, output_dim=embed_size)
# This gets the sequence length of the encoder's input during training or inference
batch_max_len_enc = tf.shape(encoder_embeddings)[1]
# Embeds the range 0..seqlen and adds it to the output of the encoder_embeddings
encoder_in = encoder_embeddings + pos_embed_layer(tf.range(batch_max_len_enc))

# Remember the decoder's sequence length can be different than the encoder's
batch_max_len_dec = tf.shape(decoder_embeddings)[1]
# We add the same positional encoding representation to both encoder's and decoder's embeddings
decoder_in = decoder_embeddings + pos_embed_layer(tf.range(batch_max_len_dec))


## Positional encodings: fixed positional encodings

In [316]:
class FixedPositionalEncoding(tf.keras.layers.Layer):
    def __init__(self, max_length, embed_size, dtype=tf.float32, **kwargs):
        super().__init__(dtype=dtype, **kwargs)
        assert embed_size % 2 == 0, "embed_size must be even"
        p, i = np.meshgrid(np.arange(max_length), 2 * np.arange(embed_size // 2))
        # Shape of a tensor (1, max_length, embed_size) that will be broadcasted across batches
        pos_emb = np.empty((1, max_length, embed_size))
        pos_emb[0, :, ::2] = np.sin(p / 10_000 ** (i / embed_size)).T
        pos_emb[0, :, 1::2] = np.cos(p / 10_000 ** (i / embed_size)).T
        self.pos_encoding = tf.constant(pos_emb.astype(self.dtype))
        self.supports_masking = True
    
    def call(self, inputs):
        batch_max_len = tf.shape(inputs)[1]
        return inputs + self.pos_encoding[:, :batch_max_len]

pos_embed_layer = FixedPositionalEncoding(max_length, embed_size)
encoder_in = pos_embed_layer(encoder_embeddings)
decoder_in = pos_embed_layer(decoder_embeddings)


In [315]:
x, y = np.meshgrid(np.arange(5), np.arange(10,16))
print(x)
print(y)
z = np.stack([y, x], axis=2)
print(z[0, 0])
print(z[0, 1])
print(z[1, 0])


[[0 1 2 3 4]
 [0 1 2 3 4]
 [0 1 2 3 4]
 [0 1 2 3 4]
 [0 1 2 3 4]
 [0 1 2 3 4]]
[[10 10 10 10 10]
 [11 11 11 11 11]
 [12 12 12 12 12]
 [13 13 13 13 13]
 [14 14 14 14 14]
 [15 15 15 15 15]]
[10  0]
[10  1]
[11  0]


In [374]:
max_length = 50
embed_size = 128
vocab_size = 1000

# = The inputs

encoder_input = tf.keras.layers.Input(shape=[], dtype=tf.string)
decoder_input = tf.keras.layers.Input(shape=[], dtype=tf.string)

encoder_tokens = text_vec_layer_en(encoder_input)
decoder_tokens = text_vec_layer_es(decoder_input)

encoder_embedding_layer = tf.keras.layers.Embedding(vocab_size, embed_size, mask_zero=True)
decoder_embedding_layer = tf.keras.layers.Embedding(vocab_size, embed_size, mask_zero=True)

encoder_embeddings = encoder_embedding_layer(encoder_tokens)
decoder_embeddings = decoder_embedding_layer(decoder_tokens)

pos_embed_layer = FixedPositionalEncoding(max_length, embed_size)
encoder_in = pos_embed_layer(encoder_embeddings)
decoder_in = pos_embed_layer(decoder_embeddings)

# = The Encoder

# instead of 6
N = 2 
num_heads = 8
dropout_rate = 0.1
# For the first dense layer in each feedforward block
# In the paper this increased dimensionality of the embeddings by 4x
n_units = 128 
# This might not be needed anymore in new versions of Tensorflow
# The last bit transforms the mask from (batch size, encoder sequence length) to
# (batch size, 1, encoder sequence length)
encoder_pad_mask = tf.math.not_equal(encoder_tokens, 0)[:, tf.newaxis]
Z = encoder_in
for _ in range(N):
    skip = Z
    # == Multi-head self-attention
    attn_layer = tf.keras.layers.MultiHeadAttention(
        num_heads=num_heads, 
        key_dim=embed_size, 
        dropout=dropout_rate
    )
    # Since `key` is not provided, uses the value
    # attention_mask is of shape (batch size, query sequence length, keys sequence length)
    # The provided mask is of shape (batch size, 1, keys sequence length), so is broadcasted.
    # However, the layer computes output for all query tokens, which in this case are the same as the keys. 
    # I.e. the output will contain the results of padded encoder tokens, which should be ignored...
    Z = attn_layer(Z, value=Z, attention_mask=encoder_pad_mask)
    # ... the Add layer in Keras propagates the mask of the layers. If any mask cell is False, the 
    # aggregate mask is considered False. This solves the aforementioned issue.
    Z = tf.keras.layers.LayerNormalization()(tf.keras.layers.Add()([Z, skip]))
    skip = Z
    # == Feed-forward
    Z = tf.keras.layers.Dense(n_units, activation='relu')(Z)
    Z = tf.keras.layers.Dense(embed_size)(Z)
    Z = tf.keras.layers.Dropout(dropout_rate)(Z)
    Z = tf.keras.layers.LayerNormalization()(tf.keras.layers.Add()([Z, skip]))

encoder_output = Z

# = The Decoder

# The sequence length of the decoder
batch_max_len_dec = tf.shape(decoder_in)[1]

decoder_pad_mask = tf.math.not_equal(decoder_tokens, 0)[:, tf.newaxis]
# The causal mask shape is (decoder seqlen, decoder seqlen).
# It will be broadcasted across all batches to (batch size, decoder seqlen, decoder seqlen)
causal_mask = tf.linalg.band_part(
    tf.ones((batch_max_len_dec, batch_max_len_dec), tf.bool), 
    # Take all values below the diagonal
    num_lower=-1,
    # Take none of the values above the diagonal
    num_upper=0
)

Z = decoder_in
for _ in range(N):
    skip = Z
    # == Masked multi-head self-attention
    attn_layer = tf.keras.layers.MultiHeadAttention(
        num_heads=num_heads, 
        key_dim=embed_size, 
        dropout=dropout_rate
    )
    Z = attn_layer(Z, value=Z, attention_mask=decoder_pad_mask & causal_mask)
    Z = tf.keras.layers.LayerNormalization()(tf.keras.layers.Add()([Z, skip]))
    skip = Z
    # == Multi-head cross-attention
    attn_layer = tf.keras.layers.MultiHeadAttention(
        num_heads=num_heads,
        key_dim=embed_size,
        dropout=dropout_rate
    )
    # Note we're using encoder_pad_mask, as this is the value space it returns
    Z = attn_layer(Z, value=encoder_output, attention_mask=encoder_pad_mask)
    Z = tf.keras.layers.LayerNormalization()(tf.keras.layers.Add()([Z, skip]))
    skip = Z
    # == Feed-forward
    Z = tf.keras.layers.Dense(n_units, activation='relu')(Z)
    Z = tf.keras.layers.Dense(embed_size)(Z)
    Z = tf.keras.layers.LayerNormalization()(tf.keras.layers.Add()([Z, skip]))

y_proba = tf.keras.layers.Dense(vocab_size, activation='softmax')(Z)

transformer_model = tf.keras.models.Model(inputs=[encoder_input, decoder_input], outputs=[y_proba])
transformer_model.summary()

Model: "model_9"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_28 (InputLayer)          [(None,)]            0           []                               
                                                                                                  
 input_27 (InputLayer)          [(None,)]            0           []                               
                                                                                                  
 text_vectorization_3 (TextVect  (None, 50)          0           ['input_28[0][0]']               
 orization)                                                                                       
                                                                                                  
 text_vectorization_2 (TextVect  (None, 50)          0           ['input_27[0][0]']         

In [375]:
transformer_model.compile(
    loss=tf.keras.losses.sparse_categorical_crossentropy,
    optimizer=tf.keras.optimizers.Nadam(),
    metrics=[tf.keras.metrics.sparse_categorical_accuracy]
)
hist = transformer_model.fit((x_train, x_train_decoder), y_train, epochs=10, validation_data=((x_valid, x_valid_decoder), y_valid))

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [380]:
def translate(en_input):
    es_output = ''
    en_input = tf.constant([en_input])
    for token_index in range(max_length):
        es_input = tf.constant([f'startofseq {es_output}'])
        # The shape of predict() ~ [1, 50, 1000]
        # We are interested in the most recently predicted token, i.e. at token_index
        y_proba = transformer_model.predict([en_input, es_input], verbose=False)[0, token_index]
        next_token = y_proba.argmax()
        next_word = text_vec_layer_es.get_vocabulary()[next_token]
        if next_word == 'endofseq':
            break
        es_output = f'{es_output} {next_word}'
    return es_output.strip()

print(translate('I am going to bed'))
print(translate('I like soccer'))
print(translate('I like soccer and going to the beach'))

voy a la cama
me gusta el fútbol
me gusta el fútbol y la playa


# Hugging Face

## Using the pipeline

In [381]:
from transformers import pipeline
# sentiment-analysis is the task. Since no model is specified, it downloads the default one.
classifier = pipeline('sentiment-analysis')
result = classifier('The actors were very convincing')
result

No model was supplied, defaulted to distilbert-base-uncased-finetuned-sst-2-english and revision af0f99b (https://huggingface.co/distilbert-base-uncased-finetuned-sst-2-english).
Using a pipeline without specifying a model name and revision in production is not recommended.


Downloading (…)lve/main/config.json:   0%|          | 0.00/629 [00:00<?, ?B/s]

Downloading model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

All PyTorch model weights were used when initializing TFDistilBertForSequenceClassification.

All the weights of TFDistilBertForSequenceClassification were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFDistilBertForSequenceClassification for predictions without further training.


Downloading (…)okenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

[{'label': 'POSITIVE', 'score': 0.9998143315315247}]

In [388]:
print(classifier('I am from USA'))
print(classifier('I am from Israel'))
print(classifier('I am from Austria'))
print(classifier('I am from Germany'))
print(classifier('I am from Iraq'))

[{'label': 'POSITIVE', 'score': 0.9769665598869324}]
[{'label': 'POSITIVE', 'score': 0.987899124622345}]
[{'label': 'POSITIVE', 'score': 0.9868885278701782}]
[{'label': 'POSITIVE', 'score': 0.8568997979164124}]
[{'label': 'NEGATIVE', 'score': 0.9706069231033325}]


In [389]:
# Here we specify the task & model.
model_name = 'huggingface/distilbert-base-uncased-finetuned-mnli'
classifier_mnli = pipeline('text-classification', model=model_name)
classifier_mnli('She loves me. [SEP] She loves me not.')

Downloading (…)lve/main/config.json:   0%|          | 0.00/729 [00:00<?, ?B/s]

Downloading model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

All PyTorch model weights were used when initializing TFDistilBertForSequenceClassification.

All the weights of TFDistilBertForSequenceClassification were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFDistilBertForSequenceClassification for predictions without further training.


Downloading (…)okenizer_config.json:   0%|          | 0.00/58.0 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)in/added_tokens.json:   0%|          | 0.00/2.00 [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

[{'label': 'contradiction', 'score': 0.9790192246437073}]

## Using classes

In [391]:
from transformers import AutoTokenizer, TFAutoModelForSequenceClassification
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = TFAutoModelForSequenceClassification.from_pretrained(model_name)

token_ids = tokenizer(['I like soccer. [SEP] We all love soccer!',
    'Joe lived for a very long time. [SEP] Joe is old.'], 
    padding = True, return_tensors='tf'
)

token_ids

All PyTorch model weights were used when initializing TFDistilBertForSequenceClassification.

All the weights of TFDistilBertForSequenceClassification were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFDistilBertForSequenceClassification for predictions without further training.


{'input_ids': <tf.Tensor: shape=(2, 15), dtype=int32, numpy=
array([[ 101, 1045, 2066, 4715, 1012,  102, 2057, 2035, 2293, 4715,  999,
         102,    0,    0,    0],
       [ 101, 3533, 2973, 2005, 1037, 2200, 2146, 2051, 1012,  102, 3533,
        2003, 2214, 1012,  102]], dtype=int32)>, 'attention_mask': <tf.Tensor: shape=(2, 15), dtype=int32, numpy=
array([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0],
       [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]], dtype=int32)>}

In [392]:
output = model(token_ids)
output

TFSequenceClassifierOutput(loss=None, logits=<tf.Tensor: shape=(2, 3), dtype=float32, numpy=
array([[-2.1110213 ,  1.178814  ,  1.4085091 ],
       [-0.01528974,  1.0974947 , -0.9926476 ]], dtype=float32)>, hidden_states=None, attentions=None)

In [395]:
y_probas = tf.keras.activations.softmax(output.logits)
print(y_probas)
y_pred = tf.argmax(y_probas, axis=1)
y_pred # 0 = contradiction, 1 = entailment, 2 = neutral

tf.Tensor(
[[0.01623192 0.43563944 0.54812866]
 [0.22628923 0.6885572  0.08515355]], shape=(2, 3), dtype=float32)


<tf.Tensor: shape=(2,), dtype=int64, numpy=array([2, 1])>

## Fine tuning of a pretrained model

In [406]:
sentences = [('Sky is blue', 'Sky is red'), ('I love her', 'She loves me')]
# Without the `data` call, it returns BatchEncoding class, which cannot be used to train the model
# Calling `data` returns a dict
x_train = tokenizer(sentences, padding=True, return_tensors='tf').data
print(x_train)
y_train = tf.constant([0, 2]) # Contradiction, Neutral
# The model outputs logits, not probabilities - so our loss function needs to know that
loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
model.compile(loss=loss, optimizer='nadam', metrics=['accuracy'])
history = model.fit(x_train, y_train, epochs=2)

{'input_ids': <tf.Tensor: shape=(2, 9), dtype=int32, numpy=
array([[ 101, 3712, 2003, 2630,  102, 3712, 2003, 2417,  102],
       [ 101, 1045, 2293, 2014,  102, 2016, 7459, 2033,  102]],
      dtype=int32)>, 'attention_mask': <tf.Tensor: shape=(2, 9), dtype=int32, numpy=
array([[1, 1, 1, 1, 1, 1, 1, 1, 1],
       [1, 1, 1, 1, 1, 1, 1, 1, 1]], dtype=int32)>}
Epoch 1/2
Epoch 2/2


---

# Ignore below

In [324]:
# ixed_positional_encoding_4 (F  (None, 50, 128)     0           ['embedding_20[0][0]']           
#  ixedPositionalEncoding)                                                                          
                                                                                                  
#  multi_head_attention_6 (MultiH  (None, 50, 128)     527488      ['fixed_positional_encoding_4[0][
#  eadAttention)                                                   0]',                             
#                                                                   'fixed_positional_encoding_4[0][

# Got it!
# We have 3 linear projections applied to the queries, values, and heads. 
# To adhere to num_heads=8, we take the tensor length on the final axis (embed_size=128) and multiply it by 8 => 128 * 8 = 1024. 
# So there is a normal Dense transformation from (32, 50, 128) => (32, 50, 1024) using a (128, 1024) matrix with 128*1024+1024 parameters.
# This is done 3 times. Then, after the scaled attention, we have the reverse transformation from (32, 50, 1024) to (32, 50, 128).
# This is done again using a normal Dense transformation using a (1024, 128) matrix with 1024*128+128
(128 * (128 * 8) + (128 * 8)) * 3 + ((128 * 8) * 128 + 128)

527488

In [327]:
tensor = tf.constant([
    [1, 2, 3],
    [4, 5, 6],
    [7, 8, 9],
    [9, 8, 7]
])

print(tensor.shape)
tensor2 = tensor[:, tf.newaxis]
print(tensor2.shape)

(4, 3)
(4, 1, 3)


In [351]:
input_a = tf.constant([[1, 2, 0, 0, 0]])
input_b = tf.constant([[1, 0, 0, 0, 0]])

print(input_a.shape, input_b.shape)

embeddings_layer_a = tf.keras.layers.Embedding(input_dim=3, output_dim=5, mask_zero=True)
embeddings_a = embeddings_layer_a(input_a)

embeddings_layer_b = tf.keras.layers.Embedding(input_dim=3, output_dim=5, mask_zero=True)
embeddings_b = embeddings_layer_b(input_b)

output = tf.keras.layers.Add()([embeddings_a, embeddings_b])
print(output._keras_mask)

output.shape

(1, 5) (1, 5)
tf.Tensor([[ True False False False False]], shape=(1, 5), dtype=bool)


TensorShape([1, 5, 5])