In [114]:
import re
import string
import tensorflow as tf
import numpy as np
from tensorflow import keras
from tensorflow.keras.layers import TextVectorization
from tensorflow.keras import layers, activations, optimizers, losses, metrics

## Text Vectorization

In [10]:
import string

class Vectorizer:
    '''A dummy text vectorizer'''

    def standardize(self, text:str) -> str:
        text = text.lower()
        text_without_punctuations = [char for char in text if char not in string.punctuation]
        return ''.join(text_without_punctuations)
    
    def tokenize(self, text:str) -> str:
        text = self.standardize(text)
        return text.split()
    
    def make_vocabulary(self, dataset):
        self.vocabulary = {'' : 0,
                           '[UNK]' : 1}
        for text in dataset:
            text = self.standardize(text)
            tokens = self.tokenize(text)

            for token in tokens:
                if token not in self.vocabulary:
                    self.vocabulary[token] = len(self.vocabulary)
        
        self.inverse_vocabulary = {value: key for key, value in self.vocabulary.items()}
    
    def encode(self, text:str) -> list:
        text = self.standardize(text)
        tokens = self.tokenize(text)
        return [self.vocabulary.get(token, 1) for token in tokens]
    
    def decode(self, sequence:list) -> str:
        return ' '.join([self.inverse_vocabulary.get(item, '[UNK]') for item in sequence])

In [32]:
dataset = ['I write, erase, rewrite', 'Erase again and then', 'A poppy blooms']

vectorizer = Vectorizer()
vectorizer.make_vocabulary(dataset)

vectorizer.vocabulary

{'': 0,
 '[UNK]': 1,
 'i': 2,
 'write': 3,
 'erase': 4,
 'rewrite': 5,
 'again': 6,
 'and': 7,
 'then': 8,
 'a': 9,
 'poppy': 10,
 'blooms': 11}

In [12]:
sample = 'I write, rewrite, and still rewrite again'
encoded = vectorizer.encode(sample)
encoded

[2, 3, 5, 7, 1, 5, 6]

In [13]:
decoded = vectorizer.decode(encoded)
decoded

'i write rewrite and [UNK] rewrite again'

In [18]:
# The keras TextVectorization layer

text_vectorization = keras.layers.TextVectorization(output_mode='int')

In [23]:
# Custom standardizer and tokenizer for TextVectorization

def custom_standardization_function(text:str) -> tf.strings:
    text = tf.strings.lower(text)
    text_without_punctuations = tf.strings.regex_replace(text, f'[{re.escape(string.punctuation)}]', '')
    return text_without_punctuations

def custom_tokenizer(text:tf.strings) -> tf.Tensor:
    return tf.strings.split(text)

text_vectorization = keras.layers.TextVectorization(output_mode='int',
                                                    standardize=custom_standardization_function,
                                                    split=custom_tokenizer)

text_vectorization.adapt(dataset)
text_vectorization.get_vocabulary()

['',
 '[UNK]',
 'erase',
 'write',
 'then',
 'rewrite',
 'poppy',
 'i',
 'blooms',
 'and',
 'again',
 'a']

In [24]:
sample = 'I write, rewrite, and still rewrite again'
text_vectorization(sample)

<tf.Tensor: shape=(7,), dtype=int64, numpy=array([ 7,  3,  5,  9,  1,  5, 10])>

In [None]:
# Note: TextVectorization cannot utilize GPU as it is mostly a dictionary lookup operation

# Approach 1 (performant): Put it in the tf.data pipeline. This happens asynchronously. This enables CPU to perform vectorization on batch_n+1 while GPU is training a batch_n.
sequences = dataset.map(text_vectorization, num_parallel=4)

# Approach 2 (flexible): Put is as a layer in the model. The GPU has to wait for this operation for every batch. However, this is easier to deploy in production environments, otherwise you might have to incorporate the logic in Javascript into the deployed model.
text_input = keras.Input(shape=(), dtype='string')
vectorized_text = keras.layers.TextVectorization(text_input)
embedded_input = keras.layers.Embedding(...)(vectorized_text)
output = ...
model = keras.Model(text_input, output)

## Dataset preparation

In [44]:
# # Collect and uncompress the ACL IMDB Movie Reviews dataset

# !wget 'https://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz'
# !tar -xf aclImdb_v1.tar.gz

# # Delete the Unsupervised training samples, we don't need them

# !rm -r aclImdb/train/unsup

# # Move it to data
# !mv aclImdb data/aclImdb

--2023-12-19 07:26:28--  https://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz
Resolving ai.stanford.edu (ai.stanford.edu)... 171.64.68.10
Connecting to ai.stanford.edu (ai.stanford.edu)|171.64.68.10|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 84125825 (80M) [application/x-gzip]
Saving to: ‘aclImdb_v1.tar.gz’


2023-12-19 07:26:52 (3.29 MB/s) - ‘aclImdb_v1.tar.gz’ saved [84125825/84125825]



In [45]:
import os, pathlib, shutil, random

base_dir = pathlib.Path('data/aclImdb')
val_dir = base_dir / 'val'
train_dir = base_dir / 'train'

for category in ('neg', 'pos'):
    os.makedirs(val_dir / category)
    files = os.listdir(train_dir / category)
    random.Random(1337).shuffle(files)
    num_val_samples = int(0.2 * len(files))
    val_files = files[-num_val_samples:]
    for fname in val_files:
        shutil.move(train_dir / category / fname, 
                    val_dir / category / fname)

In [46]:
batch_size = 32

train_ds = keras.utils.text_dataset_from_directory('data/aclImdb/train', batch_size=batch_size)
val_ds = keras.utils.text_dataset_from_directory('data/aclImdb/val', batch_size=batch_size)
test_ds = keras.utils.text_dataset_from_directory('data/aclImdb/test', batch_size=batch_size)

Found 20000 files belonging to 2 classes.
Found 5000 files belonging to 2 classes.
Found 25000 files belonging to 2 classes.


In [59]:
def snapshot(dataset):
    for inputs, targets in dataset:
        print('inputs.shape:', inputs.shape)
        print('inputs.dtype:', inputs.dtype)
        print('targets.shape:', targets.shape)
        print('targets.dtype:', targets.dtype)
        print('inputs[0]:', inputs[0])
        print('targets[0]:', targets[0])
        break

In [60]:
snapshot(train_ds)

inputs.shape: (32,)
inputs.dtype: <dtype: 'string'>
targets.shape: (32,)
targets.dtype: <dtype: 'int32'>
inputs[0]: tf.Tensor(b'Although I love this movie, I can barely watch it, it is so real. So, I put it on tonight and hid behind my bank of computers. I remembered it vividly, but just wanted to see if I could find something I hadn\'t seen before........I didn\'t: that\'s because it\'s so real to me.<br /><br />Another "user" wrote the ages of the commentators should be shown with their summary. I\'m all for that ! It\'s absolutely obvious that most of these people who\'ve made comments about "Midnight Cowboy" may not have been born when it was released. They are mentioning other movies Jon Voight and Dustin Hoffman have appeared in, at a later time. I\'ll be just as ruinously frank: I am 82-years-old. If you\'re familiar with some of my other comments, you\'ll be aware that I was a professional female-impersonator for 60 of those years, and also have appeared in film - you\'d never 

## Two text representations: Ordered Sequences (Sequence Models) and Unordered Sets (Bag of Words)

### Bag of Words

In [57]:
# Let's try the Bag of Words approach first, using unigrams

text_vectorization = TextVectorization(max_tokens=20000,                      # Limit vocabulary to 20000 most frequent words
                                       output_mode='multi_hot')               # Encode output tokens as multi-hot binary vectors

text_only_train_ds = train_ds.map(lambda x, y: x)
text_vectorization.adapt(text_only_train_ds)

binary_1gram_train_ds = train_ds.map(lambda x, y: (text_vectorization(x), y), num_parallel_calls=4)
binary_1gram_val_ds = val_ds.map(lambda x, y: (text_vectorization(x), y), num_parallel_calls=4)
binary_1gram_test_ds = test_ds.map(lambda x, y: (text_vectorization(x), y), num_parallel_calls=4)

In [61]:
snapshot(binary_1gram_train_ds)

inputs.shape: (32, 20000)
inputs.dtype: <dtype: 'float32'>
targets.shape: (32,)
targets.dtype: <dtype: 'int32'>
inputs[0]: tf.Tensor([1. 1. 1. ... 0. 0. 0.], shape=(20000,), dtype=float32)
targets[0]: tf.Tensor(0, shape=(), dtype=int32)


In [84]:
def get_bow_model(max_tokens=20000, hidden_dim=16):
    inputs = keras.Input(shape=(max_tokens,))
    x = layers.Dense(hidden_dim, activation=activations.relu)(inputs)
    x = layers.Dropout(0.5)(x)
    outputs = layers.Dense(1, activation=activations.sigmoid)(x)

    model = keras.Model(inputs, outputs)

    model.compile(optimizer=optimizers.RMSprop(),
                loss=losses.BinaryCrossentropy(),
                metrics=[metrics.BinaryAccuracy()])
    
    return model


In [85]:
model = get_bow_model()
model.summary()



Model: "model_3"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_5 (InputLayer)        [(None, 20000)]           0         
                                                                 
 dense_6 (Dense)             (None, 16)                320016    
                                                                 
 dropout_3 (Dropout)         (None, 16)                0         
                                                                 
 dense_7 (Dense)             (None, 1)                 17        
                                                                 
Total params: 320033 (1.22 MB)
Trainable params: 320033 (1.22 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [67]:
callbacks = [keras.callbacks.ModelCheckpoint('ckpts/binary_1gram.keras')]

model.fit(binary_1gram_train_ds.cache(),
          validation_data=binary_1gram_val_ds.cache(),
          epochs=10,
          callbacks=callbacks)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.src.callbacks.History at 0x28de170d0>

In [76]:
model = keras.models.load_model('ckpts/binary_1gram.keras')
model.evaluate(binary_1gram_test_ds)[1]





0.8791600465774536

In [86]:
# Bag of words, using 2-grams

text_vectorization = TextVectorization(ngrams=2,
                                       max_tokens=20000,
                                       output_mode='multi_hot')

text_vectorization.adapt(text_only_train_ds)

binary_2gram_train_ds = train_ds.map(lambda x, y: (text_vectorization(x), y), num_parallel_calls=4)
binary_2gram_val_ds = val_ds.map(lambda x, y: (text_vectorization(x), y), num_parallel_calls=4)
binary_2gram_test_ds = test_ds.map(lambda x, y: (text_vectorization(x), y), num_parallel_calls=4)

model = get_bow_model()

model.summary()



Model: "model_4"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_6 (InputLayer)        [(None, 20000)]           0         
                                                                 
 dense_8 (Dense)             (None, 16)                320016    
                                                                 
 dropout_4 (Dropout)         (None, 16)                0         
                                                                 
 dense_9 (Dense)             (None, 1)                 17        
                                                                 
Total params: 320033 (1.22 MB)
Trainable params: 320033 (1.22 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [79]:
callbacks = [keras.callbacks.ModelCheckpoint('ckpts/binary_2gram.keras', save_best_only=True)]

model.fit(binary_2gram_train_ds.cache(),
          validation_data=binary_2gram_val_ds.cache(),
          epochs=10,
          callbacks=callbacks)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.src.callbacks.History at 0x2fcbd9910>

In [87]:
model = keras.models.load_model('ckpts/binary_2gram.keras')
model.evaluate(binary_2gram_test_ds)[1]





0.8946800827980042

In [88]:
# Bag of words, using 2-grams and TF-IDF encoding

text_vectorization = TextVectorization(ngrams=2,
                                       max_tokens=20000,
                                       output_mode='tf_idf')

text_vectorization.adapt(text_only_train_ds)

tfidf_2gram_train_ds = train_ds.map(lambda x, y: (text_vectorization(x), y), num_parallel_calls=4)
tfidf_2gram_val_ds = val_ds.map(lambda x, y: (text_vectorization(x), y), num_parallel_calls=4)
tfidf_2gram_test_ds = test_ds.map(lambda x, y: (text_vectorization(x), y), num_parallel_calls=4)

model = get_bow_model()

callbacks = [keras.callbacks.ModelCheckpoint('ckpts/tfidf_2gram.keras', save_best_only=True)]

model.fit(tfidf_2gram_train_ds.cache(),
          validation_data=tfidf_2gram_val_ds.cache(),
          epochs=10,
          callbacks=callbacks)



Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.src.callbacks.History at 0x28fd64b50>

In [89]:
model = keras.models.load_model('ckpts/tfidf_2gram.keras')
model.evaluate(tfidf_2gram_test_ds)[1]





0.8961600661277771

In [101]:
# Example of using text vectorization as part of a model training pipeline (Approach 2 as mentioned before)

inputs = keras.Input(shape=(1,), dtype='string')
processed_inputs = text_vectorization(inputs)
outputs = model(processed_inputs)

inference_model = keras.Model(inputs, outputs)

raw_text_data = tf.convert_to_tensor([['That was an excellent movie, I love it!'],])
predictions = inference_model(raw_text_data)

print(f'{float(predictions[0] * 100):.3f}% positive')

83.285% positive


### Sequence Models

In [102]:
max_length = 600
max_tokens = 20000

text_vectorization = layers.TextVectorization(max_tokens=max_tokens,
                                              output_mode='int',
                                              output_sequence_length=max_length)
text_vectorization.adapt(text_only_train_ds)

int_train_ds = train_ds.map(lambda x, y: (text_vectorization(x), y), num_parallel_calls=4)
int_val_ds = val_ds.map(lambda x, y: (text_vectorization(x), y), num_parallel_calls=4)
int_test_ds = test_ds.map(lambda x, y: (text_vectorization(x), y), num_parallel_calls=4)

snapshot(int_train_ds)

inputs.shape: (32, 600)
inputs.dtype: <dtype: 'int64'>
targets.shape: (32,)
targets.dtype: <dtype: 'int32'>
inputs[0]: tf.Tensor(
[    4   244    37  4231  3335  2720 12922     3    40   512  1541 12923
    26  1602    76     4   378   492     3   217   825    17     4   373
  1708  3941     1    65  7866   611     4  1290   247     1 18668     3
    29   533     5     4   199     1    13   861    38  2420    11     7
     4    85    20    18    37  2305    30    38    53   252    13    13
  1126   181  1206  9262   603    81  4337    46    43  5711  3335   208
    85  2982   208   100    73  3361     6     1    13    38   409    23
    77   347    11     4   169     9   468   821    23  1195    23  2284
    23    41  4213    23     8    98    97    18     9    77   384    23
  2311     3  1092     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0 

In [103]:
inputs = keras.Input(shape=(None,), dtype='int64')
embedded = tf.one_hot(inputs, depth=max_tokens)
x = layers.Bidirectional(layers.LSTM(32))(embedded)
x = layers.Dropout(0.5)(x)
outputs = layers.Dense(1, activation=activations.sigmoid)(x)
model = keras.Model(inputs, outputs)

model.compile(optimizer=optimizers.RMSprop(),
              loss=losses.BinaryCrossentropy(),
              metrics=[metrics.BinaryAccuracy()])
model.summary()



Model: "model_18"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_20 (InputLayer)       [(None, None)]            0         
                                                                 
 tf.one_hot (TFOpLambda)     (None, None, 20000)       0         
                                                                 
 bidirectional (Bidirection  (None, 64)                5128448   
 al)                                                             
                                                                 
 dropout_6 (Dropout)         (None, 64)                0         
                                                                 
 dense_12 (Dense)            (None, 1)                 65        
                                                                 
Total params: 5128513 (19.56 MB)
Trainable params: 5128513 (19.56 MB)
Non-trainable params: 0 (0.00 Byte)
__________________

In [104]:
callbacks = [keras.callbacks.ModelCheckpoint('ckpts/one_hot_bidirectional_lstm.keras', save_best_only=True)]

model.fit(int_train_ds,
          epochs=10,
          validation_data=int_val_ds,
          callbacks=callbacks)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.src.callbacks.History at 0x2de258290>

In [106]:
model = keras.models.load_model('ckpts/one_hot_bidirectional_lstm.keras')
model.evaluate(int_test_ds)[1]





0.8658800721168518

## Word Embeddings

In [117]:
inputs = keras.Input(shape=(None,), dtype='int64')
embedded = layers.Embedding(input_dim=max_tokens, output_dim=256)(inputs)
x = layers.Bidirectional(layers.LSTM(32))(embedded)
x = layers.Dropout(0.5)(x)
outputs = layers.Dense(1, activation=activations.sigmoid)(x)
model = keras.Model(inputs, outputs)

model.compile(optimizer=optimizers.RMSprop(),
              loss=losses.BinaryCrossentropy(),
              metrics=[metrics.BinaryAccuracy()])
model.summary()



Model: "model_22"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_24 (InputLayer)       [(None, None)]            0         
                                                                 
 embedding_3 (Embedding)     (None, None, 256)         5120000   
                                                                 
 bidirectional_4 (Bidirecti  (None, 64)                73984     
 onal)                                                           
                                                                 
 dropout_10 (Dropout)        (None, 64)                0         
                                                                 
 dense_16 (Dense)            (None, 1)                 65        
                                                                 
Total params: 5194049 (19.81 MB)
Trainable params: 5194049 (19.81 MB)
Non-trainable params: 0 (0.00 Byte)
__________________

In [118]:
callbacks = [keras.callbacks.ModelCheckpoint('ckpts/embeddings_bidirectional_lstm.keras', save_best_only=True)]

model.fit(int_train_ds,
          epochs=10,
          validation_data=int_val_ds,
          callbacks=callbacks)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.src.callbacks.History at 0xb044db0d0>

In [109]:
model = keras.models.load_model('ckpts/embeddings_bidirectional_lstm.keras')
model.evaluate(int_test_ds)[1]





0.8593200445175171

In [168]:
# Masking (mask parameter in embedding layer) is used to mask out trailing zeros, stopping the model from learning false representations from them

inputs = keras.Input(shape=(None,), dtype='int64')
embedded = layers.Embedding(input_dim=max_tokens, output_dim=256, mask_zero=True)(inputs)
x = layers.Bidirectional(layers.LSTM(32))(embedded)
x = layers.Dropout(0.5)(x)
outputs = layers.Dense(1, activation=activations.sigmoid)(x)
model = keras.Model(inputs, outputs)

model.compile(optimizer=optimizers.RMSprop(),
              loss=losses.BinaryCrossentropy(),
              metrics=[metrics.BinaryAccuracy()])
model.summary()



Model: "model_28"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_39 (InputLayer)       [(None, None)]            0         
                                                                 
 embedding_16 (Embedding)    (None, None, 256)         5120000   
                                                                 
 bidirectional_10 (Bidirect  (None, 64)                73984     
 ional)                                                          
                                                                 
 dropout_16 (Dropout)        (None, 64)                0         
                                                                 
 dense_22 (Dense)            (None, 1)                 65        
                                                                 
Total params: 5194049 (19.81 MB)
Trainable params: 5194049 (19.81 MB)
Non-trainable params: 0 (0.00 Byte)
__________________

In [169]:
callbacks = [keras.callbacks.ModelCheckpoint('ckpts/embeddings_bidirectional_lstm_masking.keras', save_best_only=True)]

model.fit(int_train_ds,
          epochs=10,
          validation_data=int_val_ds,
          callbacks=callbacks)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.src.callbacks.History at 0x9b7ab3f10>

In [121]:
model = keras.models.load_model('ckpts/embeddings_bidirectional_lstm_masking.keras')
model.evaluate(int_test_ds)[1]





0.5034400224685669

## Pretrained embeddings

In [131]:
!wget http://nlp.stanford.edu/data/glove.6B.zip
!unzip -q glove.6B.zip -d data/glove
!rm glove.6B.zip
!rm data/glove/glove.6B.50d.txt
!rm data/glove/glove.6B.200d.txt
!rm data/glove/glove.6B.300d.txt

--2023-12-25 20:30:10--  http://nlp.stanford.edu/data/glove.6B.zip
Resolving nlp.stanford.edu (nlp.stanford.edu)... 171.64.67.140
Connecting to nlp.stanford.edu (nlp.stanford.edu)|171.64.67.140|:80... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://nlp.stanford.edu/data/glove.6B.zip [following]
--2023-12-25 20:30:10--  https://nlp.stanford.edu/data/glove.6B.zip
Connecting to nlp.stanford.edu (nlp.stanford.edu)|171.64.67.140|:443... connected.
HTTP request sent, awaiting response... 301 Moved Permanently
Location: https://downloads.cs.stanford.edu/nlp/data/glove.6B.zip [following]
--2023-12-25 20:30:10--  https://downloads.cs.stanford.edu/nlp/data/glove.6B.zip
Resolving downloads.cs.stanford.edu (downloads.cs.stanford.edu)... 171.64.64.22
Connecting to downloads.cs.stanford.edu (downloads.cs.stanford.edu)|171.64.64.22|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 862182613 (822M) [application/zip]
Saving to: ‘glove.6B.zip’


202

In [157]:
# We will use Glove embeddings

path_to_embeddings = './data/glove/glove.6B.100d.txt'

embeddings_index = {}

with open(path_to_embeddings) as f:
    for line in f:
        word, coefs = line.split(maxsplit=1)
        coefs = np.fromstring(coefs, 'f', sep=' ')
        embeddings_index[word] = coefs

len(embeddings_index)

400000

In [158]:
embedding_dim = 100

vocabulary = text_vectorization.get_vocabulary()
word_index = dict(zip(vocabulary, range(len(vocabulary))))
embedding_matrix = np.zeros((max_tokens, embedding_dim))

for word, i in word_index.items():
    if i < max_tokens:
        embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector

In [162]:
embedding_layer = layers.Embedding(max_tokens, 
                                   embedding_dim, 
                                   embeddings_initializer=keras.initializers.Constant(embedding_matrix), 
                                   trainable=False, 
                                   mask_zero=False)

In [163]:
inputs = keras.Input(shape=(None,), dtype='int64')
embedded = embedding_layer(inputs)
x = layers.Bidirectional(layers.LSTM(32))(embedded)
x = layers.Dropout(0.5)(x)
outputs = layers.Dense(1, activation=activations.sigmoid)(x)
model = keras.Model(inputs, outputs)

model.compile(optimizer=optimizers.RMSprop(),
              loss=losses.BinaryCrossentropy(),
              metrics=[metrics.BinaryAccuracy()])
model.summary()



Model: "model_26"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_37 (InputLayer)       [(None, None)]            0         
                                                                 
 embedding_14 (Embedding)    (None, None, 100)         2000000   
                                                                 
 bidirectional_8 (Bidirecti  (None, 64)                34048     
 onal)                                                           
                                                                 
 dropout_14 (Dropout)        (None, 64)                0         
                                                                 
 dense_20 (Dense)            (None, 1)                 65        
                                                                 
Total params: 2034113 (7.76 MB)
Trainable params: 34113 (133.25 KB)
Non-trainable params: 2000000 (7.63 MB)
________________

In [164]:
callbacks = [keras.callbacks.ModelCheckpoint('ckpts/glove_embedding_sequence_model.keras', save_best_only=True)]

model.fit(int_train_ds,
          epochs=10,
          validation_data=int_val_ds,
          callbacks=callbacks)

Epoch 1/10


Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.src.callbacks.History at 0x9c8bdc850>

In [165]:
model = keras.models.load_model('ckpts/glove_embedding_sequence_model.keras')
model.evaluate(int_test_ds)[1]





0.8658800721168518

## Self Attention

In [171]:
# Pseudocode

def self_attention(input_sequence):
    
    output = np.zeros_like(input_sequence)

    for i, pivot_vector in enumerate(input_sequence):
        scores = np.zeros(shape=(len(input_sequence),))

        for j, vector in enumerate(input_sequence):
            scores[j] = np.dot(pivot_vector, vector.T)
        
        scores /= np.sqrt(input_sequence.shape[1])
        scores = np.softmax(scores)

        new_pivot_representation = np.zeros_like(pivot_vector)

        for j, vector in enumerate(input_sequence):
            new_pivot_representation += vector * scores[j]
        
        output[i] = new_pivot_representation

    return output

In [None]:
# Keras way: the Multi-head Attention Layer

num_heads = 4
embed_dim = 256
mha_layer = layers.MultiHeadAttention(num_heads=num_heads, key_dim=embed_dim)
outputs = mha_layer(inputs, inputs, inputs)

## Transformer

In [178]:
# The Transformer Encoder: A useful generic encoder that is useful for learning representations

@keras.saving.register_keras_serializable()
class TransformerEncoder(layers.Layer):

    def __init__(self, embed_dim, dense_dim, num_heads, **kwargs):
        super().__init__(**kwargs)
        self.embed_dim = embed_dim
        self.dense_dim = dense_dim
        self.num_heads = num_heads
        self.attention = layers.MultiHeadAttention(num_heads=num_heads, key_dim=embed_dim)
        self.dense_proj = keras.Sequential([layers.Dense(dense_dim, activation=activations.relu),
                                            layers.Dense(embed_dim)])
        self.layernorm_1 = layers.LayerNormalization()                      
        self.layernorm_2 = layers.LayerNormalization()

    def call(self, inputs, mask=None):
        if mask is not None:
            mask = mask[:, tf.newaxis, :]
        attention_output = self.attention(inputs, inputs, attention_mask=mask)
        proj_input = self.layernorm_1(inputs + attention_output)
        proj_output = self.dense_proj(proj_input)

        return self.layernorm_2(proj_output)
    
    def get_config(self):
        config = super().get_config()
        config.update({
            'embed_dim': self.embed_dim,
            'num_heads': self.num_heads,
            'dense_dim': self.dense_dim
        })
        return config

In [174]:
# We use layer normalization (each sequence in batch normalized independently for a layer) for sequence data because batch normalization does not work well for sequences.

def layer_normalization(batch_of_sequences):                          # Input shape: (batch_size, sequence_length, embedding_dim)
    mean = np.mean(batch_of_sequences, keepdims=True, axis=-1)
    variance = np.var(batch_of_sequences, keepdims=True, axis=-1)
    
    return (batch_of_sequences - mean) / variance

def batch_normalization(batch_of_images):                             # Input shape: (batch_size, height, width, channels)
    mean = np.mean(batch_of_images, keepdims=True, axis=(0, 1, 2))
    variance = np.var(batch_of_images, keepdims=True, axis=(0, 1, 2))
    
    return (batch_of_images - mean) / variance

In [180]:
vocab_size = 20000
embed_dim = 256
num_heads = 2
dense_dim = 32

inputs = keras.Input(shape=(None,), dtype='int64')
x = layers.Embedding(vocab_size, embed_dim)(inputs)
x = TransformerEncoder(embed_dim, dense_dim, num_heads)(x)
x = layers.GlobalMaxPooling1D()(x)                            # TransformerEncoder returns full sequences. We need to reduce each sequence to a vector for classification.
x = layers.Dropout(0.5)(x)
outputs = layers.Dense(1, activation=activations.sigmoid)(x)

model = keras.Model(inputs, outputs)

model.compile(optimizer=optimizers.RMSprop(),
              loss=losses.BinaryCrossentropy(),
              metrics=[metrics.BinaryAccuracy()])

model.summary()



Model: "model_30"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_41 (InputLayer)       [(None, None)]            0         
                                                                 
 embedding_18 (Embedding)    (None, None, 256)         5120000   
                                                                 
 transformer_encoder_1 (Tra  (None, None, 256)         543776    
 nsformerEncoder)                                                
                                                                 
 global_max_pooling1d_1 (Gl  (None, 256)               0         
 obalMaxPooling1D)                                               
                                                                 
 dropout_18 (Dropout)        (None, 256)               0         
                                                                 
 dense_28 (Dense)            (None, 1)                 257

In [181]:
callbacks = [keras.callbacks.ModelCheckpoint('ckpts/transformer_encoder.keras', save_best_only=True)]

model.fit(int_train_ds,
          epochs=20,
          validation_data=int_val_ds,
          callbacks=callbacks)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<keras.src.callbacks.History at 0x9c9c7bd50>

In [182]:
model = keras.models.load_model('ckpts/transformer_encoder.keras')
model.evaluate(int_test_ds)[1]





0.859760046005249

In [186]:
# Tranformer Encoder with glove embeddings

vocab_size = 20000
embed_dim = 100
num_heads = 2
dense_dim = 32

vocabulary = text_vectorization.get_vocabulary()
word_index = dict(zip(vocabulary, range(len(vocabulary))))
embedding_matrix = np.zeros((max_tokens, embed_dim))

for word, i in word_index.items():
    if i < max_tokens:
        embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector

inputs = keras.Input(shape=(None,), dtype='int64')
x = layers.Embedding(vocab_size, 
                     embed_dim, 
                     embeddings_initializer=keras.initializers.Constant(embedding_matrix), 
                     trainable=False, 
                     mask_zero=False)(inputs)
x = TransformerEncoder(embed_dim, dense_dim, num_heads)(x)
x = layers.GlobalMaxPooling1D()(x)                            # TransformerEncoder returns full sequences. We need to reduce each sequence to a vector for classification.
x = layers.Dropout(0.5)(x)
outputs = layers.Dense(1, activation=activations.sigmoid)(x)

model = keras.Model(inputs, outputs)

model.compile(optimizer=optimizers.RMSprop(),
              loss=losses.BinaryCrossentropy(),
              metrics=[metrics.BinaryAccuracy()])

model.summary()



Model: "model_31"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_44 (InputLayer)       [(None, None)]            0         
                                                                 
 embedding_21 (Embedding)    (None, None, 100)         2000000   
                                                                 
 transformer_encoder_2 (Tra  (None, None, 100)         87632     
 nsformerEncoder)                                                
                                                                 
 global_max_pooling1d_2 (Gl  (None, 100)               0         
 obalMaxPooling1D)                                               
                                                                 
 dropout_19 (Dropout)        (None, 100)               0         
                                                                 
 dense_33 (Dense)            (None, 1)                 101

In [187]:
callbacks = [keras.callbacks.ModelCheckpoint('ckpts/transformer_encoder_glove.keras', save_best_only=True)]

model.fit(int_train_ds,
          epochs=20,
          validation_data=int_val_ds,
          callbacks=callbacks)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<keras.src.callbacks.History at 0x9ab00bd90>

In [188]:
model = keras.models.load_model('ckpts/transformer_encoder_glove.keras')
model.evaluate(int_test_ds)[1]





0.8664000630378723

## Positional Embeddings

In [219]:
# Positional Embeddings allow us to incorporate sequential structure in the self attention, which is a set operation.
# One downside is that sequence length needs to be known in advance.

@keras.saving.register_keras_serializable()
class PositionalEmbedding(layers.Layer):

    def __init__(self, sequence_length, input_dim, output_dim, **kwargs):
        super().__init__(**kwargs)
        self.token_embeddings = layers.Embedding(input_dim=input_dim,
                                                 output_dim=output_dim)
        self.position_embeddings = layers.Embedding(input_dim=sequence_length,
                                                    output_dim=output_dim)
        self.sequence_length = sequence_length
        self.input_dim = input_dim
        self.output_dim = output_dim

    def call(self, inputs):
        length = tf.shape(inputs)[-1]
        positions = tf.range(start=0,
                             limit=length,
                             delta=1)
        embedded_tokens = self.token_embeddings(inputs)
        embedded_positions = self.position_embeddings(positions)

        return embedded_tokens + embedded_positions
    
    def compute_mask(self, inputs, mask=None):
        return tf.math.not_equal(inputs, 0)
    
    def get_config(self):
        config = super().get_config()
        config.update({'output_dim': self.output_dim,
                       'sequence_length': self.sequence_length,
                       'input_dim': self.input_dim}) 
        return config

In [220]:
vocab_size = 20000
sequence_length = 600
embed_dim = 256
num_heads = 2
dense_dim = 32

inputs = keras.Input(shape=(None,), dtype='int64')
x = PositionalEmbedding(sequence_length=sequence_length,
                        input_dim=vocab_size,
                        output_dim=embed_dim)(inputs)
x = TransformerEncoder(embed_dim=embed_dim,
                       dense_dim=dense_dim,
                       num_heads=num_heads)(x)
x = layers.GlobalAveragePooling1D()(x)
x = layers.Dropout(0.5)(x)
outputs = layers.Dense(1, activation=activations.sigmoid)(x)

model = keras.Model(inputs, outputs)

model.compile(optimizer=optimizers.RMSprop(),
              loss=losses.BinaryCrossentropy(),
              metrics=[metrics.BinaryAccuracy()])

model.summary()



Model: "model_48"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_63 (InputLayer)       [(None, None)]            0         
                                                                 
 positional_embedding_18 (P  (None, None, 256)         5273600   
 ositionalEmbedding)                                             
                                                                 
 transformer_encoder_19 (Tr  (None, None, 256)         543776    
 ansformerEncoder)                                               
                                                                 
 global_average_pooling1d_1  (None, 256)               0         
 2 (GlobalAveragePooling1D)                                      
                                                                 
 dropout_36 (Dropout)        (None, 256)               0         
                                                          

In [221]:
callbacks = [keras.callbacks.ModelCheckpoint('ckpts/full_transformer_encoder.keras', save_best_only=True)]
model.fit(int_train_ds,
          epochs=20,
          validation_data=int_val_ds,
          callbacks=callbacks)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<keras.src.callbacks.History at 0x70638f1f10>

In [223]:
model = keras.models.load_model('ckpts/full_transformer_encoder.keras')
model.evaluate(int_test_ds)[1]





0.8667600750923157

## Sequence to Sequence: Translating English to Spanish

In [225]:
!wget http://storage.googleapis.com/download.tensorflow.org/data/spa-eng.zip
!unzip -q spa-eng.zip -d data
!rm spa-eng.zip

--2023-12-30 18:59:10--  http://storage.googleapis.com/download.tensorflow.org/data/spa-eng.zip
Resolving storage.googleapis.com (storage.googleapis.com)... 142.250.65.251, 142.250.81.251, 142.250.72.123, ...
Connecting to storage.googleapis.com (storage.googleapis.com)|142.250.65.251|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 2638744 (2.5M) [application/zip]
Saving to: ‘spa-eng.zip’


2023-12-30 18:59:11 (2.35 MB/s) - ‘spa-eng.zip’ saved [2638744/2638744]



In [229]:
with open('data/spa-eng/spa.txt') as f:
    lines = f.read().split('\n')[:-1]

text_pairs = []

for line in lines:
    english, spanish = line.split('\t')
    spanish = '[start]' + spanish + '[end]'
    text_pairs.append((english, spanish))

text_pairs[90]

('Drop it!', '[start]Suéltalo.[end]')

In [230]:
random.shuffle(text_pairs)
num_val_samples = int(0.15 * len(text_pairs))
num_train_samples = len(text_pairs) - 2 * num_val_samples

train_pairs = text_pairs[:num_train_samples]
val_pairs = text_pairs[num_train_samples: num_train_samples + num_val_samples]
test_pairs = text_pairs[num_train_samples + num_val_samples:]

len(train_pairs), len(val_pairs), len(test_pairs)

(83276, 17844, 17844)

In [231]:
# Customizing string standardization for text vectorization layer. We want to also eliminate '¿' but do not wish to eliminate '[', ']' due to their meaningful use in [start], [end] tokens
import re

strip_chars = string.punctuation + '¿'
strip_chars = strip_chars.replace('[', '')
strip_chars = strip_chars.replace(']', '')

In [238]:
def custom_standardization(input_string):

    lowercase = tf.strings.lower(input_string)
    output = tf.strings.regex_replace(lowercase, f'[{re.escape(strip_chars)}]', '')

    return output

In [240]:
vocab_size = 15000
sequence_length = 20

source_vectorization = layers.TextVectorization(max_tokens=vocab_size,
                                                output_mode='int',
                                                output_sequence_length=sequence_length)

target_vectorization = layers.TextVectorization(max_tokens=vocab_size,
                                                output_mode='int',
                                                output_sequence_length=sequence_length + 1,
                                                standardize=custom_standardization)


train_english_texts = [pair[0] for pair in train_pairs]
train_spanish_texts = [pair[1] for pair in train_pairs]
source_vectorization.adapt(train_english_texts)
target_vectorization.adapt(train_spanish_texts)

In [241]:
batch_size = 64

def format_dataset(eng, spa):

    eng = source_vectorization(eng)
    spa = target_vectorization(spa)

    return {'english': eng, 'spanish':spa[:, :-1]}, spa[:, 1:]

def make_dataset(pairs):
    eng_texts, spa_texts = zip(*pairs)
    eng_texts = list(eng_texts)
    spa_texts = list(spa_texts)

    dataset = tf.data.Dataset.from_tensor_slices((eng_texts, spa_texts))
    dataset = dataset.batch(batch_size)
    dataset = dataset.map(format_dataset, num_parallel_calls=4)

    return dataset.shuffle(2048).prefetch(16).cache()

train_ds = make_dataset(train_pairs)
val_ds = make_dataset(val_pairs)
test_ds = make_dataset(test_pairs)

In [242]:
for inputs, targets in train_ds.take(1):
    print(f'inputs["english"].shape: {inputs["english"].shape}')
    print(f'inputs["spanish"].shape: {inputs["spanish"].shape}')
    print(f'targets.shape: {targets.shape}')

inputs["english"].shape: (64, 20)
inputs["spanish"].shape: (64, 20)
targets.shape: (64, 20)


2023-12-30 20:17:36.278935: W tensorflow/core/kernels/data/cache_dataset_ops.cc:858] The calling iterator did not fully read the dataset being cached. In order to avoid unexpected truncation of the dataset, the partially cached contents of the dataset  will be discarded. This can happen if you have an input pipeline similar to `dataset.cache().take(k).repeat()`. You should use `dataset.take(k).cache().repeat()` instead.
