In [None]:
vocabulary = {}
for text in dataset:
    text = standardize(text)
    tokens = tokenize(text)
    for token in tokens:
        if token not in vocabulary:
            vocabulary[token] = len(vocabulary)

In [None]:
def one_hot_encode_token(token):
    vector = np.zeros((len(vocabulary),))
    token_index = vocabulary[token]
    vector[token_index] = 1
    return vector

In [6]:
import string

class Vectorizer:
    
    def standardize(self, text):
        text = text.lower()
        return ''.join(char for char in text
                       if char not in string.punctuation)
    
    def tokenize(self, text):
        text = self.standardize(text)
        return text.split()
    
    def make_vocabulary(self, dataset):
        self.vocabulary = {'':0, '[UNK]':1}
        for text in dataset:
            text = self.standardize(text)
            tokens = self.tokenize(text)
            for token in tokens:
                if token not in self.vocabulary:
                    self.vocabulary[token] = len(self.vocabulary)
        self.inverse_vocabulary = dict(
            (v, k) for k, v in self.vocabulary.items())
        
    def encode(self, text):
        text = self.standardize(text)
        tokens = self.tokenize(text)
        return [self.vocabulary.get(token, 1) for token in tokens]
    
    def decode(self, int_sequence):
        return ' '.join(self.inverse_vocabulary.get(i, ['UNK']) for i in int_sequence)
    
vectorizer = Vectorizer()
dataset = [
    'I write, erase, rewrite',
    'Erase again, and then',
    'A poppy blooms.',
]
vectorizer.make_vocabulary(dataset)

In [4]:
test_sentence = 'I write, rewrite, and still rewrite again'
encoded_sentence = vectorizer.encode(test_sentence)
print(encoded_sentence)

[2, 3, 5, 7, 1, 5, 6]


In [7]:
decoded_sentence = vectorizer.decode(encoded_sentence)
print(decoded_sentence)

i write rewrite and [UNK] rewrite again


In [8]:
from tensorflow.keras.layers import TextVectorization
text_vectorization = TextVectorization(
    output_mode='int')

In [9]:
import re
import string
import tensorflow as tf

In [11]:
def custom_standardization_fn(string_tensor):
    lowercase_string = tf.strings.lower(string_tensor)
    return tf.strings.regex_replace(
        lowercase_string, f'[{re.escape(string.punctuation)}]','')

def custom_split_fn(string_tensor):
    return tf.strings.split(string_tensor)

text_vectorization = TextVectorization(
    output_mode='int',
    standardize=custom_standardization_fn,
    split=custom_split_fn,
)

In [12]:
text_vectorization.adapt(dataset)

In [13]:
text_vectorization.get_vocabulary()

['',
 '[UNK]',
 'erase',
 'write',
 'then',
 'rewrite',
 'poppy',
 'i',
 'blooms',
 'and',
 'again',
 'a']

In [14]:
vocabulary = text_vectorization.get_vocabulary()

In [15]:
test_sentence = 'I write, rewrite, and still rewrite again'

In [16]:
encoded_sentence = text_vectorization(test_sentence)
print(encoded_sentence)

tf.Tensor([ 7  3  5  9  1  5 10], shape=(7,), dtype=int64)


In [17]:
inverse_vocab = dict(enumerate(vocabulary))

In [18]:
inverse_vocab

{0: '',
 1: '[UNK]',
 2: 'erase',
 3: 'write',
 4: 'then',
 5: 'rewrite',
 6: 'poppy',
 7: 'i',
 8: 'blooms',
 9: 'and',
 10: 'again',
 11: 'a'}

In [19]:
decoded_sentence = ' '.join(inverse_vocab[int(i)] for i in encoded_sentence)
print(decoded_sentence)

i write rewrite and [UNK] rewrite again


In [None]:
int_sequence_dataset = string_dataset.map(
    text_vectorization,
    num_parallel_calls=4)

#parallelize the map() call across multiple CPU cores

In [None]:
text_input = keras.Input(shape=(), dtype='string')
vectorized_text = text_vectorization(text_input)
embedded_input = keras.layers.Embedding(...)(vectorized_text)
output = ...
model = keras.Model(text_input, output)

In [None]:
!curl -O https://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz

In [20]:
!tar -xf aclImdb_v1.tar.gz

In [1]:
!rm -r aclImdb/train/unsup/

In [2]:
!cat aclImdb/train/pos/4077_10.txt

I first saw this back in the early 90s on UK TV, i did like it then but i missed the chance to tape it, many years passed but the film always stuck with me and i lost hope of seeing it TV again, the main thing that stuck with me was the end, the hole castle part really touched me, its easy to watch, has a great story, great music, the list goes on and on, its OK me saying how good it is but everyone will take there own best bits away with them once they have seen it, yes the animation is top notch and beautiful to watch, it does show its age in a very few parts but that has now become part of it beauty, i am so glad it has came out on DVD as it is one of my top 10 films of all time. Buy it or rent it just see it, best viewing is at night alone with drink and food in reach so you don't have to stop the film.<br /><br />Enjoy

In [3]:
import os, pathlib, shutil, random

In [6]:
base_dir = pathlib.Path('aclImdb')
val_dir = base_dir/'val'
train_dir = base_dir/'train'
for category in ('neg', 'pos'):
    os.makedirs(val_dir/category)
    files = os.listdir(train_dir/category)
    random.Random(1337).shuffle(files)
    num_val_samples = int(0.2*len(files))
    val_files = files[-num_val_samples:]
    for fname in val_files:
        shutil.move(train_dir/category/fname,
                    val_dir/category/fname)

In [7]:
from tensorflow import keras

batch_size = 32

train_ds = keras.utils.text_dataset_from_directory(
    'aclImdb/train', batch_size=batch_size)
val_ds = keras.utils.text_dataset_from_directory(
    'aclImdb/val', batch_size=batch_size)
test_ds = keras.utils.text_dataset_from_directory(
    'aclImdb/test', batch_size=batch_size)

Found 20000 files belonging to 2 classes.
Found 5000 files belonging to 2 classes.
Found 25000 files belonging to 2 classes.


In [8]:
for inputs, targets in train_ds:
    print('inputs.shape: ', inputs.shape)
    print('inputs.dtype: ', inputs.dtype)
    print('targets.shape: ', targets.shape)
    print('targets.dtype: ', targets.dtype)
    print('inputs[0]: ', inputs[0])
    print('targets[0]: ', targets[0])
    break

inputs.shape:  (32,)
inputs.dtype:  <dtype: 'string'>
targets.shape:  (32,)
targets.dtype:  <dtype: 'int32'>
inputs[0]:  tf.Tensor(b"LES CONVOYEURS ATTENDENT was the first film I saw in 2000 and I doubt I'll see a better one this year. This beautiful tragicomedy by Belgian filmmaker Beno\xc3\xaet Mariage is set in the industrial wastelands of Wallonia. Beno\xc3\xaet Poelvoorde plays a father who desperately wants his son to win a car (a Lada!) for him. To do this the son has to break the record opening doors. What the father actually wants his for his son to be someone, because he himself has never made it further as the reporter of local news for a newspaper ironically called L'Espoir (Hope). Of course nothing works out as planned. This film can best be compared to Aki Kaurism\xc3\xa4ki's DRIFTING CLOUDS, although it is more dramatic and the humour is darker. Just like in that film however the tone is more melancholic than depressing and the ending upbeat, without being unrealisticall

In [10]:
from tensorflow.keras.layers import TextVectorization

text_vectorization = TextVectorization(
    max_tokens=20000,
    output_mode='multi_hot',
)

In [11]:
text_only_train_ds = train_ds.map(lambda x,y : x)
text_vectorization.adapt(text_only_train_ds)

In [12]:
binary_1gram_train_ds = train_ds.map(
    lambda x, y: (text_vectorization(x), y),
    num_parallel_calls=4)
binary_1gram_val_ds = val_ds.map(
    lambda x,y: (text_vectorization(x), y),
    num_parallel_calls=4)
binary_1gram_test_ds = test_ds.map(
    lambda x,y: (text_vectorization(x),y),
    num_parallel_calls=4)

In [13]:
for inputs, targets in binary_1gram_train_ds:
    print('inputs.shape: ', inputs.shape)
    print('inputs.dtype: ', inputs.dtype)
    print('targets.shape: ', targets.shape)
    print('targets.dtype: ', targets.dtype)
    print('inputs[0]: ', inputs[0])
    print('targets[0]: ', targets[0])
    break

inputs.shape:  (32, 20000)
inputs.dtype:  <dtype: 'float32'>
targets.shape:  (32,)
targets.dtype:  <dtype: 'int32'>
inputs[0]:  tf.Tensor([1. 1. 1. ... 0. 0. 0.], shape=(20000,), dtype=float32)
targets[0]:  tf.Tensor(1, shape=(), dtype=int32)


In [14]:
from tensorflow import keras
from tensorflow.keras import layers

def get_model(max_tokens=20000, hidden_dim=16):
    inputs = keras.Input(shape=(max_tokens,))
    x = layers.Dense(hidden_dim, activation='relu')(inputs)
    x = layers.Dropout(0.5)(x)
    outputs = layers.Dense(1, activation='sigmoid')(x)
    model = keras.Model(inputs, outputs)
    model.compile(optimizer='rmsprop', loss='binary_crossentropy', metrics=['accuracy'])
    return model

In [15]:
model = get_model()
model.summary()

Model: "model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_1 (InputLayer)        [(None, 20000)]           0         
                                                                 
 dense (Dense)               (None, 16)                320016    
                                                                 
 dropout (Dropout)           (None, 16)                0         
                                                                 
 dense_1 (Dense)             (None, 1)                 17        
                                                                 
Total params: 320,033
Trainable params: 320,033
Non-trainable params: 0
_________________________________________________________________


In [16]:
callbacks = [
    keras.callbacks.ModelCheckpoint('binary_1gram.keras', save_best_only=True)
]

model.fit(binary_1gram_train_ds.cache(), validation_data=binary_1gram_val_ds, epochs=10,
          callbacks=callbacks)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x7fc90ca47490>

In [17]:
model = keras.models.load_model('binary_1gram.keras')
print(f'Test acc: {model.evaluate(binary_1gram_test_ds)[1]:.3f}')

Test acc: 0.889


In [18]:
text_vectorization = TextVectorization(
    ngrams=2,
    max_tokens=20000,
    output_mode='multi_hot',
)

In [19]:
text_vectorization.adapt(text_only_train_ds)
binary_2gram_train_ds = train_ds.map(
    lambda x, y: (text_vectorization(x), y),
    num_parallel_calls=4)
binary_2gram_val_ds = val_ds.map(
    lambda x, y: (text_vectorization(x), y),
    num_parallel_calls=4)
binary_2gram_test_ds = test_ds.map(
    lambda x, y: (text_vectorization(x), y),
    num_parallel_calls=4)

In [20]:
model = get_model()
model.summary()

Model: "model_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_2 (InputLayer)        [(None, 20000)]           0         
                                                                 
 dense_2 (Dense)             (None, 16)                320016    
                                                                 
 dropout_1 (Dropout)         (None, 16)                0         
                                                                 
 dense_3 (Dense)             (None, 1)                 17        
                                                                 
Total params: 320,033
Trainable params: 320,033
Non-trainable params: 0
_________________________________________________________________


In [21]:
callbacks = [
    keras.callbacks.ModelCheckpoint('binary_2gram.keras', save_best_only=True)
]

model.fit(binary_2gram_train_ds.cache(), validation_data=binary_2gram_val_ds, epochs=10,
          callbacks=callbacks)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x7fc88e2d4af0>

In [22]:
model = keras.models.load_model('binary_2gram.keras')
print(f'Test acc: {model.evaluate(binary_2gram_test_ds)[1]:.3f}')

Test acc: 0.898


In [23]:
text_vectorization = TextVectorization(
    ngrams=2,
    max_tokens=20000,
    output_mode='count')

In [24]:
text_vectorization = TextVectorization(
    ngrams=2,
    max_tokens=20000,
    output_mode='tf_idf')

In [25]:
text_vectorization.adapt(text_only_train_ds)

tfidf_2gram_train_ds = train_ds.map(
    lambda x, y: (text_vectorization(x), y),
    num_parallel_calls=4)
tfidf_2gram_val_ds = val_ds.map(
    lambda x, y: (text_vectorization(x), y),
    num_parallel_calls=4)
tfidf_2gram_test_ds = test_ds.map(
    lambda x, y: (text_vectorization(x), y),
    num_parallel_calls=4)

In [26]:
model = get_model()
model.summary()

Model: "model_2"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_3 (InputLayer)        [(None, 20000)]           0         
                                                                 
 dense_4 (Dense)             (None, 16)                320016    
                                                                 
 dropout_2 (Dropout)         (None, 16)                0         
                                                                 
 dense_5 (Dense)             (None, 1)                 17        
                                                                 
Total params: 320,033
Trainable params: 320,033
Non-trainable params: 0
_________________________________________________________________


In [27]:
callbacks = [
    keras.callbacks.ModelCheckpoint('tfidf_2gram.keras', save_best_only=True)
]

model.fit(tfidf_2gram_train_ds.cache(), validation_data=tfidf_2gram_val_ds.cache(),
          epochs=10, callbacks=callbacks)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x7fc8a2eff880>

In [28]:
model = keras.models.load_model('tfidf_2gram.keras')
print(f'Test acc: {model.evaluate(tfidf_2gram_test_ds)[1]:.3f}')

Test acc: 0.874


In [29]:
inputs = keras.Input(shape=(1,),dtype='string')
processed_inputs = text_vectorization(inputs)
outputs = model(processed_inputs)
inference_model = keras.Model(inputs, outputs)

In [30]:
import tensorflow as tf
raw_text_data = tf.convert_to_tensor([
    ['That was an excellent movie, I loved it.'],
])
predictions = inference_model(raw_text_data)
print(f'{float(predictions[0]*100):.2f} percent positive')

90.31 percent positive


In [31]:
from tensorflow.keras import layers

max_length = 600
max_tokens = 20000

text_vectorization = TextVectorization(
    max_tokens=max_tokens,
    output_mode='int',
    output_sequence_length=max_length)

text_vectorization.adapt(text_only_train_ds)

int_train_ds = train_ds.map(
    lambda x,y : (text_vectorization(x), y),
    num_parallel_calls=4)

int_val_ds = val_ds.map(
    lambda x, y : (text_vectorization(x), y),
    num_parallel_calls=4)

int_test_ds = test_ds.map(
    lambda x,y : (text_vectorization(x), y),
    num_parallel_calls=4)

In [32]:
import tensorflow as tf

inputs = keras.Input(shape=(None,), dtype='int64')
embedded = tf.one_hot(inputs, depth=max_tokens)
x = layers.Bidirectional(layers.LSTM(32))(embedded)
x = layers.Dropout(0.5)(x)
outputs = layers.Dense(1, activation='sigmoid')(x)
model = keras.Model(inputs, outputs)
model.compile(optimizer='rmsprop',
              loss='binary_crossentropy',
              metrics=['accuracy'])
model.summary()

Model: "model_4"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_5 (InputLayer)        [(None, None)]            0         
                                                                 
 tf.one_hot (TFOpLambda)     (None, None, 20000)       0         
                                                                 
 bidirectional (Bidirectiona  (None, 64)               5128448   
 l)                                                              
                                                                 
 dropout_3 (Dropout)         (None, 64)                0         
                                                                 
 dense_6 (Dense)             (None, 1)                 65        
                                                                 
Total params: 5,128,513
Trainable params: 5,128,513
Non-trainable params: 0
_________________________________________________

In [None]:
callbacks = [
    keras.callbacks.ModelCheckpoint('one_hot_bidir_lstm.keras', save_best_only=True)
]

model.fit(int_train_ds, validation_data=int_val_ds, epochs=10, callbacks=callbacks)

# not training this because every epoch takes about two and half hours ........

In [35]:
embedding_layer = layers.Embedding(input_dim=max_tokens, output_dim=256)

In [36]:
inputs = keras.Input(shape=(None,),dtype='int64')
embedded = layers.Embedding(input_dim=max_tokens, output_dim=256)(inputs)
x = layers.Bidirectional(layers.LSTM(32))(embedded)
x = layers.Dropout(0.5)(x)
outputs = layers.Dense(1, activation='sigmoid')(x)
model = keras.Model(inputs, outputs)
model.compile(optimizer='rmsprop', loss='binary_crossentropy', metrics=['accuracy'])

callbacks = [
    keras.callbacks.ModelCheckpoint('embeddings_bidir_lstm.keras', save_best_only=True)
]
model.fit(int_train_ds, validation_data=int_val_ds, epochs=10, callbacks=callbacks)

In [40]:
embedding_layer = layers.Embedding(input_dim=10, output_dim=256, mask_zero=True)
some_input = [
    [4,3,2,1,0,0,0],
    [5,4,3,2,1,0,0],
    [2,1,0,0,0,0,0]
]

In [41]:
mask = embedding_layer.compute_mask(some_input)

In [42]:
mask

<tf.Tensor: shape=(3, 7), dtype=bool, numpy=
array([[ True,  True,  True,  True, False, False, False],
       [ True,  True,  True,  True,  True, False, False],
       [ True,  True, False, False, False, False, False]])>

In [43]:
inputs = keras.Input(shape=(None,),dtype='int64')
embedded = layers.Embedding(input_dim=max_tokens, output_dim=256, mask_zero=True)(inputs)
x = layers.Bidirectional(layers.LSTM(32))(embedded)
x = layers.Dropout(0.5)(x)
outputs = layers.Dense(1, activation='sigmoid')(x)
model = keras.Model(inputs, outputs)
model.compile(optimizer='rmsprop', loss='binary_crossentropy', metrics=['accuracy'])

callbacks = [
    keras.callbacks.ModelCheckpoint('embeddings_bidir_lstm_with_masking.keras', save_best_only=True)
]
model.fit(int_train_ds, validation_data=int_val_ds, epochs=10, callbacks=callbacks)

Epoch 1/10
Epoch 2/10
  9/625 [..............................] - ETA: 9:13 - loss: 0.2720 - accuracy: 0.8958

KeyboardInterrupt: 

In [None]:
!wget http://nlp.stanford.edu/data/glove.6B.zip

In [None]:
!unzip -q glove.6B.zip

In [44]:
import numpy as np

In [None]:
path_to_glove_file = 'glove.6b.100d.txt'

embeddings_index = {}
with open(path_to_glove_file) as f:
    for line in f:
        word, coefs = line.split(maxsplit=1)
        coefs = np.fromstring(coefs, 'f', sep=' ')
        embeddings_index[word] = coefs
        
print(f'Found {len(embeddings_index)} word vectors')

In [None]:
embedding_dim = 100

vocabulary = text_vectorization.get_vocabulary()
word_index = dict(zip(vocabulary, range(len(vocabulary))))

embedding_matrix = np.zeros((max_tokens, embedding_dim))
for word, i in word_index.items():
    if i<max_tokens:
        embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector

In [46]:
embedding_layer = layers.Embedding(
    max_tokens,
    embedding_dim, 
    embeddings_initializer=keras.initializers.Constant(embedding_matrix),
    trainable=False,
    mask_zero=True,
)

In [None]:
inputs = keras.Input(shape=(None,), dtype='int64')
embedded = embedding_layer(inputs)
x = layers.Bidirectional(layers.LSTM(32))(embedded)
x = layers.Dropout(0.5)(x)
outputs = layers.Dense(1, activation='sigmoid')(x)
model = keras.Model(inputs, outputs)
model.compile(optimizer='rmsprop', loss='binary_crossentropy', metrics=['accuracy'])
model.summary()

In [None]:
callbacks=[
    keras.callbacks.ModelCheckpoint('glove_embeddings_sequence_model.keras', save_best_only=True)
]

model.fit(int_train_ds, validation_data=int_val_ds, epochs=10, callbacks=callbacks)

In [None]:
def self_attention(input_sequence):
    output = np.zeros(shape=(input_sequence.shape))
    for i, pivot_vector in enumerate(input_sequence):
        scores = np.zeros(shape=(len(input_sequence),))
        for j, vector in enumerate(input_sequence):
            scores[j] = np.dot(pivot_vector, vector.T)
        scores /= np.sqrt(input_sequence.shape[1])
        scores = softmax(scores)
        new_pivot_representation = np.zeros(shape=pivot_vector.shape)
        for j, vector in enumerate(input_sequence):
            new_pivot_representation += vector * scores[j]
        output[i] = new_pivot_representation
    return output

In [None]:
num_heads = 4
embed_dim = 256
mha_layer = MultiHeadAttention(num_heads=num_heads, key_dim=embed_dim)
outputs = mha_layer(inputs, inputs, inputs)

In [47]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers

In [48]:
class TransformerEncoder(layers.Layer):
    def __init__(self, embed_dim, dense_dim, num_heads, **kwargs):
        super().__init__(**kwargs)
        self.embed_dim = embed_dim
        self.dense_dim = dense_dim
        self.num_heads = num_heads
        self.attention = layers.MultiHeadAttention(
            num_heads=num_heads, key_dim=embed_dim)
        self.dense_proj = keras.Sequential([
            layers.Dense(dense_dim, activation='relu'),
            layers.Dense(embed_dim),
        ])
        self.layernorm_1 = layers.LayerNormalization()
        self.layernorm_2 = layers.LayerNormalization()
        
    def call(self, inputs, mask=None):
        if mask is not None:
            mask = mask[:, tf.newaxis, :]
        attention_output = self.attention(
            inputs, inputs, attention_mask=mask)
        proj_input = self.layernorm_1(inputs + attention_output)
        proj_output = self.dense_proj(proj_input)
        return self.layernorm_2(proj_input + proj_output)
    
    def get_config(self):
        config = super().get_config()
        config.update({
            'embed_dim': self.embed_dim,
            'num_heads': self.num_heads,
            'dense_dim': self.dense_dim,
        })
        return config

In [49]:
vocab_size = 20000
embed_dim = 256
num_heads = 2
dense_dim = 32

inputs = keras.Input(shape=(None,), dtype='int64')
x = layers.Embedding(vocab_size, embed_dim)(inputs)
x = TransformerEncoder(embed_dim, dense_dim, num_heads)(x)
x = layers.GlobalMaxPooling1D()(x)
x = layers.Dropout(0.5)(x)
outputs = layers.Dense(1, activation='sigmoid')(x)
model = keras.Model(inputs, outputs)
model.compile(optimizer='rmsprop', loss='binary_crossentropy', metrics=['accuracy'])
model.summary()

Model: "model_7"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_8 (InputLayer)        [(None, None)]            0         
                                                                 
 embedding_5 (Embedding)     (None, None, 256)         5120000   
                                                                 
 transformer_encoder (Transf  (None, None, 256)        543776    
 ormerEncoder)                                                   
                                                                 
 global_max_pooling1d (Globa  (None, 256)              0         
 lMaxPooling1D)                                                  
                                                                 
 dropout_6 (Dropout)         (None, 256)               0         
                                                                 
 dense_11 (Dense)            (None, 1)                 257 

In [None]:
callbacks = [
    keras.callbacks.ModelCheckpoint('transformer_encoder.keras', save_best_only=True)
]

model.fit(int_train_ds, validation_data=int_val_ds, epochs=20, callbacks=callbacks)

In [None]:
model = keras.models.load_model('transformer_encoder.keras', custom_objects={'TransformerEncoder':
                                                                             TransforerEncoder})
print(f'Test acc: {model.evaluate(int_test_ds)[1]:.3f}')

In [50]:
class PositionalEmbedding(layers.Layer):
    def __init__(self, sequence_length, input_dim, output_dim, **kwargs):
        super().__init__(**kwargs)
        self.token_embeddings = layers.Embedding(
            input_dim=input_dim, output_dim=output_dim)
        self.position_embeddings = layers.embedding(
            input_dim=sequence_length, output_dim=output_dim)
        self.sequence_length = sequence_length
        self.input_dim = input_dim
        self.output_dim = output_dim
        
    def call(self, inputs):
        length = tf.shape(inputs)[-1]
        positions = tf.range(start=0, limit=length, delta=1)
        embedded_tokens = self.token_embeddings(inputs)
        embedded_positions = slef.position_embeddings(positions)
        return embedded_tokens + embedded_positions
    
    def compute_mask(self, inputs, mask=None):
        return tf.math.not_equal(inputs, 0)
    
    def get_config(self):
        config = super().get_config()
        config.update({'output_dim':self.output_dim,
                       'sequence_length':self.sequence_length,
                       'input_dim':self.input_dim,
                      })
        return config

In [None]:
!wget http://storage.googleapis.com/download.tensorflow.org/data/spa-eng.zip

In [51]:
!unzip -q spa-eng.zip

In [55]:
text_file = 'spa-eng/spa.txt'
with open(text_file) as f:
    lines = f.read().split('\n')[:-1]
text_pairs = []
for line in lines:
    english, spanish = line.split('\t')
    spanish = '[start]' + spanish + '[end]'
    text_pairs.append((english,spanish))

In [56]:
import random
print(random.choice(text_pairs))

('Do you feel better now?', '[start]¿Te sentís mejor ahora?[end]')


In [57]:
import random

random.shuffle(text_pairs)

num_val_samples = int(0.15 * len(text_pairs))
num_train_sameples = len(text_pairs) - 2 * num_val_samples
train_pairs = text_pairs[:num_train_sameples]
val_pairs = text_pairs[num_train_sameples:num_train_sameples+num_val_samples]
test_pairs = text_pairs[num_train_sameples+num_val_samples:]

In [59]:
import tensorflow as tf
import string
import re

In [61]:
strip_chars = string.punctuation + '¿'
strip_chars = strip_chars.replace('[', '')
strip_chars = strip_chars.replace(']', '')

def custom_standardization(input_string):
    lowercase = tf.strings.lower(input_string)
    return tf.strings.regex_replace(lowercase, f'[{re.escape(strip_chars)}]', '')

vocab_size = 15000
sequence_length = 20

source_vectorization = layers.TextVectorization(
    max_tokens=vocab_size,
    output_mode='int',
    output_sequence_length=sequence_length)

target_vectorization = layers.TextVectorization(
    max_tokens=vocab_size,
    output_mode='int',
    output_sequence_length=sequence_length+1,
    standardize=custom_standardization)

train_english_texts = [pair[0] for pair in train_pairs]
train_spanish_texts = [pair[1] for pair in train_pairs]
source_vectorization.adapt(train_english_texts)
target_vectorization.adapt(train_spanish_texts)

In [62]:
batch_size = 64

def format_dataset(eng, spa):
    eng = source_vectorization(eng)
    spa = target_vectorization(spa)
    return ({'english': eng,
             'spanish': spa[:, :-1],
            }, spa[:, :-1])

def make_dataset(pairs):
    eng_texts, spa_texts = zip(*pairs)
    eng_texts = list(eng_texts)
    spa_texts = list(spa_texts)
    dataset = tf.data.Dataset.from_tensor_slices((eng_texts, spa_texts))
    dataset = dataset.batch(batch_size)
    dataset = dataset.map(format_dataset, num_parallel_calls=4)
    return dataset.shuffle(2048).prefetch(16).cache()

train_ds = make_dataset(train_pairs)
val_ds = make_dataset(val_pairs)

In [64]:
for inputs, targets in train_ds.take(1):
    print(f"inputs['english'].shape: {inputs['english'].shape}")
    print(f"inputs['spanish'].shape: {inputs['spanish'].shape}")
    print(f"targets.shape: {targets.shape}")

inputs['english'].shape: (64, 20)
inputs['spanish'].shape: (64, 20)
targets.shape: (64, 20)


2022-02-14 15:38:21.721158: W tensorflow/core/kernels/data/cache_dataset_ops.cc:768] The calling iterator did not fully read the dataset being cached. In order to avoid unexpected truncation of the dataset, the partially cached contents of the dataset  will be discarded. This can happen if you have an input pipeline similar to `dataset.cache().take(k).repeat()`. You should use `dataset.take(k).cache().repeat()` instead.


In [65]:
inputs = keras.Input(shape=(sequence_length,), dtype='int64')
x = layers.Embedding(input_dim=vocab_size, output_dim=128)(inputs)
x = layers.LSTM(32, return_sequences=True)(x)
outputs = layers.Dense(vocab_size, activation='softmax')(x)
model = keras.Model(inputs, outputs)

In [66]:
from tensorflow import keras
from tensorflow.keras import layers

embed_dim = 256
latent_dim = 1024

source = keras.Input(shape=(None,), dtype='int64', name='english')
x = layers.Embedding(vocab_size, embed_dim, mask_zero=True)(source)
encoded_source = layers.Bidirectional(
    layers.GRU(latent_dim), merge_mode='sum')(x)

In [67]:
past_target = keras.Input(shape=(None,), dtype='int64', name='spanish')
x = layers.Embedding(vocab_size, embed_dim, mask_zero=True)(past_target)
decoder_gru = layers.GRU(latent_dim, return_sequences=True)
x = decoder_gru(x, initial_state=encoded_source)
x = layers.Dropout(0.5)(x)
target_next_step = layers.Dense(vocab_size, activation='softmax')(x)
seq2seq_rnn = keras.Model([source, past_target], target_next_step)

In [None]:
seq2seq_rnn.compile(optimizer='rmsprop', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
seq2seq_rnn.fit(train_ds, epochs=15, validation_data=val_ds)

# this takes way too long to train... forget it......

In [None]:
import numpy as np

spa_vocab = target_vectorization.get_vocabulary()
spa_index_lookup = dict(zip(range(len(spa_vocab)), spa_vocab))
max_decoded_sentence_length = 20

def decode_sequence(input_sentence):
    tokenized_input_sentence = source_vectorization([input_sentence])
    decoded_sentence = '[start]'
    for i in range(max_decoded_sentence_length):
        tokenized_target_sentence = target_vectorization([decoded_sentence])
        next_token_predictions = seq2seq_rnn.predict(
            [tokenized_input_sentence, tokenized_target_sentence])
        samepled_token_index = np.argmax(next_token_predictions[0, i, :])
        sampled_token = spa_index_lookup[samepled_token_index]
        decoded_sentence += ' ' + sampled_token
        if sampled_token == '[end]':
            break
    return decoded_sentence

test_eng_texts = [pair[0] for pair in test_pairs]
for _ in range(20):
    input_sentence = random.choice(test_eng_texts)
    print('-')
    print(input_sentence)
    print(decode_sequence(input_sentence))