# Seq2Seq models (Sequence-to-Sequence)

Sequence to sequence models are a variant of deep learning models that consists of an encoder and a decoder. They are used for problems that map an abitrarily long sequence to another arbitrarliy long sequence. For example, in machine translation, you convert a sequence of words in a source language to a sequence of words in a target language. Here we will see how we can use a seq2seq model to solve a machine translation task to convert English to German.


<table align="left">
    <td>
        <a target="_blank" href="https://colab.research.google.com/github/thushv89/manning_tf2_in_action/blob/master/Ch11/11.1_Seq2seq_machine_translation.ipynb"><img src="https://www.tensorflow.org/images/colab_logo_32px.png" />Run in Google Colab</a>
    </td>
</table>



In [1]:
import tensorflow as tf
print(tf.__version__)
import numpy as np
import time

def fix_random_seed(seed):
    """ Setting the random seed of various libraries """
    try:
        np.random.seed(seed)
    except NameError:
        print("Warning: Numpy is not imported. Setting the seed for Numpy failed.")
    try:
        tf.random.set_seed(seed)
    except NameError:
        print("Warning: TensorFlow is not imported. Setting the seed for TensorFlow failed.")
    try:
        random.seed(seed)
    except NameError:
        print("Warning: random module is not imported. Setting the seed for random failed.")
 
# Fixing the random seed
random_seed=4321
fix_random_seed(random_seed)


2.4.1


http://www.manythings.org/anki/
    
german-english

In [2]:
# Not setting this led to the following error
# _Derived_]RecvAsync is cancelled.   
# [[{{node gradient_tape/model_1/embedding_1/embedding_lookup/Reshape/_172}}]] [Op:__inference_train_function_31985]

%env TF_FORCE_GPU_ALLOW_GROWTH=true

env: TF_FORCE_GPU_ALLOW_GROWTH=true


## Loading the data (Requires manual download)

Unfortunately, this dataset **must be manually downloaded** by clicking [this link](http://www.manythings.org/anki/deu-eng.zip). Then place the downloaded `deu-eng.zip` file in the `Ch11/data` folder before running the cell below.


In [3]:
import os
import requests
import zipfile

# Retrieve the data
if not os.path.exists(os.path.join('data','deu-eng.zip')):
    print("Uh oh! Did you download the deu-eng.zip from http://www.manythings.org/anki/deu-eng.zip manually and place it in the Ch11/data folder?")

else:
    if not os.path.exists(os.path.join('data', 'deu.txt')):
        with zipfile.ZipFile(os.path.join('data','deu-eng.zip'), 'r') as zip_ref:
            zip_ref.extractall('data')
    else:
        print("The extracted data already exists")

The extracted data already exists


## Reading the data in

In [39]:
import pandas as pd

df = pd.read_csv(os.path.join('data', 'deu.txt'), delimiter='\t', header=None)
df.columns = ["EN", "DE", "Attribution"]
df = df[["EN", "DE"]]
print('df.shape = {}'.format(df.shape))

df.shape = (227080, 2)


In [40]:
df.head()

Unnamed: 0,EN,DE
0,Go.,Geh.
1,Hi.,Hallo!
2,Hi.,GrÃ¼ÃŸ Gott!
3,Run!,Lauf!
4,Run.,Lauf!


## Use a smaller sample for computational speed

In [41]:
df = df.sample(n=50000, random_state=random_seed)

In [42]:
df["DE"] = '[SOS] ' + df["DE"] + ' [EOS]'

## Splitting training/validation/testing data

In [43]:
test_df = df.sample(n=5000, random_state=random_seed)
valid_df = df.loc[~df.index.isin(test_df.index)].sample(n=5000, random_state=random_seed)
train_df = df.loc[~(df.index.isin(test_df.index) | df.index.isin(valid_df.index))]

print('test_df.shape = {}'.format(test_df.shape))
print('valid_df.shape = {}'.format(valid_df.shape))
print('train_df.shape = {}'.format(train_df.shape))

test_df.shape = (5000, 2)
valid_df.shape = (5000, 2)
train_df.shape = (40000, 2)


## Vocabulary sizes (English-German)

In [44]:
from collections import Counter

en_words = train_df["EN"].str.split().sum()
de_words = train_df["DE"].str.split().sum()
n=10

def get_vocabulary_size_greater_than(words, n, verbose=True):
    counter = Counter(words)

    freq_df = pd.Series(list(counter.values()), index=list(counter.keys())).sort_values(ascending=False)
    
    if verbose:
        # Print most common words
        print(freq_df.head(n=10))

    # Count of words >= n frequent    
    n_vocab = (freq_df>=n).sum()
    
    if verbose:
        print("\nVocabulary size (>={} frequent): {}".format(n, n_vocab))
        
    return n_vocab

print("English corpus")
print('='*50)
en_vocab = get_vocabulary_size_greater_than(en_words, n)

print("\nGerman corpus")
print('='*50)
de_vocab = get_vocabulary_size_greater_than(de_words, n)

English corpus
Tom    9427
to     8673
I      8436
the    6999
you    6125
a      5680
is     4374
in     2664
of     2613
was    2298
dtype: int64

Vocabulary size (>=10 frequent): 2238

German corpus
[SOS]    40000
[EOS]    40000
Tom       9928
Ich       7749
ist       4753
nicht     4414
zu        3583
Sie       3465
du        3112
das       2909
dtype: int64

Vocabulary size (>=10 frequent): 2497


## Sequence length 

In [45]:
def print_sequence_length(str_ser):
    # Create a pd.Series, which contain the sequence length for each review
    seq_length_ser = str_ser.str.len()

    # Get the median as well as summary statistics of the sequence length
    print("\nSome summary statistics")
    print("Median length: {}\n".format(seq_length_ser.median()))
    print(seq_length_ser.describe())

    print("\nComputing the statistics between the 10% and 90% quantiles (to ignore outliers)")
    p_10 = seq_length_ser.quantile(0.1)
    p_90 = seq_length_ser.quantile(0.9)

    print(seq_length_ser[(seq_length_ser >= p_10) & (seq_length_ser < p_90)].describe(percentiles=[0.33, 0.66]))

print("English corpus")
print('='*50)
print_sequence_length(train_df["EN"])

print("\nGerman corpus")
print('='*50)
print_sequence_length(train_df["DE"])

English corpus

Some summary statistics
Median length: 29.0

count    40000.000000
mean        31.841100
std         13.496887
min          4.000000
25%         23.000000
50%         29.000000
75%         38.000000
max        537.000000
Name: EN, dtype: float64

Computing the statistics between the 10% and 90% quantiles (to ignore outliers)
count    32161.000000
mean        30.086658
std          7.659525
min         18.000000
33%         26.000000
50%         29.000000
66%         33.000000
max         47.000000
Name: EN, dtype: float64

German corpus

Some summary statistics
Median length: 46.0

count    40000.000000
mean        49.175300
std         16.145143
min         18.000000
25%         38.000000
50%         46.000000
75%         57.000000
max        493.000000
Name: DE, dtype: float64

Computing the statistics between the 10% and 90% quantiles (to ignore outliers)
count    31818.000000
mean        47.255453
std          9.185303
min         33.000000
33%         42.000000
50%

In [46]:
print("EN vocabulary size: {}".format(en_vocab))
print("DE vocabulary size: {}".format(de_vocab))
en_seq_length = 50
de_seq_length = 60
print("EN max sequence length: {}".format(en_seq_length))
print("DE max sequence length: {}".format(de_seq_length))

EN vocabulary size: 2238
DE vocabulary size: 2497
EN max sequence length: 50
DE max sequence length: 60


In [47]:
from tensorflow.keras.layers.experimental.preprocessing import TextVectorization

print("Defined the vectorization layer for English")
# Create the layer.
en_vectorize_layer = TextVectorization(
    max_tokens=en_vocab,
    output_mode='int',
    output_sequence_length=None
)

print("Fitting the EN vectorization layer on data")
# Now that the vocab layer has been created, call `adapt` on the text-only
# dataset to create the vocabulary. You don't have to batch, but for large
# datasets this means we're not keeping spare copies of the dataset.
en_vectorize_layer.adapt(np.array(train_df["EN"].tolist()).astype('str'))
print("\tDone")

print("\nDefined the vectorization layer for German")
# Create the layer.
de_vectorize_layer = TextVectorization(
    max_tokens=de_vocab,
    output_mode='int',
    output_sequence_length=de_seq_length,
    pad_to_max_tokens=False
)

print("Fitting the DE vectorization layer on data")
de_vectorize_layer.adapt(np.array(train_df["DE"].tolist()).astype('str'))
print("\tDone")

Defined the vectorization layer for English
Fitting the EN vectorization layer on data
	Done

Defined the vectorization layer for German
Fitting the DE vectorization layer on data
	Done


## Vectorization layer in action

In [48]:
import tensorflow.keras.backend as K
K.clear_session()

# Create the model that uses the vectorize text layer
toy_model = tf.keras.models.Sequential()

# Start by creating an explicit input layer. It needs to have a shape of
# (1,) (because we need to guarantee that there is exactly one string
# input per batch), and the dtype needs to be 'string'.
toy_model.add(tf.keras.Input(shape=(1,), dtype=tf.string))

# The first layer in our model is the vectorization layer. After this
# layer, we have a tensor of shape (batch_size, max_len) containing vocab
# indices.
toy_model.add(en_vectorize_layer)

# Now, the model can map strings to integers, and you can add an embedding
# layer to map these integers to learned embeddings.
input_data = [["run"], ["how are you"],["ectoplasmic residue"]]
pred = toy_model.predict(input_data)
print(pred)

[[427   0   0]
 [ 40  23   4]
 [  1   1   0]]


In [49]:
print(en_vectorize_layer.get_vocabulary()[:10])
print(len(en_vectorize_layer.get_vocabulary()))

['', '[UNK]', 'tom', 'to', 'you', 'the', 'i', 'a', 'is', 'that']
2238


## Defining the real model

In [78]:
import tensorflow.keras.backend as K
K.clear_session()

def get_vectorizer(list_of_strings, n_vocab, max_length=None, return_vocabulary=True, name=None):
    
    """ Return a text vectorization layer or a model """
        
    inp = tf.keras.Input(shape=(1,), dtype=tf.string, name='encoder_input')
    
    vectorize_layer = tf.keras.layers.experimental.preprocessing.TextVectorization(
        max_tokens=n_vocab+2,
        output_mode='int',
        output_sequence_length=max_length,        
        name=name
    )
    
    vectorize_layer.adapt(list_of_strings)
        
    vectorized_out = vectorize_layer(inp)
        
    if not return_vocabulary: 
        return tf.keras.models.Model(inputs=inp, outputs=vectorized_out)    
    else:
        return tf.keras.models.Model(inputs=inp, outputs=vectorized_out), vectorize_layer.get_vocabulary()        
    
        
def get_encoder_and_state(n_vocab, vectorizer):
    
    inp = tf.keras.Input(shape=(1,), dtype=tf.string, name='e_input')

    vectorized_out = vectorizer(inp)
    
    emb_layer = tf.keras.layers.Embedding(n_vocab+2, 128, mask_zero=True, name='e_embedding')
    emb_out = emb_layer(vectorized_out)
    
    gru_layer = tf.keras.layers.Bidirectional(tf.keras.layers.GRU(128))
    
    gru_out = gru_layer(emb_out)
    
    encoder = tf.keras.models.Model(inputs=inp, outputs=gru_out)
        
    return encoder, gru_out


def get_final_model_and_state(n_vocab, encoder, init_state, vectorizer):
        
    inp = tf.keras.Input(shape=(1,), dtype=tf.string, name='d_input')
    
    vectorized_out = vectorizer(inp)
    
    emb_layer = tf.keras.layers.Embedding(n_vocab+2, 128, mask_zero=True, name='d_embedding')
    emb_out = emb_layer(vectorized_out)
    
    gru_layer = tf.keras.layers.GRU(256, return_sequences=True)
    gru_out = gru_layer(emb_out, initial_state=init_state)
    
    dense_layer_1 = tf.keras.layers.Dense(512, activation='relu')
    dense1_out = dense_layer_1(gru_out)
    
    dense_layer_final = tf.keras.layers.Dense(n_vocab+2, activation='softmax')
    dense_final_out = dense_layer_final(dense1_out)
    
    decoder = tf.keras.models.Model(inputs=[encoder.input, inp], outputs=dense_final_out)
    
    return decoder, gru_out


en_vectorizer, en_vocabulary = get_vectorizer(np.array(train_df["EN"].tolist()), en_vocab, max_length=en_seq_length, name='en_vectorizer')
de_vectorizer, de_vocabulary = get_vectorizer(np.array(train_df["DE"].tolist()), de_vocab, max_length=de_seq_length-1, name='de_vectorizer')

encoder, enc_final_state = get_encoder_and_state(en_vocab, en_vectorizer)
final_model, _ = get_final_model_and_state(de_vocab, encoder, enc_final_state, de_vectorizer)


In [79]:
from tensorflow.keras.metrics import SparseCategoricalAccuracy
#final_model_text = tf.keras.models.Model(inputs=[en_vectorizer.input, de_vectorizer.input], outputs=final_model.output)

final_model.compile(
    loss='sparse_categorical_crossentropy', 
    optimizer='adam', 
    metrics=['accuracy']
)
final_model.summary()

Model: "model_3"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
e_input (InputLayer)            [(None, 1)]          0                                            
__________________________________________________________________________________________________
d_input (InputLayer)            [(None, 1)]          0                                            
__________________________________________________________________________________________________
model (Functional)              (None, 50)           0           e_input[0][0]                    
__________________________________________________________________________________________________
model_1 (Functional)            (None, 59)           0           d_input[0][0]                    
____________________________________________________________________________________________

## use the following for BLEU

https://github.com/tensorflow/nmt/blob/master/nmt/scripts/bleu.py

In [166]:
class BLEUMetric(object):
    
    def __init__(self, vocabulary, name='perplexity', **kwargs):
      super().__init__()
      self.vocab = vocabulary
      self.id_to_token_layer = StringLookup(vocabulary=self.vocab, invert=True)
    
    def calculate_bleu_from_predictions(self, real, pred):
      
        pred_argmax = tf.argmax(pred, axis=-1)  
        
        pred_tokens = self.id_to_token_layer(pred_argmax)
        real_tokens = self.id_to_token_layer(real)
        
        def clean_padding(tokens):
            """ If padding left in the sequence, they will count towards BLEU """
            t = tf.strings.strip(
                        tf.strings.regex_replace(
                            tf.strings.join(
                                tf.transpose(tokens), separator=' '
                            ),
                        "(?:sos|eos)", ""),
                   )
            
            t = np.char.split(np.char.decode(t.numpy().astype(np.bytes_), encoding='utf-8')).tolist()

            return t
        
        pred_tokens = clean_padding(pred_tokens)
        real_tokens = [[r] for r in clean_padding(real_tokens)]
        
        bleu, precisions, bp, ratio, translation_length, reference_length = compute_bleu(real_tokens, pred_tokens, smooth=False)

        return bleu

In [167]:
from bleu import compute_bleu

epochs = 25
batch_size = 128





def prepare_data(train_df, valid_df, test_df):
    
    data_dict = {}
    for label, df in zip(['train', 'valid', 'test'], [train_df, valid_df, test_df]):
        en_inputs = np.array(df["EN"].tolist())
        de_inputs = np.array(df["DE"].str.rsplit(n=1, expand=True).iloc[:,0].tolist())
        de_labels = np.array(df["DE"].str.split(n=1, expand=True).iloc[:,1].tolist())
        data_dict[label] = {'encoder_inputs': en_inputs, 'decoder_inputs': de_inputs, 'decoder_labels': de_labels}
    
    return data_dict


def shuffle_data(en_inputs, de_inputs, de_labels, shuffle_inds=None): 
    
    if shuffle_inds is None:
        shuffle_inds = np.random.permutation(np.arange(en_inputs_raw.shape[0]))
    else:
        shuffle_inds = np.random.permutation(shuffle_inds)
        
    return (en_inputs[shuffle_inds], de_inputs[shuffle_inds], de_labels[shuffle_inds]), shuffle_inds


bleu_metric = BLEUMetric(de_vocabulary)
data_dict = prepare_data(train_df, valid_df, test_df)


for epoch in range(epochs):
    bleu_log, val_bleu_log = [],[]
    accuracy_log, val_accuracy_log = [],[]
    loss_log, val_loss_log = [], []
        
    # =================================================================== #
    #                         Train Phase                                 #
    # =================================================================== #
    
    (en_inputs_raw,de_inputs_raw,de_labels_raw), shuffle_inds  = shuffle_data(
        data_dict['train']['encoder_inputs'],
        data_dict['train']['decoder_inputs'],
        data_dict['train']['decoder_labels'],
        shuffle_inds
    )
    
    n_train_batches = en_inputs_raw.shape[0]//batch_size
    for i in range(n_train_batches):
        print("Training batch {}/{}".format(i+1, n_train_batches), end='\r')
        x = [en_inputs_raw[i*batch_size:(i+1)*batch_size], de_inputs_raw[i*batch_size:(i+1)*batch_size]]
        y = de_vectorizer(de_labels_raw[i*batch_size:(i+1)*batch_size])
        final_model.train_on_batch(x, y)        
        loss, accuracy = final_model.evaluate(x, y, verbose=0)
        pred_y = final_model.predict(x)
        
        loss_log.append(loss)
        accuracy_log.append(accuracy)
        bleu_log.append(bleu_metric.calculate_bleu_from_predictions(y, pred_y))
    
    # =================================================================== #
    #                      Validation Phase                               #
    # =================================================================== #
    
    val_en_inputs_raw = data_dict['valid']['encoder_inputs']
    val_de_inputs_raw = data_dict['valid']['decoder_inputs']
    val_de_labels_raw = data_dict['valid']['decoder_labels']
    
    n_valid_batches = val_en_inputs_raw.shape[0]//batch_size
    for i in range(n_valid_batches):
        print("Evaluating batch {}/{}".format(i+1, n_valid_batches), end='\r')
        x = [val_en_inputs_raw[i*batch_size:(i+1)*batch_size], val_de_inputs_raw[i*batch_size:(i+1)*batch_size]]
        y = de_vectorizer(val_de_labels_raw[i*batch_size:(i+1)*batch_size])
        
        loss, accuracy = final_model.evaluate(x, y, verbose=0)
        pred_y = final_model.predict(x)
        
        val_loss_log.append(loss)
        val_accuracy_log.append(accuracy)
        val_bleu_log.append(bleu_metric.calculate_bleu_from_predictions(y, pred_y))
        
    print("\nEpoch {}/{}".format(epoch+1, epochs))
    print("\t(train) loss: {} - accuracy: {} - bleu: {}".format(np.mean(loss_log), np.mean(accuracy_log), np.mean(bleu_log)))
    print("\t(valid) loss: {} - accuracy: {} - bleu: {}".format(np.mean(val_loss_log), np.mean(val_accuracy_log), np.mean(val_bleu_log)))

Training batch 7/312

KeyboardInterrupt: 

In [168]:
print(bleu_log)

[0.3324246064568626, 0.31010147008330263, 0.3733264920176448, 0.399360658651022, 0.36397111223307255, 0.4438269905825874]


In [105]:
i=475
batch_size = 1
x = [en_inputs_raw[i*batch_size:(i+1)*batch_size], de_inputs_raw[i*batch_size:(i+1)*batch_size]]
y = de_vectorizer(de_labels_raw[i*batch_size:(i+1)*batch_size])

pred_y = final_model.predict(x)
print(y)
print(tf.constant(np.argmax(pred_y, axis=-1)))

tf.Tensor(
[[   1  157  138   20   17 1405  414   89   10  773    3    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0]], shape=(1, 59), dtype=int64)
tf.Tensor(
[[  42  138  138   20   17 1405  414   89   10  773    3    3    3    3
     3    3    3    3    3    3    3    3    3    3    3    3    3    3
     3    3    3    3    3    3    3    3    3    3    3    3    3    3
     3    3    3    3    3    3    3    3    3    3    3    3    3    3
     3    3    3]], shape=(1, 59), dtype=int64)


In [165]:
bleu_metric = BLEUMetric(de_vocabulary)
bleu_metric.calculate_bleu_from_predictions(y, pred_y)

[['als', 'mÃ¼ssen', 'mÃ¼ssen', 'wir', 'in', 'erfahrung', 'bringen', 'wo', 'sie', 'wohnen']]
[['[UNK]', 'einmal', 'mÃ¼ssen', 'wir', 'in', 'erfahrung', 'bringen', 'wo', 'sie', 'wohnen']]
0.7598356856515925
[0.8, 0.7777777777777778, 0.75, 0.7142857142857143]
10 10


0.7598356856515925