# Baseline Text to Text Translation : English to French

This notebook trains a sequence to sequence (seq2seq) model for English to French translation. This model will be our **baseline** model, which we will then improve upon by adding attention and other features.

---

In [1]:
from datasets import load_dataset
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split

from tokenizers import ByteLevelBPETokenizer

from tensorflow.keras.layers import Embedding, LSTM, RepeatVector, TimeDistributed, Dense
from tensorflow.keras.preprocessing.sequence import pad_sequences

from tensorflow.keras.callbacks import ModelCheckpoint
from tensorflow.keras.models import Sequential
from tensorflow.keras.optimizers import Adam

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
dataset = load_dataset("Nicolas-BZRD/Parallel_Global_Voices_English_French", split='train').to_pandas()
dataset.head()

Unnamed: 0,en,fr
0,Jamaica: “I am HIV”,Jamaïque : J’ai le VIH
1,"It's widely acknowledged, in the Caribbean and...","Il est largement reconnu, dans les Caraïbes et..."
2,"For this woman, however, photographed in the s...","Pour cette femme, cependant, photographiée dan..."
3,As Bacon writes on her blog:,Comme Bacon écrit sur son blog:
4,"“When I asked to take her picture, I suggested...",“Quand je lui ai demandé de la prendre en phot...


## Pre-Processing the Text Data

In [3]:
# TODO : The preprocessing should be done inside the utils/preprocess.py file so that we can implement different preprocessing methods for our experiments

In [4]:
def create_tokenizer(lines):
    tokenizer = ByteLevelBPETokenizer()
    tokenizer.train_from_iterator(lines)
    return tokenizer

In [5]:
def max_sequences_length(sequences):
    return max([len(seq) for seq in sequences])

In [6]:
def padding_sequences(sequences, max_len):
    sequences_ids = [seq.ids for seq in sequences]
    padded_sequences = pad_sequences(sequences_ids, maxlen=max_len, padding='post')
    return padded_sequences

In [7]:
def encode_sequences(tokenizer, lines):
    encoded_sequences = tokenizer.encode_batch(lines)
    max_length = max([len(seq.ids) for seq in encoded_sequences])
    padded_sequences = padding_sequences(encoded_sequences, max_length)
    return padded_sequences

In [8]:
def decode_sequences(tokenizer, sequences):
    decoded_sequences = []
    for seq in sequences:
        # Remove padding zeros
        seq = seq[:np.argmax(seq == 0)]
        decoded_seq = tokenizer.decode(seq)
        decoded_sequences.append(decoded_seq)
    return decoded_sequences

In [9]:
train_data, test_data = train_test_split(dataset, test_size=0.2, random_state=42)

In [10]:
tokenizer_en = create_tokenizer(train_data['en'])
tokenizer_fr = create_tokenizer(train_data['fr'])

In [11]:
# prepare training data
trainX = encode_sequences(tokenizer_en, train_data['fr'])
trainY = encode_sequences(tokenizer_fr, train_data['en'])

# prepare validation data
testX = encode_sequences(tokenizer_en, test_data['fr'])
testY = encode_sequences(tokenizer_fr, test_data['en'])

In [12]:
print(decode_sequences(tokenizer_en, trainX[:1]))
print(decode_sequences(tokenizer_fr, trainY[:1]))

['Et comment les deux forces concurrentes comparent-elles leur présence dans les médias sociaux ?']
['And how do the two competing forces compare in their social media presence?']


In [13]:
def define_model(src_vocab, tar_vocab, src_timesteps, tar_timesteps, n_units):
    model = Sequential()
    
    model.add(Embedding(src_vocab, n_units, input_length=src_timesteps, mask_zero=True))
    model.add(LSTM(n_units))
   
    model.add(RepeatVector(tar_timesteps))
   
    model.add(LSTM(n_units, return_sequences=True))
    model.add(TimeDistributed(Dense(tar_vocab, activation='softmax')))
    
    return model

In [14]:
print("max length of english sequences: ", max_sequences_length(trainX))
print("max length of french sequences: ", max_sequences_length(trainY))

max length of english sequences:  740
max length of french sequences:  593


In [15]:
fr_vocab_size = len(tokenizer_fr.get_vocab())  
eng_vocab_size = len(tokenizer_en.get_vocab())  

fr_length = max_sequences_length(trainX)
eng_length = max_sequences_length(trainY)

n_units = 12

In [16]:
model = define_model(fr_vocab_size, eng_vocab_size, fr_length, eng_length, n_units)
optimizer = Adam()  

model.compile(optimizer=optimizer, loss='sparse_categorical_crossentropy')
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 740, 12)           360000    
                                                                 
 lstm (LSTM)                 (None, 12)                1200      
                                                                 
 repeat_vector (RepeatVector  (None, 593, 12)          0         
 )                                                               
                                                                 
 lstm_1 (LSTM)               (None, 593, 12)           1200      
                                                                 
 time_distributed (TimeDistr  (None, 593, 30000)       390000    
 ibuted)                                                         
                                                                 
Total params: 752,400
Trainable params: 752,400
Non-trai

In [17]:
epochs = 30
batch_size = 64

In [18]:
checkpoint = ModelCheckpoint('model.h5', monitor='val_loss', verbose=1, save_best_only=True, mode='min')
model.fit(trainX, trainY, epochs=epochs, batch_size=batch_size, validation_data=(testX, testY), callbacks=[checkpoint], verbose=2)

Epoch 1/30
