# Baseline Text to Text Translation : English to French

This notebook trains a sequence to sequence (seq2seq) model for English to French translation. This model will be our **baseline** model, which we will then improve upon by adding attention and other features.

---

In [10]:
from datasets import load_dataset
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from scipy.sparse import csr_matrix

from tokenizers import ByteLevelBPETokenizer

from tensorflow.keras.layers import Embedding, LSTM, RepeatVector, TimeDistributed, Dense
from tensorflow.keras.preprocessing.sequence import pad_sequences

from tensorflow.keras.utils import to_categorical
from tensorflow.keras.callbacks import ModelCheckpoint
from tensorflow.keras.models import Sequential
from tensorflow.keras.optimizers import Adam

In [11]:
dataset = load_dataset("Nicolas-BZRD/Parallel_Global_Voices_English_French", split='train').to_pandas()
dataset.head()

Unnamed: 0,en,fr
0,Jamaica: “I am HIV”,Jamaïque : J’ai le VIH
1,"It's widely acknowledged, in the Caribbean and...","Il est largement reconnu, dans les Caraïbes et..."
2,"For this woman, however, photographed in the s...","Pour cette femme, cependant, photographiée dan..."
3,As Bacon writes on her blog:,Comme Bacon écrit sur son blog:
4,"“When I asked to take her picture, I suggested...",“Quand je lui ai demandé de la prendre en phot...


## Pre-Processing the Text Data

In [12]:
# TODO : The preprocessing should be done inside the utils/preprocess.py file so that we can implement different preprocessing methods for our experiments

In [13]:
def create_tokenizer(lines):
    tokenizer = ByteLevelBPETokenizer()
    tokenizer.train_from_iterator(lines)
    return tokenizer

def max_sequences_length(lines):
    return max(len(line.split()) for line in lines)

def padding_sequences(sequences, max_len):
    sequences_ids = [seq.ids for seq in sequences]
    padded_sequences = pad_sequences(sequences_ids, maxlen=max_len, padding='post')
    return padded_sequences

def encode_sequences(tokenizer, lines, max_length):
    encoded_sequences = tokenizer.encode_batch(lines)
    padded_sequences = padding_sequences(encoded_sequences, max_length)
    return padded_sequences

def encode_output_sparse(sequences, vocab_size):
    ylist = []
    for sequence in sequences:
        sequence_ids = sequence.ids

        # One-hot encode each token in the sequence
        encoded = [to_categorical(token_id, num_classes=vocab_size).astype('float32') for token_id in sequence_ids]
        ylist.append(encoded)

    y = csr_matrix(np.array(ylist, dtype='float32'))
    return y

def decode_sequences(tokenizer, sequences):
    decoded_sequences = []
    for seq in sequences:
        # Remove padding zeros
        seq = seq[:np.argmax(seq == 0)]
        decoded_seq = tokenizer.decode(seq)
        decoded_sequences.append(decoded_seq)
    return decoded_sequences

def vocab_size(dataset):
    return len(set([word for line in dataset for word in line.split()]))

In [14]:
train_data, test_data = train_test_split(dataset, test_size=0.2, random_state=42)

In [15]:
tokenizer_en = create_tokenizer(train_data['en'])
tokenizer_fr = create_tokenizer(train_data['fr'])









In [16]:
max_length = max(max_sequences_length(train_data['en']), max_sequences_length(train_data['fr']), max_sequences_length(test_data['en']), max_sequences_length(test_data['fr']))
print('Max length English: %d' % max_length)

Max length English: 362


In [17]:
fr_vocab_size = vocab_size(train_data['fr'])
eng_vocab_size = vocab_size(train_data['en'])

print('English vocab size:', eng_vocab_size)
print('French vocab size:', fr_vocab_size)

English vocab size: 300928
French vocab size: 323844


In [18]:
# # take a sample of the dataset
# train_data = train_data.sample(1000)
# test_data = test_data.sample(100)

# trainY = encode_sequences(tokenizer_fr, train_data['fr'], max_length)

In [19]:
# prepare training data
trainX = encode_sequences(tokenizer_en, train_data['fr'], max_length)
trainY = encode_output_sparse(tokenizer_en.encode_batch(train_data['en']), max_length)

# prepare validation data
testX = encode_sequences(tokenizer_en, test_data['fr'], max_length)
testY = encode_output_sparse(tokenizer_en.encode_batch(test_data['en']), max_length)

: 

In [None]:
print(decode_sequences(tokenizer_en, trainX[:1]))
print(decode_sequences(tokenizer_fr, trainY[:1]))

In [None]:
def define_model(src_vocab, tar_vocab, src_timesteps, tar_timesteps, n_units):
    model = Sequential()
    
    model.add(Embedding(src_vocab, n_units, input_length=src_timesteps, mask_zero=True))
    model.add(LSTM(n_units))
    
    model.add(RepeatVector(tar_timesteps))
    
    model.add(LSTM(n_units, return_sequences=True))
    model.add(TimeDistributed(Dense(tar_vocab, activation='softmax')))
    
    return model

In [None]:
print("max length of english sequences: ", max_sequences_length(train_data['en']))
print("max length of french sequences: ", max_sequences_length(train_data['fr']))

print()

print("english vocab size: ", eng_vocab_size)
print("french vocab size: ", fr_vocab_size)

print()

print("trainX shape: ", trainX.shape)
print("trainY shape: ", trainY.shape)

In [None]:
fr_length = trainX.shape[1]
eng_length = trainY.shape[1]

n_units = 2

In [None]:
model = define_model(fr_vocab_size, eng_vocab_size, fr_length, eng_length, n_units)
optimizer = Adam()  

epochs = 10
batch_size = 64

model.compile(optimizer=optimizer, loss='categorical_crossentropy')
model.summary()

In [None]:
model.fit(trainX, trainY, epochs=epochs, batch_size=batch_size, validation_data=(testX, testY))