### Importing libraries

In [10]:
import collections
from collections import Counter

import helper
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

from keras.preprocessing.text import Tokenizer
from keras_preprocessing.sequence import pad_sequences
from keras.models import Model, Sequential
from keras.layers import GRU, Input, Dense, TimeDistributed, Activation, RepeatVector, Bidirectional, LSTM
from keras.layers import Embedding
from tensorflow.keras.optimizers import Adam
from keras.losses import sparse_categorical_crossentropy
from keras.callbacks import ModelCheckpoint

import plotly.express as px
from plotly.subplots import make_subplots
import plotly.graph_objects as go

from sklearn.model_selection import train_test_split

from tabulate import tabulate

import gc

### Read Data

In [11]:
data = pd.read_csv('./eng_-french.csv', header='infer')

### EDA

In [4]:
data.shape[0]

175621

In [5]:
data.isna().sum()

English words/sentences    0
French words/sentences     0
dtype: int64

#### Rename columns of dataset

In [12]:
data = data.rename(columns={"English words/sentences":"Eng", "French words/sentences":"Frn"})
data

Unnamed: 0,Eng,Frn
0,Hi.,Salut!
1,Run!,Cours !
2,Run!,Courez !
3,Who?,Qui ?
4,Wow!,Ça alors !
...,...,...
175616,"Top-down economics never works, said Obama. ""T...","« L'économie en partant du haut vers le bas, ç..."
175617,A carbon footprint is the amount of carbon dio...,Une empreinte carbone est la somme de pollutio...
175618,Death is something that we're often discourage...,La mort est une chose qu'on nous décourage sou...
175619,Since there are usually multiple websites on a...,Puisqu'il y a de multiples sites web sur chaqu...


In [13]:
def word_count(txt):
    return len(txt.split())

In [14]:
data['Eng_Count'] = data['Eng'].apply(lambda x:word_count(x))
data['Frn_Count'] = data['Frn'].apply(lambda x:word_count(x))


In [15]:
print( '{} English Words'.format(data['Eng_Count'].sum()) ) 
print('{} French Words'.format(data['Frn_Count'].sum()) )
      

1082098 English Words
1177832 French Words


In [17]:
fig = make_subplots(rows=1, cols=2, subplot_titles=("English","French"))

fig.add_trace(
    go.Histogram(x=data['Eng_Count'],histfunc='sum',opacity =0.8,showlegend=True,text='Eng'), row=1,col=1)

fig.add_trace(
    go.Histogram(x=data['Frn_Count'],histfunc='sum', opacity =0.8,showlegend=True,text='Frn'), row=1,col=2)

fig.update_layout(height=600, width=800, title_text="Words Distribution")
fig.show()


In [18]:
def tokenize(x):
    x_tk = Tokenizer(char_level = False)
    x_tk.fit_on_texts(x)
    return x_tk.texts_to_sequences(x), x_tk
    #return x_tk

In [19]:
def pad(x, length=None):
    if length is None:
        length = max([len(sentence) for sentence in x])
        return pad_sequences(x, maxlen = length, padding='post')

In [12]:
eng_seq, eng_tok = tokenize(data['Eng'])
eng_vocab_size = len(eng_tok.word_index)+1
eng_vocab_size

14532

In [13]:
# eng_seq

In [14]:
frn_seq, frn_tok = tokenize(data['Frn'])
frn_vocab_size = len(frn_tok.word_index)+1
frn_vocab_size


30661

In [15]:
# Sequence length
eng_len = max([len(sentence) for sentence in eng_seq])
frn_len = max([len(sentence) for sentence in frn_seq])

print(f"English sequence length: {eng_len}\nFrench sequence length: {frn_len}")

English sequence length: 44
French sequence length: 55


### Data Split

In [16]:
train_data, test_data = train_test_split(data, test_size=0.1, random_state=0)

In [17]:
train_data = train_data.drop(columns=['Eng_Count', 'Frn_Count'], axis=1)
test_data = test_data.drop(columns=['Eng_Count', 'Frn_Count'], axis=1)

train_data = train_data.reset_index(drop=True)
test_data = test_data.reset_index(drop=True)

In [18]:
# Tokenization

X_train_seq, X_train_tok = tokenize(train_data['Eng'])
y_train_seq, y_train_tok = tokenize(train_data['Frn'])

X_train_vocab = len(X_train_tok.word_index ) + 1
y_train_vocab = len(y_train_tok.word_index ) + 1

X_test_seq, X_test_tok = tokenize(test_data['Eng'])
y_test_seq, y_test_tok = tokenize(test_data['Frn'])

X_test_vocab = len(X_test_tok.word_index)+1
y_test_vocab = len(y_test_tok.word_index)+1


# Padding

X_train_seq = pad(X_train_seq)
y_train_seq = pad(y_train_seq)

X_test_seq = pad(X_test_seq)
y_test_seq = pad(y_test_seq)

In [19]:
#Tabulate the Vocab Size
tab_data = [["Train", X_train_vocab, y_train_vocab],["Test",X_test_vocab,y_test_vocab]]
print(tabulate(tab_data, headers=['Dataset','Eng Vocab Size','Frn Vocab Size'], tablefmt="pretty"))

+---------+----------------+----------------+
| Dataset | Eng Vocab Size | Frn Vocab Size |
+---------+----------------+----------------+
|  Train  |     14068      |     29389      |
|  Test   |      6166      |     10502      |
+---------+----------------+----------------+


## Model

In [20]:
def define_model(in_vocab, out_vocab, in_timesteps, out_timesteps, btch_size):
    model = Sequential()
    model.add(Embedding(in_vocab, btch_size, input_length = in_timesteps, mask_zero=True))
    model.add(LSTM(btch_size))
    model.add(RepeatVector(out_timesteps))
    model.add(LSTM(btch_size, return_sequences=True))
    model.add(Dense(out_vocab, activation='softmax'))

    return model

In [21]:
batch_size = 64
lr = 1e-3

model = define_model(eng_vocab_size, frn_vocab_size, eng_len, frn_len, batch_size)

model.compile(loss='sparse_categorical_crossentropy')
optimizer = Adam(lr)

In [30]:
fn = 'model.h1.MT'
epoch = 2
val_split = 0.1

checkpoint = ModelCheckpoint(fn, monitor='val_loss', verbose=1, save_best_only=True, mode='min')

history = model.fit(X_train_seq, y_train_seq, epochs=epoch, batch_size=batch_size, validation_split=val_split, callbacks=[checkpoint], verbose=1)

Epoch 1/2
Epoch 1: val_loss improved from inf to 0.77731, saving model to model.h1.MT
INFO:tensorflow:Assets written to: model.h1.MT/assets


INFO:tensorflow:Assets written to: model.h1.MT/assets


Epoch 2/2
Epoch 2: val_loss improved from 0.77731 to 0.72341, saving model to model.h1.MT
INFO:tensorflow:Assets written to: model.h1.MT/assets


INFO:tensorflow:Assets written to: model.h1.MT/assets




In [22]:
plt.rcParams["figure.figsize"] = (10,8)
plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])
plt.legend(['train','validation'])
plt.title("Train vs Validation - Loss", fontsize=15)
plt.show()

NameError: name 'history' is not defined

In [None]:
predictions = model.predict(X_test_seq)[0]



{'i': 1,
 'you': 2,
 'to': 3,
 'the': 4,
 'a': 5,
 'is': 6,
 'tom': 7,
 'that': 8,
 'he': 9,
 'do': 10,
 'of': 11,
 'it': 12,
 'in': 13,
 'this': 14,
 'have': 15,
 'me': 16,
 "don't": 17,
 'was': 18,
 'your': 19,
 'are': 20,
 'my': 21,
 'for': 22,
 'what': 23,
 'we': 24,
 "i'm": 25,
 'be': 26,
 'not': 27,
 'she': 28,
 'want': 29,
 'know': 30,
 'on': 31,
 'like': 32,
 'with': 33,
 'can': 34,
 "you're": 35,
 'all': 36,
 'his': 37,
 'at': 38,
 'how': 39,
 'go': 40,
 'did': 41,
 'think': 42,
 'they': 43,
 "it's": 44,
 'and': 45,
 'about': 46,
 'very': 47,
 "can't": 48,
 'time': 49,
 'him': 50,
 "didn't": 51,
 'here': 52,
 'her': 53,
 'will': 54,
 'were': 55,
 'no': 56,
 'had': 57,
 'up': 58,
 'get': 59,
 'there': 60,
 'one': 61,
 'as': 62,
 'just': 63,
 'out': 64,
 'going': 65,
 'why': 66,
 'has': 67,
 'if': 68,
 'would': 69,
 'so': 70,
 'from': 71,
 'us': 72,
 'tell': 73,
 'need': 74,
 'good': 75,
 'when': 76,
 'mary': 77,
 'by': 78,
 'come': 79,
 'could': 80,
 'really': 81,
 'an': 82,
 '

In [39]:
X_test_seq

array([[  24,   36,  348, ...,    0,    0,    0],
       [  25,   27,    8, ...,    0,    0,    0],
       [ 104,   32,    3, ...,    0,    0,    0],
       ...,
       [ 129,   55, 1092, ...,    0,    0,    0],
       [ 177, 6165,    0, ...,    0,    0,    0],
       [  53,  491,    6, ...,    0,    0,    0]], dtype=int32)

In [35]:
def to_text(logits, tokenizer):

    index_to_words = {id: word for word, id in tokenizer.word_index.items()}
    index_to_words[0] = ''
    return ' '.join([index_to_words[prediction] for prediction in np.argmax(logits, 1)])

In [36]:
print(to_text(predictions, frn_tok))

nous nous pas                                                    
