# LANGUAGE TRANSLATION PIPELINE

### Authors: Nikhith Theddu, Sumanth Bhargav Kanchi, Hemalekha Pillarishetty

In [1]:
# importing libraries
import string
import re
import numpy as np
import nltk
import collections
import string
import pandas as pd
#import matplotlib.pyplot as plt
from numpy import array, argmax, random, take
from keras.models import Sequential
from keras.layers import Dense, LSTM, Embedding, RepeatVector
from keras.preprocessing.text import Tokenizer
from keras.callbacks import ModelCheckpoint
from keras.preprocessing.sequence import pad_sequences
from keras.models import load_model
from keras import optimizers
from sklearn.model_selection import train_test_split
from nltk.translate.bleu_score import sentence_bleu
from keras.models import Model
from keras.layers import GRU, Input, Dense, TimeDistributed, Activation, RepeatVector, Bidirectional
from keras.layers.embeddings import Embedding
from keras.optimizers import Adam
from keras.losses import sparse_categorical_crossentropy

### Pre-processing

In [2]:
# extracting the data from txt file
file = open('fra.txt','r')
text = file.read()
# using .split to grab the sentences
sents = text.strip().split('\n')
sents = [i.split('\t') for i in sents]
for i in sents:
    i.pop(-1)

In [3]:
sentences = array(sents)
# considering only 50000 sentences
sentences = sentences[:50000,:]
english = []
french = []
englishlen = []
frenchlen = []
for sent in sentences[:,0]:
    # removing punctuations of English text
    temp = sent.translate(str.maketrans('','',string.punctuation))
    # converting text to lower
    english.append(temp.lower())
    englishlen.append(len(temp.split()))
for sent in sentences[:,1]:
    # removing punctuations of French text
    temp = sent.translate(str.maketrans('','',string.punctuation))
    # converting text to lower
    french.append(temp.lower())
    frenchlen.append(len(temp.split()))
english[:5],french[:5],englishlen[:5],frenchlen[:5]

(['go', 'hi', 'hi', 'run', 'run'],
 ['va ', 'salut ', 'salut', 'coursâ€¯', 'courezâ€¯'],
 [1, 1, 1, 1, 1],
 [1, 1, 1, 1, 1])

In [4]:
# checking for nulls
count = 0
counts = 0
for i in englishlen:
    if i == 0:
        count += 1
for j in frenchlen:
    if j == 0:
        counts += 1
print('Nulls in English sentences is',count)
print('Nulls in French sentences is',counts)

Nulls in English sentences is 0
Nulls in French sentences is 0


In [5]:
max(englishlen),max(frenchlen)

(7, 14)

In [6]:
# Tokenizer function
def tokens(text):
      tokenizer = Tokenizer()
        # assigns index 
      tokenizer.fit_on_texts(text)
      return tokenizer

In [7]:
english_tokenizer = tokens(english)
eng_vocab = len(english_tokenizer.word_index) + 1
french_tokenizer = tokens(french)
french_vocab = len(french_tokenizer.word_index) + 1
eng_vocab,french_vocab

(6052, 14072)

In [8]:
# creating data frame
df = pd.DataFrame({'english':english, 'french':french})
df.head(15)

Unnamed: 0,english,french
0,go,va
1,hi,salut
2,hi,salut
3,run,coursâ€¯
4,run,courezâ€¯
5,who,qui
6,wow,ã‡a alorsâ€¯
7,fire,au feu
8,help,ã€ laideâ€¯
9,jump,saute


In [9]:
# train test split
train, test = train_test_split(df, test_size=0.2, random_state = 12)


In [10]:
# converting text to sequences for train English data
train_eng = english_tokenizer.texts_to_sequences(train['english'])
# padding of the sequences for train English data
train_eng = pad_sequences(train_eng, maxlen=max(englishlen), padding='pre')
test_eng = english_tokenizer.texts_to_sequences(test['english'])
test_eng = pad_sequences(test_eng, maxlen=max(englishlen), padding='pre')
train_french = french_tokenizer.texts_to_sequences(train['french'])
train_french = pad_sequences(train_french, maxlen=max(frenchlen), padding='post')
test_french = french_tokenizer.texts_to_sequences(test['french'])
test_french = pad_sequences(test_french, maxlen=max(frenchlen), padding='post')
print('Sample English padded sequence is',train_eng[1])
print('Sample French padded sequence is',train_french[1])

Sample English padded sequence is [  0   0   1 344 102 312 528]
Sample French padded sequence is [   1   24   89 1063   11  535  756    0    0    0    0    0    0    0]


### Model 1

In [11]:
# creation of a sequential model
model = Sequential()
model.add(Embedding(eng_vocab,512 , input_length=max(englishlen), mask_zero=True))
model.add(LSTM(512))
model.add(RepeatVector(max(frenchlen)))
model.add(LSTM(512, return_sequences=True))
model.add(Dense(french_vocab, activation='softmax'))
model

<tensorflow.python.keras.engine.sequential.Sequential at 0x286cd726780>

In [None]:
# to generate the model plot, due to some errors in installs, google collab is used for this piece of code
#from keras.utils import plot_model
#plot_model(model,to_file = 'model_1.png',show_shapes=True, show_layer_names=True)

In [12]:
# building optimizer with learning rate 0.001
rms = optimizers.RMSprop(lr=0.001)
model.compile(optimizer=rms, loss='sparse_categorical_crossentropy')

In [105]:
# saving model
filename = 'model.h1'
checkpoint = ModelCheckpoint(filename, monitor='val_loss', verbose=1, save_best_only=True, mode='min')

# train model
history = model.fit(train_eng, train_french.reshape(train_french.shape[0], train_french.shape[1], 1),
                    epochs=30, batch_size=512, validation_split = 0.2,callbacks=[checkpoint], 
                    verbose=1)

Epoch 1/30
Epoch 00001: val_loss improved from inf to 2.04511, saving model to model.h1
INFO:tensorflow:Assets written to: model.h1\assets
Epoch 2/30
Epoch 00002: val_loss improved from 2.04511 to 2.02790, saving model to model.h1
INFO:tensorflow:Assets written to: model.h1\assets
Epoch 3/30
Epoch 00003: val_loss improved from 2.02790 to 1.96878, saving model to model.h1
INFO:tensorflow:Assets written to: model.h1\assets
Epoch 4/30
Epoch 00004: val_loss improved from 1.96878 to 1.87143, saving model to model.h1
INFO:tensorflow:Assets written to: model.h1\assets
Epoch 5/30
Epoch 00005: val_loss improved from 1.87143 to 1.79855, saving model to model.h1
INFO:tensorflow:Assets written to: model.h1\assets
Epoch 6/30
Epoch 00006: val_loss improved from 1.79855 to 1.71823, saving model to model.h1
INFO:tensorflow:Assets written to: model.h1\assets
Epoch 7/30
Epoch 00007: val_loss improved from 1.71823 to 1.65466, saving model to model.h1
INFO:tensorflow:Assets written to: model.h1\assets
Epo

In [19]:
# loading model
model = load_model('model.h1')
# predicting on test dataset
predicted = model.predict_classes(test_eng.reshape((test_eng.shape[0],test_eng.shape[1])))

Instructions for updating:
Please use instead:* `np.argmax(model.predict(x), axis=-1)`,   if your model does multi-class classification   (e.g. if it uses a `softmax` last-layer activation).* `(model.predict(x) > 0.5).astype("int32")`,   if your model does binary classification   (e.g. if it uses a `sigmoid` last-layer activation).


In [19]:
# word match function
def wor(n, tokenizer):
      for word, index in tokenizer.word_index.items():
          if index == n:
              return word
      return None

In [21]:
# extract words using the word index 
preds = []
for pred in predicted:
    temp = []
    for ele in range(len(pred)):
        # checking for the index match with french tokenizer
        words = wor(pred[ele], french_tokenizer)
        if ele > 0:
            if (words == wor(pred[ele-1], french_tokenizer)) or (words == None):
                temp.append('')
            else:
                temp.append(words)
        else:
            if(words == None):
                temp.append('')
            else:
                temp.append(words) 
    preds.append(' '.join(temp))

In [22]:
# creating a data frame with English text, actual French and predicted French columns
pred_df = pd.DataFrame({'English':test['english'],'actual' : test['french'], 'predicted' : preds})

In [24]:
# creating actual and predicted French text lists
french_test = list(test['french'])
actual = [] 
final = []
for fre in french_test:
    fre = nltk.word_tokenize(fre)
    actual.append(fre)
for pred in preds:
    pred = nltk.word_tokenize(pred)
    final.append(pred)

In [25]:
scores = []
# Bleu score calculation for each sentence
for i in range(len(actual)):
    temp = []
    temp.append(actual[i])
    score = sentence_bleu(temp, final[i],weights=(1, 0, 0, 0))
    scores.append(score)

The hypothesis contains 0 counts of 3-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 4-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 2-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()


In [26]:
# average oF Bleu scores of all sentences
sum(scores)/len(scores)

0.4170232783948858

In [27]:
# random 15 rows of the data frame
pred_df.sample(15)

Unnamed: 0,English,actual,predicted
45835,they cant be ignored,ils ne peuvent ãªtre ignorã©s,ils ne peuvent pas virer
32973,theres no elevator,il ny a pas dascenseur,il ny a pas de
30800,i just want to help,je veux juste ãªtre utile,je veux juste que
8871,im devastated,je suis anã©antie,je suis
37537,ill not forget that,je ne loublierai pas,je noublierai pas
31266,i was happy for him,je fus heureux pour lui,jã©tais travaille pour
13381,tom is knocking,tom frappe ã la porte,tom mã¨ne
20671,ill wait outside,jattendrai dehors,jattendrai ã
43971,i work in the morning,je travaille le matin,je travaille dans au
14278,are you japanese,ãštesvous japonais,ãštesvous malchanceuxâ


### Model 2

In [27]:
# building a sequential model
final_model = Sequential()
final_model.add(Embedding(input_dim=eng_vocab,output_dim=512,input_length=max(englishlen),mask_zero = True))
final_model.add(Bidirectional(GRU(512,return_sequences=False)))
final_model.add(RepeatVector(max(frenchlen)))
final_model.add(Bidirectional(GRU(512,return_sequences=True)))
final_model.add(TimeDistributed(Dense(french_vocab,activation='softmax')))
learning_rate = 0.001
# compilation of the model
final_model.compile(loss = 'sparse_categorical_crossentropy', 
                 optimizer = Adam(learning_rate), 
                 metrics = ['accuracy'])

In [None]:
# to generate the model plot, due to some errors in installs, google collab is used for this piece of code
#from keras.utils import plot_model
#plot_model(final_model,to_file = 'model_2.png',show_shapes=True, show_layer_names=True)

In [28]:
# to save the model
filename = 'model.h8'
checkpoint = ModelCheckpoint(filename, monitor='val_loss', verbose=1, save_best_only=True, mode='min')
# train model
hist = final_model.fit(train_eng, train_french.reshape(train_french.shape[0], train_french.shape[1], 1), batch_size = 512, epochs = 30, validation_split = 0.2,callbacks=[checkpoint],verbose=1)
 


Epoch 1/30
Epoch 00001: val_loss improved from inf to 2.21833, saving model to model.h8
INFO:tensorflow:Assets written to: model.h8\assets
Epoch 2/30
Epoch 00002: val_loss improved from 2.21833 to 2.05698, saving model to model.h8
INFO:tensorflow:Assets written to: model.h8\assets
Epoch 3/30
Epoch 00003: val_loss improved from 2.05698 to 1.97029, saving model to model.h8
INFO:tensorflow:Assets written to: model.h8\assets
Epoch 4/30
Epoch 00004: val_loss improved from 1.97029 to 1.87185, saving model to model.h8
INFO:tensorflow:Assets written to: model.h8\assets
Epoch 5/30
Epoch 00005: val_loss improved from 1.87185 to 1.80039, saving model to model.h8
INFO:tensorflow:Assets written to: model.h8\assets
Epoch 6/30
Epoch 00006: val_loss improved from 1.80039 to 1.70455, saving model to model.h8
INFO:tensorflow:Assets written to: model.h8\assets
Epoch 7/30
Epoch 00007: val_loss improved from 1.70455 to 1.60979, saving model to model.h8
INFO:tensorflow:Assets written to: model.h8\assets
Epo

In [29]:
# loading model
final_model = load_model('model.h8')
# predicting the test data
final_predicted = final_model.predict_classes(test_eng)

In [30]:
# extract words using the word index 
preds = []
for pred in final_predicted:
    temp = []
    for ele in range(len(pred)):
        # checking for the index match with french tokenizer
        words = wor(pred[ele], french_tokenizer)
        if ele > 0:
            if (words == wor(pred[ele-1], french_tokenizer)) or (words == None):
                temp.append('')
            else:
                temp.append(words)
        else:
            if(words == None):
                temp.append('')
            else:
                temp.append(words) 
    preds.append(' '.join(temp))

In [31]:
# creating a data frame with English text, actual French and predicted French columns
pred_df = pd.DataFrame({'English':test['english'],'actual' : test['french'], 'predicted' : preds})

In [33]:
# creating actual and predicted French text lists
french_test = list(test['french'])
actual = [] 
final = []
for fre in french_test:
    fre = nltk.word_tokenize(fre)
    actual.append(fre)
for pred in preds:
    pred = nltk.word_tokenize(pred)
    final.append(pred)

In [34]:
# Bleu score calculation for each sentence
scores = []
for i in range(len(actual)):
    temp = []
    temp.append(actual[i])
    score = sentence_bleu(temp, final[i],weights=(1, 0, 0, 0))
    scores.append(score)

The hypothesis contains 0 counts of 3-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 4-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 2-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()


In [35]:
# average of the scores
sum(scores)/len(scores)

0.4683055177131358

In [36]:
# random 15 rows of the data frame
pred_df.sample(15)

Unnamed: 0,English,actual,predicted
34196,who did you talk to,ã€ qui parlaistuâ€¯,ã€ qui parlaistuâ€¯ parlã©â€¯
8878,im farsighted,je vois loin,je suis affaiblie
38073,it was heartwarming,cã©tait rã©confortant,cã©tait fut
10990,give it to them,donnelaleur,faisle les
43522,i often make mistakes,je commets souvent des fautes,je nombreuses souvent nombreuses erreurs
4744,tom did well,tom sest bien dã©brouillã©,tom a bien
11399,i dont need it,je nen ai pas besoin,je nen ai besoin
18006,what a cute baby,quel mignon bã©bã©,quel est
33823,we want to help tom,nous voulons aider tom,nous voulons suivre
41525,come over to my place,venez chez moi,allez ã mon
