In [25]:
#import dependencies for defining the model
import string
import re
from numpy import array, argmax, random, take
import numpy as np
import pandas as pd
import tensorflow as tf
from keras.models import Sequential
from keras.layers import Dense, LSTM, Embedding, RepeatVector
from keras.preprocessing.text import Tokenizer
from keras.callbacks import ModelCheckpoint
from keras.preprocessing.sequence import pad_sequences
from keras.models import load_model
from keras import optimizers
import matplotlib.pyplot as plt

In [26]:
data_path = 'fra.txt'
with open(data_path, 'r', encoding='utf-8') as f:
  lines = f.read()
lines

In [27]:
def to_lines(text):
  sents = text.strip().split('\n')
  sents = [i.split('\t') for i in sents]
  return sents

In [49]:
fra_eng = to_lines(lines)
fra_eng[:5]

[['Go.',
  'Va !',
  'CC-BY 2.0 (France) Attribution: tatoeba.org #2877272 (CM) & #1158250 (Wittydev)'],
 ['Go.',
  'Marche.',
  'CC-BY 2.0 (France) Attribution: tatoeba.org #2877272 (CM) & #8090732 (Micsmithel)'],
 ['Go.',
  'En route !',
  'CC-BY 2.0 (France) Attribution: tatoeba.org #2877272 (CM) & #8267435 (felix63)'],
 ['Go.',
  'Bouge !',
  'CC-BY 2.0 (France) Attribution: tatoeba.org #2877272 (CM) & #9022935 (Micsmithel)'],
 ['Hi.',
  'Salut !',
  'CC-BY 2.0 (France) Attribution: tatoeba.org #538123 (CM) & #509819 (Aiji)']]

In [50]:
fra_eng = array(fra_eng)
fra_eng[:5]


array([['Go.', 'Va !',
        'CC-BY 2.0 (France) Attribution: tatoeba.org #2877272 (CM) & #1158250 (Wittydev)'],
       ['Go.', 'Marche.',
        'CC-BY 2.0 (France) Attribution: tatoeba.org #2877272 (CM) & #8090732 (Micsmithel)'],
       ['Go.', 'En route !',
        'CC-BY 2.0 (France) Attribution: tatoeba.org #2877272 (CM) & #8267435 (felix63)'],
       ['Go.', 'Bouge !',
        'CC-BY 2.0 (France) Attribution: tatoeba.org #2877272 (CM) & #9022935 (Micsmithel)'],
       ['Hi.', 'Salut !',
        'CC-BY 2.0 (France) Attribution: tatoeba.org #538123 (CM) & #509819 (Aiji)']],
      dtype='<U349')

In [51]:
fra_eng.shape

(227815, 3)

In [52]:
fra_eng = fra_eng[:150000,:]
fra_eng = fra_eng[:,[0,1]]

In [53]:
# Removing punctuations
fra_eng[:,0] = [s.translate(str.maketrans('', '', string.punctuation)) for s in fra_eng[:,0]]
fra_eng[:,1] = [s.translate(str.maketrans('', '', string.punctuation)) for s in fra_eng[:,1]]
fra_eng[:5]

array([['Go', 'Va '],
       ['Go', 'Marche'],
       ['Go', 'En route '],
       ['Go', 'Bouge '],
       ['Hi', 'Salut ']], dtype='<U349')

In [54]:
for i in range(len(fra_eng)):
  fra_eng[i,0] = fra_eng[i,0].lower()
  fra_eng[i,1] = fra_eng[i,1].lower()
fra_eng

array([['go', 'va '],
       ['go', 'marche'],
       ['go', 'en route '],
       ...,
       ['i have a daughter in high school', 'jai une fille au lycée'],
       ['i have a few tricks up my sleeve',
        'jai plus dun tour dans mon sac'],
       ['i have a firstaid kit in my car',
        'jai une trousse de premiers soins dans ma voiture']],
      dtype='<U349')

In [55]:
# function to build tokenizer
def tokenization(lines):
  tokenizer = Tokenizer()
  tokenizer.fit_on_texts(lines)
  return tokenizer

# english tokenizer
eng_tokenizer = tokenization(fra_eng[:, 0])
eng_vocab_size = len(eng_tokenizer.word_index)+1

eng_length = 8
print(f"English vocabulary size {eng_vocab_size}")

English vocabulary size 11320


In [None]:
eng_tokenizer.word_index

In [57]:
for word, index in eng_tokenizer.word_index.items():
  if index==0:
    print(word)

In [58]:
# french tokenizer
fra_tokenizer = tokenization(fra_eng[:, 1])
fra_vocab_size = len(fra_tokenizer.word_index)+1

fra_length = 8
print(f"French vocabulary size {fra_vocab_size}")


French vocabulary size 26738


In [59]:
# encode and pad sequences,padding to a maximum sentence length as mentioned
def encode_sequences(tokenizer, length, lines):
  # integer encode sequences
  sequence = tokenizer.texts_to_sequences(lines)
  #pad sequences with 0 values
  sequence = pad_sequences(sequence, maxlen=length)
  return sequence

In [60]:
# splitting the data into training and testing
from sklearn.model_selection import train_test_split
train, test = train_test_split(fra_eng, test_size=0.2, random_state = 12)

In [61]:
# prepare train data
trainX = encode_sequences(fra_tokenizer, fra_length, train[:,1])
trainY = encode_sequences(eng_tokenizer, eng_length, train[:,0])

# prepare test data
testX = encode_sequences(fra_tokenizer, fra_length, test[:,1])
testY = encode_sequences(eng_tokenizer, eng_length, test[:,0])

In [62]:
testX = testX[:1000, :]
trainX.shape, testX.shape

((120000, 8), (1000, 8))

In [63]:
# Define the model
def define_model(in_vocab, out_vocab, in_timesteps, out_timesteps, units):
  model = Sequential()
  model.add(Embedding(in_vocab, units, input_length=in_timesteps, mask_zero=True))
  model.add(LSTM(units))
  model.add(RepeatVector(out_timesteps))
  model.add(LSTM(units, return_sequences=True))
  model.add(Dense(out_vocab, activation='softmax'))
  return model

In [64]:
# model compilation
model = define_model(fra_vocab_size, eng_vocab_size, fra_length, eng_length, 512)
adam = optimizers.Adam(learning_rate=0.01)
model.compile(loss='sparse_categorical_crossentropy', optimizer=adam, metrics=['acc'])

In [65]:
model.summary()

Model: "sequential_2"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_2 (Embedding)     (None, 8, 512)            13689856  
                                                                 
 lstm_4 (LSTM)               (None, 512)               2099200   
                                                                 
 repeat_vector_2 (RepeatVec  (None, 8, 512)            0         
 tor)                                                            
                                                                 
 lstm_5 (LSTM)               (None, 8, 512)            2099200   
                                                                 
 dense_2 (Dense)             (None, 8, 11320)          5807160   
                                                                 
Total params: 23695416 (90.39 MB)
Trainable params: 23695416 (90.39 MB)
Non-trainable params: 0 (0.00 Byte)
____________

In [66]:
#initialize the callback for early stopping the training if there is not atleast 1% improvement in the accuracy
from tensorflow.keras.callbacks import EarlyStopping
es = EarlyStopping(monitor = 'accuracy', min_delta=0.001)

In [67]:
trainX.shape, trainY.shape

((120000, 8), (120000, 8))

In [68]:
trainY.reshape(trainY.shape[0], trainY.shape[1], 1)
trainY.shape

(120000, 8)

In [69]:
# train model
history=model.fit(trainX, trainY,
                  epochs=50, batch_size=512, validation_split=0.2)

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


In [70]:
t = testX.shape
testX.reshape((testX.shape[0], testX.shape[1]))
testX.shape, t

((1000, 8), (1000, 8))

In [71]:
predictions = model.predict(testX[:1000])



In [72]:
predictions[0]

array([[9.99696374e-01, 3.00583633e-04, 6.60263288e-10, ...,
        1.10905417e-19, 8.90215588e-20, 6.28511216e-19],
       [9.74016607e-01, 2.53636204e-02, 1.09170169e-05, ...,
        5.01354912e-20, 8.94420057e-20, 5.11405189e-20],
       [1.47429064e-01, 8.31606627e-01, 2.02211668e-03, ...,
        7.21006498e-23, 4.36860920e-22, 6.37209659e-21],
       ...,
       [5.26983104e-06, 1.25346519e-02, 5.96789760e-04, ...,
        3.41173355e-22, 1.97199323e-22, 6.12691660e-24],
       [4.42881110e-08, 5.94114303e-04, 4.32074239e-06, ...,
        4.97127587e-19, 1.50087313e-22, 3.09994505e-23],
       [1.70968573e-08, 1.18704465e-05, 4.36898017e-06, ...,
        1.86142571e-16, 9.70327707e-20, 6.20276690e-21]], dtype=float32)

In [75]:

print(predictions[0][1])

[9.7401661e-01 2.5363620e-02 1.0917017e-05 ... 5.0135491e-20 8.9442006e-20
 5.1140519e-20]


In [76]:
# function to return the key word for the value
def get_word(n, tokenizer):
  for word, index in tokenizer.word_index.items():
    if (index==n):
      print(word)
      return word
  return None

In [None]:

preds_text = []
for y in predictions:
  temp = []
  for word in y:
    word = np.array(word)
    max_ = max(word)
    print(max_)
    print(word[0], word[1])
    for j in range(len(word)):
      if j>0:
        if max_ == word[j]:
          print(max_, word[j], j)
          t = get_word(j, eng_tokenizer)
          if t==None:
            temp.append('')
            #break
          else:
            temp.append(t)
            #break

  preds_text.append(' '.join(temp))

In [92]:
print(test[5,0])

do you have a shoehorn


In [93]:
preds_text[5]

'do you have a shoehorn'