In [None]:
import string
import re
import  numpy as np
import pandas as pd
from keras.models import Sequential
from keras.layers import Dense, LSTM, Embedding, RepeatVector
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import load_model
from keras import optimizers
import matplotlib.pyplot as plt
%matplotlib inline
pd.set_option("display.max_colwidth", 200)


In [None]:
data_path="/content/drive/MyDrive/eng-fra.txt"
with open(data_path,'r',encoding='utf_8') as f:
  lines=f.read()
lines

In [None]:
def to_lines(text):
  sents = text.strip().split("\n")
  sents = [i.split('\t') for i in sents]
  return sents

In [None]:
fra_eng = to_lines(lines)

In [None]:
fra_eng[:9]

In [None]:
fra_eng=np.array(fra_eng)
fra_eng[:5]

In [None]:
fra_eng.shape


In [None]:
#remove puntuation
fra_eng= fra_eng[:90000,:]

In [None]:
fra_eng[:5]

In [None]:
fra_eng[:,0]=[s.translate(str.maketrans("","",string.punctuation)) for s in fra_eng[:,0]]
fra_eng[:,1]=[s.translate(str.maketrans("","",string.punctuation)) for s in fra_eng[:,1]]


fra_eng[:5]

In [None]:
for i in range(len(fra_eng)) :
  fra_eng[i,0]=fra_eng[i,0].lower()
  fra_eng[i,1]=fra_eng[i,1].lower()



In [None]:
fra_eng

**Text to Sequence Conversion(word to index mapping)**


1.   Convert sentences into numbers
2.   Every sentence should be of same length



In [None]:
def tokenization(lines):
  tokenizer=Tokenizer()
  tokenizer.fit_on_texts(lines)
  return tokenizer

In [None]:
eng_tokenizer = tokenization(fra_eng[:,0])
eng_vocab_size= len(eng_tokenizer.word_index)

eng_length=8
print("English Vocabs size: %d"% eng_vocab_size)

In [None]:
#prepare french tokenizer

fra_tokenizer = tokenization(fra_eng[:,1])
fra_vocab_size= len(fra_tokenizer.word_index)

fra_length=8
print("French Vocabs size:%d"% fra_vocab_size)

In [None]:
# encode and pad sequences, padding to a maximum sentence length as mentioned above
def encode_sequences(tokenizer,length,lines):

  seq=tokenizer.texts_to_sequences(lines)
  seq=pad_sequences(seq,maxlen=length,padding='post') # same length
  return seq

It's time to encode the sentences. We will encode French sentences as the input sequences and English sentences as the target sequences. This had to be done for both the train and test datasets.

In [None]:
from sklearn.model_selection  import train_test_split
train, test= train_test_split(fra_eng,test_size=2,random_state=12)

In [None]:
#prepare training data
TrainX= encode_sequences(fra_tokenizer,fra_length,train[:,1])
TrainY= encode_sequences(eng_tokenizer,eng_length,train[:,0])

In [None]:
#prepare validation data
testX = encode_sequences(fra_tokenizer,fra_length,test[:,1])
testY = encode_sequences(eng_tokenizer,eng_length,test[:,0])

# ***Seq2seq model architecture***

In [None]:
def Model_seq2seq(in_vocab,out_vocab,in_timesteps,out_timesteps,units):
  model=Sequential()
  model.add(Embedding(in_vocab,units,input_length=in_timesteps,mask_zero=True)) # encoder
  model.add(LSTM(units))  # encoder
  model.add(RepeatVector(out_timesteps))
  model.add(LSTM(units,return_sequences=True)) # decoder
  model.add(Dense(out_vocab,activation='softmax')) # decoder
  return model

In [None]:
model=Model_seq2seq(fra_vocab_size,eng_vocab_size,fra_length,eng_length,512)
rms = optimizers.RMSprop(lr=0.001)
model.compile(optimizer=rms,loss="sparse_categorical_crossentropy")

In [None]:
model.summary()

train the model

In [None]:
history= model.fit(TrainX,TrainY.reshape(TrainX.shape[0],TrainY.shape[1],1),epochs=15,batch_size=512,validation_split=0.2)

In [None]:
preds=model.predict_classes(testX.reshape((testX.shape[0],testX.shape[1])))

In [None]:
#integer ------> words
def get_words(n,tokenizer):
  for word, index in tokenizer.word_index.item():
    if index==n:
      return word
  return None


***convert predictions into sentences(English): ***

In [None]:
pred_text=[]
for i in preds :
  temp=[]
  for j in range(len(i)):
    t=get_words(i[j],eng_tokenizer)
    if j > 0 :
      if(t==get_words(i[j-1],eng_tokenizer)) or (t==None):
        temp.append('')
      else:
        temp.append(t)
  pred_text.append(' '.join(temp))
