<a href="https://colab.research.google.com/github/tanyaS121/English-DeutschTranslator/blob/main/LanguageTranslationEngDeu.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import string
import re
from numpy import array, argmax, random, take
import pandas as pd
from keras.models import Sequential
from keras.layers import Dense, LSTM, Embedding, Bidirectional, RepeatVector, TimeDistributed
from keras.preprocessing.text import Tokenizer
from keras.callbacks import ModelCheckpoint
from keras.preprocessing.sequence import pad_sequences
from keras.models import load_model
from keras import optimizers
import matplotlib.pyplot as plt
% matplotlib inline
pd.set_option('display.max_colwidth', 200)

In [None]:
from nltk.tokenize import sent_tokenize, word_tokenize

In [None]:
nltk.download("all")

#####Reading raw data from text file

In [None]:
def read_text(filename):
  file = open(filename, mode='rt', encoding='utf-8')
  text = file.read()
  file.close()
  return text

#####Splitting text into parts/sentences

In [None]:
def to_lines(text):
  senten = text.strip().split('\n')
  senten = [i.split('\t') for i in senten]
  return senten

#####Importing the data from the actual text file deu

In [None]:
data = read_text("/content/sample_data/deu.txt")
deu_eng = to_lines(data)
deu_eng = array(deu_eng)

#####Training only 20000 pair of sentences so to reduce the time for training the model

In [None]:
deu_eng  = deu_eng[:20000,:]

In [None]:
#deu_eng

###Text to sequence conversion###
#####Feeding our data in Seq2Seq model


In [None]:
eng_list = []
deu_list = []

for i in deu_eng[:,0]:
  eng_list.append(len(i.split()))

for i in deu_eng[:,1]:
  deu_list.append(len(i.split()))

In [None]:
length_df = pd.DataFrame({'eng': eng_list, 'deu': deu_list})

In [None]:
length_df.hist(bins = 30)
plt.show()

#####Building of a tokenizer

In [None]:
 def tokenizer(lines):
   tokenizer = Tokenizer()
   tokenizer.fit_on_texts(lines)
   return tokenizer

#####English tokenizer

In [None]:
eng_tokenizer = tokenization(deu_eng[:,0])
eng_vocab_size = len(eng_token.word_index)+1
eng_length = 8
print('English Vocabulary Size: %d' % vocab_size)

#####German Tokenizer

In [None]:
deu_tokenizer = tokenization(deu_eng[:,1])
deu_vocab_size = len(deu_tokenizer.word_index) + 1
deu_length = 8
print('Deutsch Vocabulary Size: %d' % vocab_size)

In [None]:
def encode_sequences(tokenizer, length, lines):
  seq = tokenizer.texts_to_sequences(lines)
  seq = pad_sequences(seq, maxlen = length, padding = 'post')
  return seq

#Training the model

In [None]:
from sklearn.model_selection import train_test_split
train, test =  train_test_split(deu_eng, test_size = 0.2, random_state = 12)

#####Training Set

In [None]:
trainX = encode_sequences(deu_tokenizer, deu_length, train[:, 1])
trainY = encode_sequences(eng_tokenizer, eng_length, train[:, 0])

#####Test Set

In [None]:
testX = encode_sequences(deu_tokenizer, deu_length, train[:, 1])
testY = encode_sequences(deu_tokenizer, deu_length, train[:, 0])

#####Embedding and LSTM

In [None]:
def build_model(in_vocab, out_vocab, in_timesteps, out_timesteps, units):
  model = Sequential()
  model.add(Embedding(in_vocab, units, input_length=in_timesteps, mask_zero=True))
  model.add(LSTM(units))
  model.add(RepeatVector(out_timesteps))
  model.add(LSTM(units,return_sequences=True))
  model.add(Dense(out_vocab, activation= 'softmax'))
  return model

#####Use of RMSprop optimizer is a good choice for recurrent NN

In [None]:
model = build_model(deu_vocab_size, eng_vocab_size, deu_length, eng_length, 512)
rms = optimizers.RMSprop(lr=0.001)
model.compile(optimizers=rms, loss='sparse_categorial_crossentropy')

#sparse_categorial_crossentropry is used as the loss function because it allows 
#us to use the the target sequence as it is instead of one hot encoded format.


#One hot encoding might cosume the entire system's memory. 

In [None]:
filename = 'ENG_GER_Translation'
checkpoint = ModelCheckpoint(filename, monitor='val_loss', verbose=1, save_best_only=True, mode='min')
history = model.fit(trainX, trainY.reshape(trainY.shape[0], trainY.shape[1], 1)), epochs=5, batch_size=512, validation_split =0.2, callbacks = [checkpoint], verbose=1)

#increasing epochs' value might result into more accurate predictions

#####Training loss and validation loss

In [None]:
plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])
plt.legend(['train','validation'])
plt.show()

#####Loading model to make predictions

In [None]:
model = load_model('ENG_GER_Translation')
preds = model.predict_classes(testX.reshape((testX.shape[0], testX.shape[1])))

In [None]:
def get_word(n,tokenizer):
  for word, index in tokenizer.word_index.items():
    if index == n:
      return word
  return None

#####Converting prediction into text

In [None]:
preds_text = []
for i in preds:
  temp = []
  for j in range(len(i)):
    t = get_word(i[j], eng_tokenizer)
    if j>0:
      if (t == get_word(i[j-1], eng_tokenizer)) or (t == None):
        temp.append('')
      else :
        temp.append(t)
    
    else:
      if (t == None):
        temp.append('')
      else:
        temp.append(t)
  preds_text.append(' '.join(temp))

In [None]:
pred_df = pd.DataFrame({'actual' : test[:,0], 'predicted' : preds_text})

In [None]:
pd.set_option('display.max_colwidth', 200)

In [None]:
pred_df.head(15) #considering first 15 rows of the dataset