In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
from builtins import range, input
import os, sys
import string
import pandas as pd
import re
import numpy as np
from keras.models import Model
from keras.layers import Input, LSTM, GRU, Dense, Embedding, Bidirectional, RepeatVector, Concatenate, Activation, Dot, Lambda
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
import keras.backend as K
from keras.models import load_model
from tensorflow import keras
from keras import optimizers
from sklearn.model_selection import train_test_split

In [3]:
lines = pd.read_csv("/content/drive/MyDrive/NMT_Data/Hindi_English_Corpus.csv",encoding='utf-8')

In [4]:
lines=lines[lines['source']=='ted']

In [5]:
engSentences = lines['english_sentence']
hinSentences = lines['hindi_sentence']

In [6]:
engSentences = engSentences[:5000]
hinSentences = hinSentences[:5000]

In [7]:
X, y = engSentences, hinSentences
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.1,random_state=42)
X_train.shape, X_test.shape

((4500,), (500,))

In [8]:
#For training Data
input_texts = []
target_texts = []
target_texts_inputs = []

#Converting to lowercase
en_train = [line.lower() for line in X_train]
hin_train = [line.lower() for line in y_train]

NUM_SAMPLES = len(en_train)
print("Sample train size:",NUM_SAMPLES)

Sample train size: 4500


In [9]:
for lines in hin_train:
    target_texts_inputs.append('<sos>'+" "+ lines)
    
for lines in hin_train:
    target_texts.append(lines+ " " +'<eos>')
    
for lines in en_train:
    input_texts.append(lines)

In [10]:
tokenizer_inputs = Tokenizer()
tokenizer_inputs.fit_on_texts(input_texts)
input_sequences = tokenizer_inputs.texts_to_sequences(input_texts)

word2idx_inputs = tokenizer_inputs.word_index
max_len_input = max(len(s) for s in input_sequences)

tokenizer_outputs = Tokenizer(filters='')
tokenizer_outputs.fit_on_texts(target_texts + target_texts_inputs) 
target_sequences = tokenizer_outputs.texts_to_sequences(target_texts)
target_sequences_inputs = tokenizer_outputs.texts_to_sequences(target_texts_inputs)

word2idx_outputs = tokenizer_outputs.word_index

num_words_output = len(word2idx_outputs) + 1

max_len_target = max(len(s) for s in target_sequences)

encoder_inputs = pad_sequences(input_sequences, maxlen=max_len_input)
decoder_inputs = pad_sequences(target_sequences_inputs, maxlen=max_len_target, padding='post')
decoder_targets = pad_sequences(target_sequences, maxlen=max_len_target, padding='post')

In [11]:
#For Testing Data
input_texts_test = []
target_texts_test = [] 
target_texts_inputs_test = []

en_test = [line.lower() for line in X_test]
hin_test = [line.lower() for line in y_test]

In [12]:
for lines in hin_test:
    target_texts_inputs_test.append('<sos>'+" "+ lines)
    
for lines in hin_test:
    target_texts_test.append(lines+ " " +'<eos>')
    
for lines in en_test:
    input_texts_test.append(lines)

In [13]:
input_sequences_test = tokenizer_inputs.texts_to_sequences(input_texts_test)

target_sequences_test = tokenizer_outputs.texts_to_sequences(target_texts_test)
target_sequences_inputs_test = tokenizer_outputs.texts_to_sequences(target_texts_inputs_test)

encoder_inputs_test = pad_sequences(input_sequences_test, maxlen=max_len_input)
decoder_inputs_test = pad_sequences(target_sequences_inputs_test, maxlen=max_len_target, padding='post')
decoder_targets_test = pad_sequences(target_sequences_test, maxlen=max_len_target, padding='post')

In [14]:
BATCH_SIZE = 512
EPOCHS = 50
LATENT_DIM = 256
LATENT_DIM_DECODER = 256 
EMBEDDING_DIM = 300

In [15]:
embeddings_index = {}
with open(os.path.join("/content/drive/MyDrive/NMT_Data/gloveData.txt".format(EMBEDDING_DIM)), encoding="utf8") as f:
  for line in f:
    values = line.split()
    word = values[0]
    vec = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = vec

In [16]:
num_words = len(word2idx_inputs) + 1
embedding_matrix = np.zeros((num_words, EMBEDDING_DIM))
for word, i in word2idx_inputs.items():
  if i < num_words:
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
      embedding_matrix[i] = embedding_vector

In [17]:
def softmax_over_time(x):
  assert(K.ndim(x) > 2)
  e = K.exp(x - K.max(x, axis=1, keepdims=True))
  s = K.sum(e, axis=1, keepdims=True)
  return e / s

In [18]:
embedding_layer = Embedding(
  num_words,
  EMBEDDING_DIM,
  weights=[embedding_matrix],
  input_length=max_len_input,
)

In [19]:
decoder_targets_one_hot = np.zeros(
  (
    len(input_texts),
    max_len_target,
    num_words_output
  ),
  dtype='float32'
)

for i, d in enumerate(decoder_targets):
  for t, word in enumerate(d):
    decoder_targets_one_hot[i, t, word] = 1

In [20]:
#encoder
encoder_inputs_placeholder = Input(shape=(max_len_input,))
x = embedding_layer(encoder_inputs_placeholder)
encoder = Bidirectional(LSTM(
  LATENT_DIM,
  return_sequences=True, dropout=0.2
))
encoder_outputs = encoder(x)

In [21]:
#Decoder
decoder_inputs_placeholder = Input(shape=(max_len_target,))
decoder_embedding = Embedding(num_words_output, EMBEDDING_DIM)
decoder_inputs_x = decoder_embedding(decoder_inputs_placeholder)

In [22]:
attn_repeat_layer = RepeatVector(max_len_input)
attn_concat_layer = Concatenate(axis=-1)
attn_dense1 = Dense(10, activation='tanh')
attn_dense2 = Dense(1, activation=softmax_over_time)
attn_dot = Dot(axes=1)

In [23]:
def one_step_attention(h, st_1):
  st_1 = attn_repeat_layer(st_1)
  x = attn_concat_layer([h, st_1])
  x = attn_dense1(x)
  alphas = attn_dense2(x)
  context = attn_dot([alphas, h])
  return context

In [24]:
decoder_lstm = LSTM(LATENT_DIM_DECODER, return_state=True)
decoder_dense = Dense(num_words_output, activation='softmax')

initial_s = Input(shape=(LATENT_DIM_DECODER,), name='s0')
initial_c = Input(shape=(LATENT_DIM_DECODER,), name='c0')
context_last_word_concat_layer = Concatenate(axis=2)

In [25]:
# s, c will be re-assigned in each iteration of the loop
s = initial_s
c = initial_c

# collect outputs in a list at first
outputs = []
for t in range(max_len_target): # Ty times
  # get the context using attention
  context = one_step_attention(encoder_outputs, s)

  # we need a different layer for each time step
  selector = Lambda(lambda x: x[:, t:t+1])
  xt = selector(decoder_inputs_x)
  
  # combine 
  decoder_lstm_input = context_last_word_concat_layer([context, xt])

  # pass the combined [context, last word] into the LSTM
  # along with [s, c]
  # get the new [s, c] and output
  o, s, c = decoder_lstm(decoder_lstm_input, initial_state=[s, c])

  # final dense layer to get next word prediction
  decoder_outputs = decoder_dense(o)
  outputs.append(decoder_outputs)

In [26]:
def stack_and_transpose(x):
  # x is a list of length T, each element is a batch_size x output_vocab_size tensor
  x = K.stack(x) # is now T x batch_size x output_vocab_size tensor
  x = K.permute_dimensions(x, pattern=(1, 0, 2)) # is now batch_size x T x output_vocab_size
  return x

# make it a layerx``
stacker = Lambda(stack_and_transpose)
outputs = stacker(outputs)

In [27]:
model = Model(
  inputs=[
    encoder_inputs_placeholder,
    decoder_inputs_placeholder,
    initial_s, 
    initial_c,
  ],
  outputs=outputs
)

In [28]:
learning_rate=0.001

In [29]:
model.compile(optimizer=keras.optimizers.Adam(learning_rate) ,loss='categorical_crossentropy', metrics=['accuracy'])

In [30]:
z = np.zeros((encoder_inputs.shape[0], LATENT_DIM_DECODER)) # initial [s, c]
r = model.fit(
  [encoder_inputs, decoder_inputs, z, z], decoder_targets_one_hot,
  batch_size=BATCH_SIZE,
  epochs=EPOCHS,
  validation_split=0.2)

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


In [31]:
model.save("/content/drive/MyDrive/NMT_Data/NMT_EngToHin_gloVe.h5")

In [32]:
#
#Modifying the model for Predictions
encoder_model = Model(encoder_inputs_placeholder, encoder_outputs)

# next we define a T=1 decoder model
encoder_outputs_as_input = Input(shape=(max_len_input, LATENT_DIM * 2,))
decoder_inputs_single = Input(shape=(1,))
decoder_inputs_single_x = decoder_embedding(decoder_inputs_single)

# no need to loop over attention steps this time because there is only one step
context = one_step_attention(encoder_outputs_as_input, initial_s)

# combine context with last word
decoder_lstm_input = context_last_word_concat_layer([context, decoder_inputs_single_x])

# lstm and final dense
o, s, c = decoder_lstm(decoder_lstm_input, initial_state=[initial_s, initial_c])
decoder_outputs = decoder_dense(o)

In [33]:
# create the model object
decoder_model = Model(
  inputs=[
    decoder_inputs_single,
    encoder_outputs_as_input,
    initial_s, 
    initial_c
  ],
  outputs=[decoder_outputs, s, c]
)

In [34]:
decoder_model.save("/content/drive/MyDrive/NMT_Data/enghin_decoder_gloVeModel.h5")
encoder_model.save("/content/drive/MyDrive/NMT_Data/enghin_encoder_gloVeModel.h5")



In [35]:
dx2word_eng = {v:k for k, v in word2idx_inputs.items()}
idx2word_trans = {v:k for k, v in word2idx_outputs.items()}

In [36]:
def decode_sequence(input_seq):
  # Encode the input as state vectors.
  enc_out = encoder_model.predict(input_seq)

  # Generate empty target sequence of length 1.
  target_seq = np.zeros((1, 1))
  
  # Populate the first character of target sequence with the start character.
  # NOTE: tokenizer lower-cases all words
  target_seq[0, 0] = word2idx_outputs['<sos>']

  # if we get this we break
  eos = word2idx_outputs['<eos>']


  # [s, c] will be updated in each loop iteration
  s = np.zeros((1, LATENT_DIM_DECODER))
  c = np.zeros((1, LATENT_DIM_DECODER))


  # Create the translation
  output_sentence = []
  for _ in range(max_len_target):
    o, s, c = decoder_model.predict([target_seq, enc_out, s, c])
        

    # Get next word
    idx = np.argmax(o.flatten())

    # End sentence of EOS
    if eos == idx:
      break

    word = ''
    if idx > 0:
      word = idx2word_trans[idx]
      output_sentence.append(word)

    # Update the decoder input
    # which is just the word just generated
    target_seq[0, 0] = idx

  return ' '.join(output_sentence)

In [37]:
test_actual_sentence=[]
test_predicted_sentence=[]
for i in range(len(en_test)):
  
  input_seq = encoder_inputs_test[i:i+1]
  translation = decode_sequence(input_seq)

  test_actual_sentence.append(target_texts_test[i])
  test_predicted_sentence.append(translation)

In [38]:
for i in np.random.randint(0,100,5):
    print('-')
    print('Input sentence:', input_texts_test[i])
    print('Predicted translation:', test_predicted_sentence[i])
    print('Actual translation:', target_texts_test[i])

-
Input sentence: the dragon vis-a-vis the elephant.
Predicted translation: और और के
Actual translation: चीनी ड्रेगन और हिन्दुस्तानी हाथी का मुकाबला। <eos>
-
Input sentence: but if you're not a native speaker,
Predicted translation: और और के के
Actual translation: मगर यदि आप इंगलिश के मूल-वक्ता नहीं हैं, <eos>
-
Input sentence: and we come to work when we don't feel like it,
Predicted translation: और और और के के के के
Actual translation: और हम तब भी क्लास जाते हैं जब हमारा बिल्कुल मन नहीं होता, <eos>
-
Input sentence: for our entire lives.
Predicted translation: और और
Actual translation: अपनी पूरी ज़िंदगी. <eos>
-
Input sentence: well, i do.
Predicted translation: और और
Actual translation: खैर, मैं करता हूँ <eos>
