In [70]:
import pandas as pd
import re
import tensorflow as tf
from tensorflow.keras.layers import Embedding, LSTM, Dense
from tensorflow.keras.models import Model
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import numpy as np
import nltk.translate.bleu_score as bleu
import random
import string
from sklearn.model_selection import train_test_split
import os
import time

In [3]:
eng_hin=pd.read_csv('eng_hin.csv')
eng_hin.head()

Unnamed: 0,source,english_sentence,hindi_sentence
0,ted,politicians do not have permission to do what ...,"राजनीतिज्ञों के पास जो कार्य करना चाहिए, वह कर..."
1,ted,"I'd like to tell you about one such child,",मई आपको ऐसे ही एक बच्चे के बारे में बताना चाहू...
2,indic2012,This percentage is even greater than the perce...,यह प्रतिशत भारत में हिन्दुओं प्रतिशत से अधिक है।
3,ted,what we really mean is that they're bad at not...,हम ये नहीं कहना चाहते कि वो ध्यान नहीं दे पाते
4,indic2012,.The ending portion of these Vedas is called U...,इन्हीं वेदों का अंतिम भाग उपनिषद कहलाता है।


In [4]:
eng_hin.dropna(inplace=True)
eng_hin=eng_hin[:50000]
eng_hin.drop(['source'],axis=1,inplace=True)
eng_hin.shape

(50000, 2)

In [5]:
exclude = set(string.punctuation) # Set of all special characters
remove_digits = str.maketrans('', '', string.digits) # Set of all digits

In [6]:
def preprocess(text):
    '''Function to preprocess English sentence'''
    text = text.lower() # lower casing
    text = re.sub("'", '', text) # remove the quotation marks if any
    text = ''.join(ch for ch in text if ch not in exclude)
    text = text.translate(remove_digits) # remove the digits
    text = text.strip()
    text = re.sub(" +", " ", text) # remove extra spaces
    text = '<start> ' + text + ' <end>'
    return text

In [7]:
def preprocess_hin(text):
    '''Function to preprocess Marathi sentence'''
    text = re.sub("'", '', text) # remove the quotation marks if any
    text = ''.join(ch for ch in text if ch not in exclude)
    text = re.sub("[२३०८१५७९४६]", "", text) # remove the digits
    text = text.strip()
    text = re.sub(" +", " ", text) # remove extra spaces
    text = '<start> ' + text + ' <end>'
    return text

In [8]:
eng_hin['english_sentence'] = eng_hin['english_sentence'].apply(preprocess)
eng_hin['hindi_sentence'] = eng_hin['hindi_sentence'].apply(preprocess_hin)

eng_hin.rename(columns={"english_sentence": "english", "hindi_sentence": "hindi"},inplace=True)

eng_hin.head()

Unnamed: 0,english,hindi
0,<start> politicians do not have permission to ...,<start> राजनीतिज्ञों के पास जो कार्य करना चाहि...
1,<start> id like to tell you about one such chi...,<start> मई आपको ऐसे ही एक बच्चे के बारे में बत...
2,<start> this percentage is even greater than t...,<start> यह प्रतिशत भारत में हिन्दुओं प्रतिशत स...
3,<start> what we really mean is that theyre bad...,<start> हम ये नहीं कहना चाहते कि वो ध्यान नहीं...
4,<start> the ending portion of these vedas is c...,<start> इन्हीं वेदों का अंतिम भाग उपनिषद कहलात...


In [9]:
def tokenize(lang):

  lang_tokenizer = tf.keras.preprocessing.text.Tokenizer(filters='')
  lang_tokenizer.fit_on_texts(lang)

  tensor = lang_tokenizer.texts_to_sequences(lang)

  tensor = tf.keras.preprocessing.sequence.pad_sequences(tensor,padding='post',maxlen=20,dtype='int32')

  return tensor, lang_tokenizer

In [10]:
def load_dataset():

  input_tensor, inp_lang_tokenizer = tokenize(eng_hin['english'].values)
  target_tensor, targ_lang_tokenizer = tokenize(eng_hin['hindi'].values)

  return input_tensor, target_tensor, inp_lang_tokenizer, targ_lang_tokenizer

In [11]:
input_tensor, target_tensor, inp_lang, targ_lang = load_dataset()

In [12]:
max_length_targ, max_length_inp = target_tensor.shape[1], input_tensor.shape[1]

In [13]:
input_tensor_train, input_tensor_val, target_tensor_train, target_tensor_val = train_test_split(input_tensor, target_tensor, test_size=0.2)

print(len(input_tensor_train), len(target_tensor_train), len(input_tensor_val), len(target_tensor_val))

40000 40000 10000 10000


In [14]:
BUFFER_SIZE = len(input_tensor_train)
BATCH_SIZE = 64
N_BATCH = BUFFER_SIZE//BATCH_SIZE
embedding_dim = 256
units = 1024
steps_per_epoch = len(input_tensor_train)//BATCH_SIZE

vocab_inp_size =len(inp_lang.word_index.keys())
vocab_tar_size =len(targ_lang.word_index.keys())

dataset = tf.data.Dataset.from_tensor_slices((input_tensor_train, target_tensor_train)).shuffle(BUFFER_SIZE)
dataset = dataset.batch(BATCH_SIZE, drop_remainder=True)

In [16]:
embeddings_index = dict()
f = open('glove.6B.300d.txt')
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
f.close()

embedding_matrix = np.zeros((vocab_inp_size+1, 300))
for word, i in inp_lang.word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector

In [17]:
class Encoder(tf.keras.Model):
    def __init__(self, vocab_size, embedding_dim, enc_units, batch_sz):
        super(Encoder, self).__init__()
        self.batch_sz = batch_sz
        self.enc_units = enc_units
        self.embedding = tf.keras.layers.Embedding(input_dim=vocab_size, output_dim=embedding_dim, name="embedding_layer_encoder",trainable=False)
        self.gru = tf.keras.layers.GRU(units, return_sequences=True, return_state=True, recurrent_activation='sigmoid', recurrent_initializer='glorot_uniform')
        
    def call(self, x, hidden):
        x = self.embedding(x)
        output, state = self.gru(x, initial_state = hidden)
        return output, state
    
    def initialize_hidden_state(self):
        return tf.zeros((self.batch_sz, self.enc_units))

In [18]:
class Decoder(tf.keras.Model):
    def __init__(self, vocab_size, embedding_dim, dec_units, batch_sz):
        super(Decoder, self).__init__()
        self.batch_sz = batch_sz
        self.dec_units = dec_units
        self.embedding = tf.keras.layers.Embedding(vocab_size, embedding_dim)
        self.gru = tf.keras.layers.GRU(units, return_sequences=True, return_state=True, recurrent_activation='sigmoid', recurrent_initializer='glorot_uniform')
        self.fc = tf.keras.layers.Dense(vocab_size)

                # used for attention
        self.W1 = tf.keras.layers.Dense(self.dec_units)
        self.W2 = tf.keras.layers.Dense(self.dec_units)
        self.V = tf.keras.layers.Dense(1)

    def call(self, x, hidden, enc_output):

        hidden_with_time_axis = tf.expand_dims(hidden, 1)
        
        score = self.V(tf.nn.tanh(self.W1(enc_output) + self.W2(hidden_with_time_axis)))
        
        attention_weights = tf.nn.softmax(score, axis=1)
        
        context_vector = attention_weights * enc_output
        context_vector = tf.reduce_sum(context_vector, axis=1)
        
        x = self.embedding(x)
        
        x = tf.concat([tf.expand_dims(context_vector, 1), x], axis=-1)
        
        output, state = self.gru(x)
        
        output = tf.reshape(output, (-1, output.shape[2]))
        
        x = self.fc(output)
        
        return x, state, attention_weights
        
    def initialize_hidden_state(self):
        return tf.zeros((self.batch_sz, self.dec_units))

In [19]:
tf.keras.backend.clear_session()

encoder = Encoder(vocab_inp_size+1, 300, units, BATCH_SIZE)
decoder = Decoder(vocab_tar_size+1, embedding_dim, units, BATCH_SIZE)

In [20]:
optimizer = tf.keras.optimizers.Adam()
loss_object = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True,
                                                            reduction='none')


def loss_function(real, pred):
  mask = tf.math.logical_not(tf.math.equal(real, 0))
  loss_ = loss_object(real, pred)

  mask = tf.cast(mask, dtype=loss_.dtype)
  loss_ *= mask

  return tf.reduce_mean(loss_)

In [21]:
checkpoint_dir = './training_checkpoints'
checkpoint_prefix = os.path.join(checkpoint_dir, "ckpt")
checkpoint = tf.train.Checkpoint(optimizer=optimizer,
                                 encoder=encoder,
                                 decoder=decoder)

In [22]:
@tf.function
def train_step(inp, targ, enc_hidden):
  loss = 0

  with tf.GradientTape() as tape:
    enc_output, enc_hidden = encoder(inp, enc_hidden)
    encoder.get_layer('embedding_layer_encoder').set_weights([embedding_matrix])
    dec_hidden = enc_hidden

    dec_input = tf.expand_dims([targ_lang.word_index['<start>']] * BATCH_SIZE, 1)

    for t in range(1, targ.shape[1]):
      predictions, dec_hidden, _ = decoder(dec_input, dec_hidden, enc_output)

      loss += loss_function(targ[:, t], predictions)

      dec_input = tf.expand_dims(targ[:, t], 1)

  batch_loss = (loss / int(targ.shape[1]))

  variables = encoder.trainable_variables + decoder.trainable_variables

  gradients = tape.gradient(loss, variables)

  optimizer.apply_gradients(zip(gradients, variables))

  return batch_loss

In [23]:
EPOCHS = 15

for epoch in range(EPOCHS):
  start = time.time()

  enc_hidden = encoder.initialize_hidden_state()
  total_loss = 0

  for (batch, (inp, targ)) in enumerate(dataset.take(steps_per_epoch)):
    batch_loss = train_step(inp, targ, enc_hidden)
    total_loss += batch_loss

    if batch % 100 == 0:
      print(f'Epoch {epoch+1} Batch {batch} Loss {batch_loss.numpy():.4f}')
  if (epoch + 1) % 2 == 0:
    checkpoint.save(file_prefix=checkpoint_prefix)

  print(f'Epoch {epoch+1} Loss {total_loss/steps_per_epoch:.4f}')
  print(f'Time taken for 1 epoch {time.time()-start:.2f} sec\n')

Epoch 1 Batch 0 Loss 7.3141
Epoch 1 Batch 100 Loss 5.1003
Epoch 1 Batch 200 Loss 4.6859
Epoch 1 Batch 300 Loss 4.5698
Epoch 1 Batch 400 Loss 4.3429
Epoch 1 Batch 500 Loss 4.8458
Epoch 1 Batch 600 Loss 4.0618
Epoch 1 Loss 4.6826
Time taken for 1 epoch 135.76 sec

Epoch 2 Batch 0 Loss 4.3634
Epoch 2 Batch 100 Loss 4.3127
Epoch 2 Batch 200 Loss 3.5064
Epoch 2 Batch 300 Loss 4.2723
Epoch 2 Batch 400 Loss 3.9200
Epoch 2 Batch 500 Loss 3.5746
Epoch 2 Batch 600 Loss 3.5531
Epoch 2 Loss 3.9863
Time taken for 1 epoch 119.40 sec

Epoch 3 Batch 0 Loss 3.5205
Epoch 3 Batch 100 Loss 3.8863
Epoch 3 Batch 200 Loss 3.3188
Epoch 3 Batch 300 Loss 3.5382
Epoch 3 Batch 400 Loss 3.2281
Epoch 3 Batch 500 Loss 3.2979
Epoch 3 Batch 600 Loss 3.1598
Epoch 3 Loss 3.4068
Time taken for 1 epoch 114.08 sec

Epoch 4 Batch 0 Loss 2.8931
Epoch 4 Batch 100 Loss 2.8633
Epoch 4 Batch 200 Loss 2.7618
Epoch 4 Batch 300 Loss 2.9703
Epoch 4 Batch 400 Loss 2.9389
Epoch 4 Batch 500 Loss 3.0428
Epoch 4 Batch 600 Loss 2.6770
Epo

In [29]:
for epoch in range(EPOCHS,20):
  start = time.time()

  enc_hidden = encoder.initialize_hidden_state()
  total_loss = 0

  for (batch, (inp, targ)) in enumerate(dataset.take(steps_per_epoch)):
    batch_loss = train_step(inp, targ, enc_hidden)
    total_loss += batch_loss

    if batch % 100 == 0:
      print(f'Epoch {epoch+1} Batch {batch} Loss {batch_loss.numpy():.4f}')
  # saving (checkpoint) the model every 2 epochs
  if (epoch + 1) % 2 == 0:
    checkpoint.save(file_prefix=checkpoint_prefix)

  print(f'Epoch {epoch+1} Loss {total_loss/steps_per_epoch:.4f}')
  print(f'Time taken for 1 epoch {time.time()-start:.2f} sec\n')

Epoch 16 Batch 0 Loss 0.1300
Epoch 16 Batch 100 Loss 0.1290
Epoch 16 Batch 200 Loss 0.1455
Epoch 16 Batch 300 Loss 0.1636
Epoch 16 Batch 400 Loss 0.2018
Epoch 16 Batch 500 Loss 0.1756
Epoch 16 Batch 600 Loss 0.2210
Epoch 16 Loss 0.1711
Time taken for 1 epoch 121.54 sec

Epoch 17 Batch 0 Loss 0.1267
Epoch 17 Batch 100 Loss 0.0918
Epoch 17 Batch 200 Loss 0.1483
Epoch 17 Batch 300 Loss 0.1260
Epoch 17 Batch 400 Loss 0.1401
Epoch 17 Batch 500 Loss 0.1921
Epoch 17 Batch 600 Loss 0.1564
Epoch 17 Loss 0.1470
Time taken for 1 epoch 115.51 sec

Epoch 18 Batch 0 Loss 0.0954
Epoch 18 Batch 100 Loss 0.1427
Epoch 18 Batch 200 Loss 0.1297
Epoch 18 Batch 300 Loss 0.1105
Epoch 18 Batch 400 Loss 0.1601
Epoch 18 Batch 500 Loss 0.1347
Epoch 18 Batch 600 Loss 0.1469
Epoch 18 Loss 0.1283
Time taken for 1 epoch 118.97 sec

Epoch 19 Batch 0 Loss 0.1313
Epoch 19 Batch 100 Loss 0.1307
Epoch 19 Batch 200 Loss 0.1305
Epoch 19 Batch 300 Loss 0.1055
Epoch 19 Batch 400 Loss 0.1299
Epoch 19 Batch 500 Loss 0.1442
Epo

In [67]:
def evaluate(sentence):
  attention_plot = np.zeros((max_length_targ, max_length_inp))

  sentence = preprocess(sentence)

  inputs = [inp_lang.word_index[i] for i in sentence.split(' ')]
  inputs = tf.keras.preprocessing.sequence.pad_sequences([inputs],maxlen=20, padding='post')
  inputs = tf.convert_to_tensor(inputs)

  result = ''

  hidden = [tf.zeros((1, units))]
  enc_out, enc_hidden = encoder(inputs, hidden)

  dec_hidden = enc_hidden
  dec_input = tf.expand_dims([targ_lang.word_index['<start>']], 0)

  for t in range(max_length_targ):
    predictions, dec_hidden, attention_weights = decoder(dec_input,
                                                         dec_hidden,
                                                         enc_out)
    # storing the attention weights to plot later on
    attention_weights = tf.reshape(attention_weights, (-1, ))
    attention_plot[t] = attention_weights.numpy()
    predicted_id = tf.argmax(predictions[0]).numpy()

    result += targ_lang.index_word[predicted_id] + ' '

    if targ_lang.index_word[predicted_id] == '<end>':
      return result,attention_plot

    # the predicted ID is fed back into the model
    dec_input = tf.expand_dims([predicted_id], 0)

  return result,attention_plot

In [94]:
input_sentence= 'please ensure that you use the appropriate form '
print('Input sentence in english : ',input_sentence)
predicted_output,attention_plot=evaluate(input_sentence)
print('Predicted sentence in hindi : ',predicted_output)

Input sentence in english :  please ensure that you use the appropriate form 
Predicted sentence in hindi :  कृपया यह सुनिश्चित कर लें कि आप सही फॉर्म का प्रयोग कर रहें हैं <end> 


In [95]:
input_sentence='and do something with it to change the world '
print('Input sentence in english : ',input_sentence)
predicted_output,attention_plot=evaluate(input_sentence)
print('Predicted sentence in hindi : ',predicted_output)

Input sentence in english :  and do something with it to change the world 
Predicted sentence in hindi :  और इस दुनिया को बेहतर बनाने के लिये कुछ करेंगे । <end> 
