# Text Preprocessing

In [None]:
import numpy as np 
import pandas as pd 

import os
import string
from string import digits
import matplotlib.pyplot as plt
%matplotlib inline
import re

import seaborn as sns
from sklearn.utils import shuffle
from sklearn.model_selection import train_test_split
from keras.layers import Input, LSTM, Embedding, Dense
from keras.models import Model
from tensorflow.math import confusion_matrix


pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)
pd.set_option('display.max_colwidth', -1)


  import pandas.util.testing as tm


In [None]:
import warnings
warnings.filterwarnings('ignore')

In [None]:
lines=pd.read_csv("Hindi_English_Truncated_Corpus.csv",encoding='utf-8')

In [None]:
lines['source'].value_counts()

tides        50000
ted          39881
indic2012    37726
Name: source, dtype: int64

In [None]:
lines=lines[lines['source']=='ted']

In [None]:
lines.head(20)

Unnamed: 0,source,english_sentence,hindi_sentence
0,ted,politicians do not have permission to do what needs to be done.,"राजनीतिज्ञों के पास जो कार्य करना चाहिए, वह करने कि अनुमति नहीं है ."
1,ted,"I'd like to tell you about one such child,","मई आपको ऐसे ही एक बच्चे के बारे में बताना चाहूंगी,"
3,ted,what we really mean is that they're bad at not paying attention.,हम ये नहीं कहना चाहते कि वो ध्यान नहीं दे पाते
7,ted,"And who are we to say, even, that they are wrong",और हम होते कौन हैं यह कहने भी वाले कि वे गलत हैं
13,ted,So there is some sort of justice,तो वहाँ न्याय है
23,ted,This changed slowly,धीरे धीरे ये सब बदला
26,ted,were being produced.,उत्पन्न नहीं कि जाती थी.
30,ted,"And you can see, this LED is going to glow.","और जैसा आप देख रहे है, ये एल.ई.डी. जल उठेगी।"
32,ted,"to turn on the lights or to bring him a glass of water,","लाईट जलाने के लिए या उनके लिए पानी लाने के लिए,"
35,ted,Can you imagine saying that?,क्या आप ये कल्पना कर सकते है


In [None]:
pd.isnull(lines).sum()

source              0
english_sentence    0
hindi_sentence      0
dtype: int64

In [None]:
lines=lines[~pd.isnull(lines['english_sentence'])]

In [None]:
lines.drop_duplicates(inplace=True)

In [None]:
lines=lines.sample(n=25000)
lines.shape

(25000, 3)

In [None]:
# Lowercase all characters
lines['english_sentence']=lines['english_sentence'].apply(lambda x: x.lower())
lines['hindi_sentence']=lines['hindi_sentence'].apply(lambda x: x.lower())

In [None]:
# Remove quotes
lines['english_sentence']=lines['english_sentence'].apply(lambda x: re.sub("'", '', x))
lines['hindi_sentence']=lines['hindi_sentence'].apply(lambda x: re.sub("'", '', x))

In [None]:
exclude = set(string.punctuation) # Set of all special characters
# Remove all the special characters
lines['english_sentence']=lines['english_sentence'].apply(lambda x: ''.join(ch for ch in x if ch not in exclude))
lines['hindi_sentence']=lines['hindi_sentence'].apply(lambda x: ''.join(ch for ch in x if ch not in exclude))

In [None]:
# Remove all numbers from text
remove_digits = str.maketrans('', '', digits)
lines['english_sentence']=lines['english_sentence'].apply(lambda x: x.translate(remove_digits))
lines['hindi_sentence']=lines['hindi_sentence'].apply(lambda x: x.translate(remove_digits))

lines['hindi_sentence'] = lines['hindi_sentence'].apply(lambda x: re.sub("[२३०८१५७९४६]", "", x))

# Remove extra spaces
lines['english_sentence']=lines['english_sentence'].apply(lambda x: x.strip())
lines['hindi_sentence']=lines['hindi_sentence'].apply(lambda x: x.strip())
lines['english_sentence']=lines['english_sentence'].apply(lambda x: re.sub(" +", " ", x))
lines['hindi_sentence']=lines['hindi_sentence'].apply(lambda x: re.sub(" +", " ", x))

In [None]:
# Add start and end tokens to target sequences
lines['hindi_sentence'] = lines['hindi_sentence'].apply(lambda x : 'START_ '+ x + ' _END')

In [None]:
lines.head()

Unnamed: 0,source,english_sentence,hindi_sentence
82040,ted,we still dont know who her parents are who she is,START_ हम अभी तक नहीं जानते हैं कि उसके मातापिता कौन हैं वह कौन है _END
85038,ted,no keyboard,START_ कोई कुंजीपटल नहीं _END
58018,ted,but as far as being a performer,START_ लेकिन एक कलाकार होने के साथ _END
74470,ted,and this particular balloon,START_ और यह खास गुब्बारा _END
122330,ted,and its not as hard as you think integrate climate solutions into all of your innovations,START_ और जितना आपको लगता है यह उतना कठिन नहीं हैअपने सभी नवाचारों में जलवायु समाधान को एकीकृत करें _END


In [None]:
# Get English and Hindi Vocabulary
all_eng_words=set()
for eng in lines['english_sentence']:
    for word in eng.split():
        if word not in all_eng_words:
            all_eng_words.add(word)

all_hindi_words=set()
for hin in lines['hindi_sentence']:
    for word in hin.split():
        if word not in all_hindi_words:
            all_hindi_words.add(word)

In [None]:
len(all_eng_words)

14030

In [None]:
len(all_hindi_words)

17540

In [None]:
lines['length_eng_sentence']=lines['english_sentence'].apply(lambda x:len(x.split(" ")))
lines['length_hin_sentence']=lines['hindi_sentence'].apply(lambda x:len(x.split(" ")))

In [None]:
lines.head()

Unnamed: 0,source,english_sentence,hindi_sentence,length_eng_sentence,length_hin_sentence
82040,ted,we still dont know who her parents are who she is,START_ हम अभी तक नहीं जानते हैं कि उसके मातापिता कौन हैं वह कौन है _END,11,16
85038,ted,no keyboard,START_ कोई कुंजीपटल नहीं _END,2,5
58018,ted,but as far as being a performer,START_ लेकिन एक कलाकार होने के साथ _END,7,8
74470,ted,and this particular balloon,START_ और यह खास गुब्बारा _END,4,6
122330,ted,and its not as hard as you think integrate climate solutions into all of your innovations,START_ और जितना आपको लगता है यह उतना कठिन नहीं हैअपने सभी नवाचारों में जलवायु समाधान को एकीकृत करें _END,16,20


In [None]:
lines[lines['length_eng_sentence']>30].shape

(0, 5)

In [None]:
# remove sentences length <= 20
lines=lines[lines['length_eng_sentence']<=20]
lines=lines[lines['length_hin_sentence']<=20]

In [None]:
lines.shape

(24774, 5)

In [None]:
print("maximum length of Hindi Sentence ",max(lines['length_hin_sentence']))
print("maximum length of English Sentence ",max(lines['length_eng_sentence']))

maximum length of Hindi Sentence  20
maximum length of English Sentence  20


In [None]:
max_length_src=max(lines['length_hin_sentence'])
max_length_tar=max(lines['length_eng_sentence'])

In [None]:
input_words = sorted(list(all_eng_words))
target_words = sorted(list(all_hindi_words))
num_encoder_tokens = len(all_eng_words) + 1
num_decoder_tokens = len(all_hindi_words) + 1
num_encoder_tokens, num_decoder_tokens

(14030, 17540)

In [None]:
input_token_index = dict([(word, i+1) for i, word in enumerate(input_words)])
target_token_index = dict([(word, i+1) for i, word in enumerate(target_words)])

In [None]:
reverse_input_char_index = dict((i, word) for word, i in input_token_index.items())
reverse_target_char_index = dict((i, word) for word, i in target_token_index.items())

In [None]:
lines = shuffle(lines)
lines.head(10)

Unnamed: 0,source,english_sentence,hindi_sentence,length_eng_sentence,length_hin_sentence
27649,ted,to be able to see these,START_ इन्हें देखने के लिए _END,6,6
42662,ted,itll take a few years to prove it experimentally,START_ मुझे इसे प्रायौगिक ढंग में साबित करने में कुछ साल लगेंगे _END,9,13
115017,ted,when my mother had time to read for me,START_ जब मेरी माँ के पास मेरे लिए पढ़ने का समय था _END,9,13
125156,ted,theyre as smart as chimpanzees in some respects,START_ कुछ मायनों में तो वो चिम्पान्ज़ी जितने बुद्धिमान होते हैं। _END,8,12
52536,ted,you had to tie really tight,START_ जिसे बहुत कस कर बांधना पड़ता था _END,6,9
34308,ted,only god knows the true meaning”,START_ बस ईश्वर ही है जो सच जानता है _END,6,10
30674,ted,and you felt in the presence of power of awe,START_ और तुमने स्वयं को शक्ति विस्मय के सानिध्य में पाया _END,10,12
70299,ted,i know that they were tougher for a lot of people,START_ न सिर्फ मेरे लिए बल्कि कुछ लोगों के लिए _END,11,11
99824,ted,what would you get done that youre waiting to get done,START_ आप वो क्या चीजे करेंगे जो आप अभी तक टालते आये है _END,11,14
57621,ted,and youd think that would be a pretty big problem for a law,START_ और आपको लग रहा होगा कि ये कानून के लिये ख़ासी बडी दिक्कत होगी _END,13,16


In [None]:
X, y = lines['english_sentence'], lines['hindi_sentence']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2,random_state=42)
X_train.shape, X_test.shape

((19819,), (4955,))

In [None]:
X_train.head(10)

4842      actually the country with more telephones                              
57544     ive later come to know that to be                                      
32816     of course a lot of people said “maybe they dance better than they play”
11147     and theres a book john gartner                                         
117618    standing in a circle                                                   
64119     ive been making toys for the last years                                
117258    in settings in which we are supposed to learn something                
69618     africa can only be transformed by enlightened leaders                  
40894     laughter                                                               
35841     and russia and elsewhere                                               
Name: english_sentence, dtype: object

In [None]:
X_test.head(10)

60899     but rather it devotes this revenue                                  
43644     teach themselves biotechnology                                      
24164     i said “really we have to show the other classes                    
35113     she was given seven years of hard labor                             
72169     and hes in san quentin state prison                                 
45396     because it again sort of manages to pull itself up                  
34980     its the disease entities that have come in                          
62872     till they get english                                               
101928    and if you record from several grid cells                           
54630     the first level the weakest level of civil response against violence
Name: english_sentence, dtype: object

In [None]:
def generate_batch(X = X_train, y = y_train, batch_size = 128):
    ''' Generate a batch of data '''
    while True:
        for j in range(0, len(X), batch_size):
            encoder_input_data = np.zeros((batch_size, max_length_src),dtype='float32')
            decoder_input_data = np.zeros((batch_size, max_length_tar),dtype='float32')
            decoder_target_data = np.zeros((batch_size, max_length_tar, num_decoder_tokens),dtype='float32')
            for i, (input_text, target_text) in enumerate(zip(X[j:j+batch_size], y[j:j+batch_size])):
                for t, word in enumerate(input_text.split()):
                    encoder_input_data[i, t] = input_token_index[word] 
                for t, word in enumerate(target_text.split()):
                    if t<len(target_text.split())-1:
                        decoder_input_data[i, t] = target_token_index[word] 
                    if t>0:
                        # decoder target sequence (one hot encoded)
                        # does not include the START_ token
                        # Offset by one timestep
                        decoder_target_data[i, t - 1, target_token_index[word]] = 1.
            yield([encoder_input_data, decoder_input_data], decoder_target_data)

# Model Architecture

In [None]:
latent_dim=300

In [None]:
# Encoder
encoder_inputs = Input(shape=(None,))
enc_emb =  Embedding(num_encoder_tokens, latent_dim, mask_zero = True)(encoder_inputs)

encoder_lstm = LSTM(latent_dim, return_state=True)
encoder_outputs, state_h, state_c = encoder_lstm(enc_emb)

encoder_states = [state_h, state_c]

In [None]:
# Decoder
decoder_inputs = Input(shape=(None,))

dec_emb_layer = Embedding(num_decoder_tokens, latent_dim, mask_zero = True)
dec_emb = dec_emb_layer(decoder_inputs)

decoder_lstm = LSTM(latent_dim, return_sequences=True, return_state=True)
decoder_outputs, _, _ = decoder_lstm(dec_emb,initial_state=encoder_states)

decoder_dense = Dense(num_decoder_tokens, activation='softmax')
decoder_outputs = decoder_dense(decoder_outputs)

new_model = Model([encoder_inputs, decoder_inputs], decoder_outputs)

In [None]:
new_model.compile(optimizer='rmsprop', loss='categorical_crossentropy')

In [None]:
new_model.summary()

Model: "functional_1"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            [(None, None)]       0                                            
__________________________________________________________________________________________________
input_2 (InputLayer)            [(None, None)]       0                                            
__________________________________________________________________________________________________
embedding (Embedding)           (None, None, 300)    4209000     input_1[0][0]                    
__________________________________________________________________________________________________
embedding_1 (Embedding)         (None, None, 300)    5262300     input_2[0][0]                    
_______________________________________________________________________________________

In [None]:
train_samples = len(X_train)
val_samples = len(X_test)
batch_size = 128
epochs = 100

In [None]:
"""
model.fit_generator(generator = generate_batch(X_train, y_train, batch_size = batch_size),
                    steps_per_epoch = train_samples//batch_size,
                    epochs=epochs,
                    validation_data = generate_batch(X_test, y_test, batch_size = batch_size),
                    validation_steps = val_samples//batch_size)
"""                    

'\nmodel.fit_generator(generator = generate_batch(X_train, y_train, batch_size = batch_size),\n                    steps_per_epoch = train_samples//batch_size,\n                    epochs=epochs,\n                    validation_data = generate_batch(X_test, y_test, batch_size = batch_size),\n                    validation_steps = val_samples//batch_size)\n'

In [None]:
new_model.load_weights('nmt_weights.h5')

In [None]:
encoder_model = Model(encoder_inputs, encoder_states)

decoder_state_input_h = Input(shape=(latent_dim,))
decoder_state_input_c = Input(shape=(latent_dim,))
decoder_states_inputs = [decoder_state_input_h, decoder_state_input_c]

dec_emb2= dec_emb_layer(decoder_inputs) 

decoder_outputs2, state_h2, state_c2 = decoder_lstm(dec_emb2, initial_state=decoder_states_inputs)
decoder_states2 = [state_h2, state_c2]
decoder_outputs2 = decoder_dense(decoder_outputs2) 
decoder_model = Model([decoder_inputs] + decoder_states_inputs,[decoder_outputs2] + decoder_states2)

# Testing

In [None]:
 def decode_sequence(input_seq):

    states_value = encoder_model.predict(input_seq)
    target_seq = np.zeros((1,1))
    target_seq[0, 0] = target_token_index['START_']

    stop_condition = False
    decoded_sentence = ''
    while not stop_condition:
        output_tokens, h, c = decoder_model.predict([target_seq] + states_value)

        sampled_token_index = np.argmax(output_tokens[0, -1, :])
        sampled_char = reverse_target_char_index[sampled_token_index]
        decoded_sentence += ' '+sampled_char

        if (sampled_char == '_END' or len(decoded_sentence) > 50):
            stop_condition = True

        target_seq = np.zeros((1,1))
        target_seq[0, 0] = sampled_token_index

        states_value = [h, c]

    return decoded_sentence

In [None]:
def accuracy_calculation(input, output):
    prediction = new_model.predict(input)
    threshold = 1e-10
    prediction[prediction < threshold] = 0.0
    match = np.sum(prediction == output)
    mismatch = np.sum(prediction != output)
    accuracy = match/(match+mismatch)
    print('Accuracy is : ', accuracy)

In [None]:
data_gen = generate_batch(X_train, y_train, batch_size=1)

In [None]:
test_gen = generate_batch(X_test, y_test, batch_size = 1)
k = -1

In [None]:
from nltk.translate.bleu_score import sentence_bleu

In [None]:
def prediction_for_line(input_seq, actual_output):
    decoded_sentence = decode_sequence(input_seq)
    bleu_score = sentence_bleu(y_train[k:k+1].values[0][6:-1], decoded_sentence[:-4],weights=(0.25, 0.25, 0.25, 0.25))
    print('Input English sentence:', X_train[k:k+1].values[0])
    print('Actual Hindi Translation:', y_train[k:k+1].values[0][6:-4])
    print('Predicted Hindi Translation:', decoded_sentence[:-4])
    print('Bleu Score : ', bleu_score)
    accuracy_calculation(input_seq, output)

In [None]:
k += 1
(input, output) = next(data_gen)
prediction_for_line(input, output)

Input English sentence: actually the country with more telephones
Actual Hindi Translation:  असल में ज्यादा टेलीफ़ोन वाला देश है 
Predicted Hindi Translation:  असल में ज्यादा टेलीफ़ोन वाला देश है 
Bleu Score :  0.8781227710041801
Accuracy is :  0.7286927769226383


In [None]:
k += 1
(input, output) = next(data_gen)
prediction_for_line(input, output)

Input English sentence: ive later come to know that to be
Actual Hindi Translation:  बाद में मैंने जाना कि 
Predicted Hindi Translation:  बाद में मैंने जाना कि 
Bleu Score :  0.8498912392268879
Accuracy is :  0.038464169659654525


In [None]:
k += 1
(input, output) = next(data_gen)
prediction_for_line(input, output)

Input English sentence: of course a lot of people said “maybe they dance better than they play”
Actual Hindi Translation:  बेशक बहुत लोगो ने कहा कि जितना अच्छा खेलते हैं उससे कहीं अच्छा नाचते हैं 
Predicted Hindi Translation:  बेशक बहुत लोगो ने भी कहा कि “यह बहुत अच्छा है” 
Bleu Score :  0.8034284189446518
Accuracy is :  0.2778632917165498


In [None]:
k += 1
(input, output) = next(data_gen)
prediction_for_line(input, output)

Input English sentence: and theres a book john gartner
Actual Hindi Translation:  एक किताब है जॉन गार्टनरjohn gartner 
Predicted Hindi Translation:  और यहाँ एक मां देख रहे हैं 
Bleu Score :  0.7071067811865476
Accuracy is :  0.5013254660509663


In [None]:
k += 1
(input, output) = next(data_gen)
prediction_for_line(input, output)

Input English sentence: ive been making toys for the last years
Actual Hindi Translation:  मैं पिछले सालों से खिलौने बना रहा हूँ। 
Predicted Hindi Translation:  मैं पिछले सालों से अभी तक आया 
Bleu Score :  0.7887781797427305
Accuracy is :  0.7032181745624537


In [None]:
for i in range(1,5):
    k += 1
    (input, output) = next(data_gen)
    prediction_for_line(input, output)

Input English sentence: in settings in which we are supposed to learn something
Actual Hindi Translation:  उन स्थितियों में जहाँ हमसे कुछ सीखने की अपेक्षा की जाती है। 
Predicted Hindi Translation:  उन स्थितियों में जहाँ हमसे कुछ सीखने की अपेक्ष
Bleu Score :  0.8540052444156726
Accuracy is :  0.5271535260247421
Input English sentence: africa can only be transformed by enlightened leaders
Actual Hindi Translation:  केवल जागरूक नेता ही अफ्रीका की काया पलट सकते हैं 
Predicted Hindi Translation:  केवल जागरूक नेता ही अफ्रीका की काया पलट सकते हैं 
Bleu Score :  0.8235490260528069
Accuracy is :  0.5408329057636395
Input English sentence: laughter
Actual Hindi Translation:  ठ्हाके 
Predicted Hindi Translation:  हँसी 
Bleu Score :  0.7598356856515925
Accuracy is :  0.9463400034205576
Input English sentence: and russia and elsewhere
Actual Hindi Translation:  और रूस और अन्य जगहों में 
Predicted Hindi Translation:  और रूस और अन्य जगहों में 
Bleu Score :  0.8857000285382948
Accuracy is :  0.7121999