In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
from pickle import load
from pickle import dump
import h5py
import math
import nltk
import re 
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import string
from keras.models import Model
from keras.layers import Input, LSTM, Dense
import io
import re
import collections 
from keras.preprocessing.text import Tokenizer
from keras.utils import to_categorical
from keras.preprocessing.sequence import pad_sequences

Using TensorFlow backend.


In [2]:
# English Training Data
english_data = 'parallel/IITB.en-hi.en'
eng_lines = io.open(english_data, encoding = "utf-8").read().split('\n')

# Hindi Training Data
hindi_data = 'parallel/IITB.en-hi.hi'
hin_lines = io.open(hindi_data, encoding = "utf-8").read().split('\n')

In [3]:
eng_lines=eng_lines[0:200000]
hin_lines=hin_lines[0:200000]

In [4]:
eng_lines = pd.DataFrame(eng_lines)
eng_lines.columns = ['English Sentence']
hin_lines = pd.DataFrame(hin_lines)
hin_lines.columns = ['Hindi Sentence']

In [5]:
eng_lines['len_english'] = eng_lines['English Sentence'].apply(lambda x:len(x.split()))
hin_lines['len_hindi'] = hin_lines['Hindi Sentence'].apply(lambda x:len(x.split()))

In [6]:
print(eng_lines['len_english'].max())
print(hin_lines['len_hindi'].max())

301
213


In [7]:
eng_lines['Removed Spaces'] = eng_lines['English Sentence'].apply(lambda x:x.replace(" ",""))
eng_lines['Non Chars'] = eng_lines['Removed Spaces'].apply(lambda x:x.isalpha())
eng_lines = eng_lines[eng_lines['Non Chars']==True]
eng_lines = eng_lines[eng_lines.len_english!=0]

In [8]:
def non_hindi(sentence):
    return bool(re.search('[a-zA-Z0-9-,:"+./।<?>}{!@#$%^&*(¬à®¾°¥\¶®¹¿¡¼µª²£¤⁵⁷⁸⁹⁶⁴³´µÃ¥]', sentence))

In [9]:
hin_lines['Non Hindi Chars'] = hin_lines['Hindi Sentence'].apply(non_hindi)
hin_lines = hin_lines[hin_lines['Non Hindi Chars']==False]
hin_lines.drop(['Non Hindi Chars'],axis = 1,inplace=True)
hin_lines = hin_lines[hin_lines.len_hindi!=0]

In [10]:
hin_lines = pd.merge(hin_lines, eng_lines, left_index=True, right_index=True)
hin_lines.drop(['English Sentence','len_english'],axis=1,inplace=True)

In [11]:
eng_lines = pd.merge(hin_lines,eng_lines,left_index=True,right_index=True)
eng_lines.drop(['Hindi Sentence','len_hindi'],axis=1,inplace=True)

In [12]:
len(eng_lines)
len(hin_lines)

75775

In [13]:
hindi_sentences = hin_lines['Hindi Sentence']
hindi_sentences.dropna(inplace=True)
hindi_sentences.reset_index(drop=True, inplace=True)

In [14]:
hindi_corpus = []
for i in range(0,len(hindi_sentences)):
    hindi_corpus.append(hindi_sentences[i])
hindi_sentences = hindi_corpus

In [15]:
corpus = []
english_sentences = eng_lines['English Sentence']
english_sentences.dropna(inplace=True)
english_sentences.reset_index(drop=True, inplace=True)
for i in range(0,len(english_sentences)):
    sen = re.sub('[^a-zA-Z]', ' ', english_sentences[i])
    sen = sen.lower()
    sen = sen.split()
    sen = ' '.join(sen)
    corpus.append(sen)

In [16]:
corpus_for_dictionary = ' '.join(english_sentences)
corpus_for_dictionary1=[]
corpus_for_dictionary1.append(corpus_for_dictionary)

In [17]:

eng_tokenizer = Tokenizer(oov_token='UNK')

#creating dictionary with word as key and index as value
eng_tokenizer.fit_on_texts(corpus_for_dictionary1)
 
print(eng_tokenizer.word_index)

# vocabulary size
eng_vocab_size = len(eng_tokenizer.word_index)+1

print(eng_vocab_size)


7818


In [18]:
hindi_sentences = [x + ' eol' for x in hindi_sentences ]

In [19]:
hindi_corpus_dictionary = ' '.join(hindi_sentences)
corpus_for_dictionary2=[]
corpus_for_dictionary2.append(hindi_corpus_dictionary)

In [20]:

hin_tokenizer = Tokenizer(oov_token='UNK')

#creating dictionary with word as key and index as value
hin_tokenizer.fit_on_texts(corpus_for_dictionary2)

print(hin_tokenizer.word_index)

hin_vocab_size = len(hin_tokenizer.word_index)+1

print(hin_vocab_size)


{'UNK': 1, 'eol': 2, 'करें': 3, 'के': 4, 'है': 5, 'में': 6, 'को': 7, 'नहीं': 8, 'का': 9, 'लिए': 10, 'फ़ाइल': 11, 'की': 12, 'संदेश': 13, 'से': 14, 'चयनित': 15, 'पर': 16, 'रहा': 17, 'एक': 18, 'फ़ोल्डर': 19, 'सूची': 20, 'छवि': 21, 'कोई': 22, 'नाम': 23, 'इस': 24, 'दिखाएँ': 25, 'चुनें': 26, 'कर': 27, 'सभी': 28, 'विंडो': 29, 'संपर्क': 30, 'पाठ': 31, 'करने': 32, 'नया': 33, 'गया': 34, 'किया': 35, 'पता': 36, 'कार्य': 37, 'जोड़ें': 38, 'खोलें': 39, 'त्रुटि': 40, 'रूप': 41, 'सहेजें': 42, 'डिस्क': 43, 'डाक': 44, 'और': 45, 'जा': 46, 'रंग': 47, 'या': 48, 'दृश्य': 49, 'स्थिति': 50, 'जाएँ': 51, 'बनाएँ': 52, 'आकार': 53, 'समय': 54, 'मिटाएँ': 55, 'विकल्प': 56, 'फाइल': 57, 'क्या': 58, 'यह': 59, 'खोज': 60, 'मौजूदा': 61, 'पंचांग': 62, 'परियोजना': 63, 'सूचना': 64, 'स्थान': 65, 'पृष्ठ': 66, 'एवोल्यूशन': 67, 'साथ': 68, 'सका': 69, 'बदलें': 70, 'उपयोग': 71, 'अभिलेख': 72, 'प्रमाणपत्र': 73, 'नक़ल': 74, 'अज्ञात': 75, 'सेट': 76, 'सर्वर': 77, 'पट्टी': 78, 'प्रोग्राम': 79, 'सक्रिय': 80, 'गुण': 81, 'खाता': 82, 'वर्तमान

In [22]:
hin_word_to_index= hin_tokenizer.word_index
hin_index_to_word = {v: k for k, v in hin_word_to_index.items()}
hin_index_to_word[0]=' '
hin_word_to_index[' ']=0

In [23]:
nb_samples = 5441
max_len_eng_sent = 13
max_len_hin_sent = 14

In [24]:
tokenized_eng_sentences = np.zeros(shape = (nb_samples,max_len_eng_sent, eng_vocab_size), dtype='float32')
tokenized_hin_sentences = np.zeros(shape = (nb_samples,max_len_hin_sent, hin_vocab_size), dtype='float32')
target_data = np.zeros((nb_samples, max_len_hin_sent, hin_vocab_size),dtype='float32')

In [25]:
eng_embedding = eng_tokenizer.texts_to_sequences(english_sentences)
# print(eng_embedding)
hin_embedding = hin_tokenizer.texts_to_sequences(hindi_sentences)
# print(hin_embedding)
eng_embedding_padded = pad_sequences(eng_embedding, maxlen=max_len_eng_sent, padding='post', value=0)
hin_embedding_padded = pad_sequences(hin_embedding, maxlen=max_len_hin_sent, padding='post', value=0)

In [26]:
for i in range(0,nb_samples):
    encoded_eng_sent = eng_embedding_padded[i]
    encoded_hin_sent = hin_embedding_padded[i]
    for j in range(0,max_len_eng_sent):
        tokenized_eng_sentences[i,j,:]=to_categorical(encoded_eng_sent[j],num_classes=eng_vocab_size) 
    for k in range(0,max_len_hin_sent):
        tokenized_hin_sentences[i,k,:]=to_categorical(encoded_hin_sent[k],num_classes=hin_vocab_size)

In [27]:
# Encoder model

encoder_input = Input(shape=(None,eng_vocab_size))
encoder_LSTM = LSTM(256,return_state = True)
encoder_outputs, encoder_h, encoder_c = encoder_LSTM (encoder_input)
encoder_states = [encoder_h, encoder_c]


In [28]:
# Decoder model

decoder_input = Input(shape=(None,hin_vocab_size))
decoder_LSTM = LSTM(256,return_sequences=True, return_state = True)
decoder_out, _ , _ = decoder_LSTM(decoder_input, initial_state=encoder_states)
decoder_dense = Dense(hin_vocab_size,activation='softmax')
decoder_out = decoder_dense (decoder_out)


In [29]:
model = Model(inputs=[encoder_input, decoder_input],outputs=[decoder_out])

# Run training
model.compile(optimizer='rmsprop', loss='categorical_crossentropy')
model.fit(x=[tokenized_eng_sentences,tokenized_hin_sentences], 
          y=tokenized_hin_sentences,
          batch_size=64,
          epochs=10,
          validation_split=0.2)

Train on 4352 samples, validate on 1089 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x1f08a764240>

In [32]:
# Inference models for testing

# Encoder inference model
encoder_model_inf = Model(encoder_input, encoder_states)

# Decoder inference model
decoder_state_input_h = Input(shape=(256,))
decoder_state_input_c = Input(shape=(256,))
decoder_input_states = [decoder_state_input_h, decoder_state_input_c]

decoder_out, decoder_h, decoder_c = decoder_LSTM(decoder_input, 
                                                 initial_state=decoder_input_states)

decoder_states = [decoder_h , decoder_c]

decoder_out = decoder_dense(decoder_out)

decoder_model_inf = Model(inputs=[decoder_input] + decoder_input_states,
                          outputs=[decoder_out] + decoder_states )

In [33]:
def decode_seq(inp_seq):
    
    # Initial states value is coming from the encoder 
    states_val = encoder_model_inf.predict(inp_seq)
    
    target_seq = np.zeros((1, 1, hin_vocab_size))
#     target_seq[0, 0, fra_char_to_index_dict['\t']] = 1
    
    translated_sent = ''
    stop_condition = False
    
    while not stop_condition:
        
        decoder_out, decoder_h, decoder_c = decoder_model_inf.predict(x=[target_seq] + states_val)
        
        max_val_index = np.argmax(decoder_out[0,-1,:])
        sampled_hin_word = hin_index_to_word[max_val_index]
        translated_sent += sampled_hin_word+" "
        
        if ( (sampled_hin_word == 'eol') or (len(translated_sent) > max_len_hin_sent)) :
            stop_condition = True
        
        target_seq = np.zeros((1, 1,hin_vocab_size))
        target_seq[0, 0, max_val_index] = 1
        
        states_val = [decoder_h, decoder_c]
        
    return translated_sent


In [34]:
for seq_index in range(20):
    inp_seq = tokenized_eng_sentences[seq_index:seq_index+1]
    translated_sent = decode_seq(inp_seq)
    print('-')
    print('Input sentence:', english_sentences[seq_index])
    print('Decoded sentence:', translated_sent)

-
Input sentence: Give your application an accessibility workout
Decoded sentence: अंतिम बार बार किए 
-
Input sentence: Accerciser Accessibility Explorer
Decoded sentence: वर्तमान वर्तमान 
-
Input sentence: Highlight duration
Decoded sentence: अवधि गुणों गुणों 
-
Input sentence: Highlight fill color
Decoded sentence: अंतिम अंतिम फोकस 
-
Input sentence: API Browser
Decoded sentence: टूटते तारे स्याने 
-
Input sentence: Hide private attributes
Decoded sentence: अंतिम अंतिम प्रविष्ट 
-
Input sentence: Method
Decoded sentence: चोर चोर चोर चोर 
-
Input sentence: Property
Decoded sentence: वर्णन रंग रंग रंग 
-
Input sentence: Value
Decoded sentence: आकार बन्द बन्द 
-
Input sentence: IPython Console
Decoded sentence: आईपाइथन कन्सोल 
-
Input sentence: Interactive console for manipulating currently selected accessible
Decoded sentence: इस इस चुने चुने 
-
Input sentence: Event monitor
Decoded sentence: अधिकतम मान मान 
-
Input sentence: Everything
Decoded sentence: त्रुटि त्रुटि   
-
Input senten