In [1]:
import numpy as np
import pandas as pd
import tensorflow as tf

import re
import string
from string import digits
import pickle

In [2]:
encoder_model = tf.keras.models.load_model("encoder_model")
decoder_model = tf.keras.models.load_model("decoder_model")



In [3]:
X_train = pickle.load(open("pickle_data/X_train.pkl", "rb"))
y_train = pickle.load(open("pickle_data/y_train.pkl", "rb"))

In [4]:
max_length_src = pickle.load(open("pickle_data/max_length_src.pkl", "rb"))
max_length_tar = pickle.load(open("pickle_data/max_length_tar.pkl", "rb"))
num_decoder_tokens = pickle.load(open("pickle_data/num_decoder_tokens.pkl", "rb"))
input_token_index = pickle.load(open("pickle_data/input_token_index.pkl", "rb"))
target_token_index = pickle.load(open("pickle_data/target_token_index.pkl", "rb"))
reverse_target_token_index = pickle.load(open("pickle_data/reverse_target_token_index.pkl", "rb"))

In [5]:
def generate_batch(X = X_train, y = y_train, batch_size = 128):
    ''' Generate a batch of data '''
    while True:
        for j in range(0, len(X), batch_size):
            encoder_input_data = np.zeros((batch_size, max_length_src),dtype='float32')
            decoder_input_data = np.zeros((batch_size, max_length_tar),dtype='float32')
            decoder_target_data = np.zeros((batch_size, max_length_tar, num_decoder_tokens),dtype='float32')
            for i, (input_text, target_text) in enumerate(zip(X[j:j+batch_size], y[j:j+batch_size])):
                for t, word in enumerate(input_text.split()):
                    encoder_input_data[i, t] = input_token_index[word] # encoder input seq
                for t, word in enumerate(target_text.split()):
                    if t<len(target_text.split())-1:
                        decoder_input_data[i, t] = target_token_index[word] # decoder input seq
                    if t>0:
                        # decoder target sequence (one hot encoded)
                        # does not include the START_ token
                        # Offset by one timestep
                        decoder_target_data[i, t - 1, target_token_index[word]] = 1.
            yield([encoder_input_data, decoder_input_data], decoder_target_data)

In [6]:
def decode_sequence(input_seq):
    # encode the input as state vectors
    states_value = encoder_model.predict(input_seq)
    # generate empty target sequence of length 1
    target_seq = np.zeros((1,1))
    # populate the first character of target sequence with the start character
    target_seq[0,0] = target_token_index['START_']
    
    # sampling loop for a batch of sequence
    # (to simplify, here we assume a batch of size 1)
    stop_condition = False
    decoded_sentence = ''
    while not stop_condition:
        output_tokens, h, c = decoder_model.predict([target_seq] + states_value)
        
        # sample a token
        sampled_token_index = np.argmax(output_tokens[0, -1, :])
        sampled_char = reverse_target_token_index[sampled_token_index]
        decoded_sentence += ' ' + sampled_char
        
        # exit condition: either hit max length or find stop character
        if (sampled_char == '_END' or len(decoded_sentence) > 50):
            stop_condition = True
            
        # update the target sequence (of length 1)
        target_seq = np.zeros((1,1))
        target_seq[0,0] = sampled_token_index
        
        # update states
        states_value = [h, c]
        
    return decoded_sentence

In [7]:
train_gen = generate_batch(X_train, y_train, batch_size=1)
k = -1

In [8]:
for _ in range(5):
    k += 1
    (input_seq, actual_output), _ = next(train_gen)
    decoded_sentence = decode_sequence(input_seq)
    print('Input English sentence:', X_train[k:k+1].values[0])
    print('Actual Hindi Translation:', y_train[k:k+1].values[0][6:-4])
    print('Predicted Hindi Translation:', decoded_sentence[:-4])
    print('\n\n')

Input English sentence: we print geometry where we can make industrial design objects
Actual Hindi Translation:  जहां हम औद्योगिक डिजाइन वस्तुओं को छाप सकते हैं 
Predicted Hindi Translation:  जहां हम औद्योगिक डिजाइन वस्तुओं और बिजली में सकते



Input English sentence: a few oxymorons in one sentence
Actual Hindi Translation:  और ये वाक्य विरोधाभास से भरा है। 
Predicted Hindi Translation:  और ये वाक्य वाक्य करने वाले मिलता है 



Input English sentence: but they would give her the broccoli if she liked the broccoli
Actual Hindi Translation:  और गोभी देते थे अगर उसे गोभी पसंद थी। 
Predicted Hindi Translation:  और उन्हें दुकान में थे वो कंप्यूटर के बाद 



Input English sentence: and on top of all off these rules
Actual Hindi Translation:  और अगर ये नियमों काफी नहीं हैं 
Predicted Hindi Translation:  और उन पर अधिकांश लोगों वे सच हैं 



Input English sentence: someday little robots will go
Actual Hindi Translation:  एक दिन छोटे रोबोट्स हमारी धमनियों 
Predicted Hindi Translation:  एक दिन छ

# Generate input sequence for new raw sentence

In [9]:
def generate_input_seq(input_str):
    X = pd.DataFrame({'english_sentence': [input_str]})
    X['english_sentence'] = X['english_sentence'].apply(lambda x: x.lower())
    X['english_sentence'] = X['english_sentence'].apply(lambda x: re.sub("'", '', x))
    X['english_sentence'] = X['english_sentence'].apply(lambda x: re.sub("“", '', x))
    X['english_sentence'] = X['english_sentence'].apply(lambda x: re.sub("”", '', x))
    
    exclude = set(string.punctuation)
    X['english_sentence'] = X['english_sentence'].apply(lambda x: ''.join(ch for ch in x if ch not in exclude))
    
    remove_digits = str.maketrans('', '', digits)
    X['english_sentence'] = X['english_sentence'].apply(lambda x: x.translate(remove_digits))
    
    X['english_sentence'] = X['english_sentence'].apply(lambda x: x.strip())
    X['english_sentence'] = X['english_sentence'].apply(lambda x: re.sub(" +", " ", x))
    
    encoder_input_data = np.zeros((1, max_length_src),dtype='float32')
    for i, input_text in enumerate(X['english_sentence']):
        for t, word in enumerate(input_text.split()):
            encoder_input_data[i, t] = input_token_index[word] # encoder input seq

    return encoder_input_data

In [10]:
def pipeline(eng_str):
    input_seq = generate_input_seq(eng_str)
    decoded_sentence = decode_sequence(input_seq)
    print('Input English sentence:', eng_str)
    print('Predicted Hindi Translation:', decoded_sentence[:-4])

In [11]:
pipeline("I am from India")

Input English sentence: I am from India
Predicted Hindi Translation:  मैं भारत को नहीं दिया 
