In [1]:
#Importing dependencies
import numpy as np
import string
import random
import re
import tensorflow as tf
import pandas as pd
import tensorflow.keras as keras
import matplotlib.pyplot as plt
import sklearn
from tensorflow.keras.models import Sequential
from numpy import array, argmax, random, take
from tensorflow.keras.layers import Dense, Activation
from tensorflow.keras.layers import Dropout
from tensorflow.keras.layers import RNN, SimpleRNN, LSTM,  Embedding, RepeatVector
from tensorflow.keras.optimizers import RMSprop
from tensorflow.keras.callbacks import ModelCheckpoint
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
import urllib.request  
from pydrive.auth import GoogleAuth
from pydrive.drive import GoogleDrive
from google.colab import auth
from oauth2client.client import GoogleCredentials
%matplotlib inline
#For plotting the matplotlib graphs in notebook

# English to Hindi Translation

## Data Importing

In [None]:
urllib.request.urlretrieve("https://raw.githubusercontent.com/venkatareddykonasani/Datasets/master/Language_Translation/eng_hin/eng_hin.zip", "eng_hin.zip")
!unzip -qq 'eng_hin.zip'

In [None]:
raw_data= open("eng_hin.txt", mode='rt', encoding='utf-8').read()
raw_data=raw_data.strip().split('\n')
raw_data=[i.split('\t') for i in raw_data]
lang1_lang2_data=array(raw_data)
print(lang1_lang2_data)
print(" \n Overall Sentence pairs", len(lang1_lang2_data))

## Data Pre-processing

In [None]:
# Remove punctuation
lang1_lang2_data[:,0] = [word.translate(str.maketrans('', '', string.punctuation)) for word in lang1_lang2_data[:,0]]
lang1_lang2_data[:,1] = [word.translate(str.maketrans('', '', string.punctuation)) for word in lang1_lang2_data[:,1]]

print(lang1_lang2_data)

In [None]:
## convert text to lowercase
for word in range(len(lang1_lang2_data)):
    lang1_lang2_data[word,0] = lang1_lang2_data[word,0].lower()
    lang1_lang2_data[word,1] = lang1_lang2_data[word,1].lower()
print(lang1_lang2_data)

In [None]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(lang1_lang2_data[:, 0])
lang1_tokens=tokenizer
lang1_vocab_size = len(lang1_tokens.word_index) + 1
print("lang1_vocab_size", lang1_vocab_size)

In [None]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(lang1_lang2_data[:, 1])
lang2_tokens=tokenizer
lang2_vocab_size = len(lang2_tokens.word_index) + 1
print("lang2_vocab_size", lang2_vocab_size)

## Train and Test Data

In [None]:
# split data into train and test set
train, test = train_test_split(lang1_lang2_data, test_size=0.1, random_state = 44)

In [None]:
# Maximum number of words in a sentence in  Lang1 and Lang2
lang1_seq_length=15 
lang2_seq_length=15

X_train_seq=lang1_tokens.texts_to_sequences(train[:, 0])
X_train= pad_sequences(X_train_seq,lang1_seq_length,padding='post')

Y_train_seq=lang2_tokens.texts_to_sequences(train[:, 1])
Y_train= pad_sequences(Y_train_seq,lang2_seq_length,padding='post')

X_test_seq=lang1_tokens.texts_to_sequences(test[:, 0])
X_test= pad_sequences(X_test_seq,lang1_seq_length,padding='post')

Y_test_seq=lang2_tokens.texts_to_sequences(test[:, 1])
Y_test= pad_sequences(Y_test_seq,lang2_seq_length,padding='post')

print("X_train.shape", X_train.shape)
print("Y_train.shape",Y_train.shape)
print("X_test.shape",X_test.shape)
print("Y_test.shape", Y_test.shape)

In [None]:
#Sample Text
print("Text data", [train[5, 0]])
print('Numbers sequence', X_train_seq[5])
print('Padded Sequence', X_train[5])

## Model building

In [None]:
model = Sequential()
model.add(Embedding(input_dim=lang1_vocab_size, output_dim=256, input_length=lang1_seq_length, mask_zero=True))
# input_dim - Size of the vocabulary
# output_dim - Embedding Vector Length
# input_length - Length of the input sequence
# mask_zero=True for zero padded inputs - it means, ignore zero while training

model.add(LSTM(128)) 
# Encoding

model.add(RepeatVector(lang2_seq_length)) 
# Resultant Thought vector after encoding

model.add(LSTM(128, return_sequences=True)) 
#Decoding
#return_sequences #True - Return the output values at each time step #False -last time step output only


model.add(Dense(lang2_vocab_size, activation='softmax'))
model.summary()

In [None]:
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy')

In [None]:
model.fit(X_train, Y_train.reshape(Y_train.shape[0], Y_train.shape[1], 1),  
                    epochs=1, verbose=1, 
                    batch_size=1024)
model.save_weights('Eng_Hin_model1.hdf5')

In [None]:
#Download Pre-trained model
auth.authenticate_user()
gauth = GoogleAuth()
gauth.credentials = GoogleCredentials.get_application_default()
drive = GoogleDrive(gauth)
downloaded = drive.CreateFile({'id':"1gplZOzfv9e_jb5c66BWWOAJCVMfuFR_I"})   
downloaded.GetContentFile('Eng_hin_model.zip') 
!unzip -qq 'Eng_hin_model.zip'

In [None]:
#Load Pre-trained model
model.load_weights('Eng_hin_model.hdf5')

## Prediction

In [None]:
#Define prediction function this involves five steps Explained below
def one_line_prediction(text1):
    #1.Given below is the code for pre-processing.  
    def to_lines(text):
          sents = text.strip().split('\n')
          sents = [i.split('\t') for i in sents]
          return sents
    small_input = to_lines(text1)
    small_input = array(small_input)
    
    #1.1 Remove punctuation
    small_input[:,0] = [s.translate(str.maketrans('', '', string.punctuation)) for s in small_input[:,0]]
    
    #1.2 convert text to lowercase
    for i in range(len(small_input)):
        small_input[i,0] = small_input[i,0].lower()

    #2. Encode and pad sequences to send it as input
    small_input_seq=lang1_tokens.texts_to_sequences(small_input[0])
    small_input= pad_sequences(small_input_seq,lang1_seq_length,padding='post')

    #3. Actual prediction from model - Result will be numbers
    pred_seq = model.predict_classes(small_input[0:1].reshape((small_input[0:1].shape[0],small_input[0:1].shape[1])))
    
    #4. Functon for converting numbers into words based on word_index
    def num_to_word(n, tokens):
          for word, index in tokens.word_index.items():
              if index == n:
                  return word
          return None

    #5. Final language-2 text after conversion from all numbers to words using above function
    Lang2_text = []
    for word_num in pred_seq:
          sing_pred = [] #Single word prediction 
          for i in range(len(word_num)):
                t = num_to_word(word_num[i], lang2_tokens)
                if i > 0:
                    if (t == num_to_word(word_num[i-1], lang2_tokens)) or (t == None): 
                      #Special cases like "blank" and "end of the line" in the input sequence 
                        sing_pred.append('')  
                    else:
                        sing_pred.append(t) #Appending Single word prediction 
          Lang2_text.append(' '.join(sing_pred))
    return(Lang2_text)

In [None]:
#Get the predictions
Input_sentences=["have a great Good day",
                 "Do you speak English",
                 "I do not know your language",
                 "I need help",
                 "Thank you very much",
                 "Where can I get this",
                 "How much does it cost",
                 "Where is the bathroom",
                 "Where is the ATM",
                 "I am a visitor here",
                 "Excuse me",
                 "What do you do for living",
                 "Here is my passport"]

for sent in Input_sentences:
  print([sent] , " -->",one_line_prediction(sent))

In [None]:
#Get the predictions
Input_sentences=["I want a dog"]

for sent in Input_sentences:
  print([sent] , " -->",one_line_prediction(sent))

In [None]:
#Get the predictions
Input_sentences=["I want to purchase a car"]

for sent in Input_sentences:
  print([sent] , " -->",one_line_prediction(sent))

# More Data

Tab-delimited Bilingual Sentence Pairs

http://www.manythings.org/anki/