# DATA

In [1]:
import numpy as np
import re
import string
import unicodedata
from unicodedata import normalize
import sys
import indicnlp
import tensorflow as tf
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences


In [2]:
def load_doc(filename):
    file = open('ENG_HIN_SMALL_DATASET.txt' , mode ='rt' , encoding = 'utf-8')
    text = file.read()
    file.close()
    return text

In [3]:
def to_pairs(text):
    lines = text.strip().split('\n') #1d list of strings
    eng_hin_pairs = [line.split('\t') for line in lines] #list
    eng_hin_pairs = np.delete(eng_hin_pairs , -1 , axis = 1)
    eng_hin_pairs = eng_hin_pairs.tolist()
    return eng_hin_pairs    

In [4]:
doc = load_doc('ENG_HIN_SMALL_DATASET.txt')
hin_eng_pairs = to_pairs(doc)
english_sentences = []
hindi_sentences = []
for i in range(len(hin_eng_pairs)):
    english_sentences.append(hin_eng_pairs[i][0])
    hindi_sentences.append(hin_eng_pairs[i][1])

In [5]:
hin_eng_pairs

[['Wow!', 'वाह!'],
 ['Help!', 'बचाओ!'],
 ['Jump.', 'उछलो.'],
 ['Jump.', 'कूदो.'],
 ['Jump.', 'छलांग.'],
 ['Hello!', 'नमस्ते।'],
 ['Hello!', 'नमस्कार।'],
 ['Cheers!', 'वाह-वाह!'],
 ['Cheers!', 'चियर्स!'],
 ['Got it?', 'समझे कि नहीं?'],
 ["I'm OK.", 'मैं ठीक हूँ।'],
 ['Awesome!', 'बहुत बढ़िया!'],
 ['Come in.', 'अंदर आ जाओ।'],
 ['Get out!', 'बाहर निकल जाओ!'],
 ['Go away!', 'चले जाओ!'],
 ['Goodbye!', 'ख़ुदा हाफ़िज़।'],
 ['Perfect!', 'उत्तम!'],
 ['Perfect!', 'सही!'],
 ['Welcome.', 'आपका स्वागत है।'],
 ['Welcome.', 'स्वागतम्।'],
 ['Have fun.', 'मज़े करना।'],
 ['Have fun.', 'मौज करना।'],
 ['Have fun.', 'मज़े करो।'],
 ['I forgot.', 'मैं भूल गया।'],
 ['I forgot.', 'मैं भूल गई।'],
 ["I'll pay.", 'मैं पैसे दूंगा।'],
 ["I'm fine.", 'मैं ठीक हूँ।'],
 ["I'm full.", 'मेरा पेट भर गया है।'],
 ["Let's go!", 'चलो चलें!'],
 ['Answer me.', 'मुझे जवाब दो।'],
 ['Birds fly.', 'पंछी उड़ते हैं।'],
 ['Excuse me.', 'माफ़ कीजिए।'],
 ['Fantastic!', 'बहुत ख़ूब!'],
 ['I fainted.', 'मैं बेहोश हो गया।'],
 ['I fear so.'

### Cleaning english data


In [6]:
def clean_english_data(lines):
    re_print = re.compile('[^%s]' %re.escape(string.printable))
    table = str.maketrans('' , '' , string.punctuation)
    clean_eng_lines = []
    for line in lines:
        line = normalize('NFD' , line).encode('ascii' , 'ignore')
        line = line.decode('UTF-8')
        line = line.split()
        line = [word.lower() for word in line]
        line = [word.translate(table) for word in line]
        line = [re_print.sub('' , w) for w in line]
        line = [word for word in line if word.isalpha()]
        line = ' '.join(line)
        clean_eng_lines.append(line)
    return clean_eng_lines    

In [7]:
clean_eng_lines = clean_english_data(english_sentences)
print(type(clean_eng_lines))
print(english_sentences[0:10])
print(clean_eng_lines[0:10])

<class 'list'>
['Wow!', 'Help!', 'Jump.', 'Jump.', 'Jump.', 'Hello!', 'Hello!', 'Cheers!', 'Cheers!', 'Got it?']
['wow', 'help', 'jump', 'jump', 'jump', 'hello', 'hello', 'cheers', 'cheers', 'got it']


### Clean hindi data

###### SETTING UP PATHS FOR INDIC NLP

In [8]:
INDIC_NLP_LIB_HOME=r"C:\Users\sudha\Desktop\NMT_PROJECTS\Language_Translation_Chat_Bot\anoopkunchukuttan-indic_nlp_library-eccde81"
INDIC_NLP_RESOURCES=r"C:\Users\sudha\Desktop\NMT_PROJECTS\Language_Translation_Chat_Bot\indic_nlp_resources-master"
sys.path.append(r'{}'.format(INDIC_NLP_LIB_HOME))
from indicnlp import common
common.set_resources_path(INDIC_NLP_RESOURCES)
from indicnlp import loader
loader.load()
from indicnlp.normalize.indic_normalize import IndicNormalizerFactory
from indicnlp.tokenize import indic_tokenize

In [9]:
def clean_text(line):
    text = line
    text = text.replace(u',' ,'')
    text = text.replace(u'"' ,'')
    text = text.replace(u'(' ,'')
    text = text.replace(u')' ,'')
    text = text.replace(u'"' ,'')
    text = text.replace(u':' ,'')
    text = text.replace(u"'" ,'')
    text = text.replace(u"'" ,'')
    text=text.replace(u"‘‘",'')
    text=text.replace(u"’’",'')
    text=text.replace(u"''",'')
    text=text.replace(u".",'')
    text=text.replace(u"-",'')
    text=text.replace(u"।",'')
    text=text.replace(u"?",'')
    text=text.replace(u"\\",'')
    text=text.replace(u"_",'')
    text = re.sub('[a-zA-Z]' , '' , text)
    text = re.sub('[0-9+\-*/.%&!]' , '' , text)
    return text

In [10]:
def clean_hindi_data(lines):
    clean_hin_lines = []
    for line in lines:
        remove_nuktas = False
        factory = IndicNormalizerFactory()
        normalizer = factory.get_normalizer("hi" , remove_nuktas = False )
        line = clean_text(line)
        tokens = list()
        for t in indic_tokenize.trivial_tokenize(line):
            tokens.append(t)
        line = tokens
        line = [ word.lower() for word in line]
        line = [word for word in line if not re.search(r'\d', word)]
        line = ' '.join(line)
        clean_hin_lines.append(line)
    return clean_hin_lines    

In [11]:
clean_hindi_lines = clean_hindi_data(hindi_sentences)
print(hindi_sentences[133])
print(clean_hindi_lines[133])

इसे दोबारा पढ़ें।
इसे दोबारा पढ़ें


### Adding SOS and EOS ,  PREPARING INPUTS FOR ENCODER AND DECODER

In [12]:
num_sentences = 2200 #(about 80-20 ratio)

In [13]:
input_sentences = []
output_sentences = []
output_sentences_inputs = []

In [14]:
#input sentences
input_sentences = clean_eng_lines[0:num_sentences]

#output sentences
for line in clean_hindi_lines[0:num_sentences]:
    line = line + ' <eos>'
    output_sentences.append(line)

#output sentence input

for line in clean_hindi_lines[0:num_sentences]:
    line = '<sos> ' + line
    output_sentences_inputs.append(line)

In [15]:
len(input_sentences)

2200

In [16]:
output_sentences_inputs[0:10]

['<sos> वाह',
 '<sos> बचाओ',
 '<sos> उछलो',
 '<sos> कूदो',
 '<sos> छलांग',
 '<sos> नमस्ते',
 '<sos> नमस्कार',
 '<sos> वाहवाह',
 '<sos> चियर्स',
 '<sos> समझे कि नहीं']

In [17]:
print("num samples input:", len(input_sentences))
print("num samples output:", len(output_sentences))
print("num samples output input:", len(output_sentences_inputs))

num samples input: 2200
num samples output: 2200
num samples output input: 2200


In [18]:
print(len(output_sentences + output_sentences_inputs))

4400


#### Tokenizing and padding  english and hindi

In [19]:
BATCH_SIZE = 64
EPOCHS = 20
LSTM_NODES =256
NUM_SENTENCES = 20000
MAX_SENTENCE_LENGTH = 50
MAX_NUM_WORDS = 20000
EMBEDDING_SIZE = 100

In [20]:
# for vocabulary

In [21]:
class LanguageIndex():
  def __init__(self, lang):
    self.lang = lang
    self.word2idx = {}
    self.idx2word = {}
    self.vocab = set()
    
    self.create_index()
    
  def create_index(self):
    for phrase in self.lang:
      self.vocab.update(phrase.split(' '))
    
    self.vocab = sorted(self.vocab)
    
    self.word2idx['<sos>'] = 0
    self.word2idx['<eos>'] = 1
    
    for index, word in enumerate(self.vocab):
      self.word2idx[word] = index + 1
    
    for word, index in self.word2idx.items():
      self.idx2word[index] = word


In [22]:
pairs = []
for i in range(len(input_sentences)):
    pairs.append([])
    pairs[i].append(input_sentences[i])
    pairs[i].append(output_sentences[i])



In [23]:
def max_length(tensor):
    return max(len(t) for t in tensor)

def load_dataset(pairs):  
    inp_lang = LanguageIndex(en for en, hi in pairs)
    targ_lang = LanguageIndex(hi for en, hi in pairs)
    input_tensor = [[inp_lang.word2idx[s] for s in en.split(' ')] for en, hi in pairs]
    target_tensor = [[targ_lang.word2idx[s] for s in hi.split(' ')] for en, hi in pairs]
    max_length_inp, max_length_targ = max_length(input_tensor), max_length(target_tensor)
    return inp_lang, targ_lang, max_length_inp, max_length_targ, input_tensor, target_tensor




In [36]:
inp_lang, targ_lang, max_length_inp, max_length_targ , input_tensor , target_tensor = load_dataset(pairs)
inp_lang.word2idx

{'<sos>': 0,
 '<eos>': 1,
 'a': 1,
 'abandoned': 2,
 'ability': 3,
 'ablaze': 4,
 'able': 5,
 'about': 6,
 'abroad': 7,
 'absence': 8,
 'absent': 9,
 'absolute': 10,
 'absurd': 11,
 'abused': 12,
 'accepted': 13,
 'accident': 14,
 'accidental': 15,
 'accompanied': 16,
 'accompany': 17,
 'according': 18,
 'account': 19,
 'accused': 20,
 'accustomed': 21,
 'ache': 22,
 'acquainted': 23,
 'across': 24,
 'actor': 25,
 'actress': 26,
 'add': 27,
 'address': 28,
 'admit': 29,
 'adopted': 30,
 'advantage': 31,
 'advice': 32,
 'advise': 33,
 'advised': 34,
 'afraid': 35,
 'africa': 36,
 'after': 37,
 'afternoon': 38,
 'again': 39,
 'against': 40,
 'age': 41,
 'agree': 42,
 'agreement': 43,
 'air': 44,
 'airport': 45,
 'alcohol': 46,
 'alive': 47,
 'all': 48,
 'allergic': 49,
 'allow': 50,
 'allowed': 51,
 'almost': 52,
 'alone': 53,
 'along': 54,
 'aloud': 55,
 'already': 56,
 'also': 57,
 'always': 58,
 'am': 59,
 'amateur': 60,
 'amazed': 61,
 'ambulance': 62,
 'america': 63,
 'american': 64

In [25]:
# padding

In [26]:
def padded_tensor(max_length_inp, max_length_tar, inp_lang, targ_lang , input_tensor , target_tensor):
    
    input_tensor = tf.keras.preprocessing.sequence.pad_sequences(input_tensor, maxlen=max_length_inp, padding='post')
    target_tensor = tf.keras.preprocessing.sequence.pad_sequences(target_tensor, maxlen=max_length_tar, padding='post')
    return input_tensor, target_tensor



In [27]:
input_tensor_train, target_tensor_train = padded_tensor(max_length_inp, max_length_targ, inp_lang, targ_lang ,input_tensor , target_tensor)


In [28]:
input_tensor_train

array([[1822,    0,    0, ...,    0,    0,    0],
       [ 750,    0,    0, ...,    0,    0,    0],
       [ 851,    0,    0, ...,    0,    0,    0],
       ...,
       [1835, 1031, 1079, ...,  320,    0,    0],
       [1845,  688,    1, ...,    1, 1606,    0],
       [1840,  536, 1296, ...,    0,    0,    0]])

In [29]:
target_tensor_train

array([[1929,    1,    0, ...,    0,    0,    0],
       [1394,    1,    0, ...,    0,    0,    0],
       [ 183,    1,    0, ...,    0,    0,    0],
       ...,
       [ 941,  449,  411, ...,    0,    0,    0],
       [ 939,  823,  422, ...,    0,    0,    0],
       [ 940,  153,  411, ...,    0,    0,    0]])

### word embeddings

In [31]:
from numpy import array
from numpy import asarray
from numpy import zeros

embeddings_dictionary = dict()

glove_file = open('glove.6B.100d.txt', encoding="utf-8")

for line in glove_file:
    records = line.split()
    word = records[0]
    vector_dimensions = asarray(records[1:], dtype='float32')
    embeddings_dictionary[word] = vector_dimensions
glove_file.close()

In [34]:
num_words = min(MAX_NUM_WORDS, len(input_tensor) + 1)
embedding_matrix = zeros((num_words, EMBEDDING_SIZE))
for word, index in input_tensor.items():
    embedding_vector = embeddings_dictionary.get(word)
    if embedding_vector is not None:
        embedding_matrix[index] = embedding_vector

AttributeError: 'list' object has no attribute 'items'

In [35]:
input_tensor

[[1822],
 [750],
 [851],
 [851],
 [851],
 [749],
 [749],
 [293],
 [293],
 [688, 834],
 [802, 1096],
 [128],
 [331, 808],
 [666, 1119],
 [677, 127],
 [685],
 [1153],
 [1153],
 [1752],
 [1752],
 [733, 648],
 [733, 648],
 [733, 648],
 [794, 631],
 [794, 631],
 [800, 1145],
 [802, 597],
 [802, 646],
 [910, 677],
 [75, 975],
 [189, 613],
 [540, 975],
 [567],
 [794, 559],
 [794, 581, 1448],
 [794, 892],
 [802, 205],
 [802, 225],
 [802, 1627],
 [836, 326],
 [1753, 460],
 [1771, 874],
 [1771, 874],
 [1771, 874],
 [1771, 874],
 [1801],
 [189, 1414],
 [331, 1099, 808],
 [419],
 [461, 1022],
 [602, 241],
 [616, 763],
 [794, 259, 1553],
 [794, 259, 1553],
 [794, 945, 1835],
 [794, 945, 1835],
 [794, 945, 1835],
 [794, 945, 1835],
 [794, 945, 1835],
 [794, 1780, 1669],
 [802, 334],
 [802, 788],
 [802, 788],
 [909, 763, 808],
 [909, 763, 808],
 [909, 975, 1119],
 [1100, 39],
 [1184, 1420],
 [1591, 1, 216],
 [1761, 1053],
 [1761, 1053],
 [1774, 1591],
 [461, 1396],
 [461, 1396],
 [737, 1510, 1695],
 