# DATA

In [28]:
import numpy as np
import re
import string
import unicodedata
from unicodedata import normalize
import sys
import indicnlp
import tensorflow as tf
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from numpy import array
from numpy import asarray
from numpy import zeros

In [2]:
def load_doc(filename):
    file = open('ENG_HIN_SMALL_DATASET.txt' , mode ='rt' , encoding = 'utf-8')
    text = file.read()
    file.close()
    return text

In [3]:
def to_pairs(text):
    lines = text.strip().split('\n') #1d list of strings
    eng_hin_pairs = [line.split('\t') for line in lines] #list
    eng_hin_pairs = np.delete(eng_hin_pairs , -1 , axis = 1)
    eng_hin_pairs = eng_hin_pairs.tolist()
    return eng_hin_pairs    

In [4]:
doc = load_doc('ENG_HIN_SMALL_DATASET.txt')
hin_eng_pairs = to_pairs(doc)
english_sentences = []
hindi_sentences = []
for i in range(len(hin_eng_pairs)):
    english_sentences.append(hin_eng_pairs[i][0])
    hindi_sentences.append(hin_eng_pairs[i][1])

### Cleaning english data

In [5]:
def clean_english_data(lines):
    re_print = re.compile('[^%s]' %re.escape(string.printable))
    table = str.maketrans('' , '' , string.punctuation)
    clean_eng_lines = []
    for line in lines:
        line = normalize('NFD' , line).encode('ascii' , 'ignore')
        line = line.decode('UTF-8')
        line = line.split()
        line = [word.lower() for word in line]
        line = [word.translate(table) for word in line]
        line = [re_print.sub('' , w) for w in line]
        line = [word for word in line if word.isalpha()]
        line = ' '.join(line)
        clean_eng_lines.append(line)
    return clean_eng_lines   

In [6]:
clean_eng_lines = clean_english_data(english_sentences)
print(type(clean_eng_lines))
print(english_sentences[0:10])
print(clean_eng_lines[0:10])

<class 'list'>
['Wow!', 'Help!', 'Jump.', 'Jump.', 'Jump.', 'Hello!', 'Hello!', 'Cheers!', 'Cheers!', 'Got it?']
['wow', 'help', 'jump', 'jump', 'jump', 'hello', 'hello', 'cheers', 'cheers', 'got it']


### Clean hindi data

###### SETTING UP PATHS FOR INDIC NLP

In [7]:
INDIC_NLP_LIB_HOME=r"C:\Users\sudha\Desktop\NMT_PROJECTS\Language_Translation_Chat_Bot\anoopkunchukuttan-indic_nlp_library-eccde81"
INDIC_NLP_RESOURCES=r"C:\Users\sudha\Desktop\NMT_PROJECTS\Language_Translation_Chat_Bot\indic_nlp_resources-master"
sys.path.append(r'{}'.format(INDIC_NLP_LIB_HOME))
from indicnlp import common
common.set_resources_path(INDIC_NLP_RESOURCES)
from indicnlp import loader
loader.load()
from indicnlp.normalize.indic_normalize import IndicNormalizerFactory
from indicnlp.tokenize import indic_tokenize

In [8]:
def clean_text(line):
    text = line
    text = text.replace(u',' ,'')
    text = text.replace(u'"' ,'')
    text = text.replace(u'(' ,'')
    text = text.replace(u')' ,'')
    text = text.replace(u'"' ,'')
    text = text.replace(u':' ,'')
    text = text.replace(u"'" ,'')
    text = text.replace(u"'" ,'')
    text=text.replace(u"‘‘",'')
    text=text.replace(u"’’",'')
    text=text.replace(u"''",'')
    text=text.replace(u".",'')
    text=text.replace(u"-",'')
    text=text.replace(u"।",'')
    text=text.replace(u"?",'')
    text=text.replace(u"\\",'')
    text=text.replace(u"_",'')
    text = re.sub('[a-zA-Z]' , '' , text)
    text = re.sub('[0-9+\-*/.%&!]' , '' , text)
    return text

In [9]:
def clean_hindi_data(lines):
    clean_hin_lines = []
    for line in lines:
        remove_nuktas = False
        factory = IndicNormalizerFactory()
        normalizer = factory.get_normalizer("hi" , remove_nuktas = False )
        line = clean_text(line)
        tokens = list()
        for t in indic_tokenize.trivial_tokenize(line):
            tokens.append(t)
        line = tokens
        line = [ word.lower() for word in line]
        line = [word for word in line if not re.search(r'\d', word)]
        line = ' '.join(line)
        clean_hin_lines.append(line)
    return clean_hin_lines    

In [10]:
clean_hindi_lines = clean_hindi_data(hindi_sentences)
print(hindi_sentences[133])
print(clean_hindi_lines[133])

इसे दोबारा पढ़ें।
इसे दोबारा पढ़ें


### Adding SOS and EOS ,  PREPARING INPUTS FOR ENCODER AND DECODER

In [11]:
num_sentences = 2200 #(about 80-20 ratio)

In [12]:
input_sentences = []
output_sentences = []
output_sentences_inputs = []

In [13]:
#input sentences
input_sentences = clean_hindi_lines[0:num_sentences]

#output sentences
for line in clean_eng_lines[0:num_sentences]:
    line = line + ' <eos>'
    output_sentences.append(line)

#output sentence input

for line in clean_eng_lines[0:num_sentences]:
    line = '<sos> ' + line
    output_sentences_inputs.append(line)

In [14]:
output_sentences_inputs[0:10]

['<sos> wow',
 '<sos> help',
 '<sos> jump',
 '<sos> jump',
 '<sos> jump',
 '<sos> hello',
 '<sos> hello',
 '<sos> cheers',
 '<sos> cheers',
 '<sos> got it']

In [15]:
print("num samples input:", len(input_sentences))
print("num samples output:", len(output_sentences))
print("num samples output input:", len(output_sentences_inputs))

num samples input: 2200
num samples output: 2200
num samples output input: 2200


#### Tokenizing and padding  english and hindi

In [16]:
BATCH_SIZE = 64
EPOCHS = 20
LSTM_NODES =256
NUM_SENTENCES = 20000
MAX_SENTENCE_LENGTH = 50
MAX_NUM_WORDS = 20000
EMBEDDING_SIZE = 100

In [17]:
# for vocabulary

In [18]:
class LanguageIndex():
  def __init__(self, lang):
    self.lang = lang
    self.word2idx = {}
    self.idx2word = {}
    self.vocab = set()
    
    self.create_index()
    
  def create_index(self):
    for phrase in self.lang:
      self.vocab.update(phrase.split(' '))
    
    self.vocab = sorted(self.vocab)
    
    self.word2idx['<sos>'] = 0
#     self.word2idx['<eos>'] = 1
    
    for index, word in enumerate(self.vocab):
      self.word2idx[word] = index + 1
    
    for word, index in self.word2idx.items():
      self.idx2word[index] = word


In [19]:
pairs = []
for i in range(len(input_sentences)):
    pairs.append([])
    pairs[i].append(input_sentences[i])
    pairs[i].append(output_sentences_inputs[i])
    pairs[i].append(output_sentences[i])



In [21]:
output_sentences_inputs[0:10]

['<sos> wow',
 '<sos> help',
 '<sos> jump',
 '<sos> jump',
 '<sos> jump',
 '<sos> hello',
 '<sos> hello',
 '<sos> cheers',
 '<sos> cheers',
 '<sos> got it']

In [22]:
def max_length(tensor):
    return max(len(t) for t in tensor)

def load_dataset(pairs):  
    inp_lang = LanguageIndex(hi for hi, en, en2 in pairs)
    targ_lang = LanguageIndex(en for hi, en, en2 in pairs)
#   targ_lang_decoder_output: english sentences with <eos>
    targ_lang_decoder_output = LanguageIndex(en2 for hi, en, en2 in pairs)
    input_tensor = [[inp_lang.word2idx[s] for s in hi.split(' ')] for hi, en, en2 in pairs]
    target_tensor = [[targ_lang.word2idx[s] for s in en.split(' ')] for hi, en, en2 in pairs]
    target_tensor_decoder_output = [[targ_lang_decoder_output.word2idx[s] for s in en2.split(' ')] for hi, en, en2 in pairs]
    max_length_inp, max_length_targ,  = max_length(input_tensor), max_length(target_tensor)
    input_lang_vocab = inp_lang.vocab
    return inp_lang, targ_lang,targ_lang_decoder_output, max_length_inp, max_length_targ, input_tensor, target_tensor, target_tensor_decoder_output, input_lang_vocab




In [24]:
inp_lang, targ_lang, targ_lang_decoder_output, max_length_inp, max_length_targ , input_tensor , target_tensor,target_tensor_decoder_output, input_lang_vocab = load_dataset(pairs)
print(targ_lang_decoder_output.word2idx)
input_lang_word2idx = inp_lang.word2idx

{'<sos>': 0, '<eos>': 1, 'a': 2, 'abandoned': 3, 'ability': 4, 'ablaze': 5, 'able': 6, 'about': 7, 'abroad': 8, 'absence': 9, 'absent': 10, 'absolute': 11, 'absurd': 12, 'abused': 13, 'accepted': 14, 'accident': 15, 'accidental': 16, 'accompanied': 17, 'accompany': 18, 'according': 19, 'account': 20, 'accused': 21, 'accustomed': 22, 'ache': 23, 'acquainted': 24, 'across': 25, 'actor': 26, 'actress': 27, 'add': 28, 'address': 29, 'admit': 30, 'adopted': 31, 'advantage': 32, 'advice': 33, 'advise': 34, 'advised': 35, 'afraid': 36, 'africa': 37, 'after': 38, 'afternoon': 39, 'again': 40, 'against': 41, 'age': 42, 'agree': 43, 'agreement': 44, 'air': 45, 'airport': 46, 'alcohol': 47, 'alive': 48, 'all': 49, 'allergic': 50, 'allow': 51, 'allowed': 52, 'almost': 53, 'alone': 54, 'along': 55, 'aloud': 56, 'already': 57, 'also': 58, 'always': 59, 'am': 60, 'amateur': 61, 'amazed': 62, 'ambulance': 63, 'america': 64, 'american': 65, 'among': 66, 'amount': 67, 'an': 68, 'anchorage': 69, 'and': 7

In [25]:
# padding

In [26]:
def padded_tensor(max_length_inp, max_length_tar, inp_lang, targ_lang , input_tensor , target_tensor):
    
    input_tensor = tf.keras.preprocessing.sequence.pad_sequences(input_tensor, maxlen=max_length_inp, padding='pre')
    target_tensor = tf.keras.preprocessing.sequence.pad_sequences(target_tensor, maxlen=max_length_tar, padding='post')
    return input_tensor, target_tensor


In [29]:
input_tensor_train, target_tensor_train = padded_tensor(max_length_inp, max_length_targ, inp_lang, targ_lang ,input_tensor , target_tensor)


In [30]:
input_tensor_train

array([[   0,    0,    0, ...,    0,    0, 1928],
       [   0,    0,    0, ...,    0,    0, 1393],
       [   0,    0,    0, ...,    0,    0,  182],
       ...,
       [   0,    0,    0, ..., 1270,  314, 2241],
       [   0,    0,    0, ..., 1737, 1560,  395],
       [   0,    0,    0, ..., 1470,   26, 2234]])

In [31]:
target_tensor_train

array([[   1, 1823,    0, ...,    0,    0,    0],
       [   1,  751,    0, ...,    0,    0,    0],
       [   1,  852,    0, ...,    0,    0,    0],
       ...,
       [   1, 1836, 1032, ...,  321,    0,    0],
       [   1, 1846,  689, ...,    2, 1607,    0],
       [   1, 1841,  537, ...,    0,    0,    0]])

### word embeddings

In [33]:
embeddings_dictionary = dict()
glove_file = open('hi-d100-glove.txt' ,  encoding = 'utf-8')
# glove_file = open('hi-d100-glove.txt' ,  encoding = 'Devanagari')
for line in glove_file:
    records = line.split()
    word = records[0]
    vector_dimensions = asarray(records[1:] , dtype='float32')
    embeddings_dictionary[word] = vector_dimensions

glove_file.close()

LookupError: unknown encoding: Devanagari

In [34]:
embeddings_dictionary

{}