# DATA

In [1]:
import numpy as np
import re
import string
import unicodedata
from unicodedata import normalize
import sys
import indicnlp

In [2]:
def load_doc(filename):
    file = open('ENG_HIN_SMALL_DATASET.txt' , mode ='rt' , encoding = 'utf-8')
    text = file.read()
    file.close()
    return text

In [3]:
def to_pairs(text):
    lines = text.strip().split('\n') #1d list of strings
    eng_hin_pairs = [line.split('\t') for line in lines] #list
    eng_hin_pairs = np.delete(eng_hin_pairs , -1 , axis = 1)
    eng_hin_pairs = eng_hin_pairs.tolist()
    return eng_hin_pairs    

In [4]:
doc = load_doc('ENG_HIN_SMALL_DATASET.txt')
hin_eng_pairs = to_pairs(doc)
english_sentences = []
hindi_sentences = []
for i in range(len(hin_eng_pairs)):
    english_sentences.append(hin_eng_pairs[i][0])
    hindi_sentences.append(hin_eng_pairs[i][1])

### Cleaning english data

In [5]:
def clean_english_data(lines):
    re_print = re.compile('[^%s]' %re.escape(string.printable))
    table = str.maketrans('' , '' , string.punctuation)
    clean_eng_lines = []
    for line in lines:
        line = normalize('NFD' , line).encode('ascii' , 'ignore')
        line = line.decode('UTF-8')
        line = line.split()
        line = [word.lower() for word in line]
        line = [word.translate(table) for word in line]
        line = [re_print.sub('' , w) for w in line]
        line = [word for word in line if word.isalpha()]
        line = ' '.join(line)
        clean_eng_lines.append(line)
    return clean_eng_lines   

In [6]:
clean_eng_lines = clean_english_data(english_sentences)
print(type(clean_eng_lines))
print(english_sentences[0:10])
print(clean_eng_lines[0:10])

<class 'list'>
['Wow!', 'Help!', 'Jump.', 'Jump.', 'Jump.', 'Hello!', 'Hello!', 'Cheers!', 'Cheers!', 'Got it?']
['wow', 'help', 'jump', 'jump', 'jump', 'hello', 'hello', 'cheers', 'cheers', 'got it']


### Clean hindi data

###### SETTING UP PATHS FOR INDIC NLP

In [7]:
INDIC_NLP_LIB_HOME=r"C:\Users\sudha\Desktop\NMT_PROJECTS\Language_Translation_Chat_Bot\anoopkunchukuttan-indic_nlp_library-eccde81"
INDIC_NLP_RESOURCES=r"C:\Users\sudha\Desktop\NMT_PROJECTS\Language_Translation_Chat_Bot\indic_nlp_resources-master"
sys.path.append(r'{}'.format(INDIC_NLP_LIB_HOME))
from indicnlp import common
common.set_resources_path(INDIC_NLP_RESOURCES)
from indicnlp import loader
loader.load()
from indicnlp.normalize.indic_normalize import IndicNormalizerFactory
from indicnlp.tokenize import indic_tokenize

In [8]:
def clean_text(line):
    text = line
    text = text.replace(u',' ,'')
    text = text.replace(u'"' ,'')
    text = text.replace(u'(' ,'')
    text = text.replace(u')' ,'')
    text = text.replace(u'"' ,'')
    text = text.replace(u':' ,'')
    text = text.replace(u"'" ,'')
    text = text.replace(u"'" ,'')
    text=text.replace(u"‘‘",'')
    text=text.replace(u"’’",'')
    text=text.replace(u"''",'')
    text=text.replace(u".",'')
    text=text.replace(u"-",'')
    text=text.replace(u"।",'')
    text=text.replace(u"?",'')
    text=text.replace(u"\\",'')
    text=text.replace(u"_",'')
    text = re.sub('[a-zA-Z]' , '' , text)
    text = re.sub('[0-9+\-*/.%&!]' , '' , text)
    return text

In [9]:
def clean_hindi_data(lines):
    clean_hin_lines = []
    for line in lines:
        remove_nuktas = False
        factory = IndicNormalizerFactory()
        normalizer = factory.get_normalizer("hi" , remove_nuktas = False )
        line = clean_text(line)
        tokens = list()
        for t in indic_tokenize.trivial_tokenize(line):
            tokens.append(t)
        line = tokens
        line = [ word.lower() for word in line]
        line = [word for word in line if not re.search(r'\d', word)]
        line = ' '.join(line)
        clean_hin_lines.append(line)
    return clean_hin_lines    

In [10]:
clean_hindi_lines = clean_hindi_data(hindi_sentences)
print(hindi_sentences[133])
print(clean_hindi_lines[133])

इसे दोबारा पढ़ें।
इसे दोबारा पढ़ें


### Adding SOS and EOS ,  PREPARING INPUTS FOR ENCODER AND DECODER

In [11]:
num_sentences = 2200 #(about 80-20 ratio)

In [12]:
input_sentences = []
output_sentences = []
output_sentences_inputs = []

In [13]:
#input sentences
input_sentences = clean_eng_lines[0:num_sentences]

#output sentences
for line in clean_hindi_lines[0:num_sentences]:
    line = line + ' <eos>'
    output_sentences.append(line)

#output sentence input

for line in clean_hindi_lines[0:num_sentences]:
    line = '<sos> ' + line
    output_sentences_inputs.append(line)

In [14]:
output_sentences_inputs[0:10]

['<sos> वाह',
 '<sos> बचाओ',
 '<sos> उछलो',
 '<sos> कूदो',
 '<sos> छलांग',
 '<sos> नमस्ते',
 '<sos> नमस्कार',
 '<sos> वाहवाह',
 '<sos> चियर्स',
 '<sos> समझे कि नहीं']

In [15]:
print("num samples input:", len(input_sentences))
print("num samples output:", len(output_sentences))
print("num samples output input:", len(output_sentences_inputs))

num samples input: 2200
num samples output: 2200
num samples output input: 2200
