# Import the necessary libraries

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
plt.style.use('fivethirtyeight')

import re
import string
from string import digits

# Loading the data

In [2]:
df = pd.read_csv('Hindi_English_Truncated_Corpus.csv', encoding='utf-8')
df.head()

Unnamed: 0,source,english_sentence,hindi_sentence
0,ted,politicians do not have permission to do what ...,"राजनीतिज्ञों के पास जो कार्य करना चाहिए, वह कर..."
1,ted,"I'd like to tell you about one such child,",मई आपको ऐसे ही एक बच्चे के बारे में बताना चाहू...
2,indic2012,This percentage is even greater than the perce...,यह प्रतिशत भारत में हिन्दुओं प्रतिशत से अधिक है।
3,ted,what we really mean is that they're bad at not...,हम ये नहीं कहना चाहते कि वो ध्यान नहीं दे पाते
4,indic2012,.The ending portion of these Vedas is called U...,इन्हीं वेदों का अंतिम भाग उपनिषद कहलाता है।


# Preprocess the data

In [3]:
# count the specific values in source column
df['source'].value_counts()

tides        50000
ted          39881
indic2012    37726
Name: source, dtype: int64

In [4]:
# filter data by keeing only rows with red source
df = df[df['source'] == 'ted']
df.head()

Unnamed: 0,source,english_sentence,hindi_sentence
0,ted,politicians do not have permission to do what ...,"राजनीतिज्ञों के पास जो कार्य करना चाहिए, वह कर..."
1,ted,"I'd like to tell you about one such child,",मई आपको ऐसे ही एक बच्चे के बारे में बताना चाहू...
3,ted,what we really mean is that they're bad at not...,हम ये नहीं कहना चाहते कि वो ध्यान नहीं दे पाते
7,ted,"And who are we to say, even, that they are wrong",और हम होते कौन हैं यह कहने भी वाले कि वे गलत हैं
13,ted,So there is some sort of justice,तो वहाँ न्याय है


In [5]:
# resetting index after filtering the data
df.reset_index(inplace=True, drop=True)
df.head()

Unnamed: 0,source,english_sentence,hindi_sentence
0,ted,politicians do not have permission to do what ...,"राजनीतिज्ञों के पास जो कार्य करना चाहिए, वह कर..."
1,ted,"I'd like to tell you about one such child,",मई आपको ऐसे ही एक बच्चे के बारे में बताना चाहू...
2,ted,what we really mean is that they're bad at not...,हम ये नहीं कहना चाहते कि वो ध्यान नहीं दे पाते
3,ted,"And who are we to say, even, that they are wrong",और हम होते कौन हैं यह कहने भी वाले कि वे गलत हैं
4,ted,So there is some sort of justice,तो वहाँ न्याय है


In [6]:
# taking only required columns
required_columns = ['english_sentence', 'hindi_sentence']
df = df[required_columns]

In [7]:
df.head()

Unnamed: 0,english_sentence,hindi_sentence
0,politicians do not have permission to do what ...,"राजनीतिज्ञों के पास जो कार्य करना चाहिए, वह कर..."
1,"I'd like to tell you about one such child,",मई आपको ऐसे ही एक बच्चे के बारे में बताना चाहू...
2,what we really mean is that they're bad at not...,हम ये नहीं कहना चाहते कि वो ध्यान नहीं दे पाते
3,"And who are we to say, even, that they are wrong",और हम होते कौन हैं यह कहने भी वाले कि वे गलत हैं
4,So there is some sort of justice,तो वहाँ न्याय है


In [8]:
# checking for the null values
df.isnull().sum()

english_sentence    0
hindi_sentence      0
dtype: int64

In [9]:
# checking for the duplicate values
df.duplicated()

0        False
1        False
2        False
3        False
4        False
         ...  
39876    False
39877    False
39878     True
39879    False
39880    False
Length: 39881, dtype: bool

In [10]:
# checking the shape of the dataframe
df.shape

(39881, 2)

In [11]:
# remove the duplicates
df.drop_duplicates(inplace=True)

In [12]:
# checking the shape of dataframes after removing the duplicates
df.shape

(38803, 2)

In [13]:
# getting names of the columns present
df.columns

Index(['english_sentence', 'hindi_sentence'], dtype='object')

In [14]:
# lower case all the characters
df['english_sentence'] = df['english_sentence'].apply(lambda x: x.lower())
df['hindi_sentence'] = df['hindi_sentence'].apply(lambda x: x.lower())

In [15]:
# remove quotes
df['english_sentence'] = df['english_sentence'].apply(lambda x: re.sub("'", '', x))
df['hindi_sentence'] = df['hindi_sentence'].apply(lambda x: re.sub("'", '', x))

In [16]:
# set of punctuations
exclude = set(string.punctuation)

# remove all the special characters
df['english_sentence'] = df['english_sentence'].apply(lambda x: ''.join(ch for ch in x if ch not in exclude))
df['hindi_sentence'] = df['hindi_sentence'].apply(lambda x: ''.join(ch for ch in x if ch not in exclude))

In [17]:
# trans of all the numbers
remove_digits = str.maketrans('', '', digits)

# remove all the digits
df['english_sentence'] = df['english_sentence'].apply(lambda x: x.translate(remove_digits))
df['hindi_sentence'] = df['hindi_sentence'].apply(lambda x: x.translate(remove_digits))

# remove hindi digits
df['hindi_sentence'] = df['hindi_sentence'].apply(lambda x: re.sub("[२३०८१५७९४६]", "", x))

In [18]:
# remove extra spaces
df['english_sentence'] = df['english_sentence'].apply(lambda x: x.strip())
df['hindi_sentence'] = df['hindi_sentence'].apply(lambda x: x.strip())

df['english_sentence'] = df['english_sentence'].apply(lambda x: re.sub(" +", " ", x))
df['hindi_sentence'] = df['hindi_sentence'].apply(lambda x: re.sub(" +", " ", x))

In [19]:
# add START and END token to the target string
df['hindi_sentence'] = df['hindi_sentence'].apply(lambda x: 'START_ ' + x + ' _END')

In [20]:
df.head()

Unnamed: 0,english_sentence,hindi_sentence
0,politicians do not have permission to do what ...,START_ राजनीतिज्ञों के पास जो कार्य करना चाहिए...
1,id like to tell you about one such child,START_ मई आपको ऐसे ही एक बच्चे के बारे में बता...
2,what we really mean is that theyre bad at not ...,START_ हम ये नहीं कहना चाहते कि वो ध्यान नहीं ...
3,and who are we to say even that they are wrong,START_ और हम होते कौन हैं यह कहने भी वाले कि व...
4,so there is some sort of justice,START_ तो वहाँ न्याय है _END
