In [4]:
import pandas as pd 
import numpy as np 
import configparser
import os

config = configparser.ConfigParser()
config.read("env.ini")
data_home = config['DEFAULT']['data_home']
output_dir = config['DEFAULT']['output_dir']

OHCO = ['book_id','chap_num', 'para_num', 'sent_num', 'token_num']
k = 18
all_dir_list = os.listdir(data_home)
text_file = f"{data_home}/{all_dir_list[k-1]}"

In [5]:
import preprocess
book_dict = {}
all_TOKENS = pd.DataFrame()
for book_id, book_file in enumerate(all_dir_list):
    print(book_id+1, book_file)
    book_dict[book_id+1] = preprocess.create_tokendf(f"{data_home}/{all_dir_list[book_id]}")
    book_dict[book_id+1]['token_num'].index = pd.MultiIndex.from_tuples(
                                        [(book_id+1,) + idx for idx in book_dict[book_id+1]['token_num'].index],
                                        names=['book_id'] + book_dict[book_id+1]['token_num'].index.names)
    all_TOKENS = pd.concat([all_TOKENS, book_dict[book_id+1]['token_num']])


1 maha01.txt
2 maha02.txt
3 maha03.txt
4 maha04.txt
5 maha05.txt
6 maha06.txt
7 maha07.txt
8 maha08.txt
9 maha09.txt
10 maha10.txt
11 maha11.txt
12 maha12.txt
13 maha13.txt
14 maha14.txt
15 maha15.txt
16 maha16.txt
17 maha17.txt
18 maha18.txt


In [7]:
all_TOKENS

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,token_str,term_str
book_id,chap_num,para_num,sent_num,token_num,Unnamed: 5_level_1,Unnamed: 6_level_1
1,1,0,0,0,Om,om
1,1,0,1,0,Having,having
1,1,0,1,1,bowed,bowed
1,1,0,1,2,down,down
1,1,0,1,3,to,to
...,...,...,...,...,...,...
18,6,37,2,23,Vishnu,vishnu
18,6,37,2,24,like,like
18,6,37,2,25,Vishnu,vishnu
18,6,37,2,26,himself,himself


In [2]:
all_dir_list = os.listdir(data_home)
text_file = f"{data_home}/{all_dir_list[k-1]}"

# Reading with Signed UTF8 encoding and column name as line_str
LINES = pd.DataFrame(open(text_file, 'r', encoding='utf-8-sig').readlines(), columns=['line_str'])
LINES.index.name = 'line_num'

LINES.line_str = LINES.line_str.str.replace(r'\n+', ' ', regex=True).str.strip()
LINES.head(20)

Unnamed: 0_level_0,line_str
line_num,Unnamed: 1_level_1
0,
1,
2,
3,
4,The Mahabharata
5,
6,of
7,
8,Krishna-Dwaipayana Vyasa
9,


In [219]:
#### Clip the Cruft

# Using RegEx to find the placeholders for Start & End of Text
clip_pats = [
    r"(?i)^om\b",
    r"(?i)(?=.*\bend\b)(?=.*\bparv\w*)"
]
print(clip_pats)

# Getting the Pattern matches for both
pat_a = LINES.line_str.str.match(clip_pats[0])
pat_b = LINES.line_str.str.match(clip_pats[1])

# Getting the line number
line_a = LINES.loc[pat_a].index[0] 
line_b = LINES.loc[pat_b].index[-1] - 2
line_a, line_b

['(?i)^om\\b', '(?i)(?=.*\\bend\\b)(?=.*\\bparv\\w*)']


(31, 881)

In [220]:
LINES = LINES.loc[line_a : line_b]
LINES

Unnamed: 0_level_0,line_str
line_num,Unnamed: 1_level_1
31,"Om! Having bowed down into Narayana, and to Na..."
32,"also to the goddess Sarasvati, should the word..."
33,
34,"Janamejaya said, ""Having attained to Heaven, w..."
35,"respectively attained by my grandsires of old,..."
...,...
877,"preceptors bed, or even if he be a drinker of ..."
878,"other peoples wares, or even if he be born in ..."
879,Destroying all his sins like the maker of day ...
880,"man, without doubt, sports in felicity in the ..."


In [221]:
# Chapter/Letter space Number Sentences as Chapter Headings
chap_pat = r"^\s*(?:SECTION)+"
num_pat = r"^\s*\d+\s*$"
chap_lines = LINES.line_str.str.match(chap_pat, case=True) | LINES.line_str.str.match(num_pat, case=True)

LINES.loc[chap_lines]

Unnamed: 0_level_0,line_str
line_num,Unnamed: 1_level_1
99,2
239,3
345,4
405,5
568,6


In [222]:
# Creating a chap_num column as the Chapter Number Index starting from 1
LINES.loc[chap_lines, 'chap_num'] = [int(i+2) for i in range(LINES.loc[chap_lines].shape[0])]


# Forward Fill to fill the Chapter Lines with the Chapter Number
LINES.chap_num = LINES.chap_num.ffill()



# Removing Lines before Chapter 1 that do not need to be included
LINES.loc[:LINES.loc[chap_lines].index[0],"chap_num"] = 1
LINES = LINES.dropna(subset=['chap_num']) 
# Removing Chapter Headers now
LINES = LINES.loc[~chap_lines]
# Making the Chapter Numbers Integer Type
LINES.chap_num = LINES.chap_num.astype('int')

LINES.head(10)

Unnamed: 0_level_0,line_str,chap_num
line_num,Unnamed: 1_level_1,Unnamed: 2_level_1
31,"Om! Having bowed down into Narayana, and to Na...",1
32,"also to the goddess Sarasvati, should the word...",1
33,,1
34,"Janamejaya said, ""Having attained to Heaven, w...",1
35,"respectively attained by my grandsires of old,...",1
36,sons of Dhritarashtra? I desire to hear this. ...,1
37,"conversant with everything, having been taught...",1
38,of wonderful feats.,1
39,,1
40,"Vaishampayana said, ""Listen now to what thy gr...",1


In [223]:
# Grouping by chap_num and concatenating using \n
CHAPS = LINES.groupby(OHCO[1:2])\
    .line_str.apply(lambda x: '\n'.join(x))\
    .to_frame('chap_str')                      

# Cleaning trailing newlines
CHAPS['chap_str'] = CHAPS.chap_str.str.strip()
CHAPS

Unnamed: 0_level_0,chap_str
chap_num,Unnamed: 1_level_1
1,"Om! Having bowed down into Narayana, and to Na..."
2,"""Yudhishthira said, Ye deities, I do not see h..."
3,"Vaishampayana said, ""King Yudhishthira the jus..."
4,"Vaishampayana said, ""King Yudhishthira, thus p..."
5,"Janamejaya said, ""Bhishma and Drona, those two..."
6,"Janamejaya said, ""O holy one, according to wha..."


In [224]:
# RegEx for each paragraph
para_pat = r'\n\n+'
PARAS = CHAPS['chap_str'].str.split(para_pat, expand=True)\
    .stack()\
    .to_frame('para_str')\
    .sort_index()

PARAS.index.names = OHCO[1:3]

PARAS['para_str'] = PARAS['para_str'].str.replace(r'\n', ' ', regex=True)\
                                     .str.strip()

PARAS = PARAS[~PARAS['para_str'].str.match(r'^\s*$')]
PARAS.sample(20)

Unnamed: 0_level_0,Unnamed: 1_level_0,para_str
chap_num,para_num,Unnamed: 2_level_1
6,37,"""I have thus, O chief of men, told everything ..."
6,3,"""One desirous of hearing the Bharata, should h..."
6,15,"""Thus accoutred, he receives high honours in H..."
6,26,"""During the recitation of the Gada-parva, brah..."
5,0,"Janamejaya said, ""Bhishma and Drona, those two..."
2,9,"""Thus addressed, they answered him from all si..."
6,27,"""During the recitation of the Stri-parva, fore..."
6,21,"""During the recitation of the Virata-parva div..."
1,0,"Om! Having bowed down into Narayana, and to Na..."
2,7,"""Hearing those words of beings in woe, Yudhish..."


In [225]:
# RegEx for each line ending
sent_pat = r'[.?!;:]+'
SENTS = PARAS['para_str'].str.split(sent_pat, expand=True).stack()\
    .to_frame('sent_str')
SENTS.index.names = OHCO[1:4]

SENTS = SENTS[~SENTS['sent_str'].str.match(r'^\s*$')] 
SENTS.sent_str = SENTS.sent_str.str.strip() 

SENTS.head(25)

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,sent_str
chap_num,para_num,sent_num,Unnamed: 3_level_1
1,0,0,Om
1,0,1,"Having bowed down into Narayana, and to Nara, ..."
1,1,0,"Janamejaya said, ""Having attained to Heaven, w..."
1,1,1,", the Pandavas and the sons of Dhritarashtra"
1,1,2,I desire to hear this
1,1,3,I think that thou art conversant with everythi...
1,2,0,"Vaishampayana said, ""Listen now to what thy gr..."
1,2,1,"Arrived at Heaven, king Yudhishthira the just,..."
1,2,2,He blazed with effulgence like the sun and wor...
1,2,3,And he was in the company of many deities of b...


In [227]:
# RegEx to Split by space, hyphen or comma
token_pat = r"[\s',-]+"
TOKENS = SENTS['sent_str'].str.split(token_pat, expand=True)\
    .stack()\
    .to_frame('token_str')

TOKENS.index.names = OHCO[1:5]

TOKENS['term_str'] = TOKENS.token_str.replace(r'[\W_]+', '', regex=True).str.lower()

TOKENS.tail(25)

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,token_str,term_str
chap_num,para_num,sent_num,token_num,Unnamed: 4_level_1,Unnamed: 5_level_1
6,37,2,3,sins,sins
6,37,2,4,like,like
6,37,2,5,the,the
6,37,2,6,maker,maker
6,37,2,7,of,of
6,37,2,8,day,day
6,37,2,9,destroying,destroying
6,37,2,10,darkness,darkness
6,37,2,11,such,such
6,37,2,12,a,a
