In [1]:
import pandas as pd 
import numpy as np 
import configparser
import os

import nltk 

config = configparser.ConfigParser()
config.read("env.ini")
data_home = config['DEFAULT']['data_home']
output_dir = config['DEFAULT']['output_dir']
base_path = config['DEFAULT']['base_path']
code_dir = config['DEFAULT']['code_dir']

data_prefix = 'Maha'
out_path = f'{output_dir}/{data_prefix}'


os.chdir(code_dir)
import preprocess
os.chdir(base_path)

In [2]:
book_dict = {}
CORPUS = pd.DataFrame()
book_data = []

all_dir_list = [f for f in os.listdir(data_home) if f.endswith('.txt')]
for book_id, book_file in enumerate(all_dir_list):
    
    book_dict[book_id+1] = preprocess.create_tokendf(f"{data_home}/{all_dir_list[book_id]}")
    book_dict[book_id+1]['token_num'].index = pd.MultiIndex.from_tuples(
                                        [(book_id+1,) + idx for idx in book_dict[book_id+1]['token_num'].index],
                                        names=['book_id'] + book_dict[book_id+1]['token_num'].index.names)
    CORPUS = pd.concat([CORPUS, book_dict[book_id+1]['token_num']])

    print(f"{str(book_id+1).zfill(2)}] {book_file} - {book_dict[book_id+1]['book_id']}")

    book_data.append((book_id+1, all_dir_list[book_id], book_dict[book_id+1]['book_id']))


LIB = pd.DataFrame(book_data, columns=['book_id','source_file_path','raw_title'])\
    .set_index('book_id').sort_index()

01] maha01.txt - Adi Parva
02] maha02.txt - Sabha Parva
03] maha03.txt - Vana Parva
04] maha04.txt - Virata Parva
05] maha05.txt - Udyoga Parva
06] maha06.txt - Bhishma Parva
07] maha07.txt - Drona Parva
08] maha08.txt - Karna Parva
09] maha09.txt - Shalya Parva
10] maha10.txt - Sauptika Parva
11] maha11.txt - Stri Parva
12] maha12.txt - Santi Parva
13] maha13.txt - Anusasana Parva
14] maha14.txt - Aswamedha Parva
15] maha15.txt - Asramavasika Parva
16] maha16.txt - Mausala Parva
17] maha17.txt - Mahaprasthanika Parva
18] maha18.txt - Svargarohanika Parva


In [3]:
CORPUS = CORPUS[CORPUS.term_str != '']
CORPUS['pos_group'] = CORPUS.pos.str[:2]
CORPUS.to_csv(f"{out_path}-CORPUS.csv")
CORPUS

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,pos_tuple,pos,token_str,term_str,pos_group
book_id,chap_num,para_num,sent_num,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
1,1,0,0,0,"(Om, NN)",NN,Om,om,NN
1,1,0,1,0,"(Having, VBG)",VBG,Having,having,VB
1,1,0,1,1,"(bowed, VBN)",VBN,bowed,bowed,VB
1,1,0,1,2,"(down, IN)",IN,down,down,IN
1,1,0,1,3,"(to, TO)",TO,to,to,TO
...,...,...,...,...,...,...,...,...,...
18,6,37,2,26,"(Vishnu, NNP)",NNP,Vishnu,vishnu,NN
18,6,37,2,27,"(like, IN)",IN,like,like,IN
18,6,37,2,28,"(Vishnu, NNP)",NNP,Vishnu,vishnu,NN
18,6,37,2,29,"(himself, PRP)",PRP,himself,himself,PR


In [4]:
LIB['book_len'] = CORPUS.groupby('book_id').term_str.count()

LIB['n_chaps'] = CORPUS.reset_index()[['book_id','chap_num']]\
    .drop_duplicates()\
    .groupby('book_id').chap_num.count()

LIB.to_csv(f"{out_path}-LIB.csv")
LIB

Unnamed: 0_level_0,source_file_path,raw_title,book_len,n_chaps
book_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,maha01.txt,Adi Parva,259040,235
2,maha02.txt,Sabha Parva,84939,79
3,maha03.txt,Vana Parva,364892,313
4,maha04.txt,Virata Parva,69034,72
5,maha05.txt,Udyoga Parva,216285,199
6,maha06.txt,Bhishma Parva,173416,124
7,maha07.txt,Drona Parva,283835,200
8,maha08.txt,Karna Parva,158048,96
9,maha09.txt,Shalya Parva,104408,65
10,maha10.txt,Sauptika Parva,23893,18


In [5]:
VOCAB = CORPUS.term_str.value_counts().to_frame('n').sort_index()
VOCAB.index.name = 'term_str'
VOCAB['n_chars'] = VOCAB.index.str.len()
VOCAB['p'] = VOCAB.n / VOCAB.n.sum()
VOCAB['i'] = -np.log2(VOCAB.p)

sw = pd.DataFrame(nltk.corpus.stopwords.words('english'), columns=['term_str'])
sw = sw.reset_index().set_index('term_str')
sw.columns = ['dummy']
sw.dummy = 1


VOCAB['stop'] = VOCAB.index.map(sw.dummy)
VOCAB['stop'] = VOCAB['stop'].fillna(0).astype('int')

from nltk.stem.porter import PorterStemmer
stemmer1 = PorterStemmer()
VOCAB['stem_porter'] = VOCAB.apply(lambda x: stemmer1.stem(x.name), 1)

from nltk.stem.snowball import SnowballStemmer
stemmer2 = SnowballStemmer("english")
VOCAB['stem_snowball'] = VOCAB.apply(lambda x: stemmer2.stem(x.name), 1)

from nltk.stem.lancaster import LancasterStemmer
stemmer3 = LancasterStemmer()
VOCAB['stem_lancaster'] = VOCAB.apply(lambda x: stemmer3.stem(x.name), 1)


# Getting the PoS from CORPUS
VOCAB['max_pos'] = CORPUS[['term_str','pos']].value_counts().unstack(fill_value=0).idxmax(1)

# Getting the Most frequent PoS Group
VOCAB['max_pos_group'] = CORPUS[['term_str','pos_group']].value_counts().unstack(fill_value=0).idxmax(1)

# Set of all Possible PoS Groups
VOCAB['n_pos_group'] = CORPUS[['term_str','pos_group']].value_counts().unstack().count(1)
VOCAB['cat_pos_group'] = CORPUS[['term_str','pos_group']].value_counts().to_frame('n').reset_index()\
    .groupby('term_str').pos_group.apply(lambda x: set(x))

# Set of all Possible PoS
VOCAB['n_pos'] = CORPUS[['term_str','pos']].value_counts().unstack().count(1)
VOCAB['cat_pos'] = CORPUS[['term_str','pos']].value_counts().to_frame('n').reset_index()\
    .groupby('term_str').pos.apply(lambda x: set(x))


VOCAB.sort_values("n_pos", ascending=0)

VOCAB.to_csv(f"{out_path}-VOCAB.csv")
VOCAB

Unnamed: 0_level_0,n,n_chars,p,i,stop,stem_porter,stem_snowball,stem_lancaster,max_pos,max_pos_group,n_pos_group,cat_pos_group,n_pos,cat_pos
term_str,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
&,17,1,6.217816e-06,17.295161,0,&,&,&,CC,CC,1,{CC},1,{CC}
',12078,1,4.417575e-03,7.822530,0,',',','','',3,"{'', CD, PO}",3,"{'', CD, POS}"
'',2055,2,7.516242e-04,10.377701,0,'','','','','',1,{''},1,{''}
'_i,1,3,3.657539e-07,21.382623,0,'_i,_i,'_i,'','',1,{''},1,{''}
'abandoning,1,11,3.657539e-07,21.382623,0,'abandon,abandon,'abandoning,VBG,VB,1,{VB},1,{VBG}
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
zeal,2,4,7.315078e-07,20.382623,0,zeal,zeal,zeal,NN,NN,1,{NN},1,{NN}
zealously,4,9,1.463016e-06,19.382623,0,zealous,zealous,zeal,RB,RB,1,{RB},1,{RB}
zenana,1,6,3.657539e-07,21.382623,0,zenana,zenana,zenan,NN,NN,1,{NN},1,{NN}
zodiac,2,6,7.315078e-07,20.382623,0,zodiac,zodiac,zodiac,NNP,NN,1,{NN},1,{NNP}
