In [1]:
import pandas as pd 
import numpy as np 
import configparser
import os

import nltk 

config = configparser.ConfigParser()
config.read("env.ini")
data_home = config['DEFAULT']['data_home']
output_dir = config['DEFAULT']['output_dir']
base_path = config['DEFAULT']['base_path']
code_dir = config['DEFAULT']['code_dir']

data_prefix = 'Maha'
out_path = f'{output_dir}/{data_prefix}'


os.chdir(code_dir)
import preprocess
import bow_analysis
os.chdir(base_path)

In [2]:
book_dict = {}
CORPUS = pd.DataFrame()
book_data = []

all_dir_list = [f for f in os.listdir(data_home) if f.endswith('.txt')]
for book_id, book_file in enumerate(all_dir_list):
    
    book_dict[book_id+1] = preprocess.create_tokendf(f"{data_home}/{all_dir_list[book_id]}")
    book_dict[book_id+1]['token_num'].index = pd.MultiIndex.from_tuples(
                                        [(book_id+1,) + idx for idx in book_dict[book_id+1]['token_num'].index],
                                        names=['book_id'] + book_dict[book_id+1]['token_num'].index.names)
    CORPUS = pd.concat([CORPUS, book_dict[book_id+1]['token_num']])

    print(f"{str(book_id+1).zfill(2)}] {book_file} - {book_dict[book_id+1]['book_id']}")

    book_data.append((book_id+1, all_dir_list[book_id], book_dict[book_id+1]['book_id']))


LIB = pd.DataFrame(book_data, columns=['book_id','source_file_path','raw_title'])\
    .set_index('book_id').sort_index()

01] maha01.txt - Adi Parva
02] maha02.txt - Sabha Parva
03] maha03.txt - Vana Parva
04] maha04.txt - Virata Parva
05] maha05.txt - Udyoga Parva
06] maha06.txt - Bhishma Parva
07] maha07.txt - Drona Parva
08] maha08.txt - Karna Parva
09] maha09.txt - Shalya Parva
10] maha10.txt - Sauptika Parva
11] maha11.txt - Stri Parva
12] maha12.txt - Santi Parva
13] maha13.txt - Anusasana Parva
14] maha14.txt - Aswamedha Parva
15] maha15.txt - Asramavasika Parva
16] maha16.txt - Mausala Parva
17] maha17.txt - Mahaprasthanika Parva
18] maha18.txt - Svargarohanika Parva


In [3]:
CORPUS = CORPUS[CORPUS.term_str != '']
CORPUS = CORPUS[CORPUS.term_str.isnull()==False]
CORPUS['pos_group'] = CORPUS.pos.str[:2]
CORPUS.to_csv(f"{out_path}-CORPUS.csv", sep = '|')
CORPUS

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,pos_tuple,pos,token_str,term_str,pos_group
book_id,chap_num,para_num,sent_num,token_num,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
1,1,0,0,0,"(Om, NN)",NN,Om,om,NN
1,1,0,1,0,"(Having, VBG)",VBG,Having,having,VB
1,1,0,1,1,"(bowed, VBN)",VBN,bowed,bowed,VB
1,1,0,1,2,"(down, IN)",IN,down,down,IN
1,1,0,1,3,"(to, TO)",TO,to,to,TO
...,...,...,...,...,...,...,...,...,...
18,6,37,2,25,"(of, IN)",IN,of,of,IN
18,6,37,2,26,"(Vishnu, NNP)",NNP,Vishnu,vishnu,NN
18,6,37,2,27,"(like, IN)",IN,like,like,IN
18,6,37,2,28,"(Vishnu, NNP)",NNP,Vishnu,vishnu,NN


In [4]:
LIB['book_len'] = CORPUS.groupby('book_id').term_str.count()

LIB['n_chaps'] = CORPUS.reset_index()[['book_id','chap_num']]\
    .drop_duplicates()\
    .groupby('book_id').chap_num.count()

LIB.to_csv(f"{out_path}-LIB.csv", sep = "|")
LIB

Unnamed: 0_level_0,source_file_path,raw_title,book_len,n_chaps
book_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,maha01.txt,Adi Parva,228680,235
2,maha02.txt,Sabha Parva,74951,79
3,maha03.txt,Vana Parva,323181,313
4,maha04.txt,Virata Parva,61157,72
5,maha05.txt,Udyoga Parva,188718,199
6,maha06.txt,Bhishma Parva,148378,124
7,maha07.txt,Drona Parva,245954,200
8,maha08.txt,Karna Parva,138242,96
9,maha09.txt,Shalya Parva,91541,65
10,maha10.txt,Sauptika Parva,21340,18


In [5]:
VOCAB = CORPUS.term_str.value_counts().to_frame('n').sort_index()
VOCAB.index.name = 'term_str'
VOCAB['n_chars'] = VOCAB.index.str.len()
VOCAB['p'] = VOCAB.n / VOCAB.n.sum()
VOCAB['i'] = -np.log2(VOCAB.p)
VOCAB['s'] = 1 / VOCAB['p']
VOCAB['h'] = VOCAB['p'] * VOCAB['i']

sw = pd.DataFrame(nltk.corpus.stopwords.words('english'), columns=['term_str'])
sw = sw.reset_index().set_index('term_str')
sw.columns = ['dummy']
sw.dummy = 1


VOCAB['stop'] = VOCAB.index.map(sw.dummy)
VOCAB['stop'] = VOCAB['stop'].fillna(0).astype('int')

from nltk.stem.porter import PorterStemmer
stemmer1 = PorterStemmer()
VOCAB['stem_porter'] = VOCAB.apply(lambda x: stemmer1.stem(x.name), 1)

from nltk.stem.snowball import SnowballStemmer
stemmer2 = SnowballStemmer("english")
VOCAB['stem_snowball'] = VOCAB.apply(lambda x: stemmer2.stem(x.name), 1)

from nltk.stem.lancaster import LancasterStemmer
stemmer3 = LancasterStemmer()
VOCAB['stem_lancaster'] = VOCAB.apply(lambda x: stemmer3.stem(x.name), 1)


# Getting the PoS from CORPUS
VOCAB['max_pos'] = CORPUS[['term_str','pos']].value_counts().unstack(fill_value=0).idxmax(1)

# Getting the Most frequent PoS Group
VOCAB['max_pos_group'] = CORPUS[['term_str','pos_group']].value_counts().unstack(fill_value=0).idxmax(1)

# Set of all Possible PoS Groups
VOCAB['n_pos_group'] = CORPUS[['term_str','pos_group']].value_counts().unstack().count(1)
VOCAB['cat_pos_group'] = CORPUS[['term_str','pos_group']].value_counts().to_frame('n').reset_index()\
    .groupby('term_str').pos_group.apply(lambda x: set(x))

# Set of all Possible PoS
VOCAB['n_pos'] = CORPUS[['term_str','pos']].value_counts().unstack().count(1)
VOCAB['cat_pos'] = CORPUS[['term_str','pos']].value_counts().to_frame('n').reset_index()\
    .groupby('term_str').pos.apply(lambda x: set(x))


VOCAB.sort_values("n_pos", ascending=0)

VOCAB.to_csv(f"{out_path}-VOCAB.csv", sep = "|")
VOCAB

Unnamed: 0_level_0,n,n_chars,p,i,s,h,stop,stem_porter,stem_snowball,stem_lancaster,max_pos,max_pos_group,n_pos_group,cat_pos_group,n_pos,cat_pos
term_str,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
01,1,2,4.150513e-07,21.200207,2.409341e+06,0.000009,0,01,01,01,CD,CD,1,{CD},1,{CD}
1,15,1,6.225769e-06,17.293317,1.606227e+05,0.000108,0,1,1,1,CD,CD,1,{CD},1,{CD}
10,9,2,3.735461e-06,18.030282,2.677046e+05,0.000067,0,10,10,10,CD,CD,1,{CD},1,{CD}
100,9,3,3.735461e-06,18.030282,2.677046e+05,0.000067,0,100,100,100,CD,CD,1,{CD},1,{CD}
1000,41,4,1.701710e-05,15.842655,5.876441e+04,0.000270,0,1000,1000,1000,CD,CD,1,{CD},1,{CD}
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
zeal,2,4,8.301025e-07,20.200207,1.204670e+06,0.000017,0,zeal,zeal,zeal,NN,NN,1,{NN},1,{NN}
zealously,4,9,1.660205e-06,19.200207,6.023352e+05,0.000032,0,zealous,zealous,zeal,RB,RB,1,{RB},1,{RB}
zenana,1,6,4.150513e-07,21.200207,2.409341e+06,0.000009,0,zenana,zenana,zenan,NN,NN,1,{NN},1,{NN}
zodiac,2,6,8.301025e-07,20.200207,1.204670e+06,0.000017,0,zodiac,zodiac,zodiac,NNP,NN,1,{NN},1,{NNP}


In [6]:
VOCAB[VOCAB.stop==1]

Unnamed: 0_level_0,n,n_chars,p,i,s,h,stop,stem_porter,stem_snowball,stem_lancaster,max_pos,max_pos_group,n_pos_group,cat_pos_group,n_pos,cat_pos
term_str,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
a,28160,1,0.011688,6.418847,85.558984,0.075022,1,a,a,a,DT,DT,3,"{NN, VB, DT}",3,"{NNP, VB, DT}"
about,1547,5,0.000642,10.604950,1557.427925,0.006809,1,about,about,about,IN,IN,4,"{IN, RB, NN, RP}",4,"{IN, RB, NN, RP}"
above,269,5,0.000112,13.128745,8956.657993,0.001466,1,abov,abov,abov,IN,IN,5,"{IN, VB, NN, JJ, RB}",5,"{IN, VB, NN, JJ, RB}"
after,3486,5,0.001447,9.432850,691.147734,0.013648,1,after,after,aft,IN,IN,4,"{JJ, IN, VB, NN}",4,"{NNP, IN, JJ, VBD}"
again,1743,5,0.000723,10.432850,1382.295468,0.007547,1,again,again,again,RB,RB,3,"{RB, VB, NN}",3,"{NNP, RB, VB}"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
you,1281,3,0.000532,10.877152,1880.828259,0.005783,1,you,you,you,PRP,PR,1,{PR},1,{PRP}
your,657,4,0.000273,11.840458,3667.185693,0.003229,1,your,your,yo,PRP$,PR,2,"{NN, PR}",2,"{NNP, PRP$}"
yours,51,5,0.000021,15.527782,47241.980392,0.000329,1,your,your,yo,NNS,NN,3,"{RB, VB, NN}",7,"{VBZ, VBN, NNS, NNP, NN, RB, VBP}"
yourself,11,8,0.000005,17.740776,219031.000000,0.000081,1,yourself,yourself,yourself,PRP,PR,1,{PR},1,{PRP}


In [7]:
CORPUS[CORPUS.pos_group == "''"]

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,pos_tuple,pos,token_str,term_str,pos_group
book_id,chap_num,para_num,sent_num,token_num,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
1,176,1,7,8,"(wild, '')",'',wild,wild,''
3,1,3,6,7,"(whither, '')",'',whither,whither,''
3,32,0,64,3,"(becometh, '')",'',becometh,becometh,''
4,4,1,9,5,"(ye, '')",'',ye,ye,''
6,118,1,9,6,"(ye, '')",'',ye,ye,''
7,88,1,0,27,"(thou, '')",'',thou,thou,''
9,36,2,52,9,"(ye, '')",'',ye,ye,''
11,12,2,8,7,"(thou, '')",'',thou,thou,''
12,152,2,0,7,"(ye, '')",'',ye,ye,''
12,152,7,6,6,"(ye, '')",'',ye,ye,''
