In [83]:
import numpy as np 
import pandas as pd
import matplotlib as plt 

# 1. Data Cleaning

In [84]:
# read data
data_test = pd.read_csv('religious_text_test.csv')
data_train = pd.read_csv('religious_text_train.csv')

In [90]:
# fill missing values
data_train[:] = data_train.fillna(0)
data_test[:] = data_test.fillna(0)

# rename the unnamed column to Chapters, '# foolishness' to just foolishness
data_train.rename(columns = {'Unnamed: 0' : 'Chapters'}, inplace = True)
data_test.rename(columns = {'# foolishness' : 'foolishness'}, inplace = True)

# make sure everything is an integer
data_test = data_test.applymap(int).astype(int)
data_train.iloc[:, 1:] = data_train.iloc[:, 1:].applymap(int).astype(int)

In [187]:
# Data validation stuff, more cleaning

all(data_train.iloc[:,1:].dtypes == 'int64') # True (All entries are integers)
all(data_test.dtypes == 'int64') # True (All entries are integers)

all(list(map(lambda a: ' ' not in a, data_train.iloc[:,1:].columns))) # Each column is one word
all(list(map(lambda a: ' ' not in a, data_test.columns))) # Each column is one word

True

In [238]:
# Extraaaaaa way of getting the books lmao, dont do this
books = [i[0] for i in list(map(lambda a: a.split('_'), data_train['Chapters'].unique())) if i[1] == 'Ch1']

# dataframes segregated by chapters
book_list = [data_train[[j in i for i in data_train['Chapters']]] for j in books]

# top 20 words for each book in a list containing 8 series
top_20_books = [j.sort_values(ascending = False) for j in [i.iloc[:,1:].sum() for i in book_list]]

# total words in each book
total_words_book = dict(zip(books, [i.sum() for i in top_20_books]))

# total words in each chapter of each book (dictionary)
total_words_chapter = dict(zip(books,[pd.Series(data=i.iloc[:,1:].sum(axis=1).tolist(), index=i.iloc[:,0]) for i in book_list]))

# Series for each book (Top 20 Words)
buddhism_20 = top_20_books[0]; tao_20 = top_20_books[1]
upanishad_20 = top_20_books[2]; yoga_20 = top_20_books[3]
proverb_20 = top_20_books[4]; ecclesiastes_20 = top_20_books[5]
eccleasiasticus_20 = top_20_books[6]; wisdom_20 = top_20_books[7]

# Top 20 words for all books
all_20 = data_train.iloc[:,1:].sum().sort_values(ascending = False)
# total words in the dataframe 
total_words = all_20.sum()

In [240]:
#overlapping_words = len

{'Buddhism': Chapters
 Buddhism_Ch1      298
 Buddhism_Ch2      107
 Buddhism_Ch3      188
 Buddhism_Ch4      129
 Buddhism_Ch5       15
 Buddhism_Ch6      244
 Buddhism_Ch7      226
 Buddhism_Ch8       89
 Buddhism_Ch9      110
 Buddhism_Ch10      30
 Buddhism_Ch11     339
 Buddhism_Ch12      59
 Buddhism_Ch13      16
 Buddhism_Ch14       0
 Buddhism_Ch15      45
 Buddhism_Ch16      56
 Buddhism_Ch17      31
 Buddhism_Ch18     144
 Buddhism_Ch19      17
 Buddhism_Ch20       8
 Buddhism_Ch21     166
 Buddhism_Ch22      80
 Buddhism_Ch23     108
 Buddhism_Ch24     178
 Buddhism_Ch25      29
 Buddhism_Ch26      42
 Buddhism_Ch27      29
 Buddhism_Ch28      25
 Buddhism_Ch29      86
 Buddhism_Ch30     257
 Buddhism_Ch31     182
 Buddhism_Ch32     109
 Buddhism_Ch33     174
 Buddhism_Ch34     204
 Buddhism_Ch35      57
 Buddhism_Ch36      86
 Buddhism_Ch37    1194
 Buddhism_Ch38     272
 Buddhism_Ch39     139
 Buddhism_Ch40      30
 Buddhism_Ch41     220
 Buddhism_Ch42     522
 Buddhism_Ch

In [247]:
data_train.describe()
                

Unnamed: 0,foolishness,hath,wholesome,takest,feelings,anger,vaivaswata,matrix,kindled,convict,...,erred,thinkest,modern,reigned,sparingly,visual,thoughts,illumines,attire,explains
count,590.0,590.0,590.0,590.0,590.0,590.0,590.0,590.0,590.0,590.0,...,590.0,590.0,590.0,590.0,590.0,590.0,590.0,590.0,590.0,590.0
mean,0.00339,0.566102,0.00678,0.001695,0.032203,0.059322,0.001695,0.001695,0.00678,0.001695,...,0.00339,0.00678,0.00339,0.001695,0.001695,0.001695,0.079661,0.008475,0.005085,0.00339
std,0.058173,1.777487,0.082129,0.041169,0.488211,0.326851,0.041169,0.041169,0.082129,0.041169,...,0.058173,0.100701,0.058173,0.041169,0.041169,0.041169,0.366824,0.091744,0.071186,0.058173
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
max,1.0,14.0,1.0,1.0,11.0,4.0,1.0,1.0,1.0,1.0,...,1.0,2.0,1.0,1.0,1.0,1.0,5.0,1.0,1.0,1.0


# 2. Visualizations

In [236]:
total_words_book

{'Buddhism': 6624,
 'TaoTeChing': 4608,
 'Upanishad': 6579,
 'YogaSutra': 12443,
 'BookOfProverb': 7716,
 'BookOfEcclesiastes': 2773,
 'BookOfEccleasiasticus': 14839,
 'BookOfWisdom': 5027}

0           Buddhism_Ch1
1           Buddhism_Ch2
2           Buddhism_Ch3
3           Buddhism_Ch4
4           Buddhism_Ch5
             ...        
585    BookOfWisdom_Ch15
586    BookOfWisdom_Ch16
587    BookOfWisdom_Ch17
588    BookOfWisdom_Ch18
589    BookOfWisdom_Ch19
Name: Chapters, Length: 590, dtype: object