In [147]:
import numpy as np 
import pandas as pd
import matplotlib.pyplot as plt 
from PIL import Image
from wordcloud import WordCloud
from google.cloud import language_v1
import io
import os
os.environ["GOOGLE_APPLICATION_CREDENTIALS"]="lustrous-center-305403-9cd548f48b67.json"

# 1. Data Cleaning

In [148]:
# read data
data_test = pd.read_csv('religious_text_test.csv')
data_train = pd.read_csv('religious_text_train.csv')

In [149]:
# fill missing values
data_train[:] = data_train.fillna(0)
data_test[:] = data_test.fillna(0)

# rename the unnamed column to Chapters, '# foolishness' to just foolishness
data_train.rename(columns = {'Unnamed: 0' : 'Chapters'}, inplace = True)
data_test.rename(columns = {'# foolishness' : 'foolishness'}, inplace = True)

# make sure everything is an integer
data_test = data_test.applymap(int).astype(int)
data_train.iloc[:, 1:] = data_train.iloc[:, 1:].applymap(int).astype(int)

In [150]:
# Data validation stuff, more cleaning

all(data_train.iloc[:,1:].dtypes == 'int64') # True (All entries are integers)
all(data_test.dtypes == 'int64') # True (All entries are integers)

all(list(map(lambda a: ' ' not in a, data_train.iloc[:,1:].columns))) # Each column is one word
all(list(map(lambda a: ' ' not in a, data_test.columns))) # Each column is one word

True

In [151]:
# Data to work with 


# Extraaaaaa way of getting the books lmao, dont do this
books = [i[0] for i in list(map(lambda a: a.split('_'), data_train['Chapters'].unique())) if i[1] == 'Ch1']

# dataframes segregated by chapters
book_list = [data_train[[j in i for i in data_train['Chapters']]] for j in books]

# top 20 words for each book in a list containing 8 series
top_20_books = [j.sort_values(ascending = False) for j in [i.iloc[:,1:].sum() for i in book_list]]

# total words in each book
total_words_book = dict(zip(books, [i.sum() for i in top_20_books]))

# total words in each chapter of each book (dictionary)
total_words_chapter = dict(zip(books,[pd.Series(data=i.iloc[:,1:].sum(axis=1).tolist(), index=i.iloc[:,0]) for i in book_list]))

# Top 20 words for all books
all_20 = data_train.iloc[:,1:].sum().sort_values(ascending = False)

# total words in the dataframe 
total_words = all_20.sum()

# proportion of words in the top 20 over total words in each book 
prop_20_book = dict(zip(books,[top_20_books[i][:20].sum() / [j.sum() for j in top_20_books][i] for i in range(8)]))

In [152]:
# Word cloud visualizations (ready for graphing)

mask = np.array(Image.open("book.png"))

# for all books in total
desc = all_20.index.tolist(); counts = all_20.tolist()
wordcloud_20 = WordCloud(mask = mask, max_words = len(all_20)).generate_from_frequencies(dict(zip(desc,counts))) # stored here

# for each book (in a list)
word_cloud_each = [] # all in this list
for i in top_20_books:
    desc = i[:20].index.tolist(); counts = i[:20].tolist()
    word_cloud_each.append(WordCloud(max_words = 20).generate_from_frequencies(dict(zip(desc,counts))))

In [201]:
# This code basically takes all scores calculated from the sentiment analysis text file,
# in order to help with sentiment score calculations 

sentiments = open('sentiment_analysis.txt', 'r')
scores = [float(line.split(' ')[3].strip()) for line in sentiments.readlines() if 'score' in line and 'Document' in line][1:]
sentiments.seek(0)
magnitude = [float(line.split(' ')[3].strip()) for line in sentiments.readlines() if 'magnitude' in line and 'Document' in line][1:]
sentiments.seek(0)
keys = [line.strip() for line in sentiments.readlines() if ' ' not in line and line.rstrip()]
scores_dict = dict(zip(keys,scores))
magnitude_dict = dict(zip(keys,magnitude))

# function to calculate sentiment scores
book_scores = list(map((lambda j: sum([scores_dict[k] * j[i] for i,k in enumerate(j.index.tolist())]) / sum(j.tolist())), top_20_books))
magnitude_scores = list(map((lambda j: sum([magnitude_dict[k] * j[i] for i,k in enumerate(j.index.tolist())]) / sum(j.tolist())), top_20_books))
top_20_scores = list(map((lambda j: sum([scores_dict[k] * j[:20][i] for i,k in enumerate(j[:20].index.tolist())]) / sum(j[:20].tolist())), top_20_books))
top_20_magnitude = list(map((lambda j: sum([magnitude_dict[k] * j[:20][i] for i,k in enumerate(j[:20].index.tolist())]) / sum(j[:20].tolist())), top_20_books))

scores_20 = dict(zip(books,list(top_20_scores)))
magnitude_20 = dict(zip(books,list(top_20_magnitude)))
book_scores_dict = dict(zip(books,list(book_scores)))
book_magnitude_dict = dict(zip(books,list(magnitude_scores)))

In [200]:
book_magnitude_dict

{'Buddhism': 0.24219505243556763,
 'TaoTeChing': 0.2704861143008909,
 'Upanishad': 0.25052439940317295,
 'YogaSutra': 0.25580648164577946,
 'BookOfProverb': 0.2854328706238695,
 'BookOfEcclesiastes': 0.26505589974171584,
 'BookOfEccleasiasticus': 0.2743513750982972,
 'BookOfWisdom': 0.27575094851301896}

# 2. Visualizations

In [191]:
book_scores_dict

{'Buddhism': 0.09615036416551377,
 'TaoTeChing': 0.10507812653344849,
 'Upanishad': 0.14579723597927322,
 'YogaSutra': 0.1536446225476219,
 'BookOfProverb': 0.11912908512633535,
 'BookOfEcclesiastes': 0.10775333822977229,
 'BookOfEccleasiasticus': 0.1323606737575101,
 'BookOfWisdom': 0.11163716152331121}

In [202]:
scores_20

{'Buddhism': 0.12714171554562184,
 'TaoTeChing': 0.12188365487923583,
 'Upanishad': 0.185788385794185,
 'YogaSutra': 0.29939333488458447,
 'BookOfProverb': 0.2113649379536467,
 'BookOfEcclesiastes': 0.1680791009280641,
 'BookOfEccleasiasticus': 0.21494290865896964,
 'BookOfWisdom': 0.20057537840514245}

In [203]:
magnitude_20

{'Buddhism': 0.2459567718154149,
 'TaoTeChing': 0.24016620511823744,
 'Upanishad': 0.2505186763495454,
 'YogaSutra': 0.3062689669385055,
 'BookOfProverb': 0.291319248413849,
 'BookOfEcclesiastes': 0.25000000458821064,
 'BookOfEccleasiasticus': 0.24665579735795204,
 'BookOfWisdom': 0.20655926802250266}