In [11]:
import numpy as np 
import pandas as pd
import matplotlib.pyplot as plt 
from PIL import Image
from wordcloud import WordCloud, STOPWORDS
from google.cloud import language_v1
import io
import os

# 1. Data Cleaning

In [12]:
# read data
data_test = pd.read_csv('religious_text_test.csv')
data_train = pd.read_csv('religious_text_train.csv')

In [13]:
# fill missing values
data_train[:] = data_train.fillna(0)
data_test[:] = data_test.fillna(0)

# rename the unnamed column to Chapters, '# foolishness' to just foolishness
data_train.rename(columns = {'Unnamed: 0' : 'Chapters'}, inplace = True)
data_test.rename(columns = {'# foolishness' : 'foolishness'}, inplace = True)

# make sure everything is an integer
data_test = data_test.applymap(int).astype(int)
data_train.iloc[:, 1:] = data_train.iloc[:, 1:].applymap(int).astype(int)

In [14]:
# Data validation stuff, more cleaning

all(data_train.iloc[:,1:].dtypes == 'int64') # True (All entries are integers)
all(data_test.dtypes == 'int64') # True (All entries are integers)

all(list(map(lambda a: ' ' not in a, data_train.iloc[:,1:].columns))) # Each column is one word
all(list(map(lambda a: ' ' not in a, data_test.columns))) # Each column is one word

True

In [15]:
# Extraaaaaa way of getting the books lmao, dont do this
books = [i[0] for i in list(map(lambda a: a.split('_'), data_train['Chapters'].unique())) if i[1] == 'Ch1']

# dataframes segregated by chapters
book_list = [data_train[[j in i for i in data_train['Chapters']]] for j in books]

# top 20 words for each book in a list containing 8 series
top_20_books = [j.sort_values(ascending = False) for j in [i.iloc[:,1:].sum() for i in book_list]]

# total words in each book
total_words_book = dict(zip(books, [i.sum() for i in top_20_books]))

# total words in each chapter of each book (dictionary)
total_words_chapter = dict(zip(books,[pd.Series(data=i.iloc[:,1:].sum(axis=1).tolist(), index=i.iloc[:,0]) for i in book_list]))

# Series for each book (Top 20 Words)
buddhism_20 = top_20_books[0]; tao_20 = top_20_books[1]
upanishad_20 = top_20_books[2]; yoga_20 = top_20_books[3]
proverb_20 = top_20_books[4]; ecclesiastes_20 = top_20_books[5]
eccleasiasticus_20 = top_20_books[6]; wisdom_20 = top_20_books[7]

# Top 20 words for all books
all_20 = data_train.iloc[:,1:].sum().sort_values(ascending = False)
# total words in the dataframe 
tototal_words = all_20.sum()

In [16]:
# Word cloud visualizations (ready for graphing)

mask = np.array(Image.open("book.png"))

# for all books in total
desc = all_20.index.tolist(); counts = all_20.tolist()
wordcloud_20 = WordCloud(mask = mask, max_words = len(all_20)).generate_from_frequencies(dict(zip(desc,counts))) # stored here

# for each book (in a list)
word_cloud_each = [] # all in this list
for i in top_20_books:
    desc = i[:20].index.tolist(); counts = i[:20].tolist()
    word_cloud_each.append(WordCloud(max_words = 20).generate_from_frequencies(dict(zip(desc,counts))))

In [45]:
os.environ["GOOGLE_APPLICATION_CREDENTIALS"]="lustrous-center-305403-9cd548f48b67.json"

def analyze_sentiment(text_content):
    client = language_v1.LanguageServiceClient()

    # Available types: PLAIN_TEXT, HTML
    type_ = language_v1.Document.Type.PLAIN_TEXT

    # Optional. If not specified, the language is automatically detected.
    # For list of supported languages:
    # https://cloud.google.com/natural-language/docs/languages
    language = "en"
    document = {"content": text_content, "type_": type_, "language": language}

    # Available values: NONE, UTF8, UTF16, UTF32
    encoding_type = language_v1.EncodingType.UTF8

    response = client.analyze_sentiment(request = {'document': document, 'encoding_type': encoding_type})
    # Get overall sentiment of the input document
    
    #print(u"Document sentiment score: {}".format(response.document_sentiment.score))
    return response.document_sentiment.score
        


    # Get the language of the text, which will be the same as
    # the language specified in the request or, if not specified,
    # the automatically-detected language.
    #print(u"Language of the text: {}".format(response.language))
    #print(text_content)
    

analyze_sentiment(text)

0.8999999761581421

In [43]:
buddhist_score = sum([analyze_sentiment(buddhism_20[:20].index.tolist()[i]) * buddhism_20[i] for i in range(20)]) / sum(buddhism_20[:20].tolist())
tao_score = sum([analyze_sentiment(tao_20[:20].index.tolist()[i]) * tao_20[i] for i in range(20)]) / sum(tao[:20].tolist())


Document sentiment score: 0.30000001192092896
Document sentiment magnitude: 0.30000001192092896
Document sentiment score: 0.30000001192092896
Document sentiment magnitude: 0.30000001192092896
Document sentiment score: 0.10000000149011612
Document sentiment magnitude: 0.10000000149011612
Document sentiment score: -0.6000000238418579
Document sentiment magnitude: 0.6000000238418579
Document sentiment score: 0.10000000149011612
Document sentiment magnitude: 0.10000000149011612
Document sentiment score: 0.0
Document sentiment magnitude: 0.0
Document sentiment score: 0.10000000149011612
Document sentiment magnitude: 0.10000000149011612
Document sentiment score: 0.0
Document sentiment magnitude: 0.0
Document sentiment score: -0.20000000298023224
Document sentiment magnitude: 0.20000000298023224
Document sentiment score: 0.0
Document sentiment magnitude: 0.0
Document sentiment score: -0.30000001192092896
Document sentiment magnitude: 0.30000001192092896
Document sentiment score: 0.30000001192

NameError: name 'tao' is not defined

# 2. Visualizations

In [None]:
total_words_book