### Natural Language Processing of the KJV Bible
+ Sentiment Analysis
+ EDA
+ Summarization
+ Prediction of Verse

#### Data Sources
+ https://raw.githubusercontent.com/scrollmapper/bible_databases/master/csv/t_kjv.csv

In [21]:
# Load EDA Pkgs
from typing import Tuple
import pandas as pd
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)


In [2]:
# Load Dataset
df = pd.read_csv("t_kjv.csv", index_col=False)

In [3]:
# Columns
df.columns

Index(['id', 'b', 'c', 'v', 't'], dtype='object')

In [4]:
# Head
df.head()

Unnamed: 0,id,b,c,v,t
0,1001001,1,1,1,In the beginning God created the heaven and th...
1,1001002,1,1,2,"And the earth was without form, and void; and ..."
2,1001003,1,1,3,"And God said, Let there be light: and there wa..."
3,1001004,1,1,4,"And God saw the light, that it was good: and G..."
4,1001005,1,1,5,"And God called the light Day, and the darkness..."


In [5]:
# Rename Books
df1 = df

In [6]:
# Replacing with the correct name
df1.b.replace({1:"Genesis",
2:"Exodus",
3:"Leviticus",
4:"Numbers",
5:"Deuteronomy",
6:"Joshua",
7:"Judges",
8:"Ruth",
9:"1 Samuel (1 Kings)",
10:"2 Samuel (2 Kings)",
11:"1 Kings (3 Kings)",
12:"2 Kings (4 Kings)",
13:"1 Chronicles",
14:"2 Chronicles",
15:"Ezra",
16:"Nehemiah",
17:"Esther",
18:"Job",
19:"Psalms",
20:"Proverbs",
21:"Ecclesiastes",
22:"Song of Solomon (Canticles)",
23:"Isaiah",
24:"Jeremiah",
25:"Lamentations",
26:"Ezekiel",
27:"Daniel",
28:"Hosea",
29:"Joel",
30:"Amos",
31:"Obadiah",
32:"Jonah",
33:"Micah",
34:"Nahum",
35:"Habakkuk",
36:"Zephaniah",
37:"Haggai",
38:"Zechariah",
39:"Malachi",
40:"Matthew",
41:"Mark",
42:"Luke",
43:"John",
44:"Acts",
45:"Romans",
46:"1 Corinthians",
47:"2 Corinthians",
48:"Galatians",
49:"Ephesians",
50:"Philippians",
51:"Colossians",
52:"1 Thessalonians",
53:"2 Thessalonians",
54:"1 Timothy",
55:"2 Timothy",
56:"Titus",
57:"Philemon",
58:"Hebrews",
59:"James",
60:"1 Peter",
61:"2 Peter",
62:"1 John",
63:"2 John",
64:"3 John",
65:"Jude",
66:"Revelation"},inplace=True)

In [7]:
# Renaming Columns
df1.columns = ["id","book","chapter","verse","text"]

In [8]:
Genesis = df1[df1["book"]=='Genesis']
Chapter1 = Genesis[Genesis["chapter"]==1]
Chapter1

Unnamed: 0,id,book,chapter,verse,text
0,1001001,Genesis,1,1,In the beginning God created the heaven and th...
1,1001002,Genesis,1,2,"And the earth was without form, and void; and ..."
2,1001003,Genesis,1,3,"And God said, Let there be light: and there wa..."
3,1001004,Genesis,1,4,"And God saw the light, that it was good: and G..."
4,1001005,Genesis,1,5,"And God called the light Day, and the darkness..."
5,1001006,Genesis,1,6,"And God said, Let there be a firmament in the ..."
6,1001007,Genesis,1,7,"And God made the firmament, and divided the wa..."
7,1001008,Genesis,1,8,And God called the firmament Heaven. And the e...
8,1001009,Genesis,1,9,"And God said, Let the waters under the heaven ..."
9,1001010,Genesis,1,10,And God called the dry land Earth; and the gat...


In [9]:
Chapter1[Chapter1['verse']==1]['book'].values[0]

'Genesis'

In [10]:
import re
import math

verses = Chapter1['text'].values

def get_tf_for_verses(verses, sortby='tf', skip_stopwords: bool = False):
    tf = pd.DataFrame(columns=["word", "tc", "tf"])
    
    if skip_stopwords:
        from nltk.corpus import stopwords
        stopwords_set = set(stopwords.words('english'))
    else:
        stopwords_set = set()

    # Get n_words
    n_words = 0
    for verse in verses:
        verse_words = re.findall(r'\w+', verse)
        n_words += len(verse_words)


    for verse in verses:
        verse_words = re.findall(r'\w+', verse)
        for word in verse_words:
            word = word.lower()
            if word not in tf['word'].values and word not in stopwords_set:
                row = {}
                row['word']=[word]
                row['tf'] = [1/n_words]
                row['tc'] = 1
                row_df = pd.DataFrame(row)
                tf = pd.concat([tf, row_df], ignore_index=True)
            else:
                tf.loc[tf.word.isin([word]), 'tf']+=1/n_words  
                tf.loc[tf.word.isin([word]), 'tc']+=1
    
      
    tf = tf.sort_values(by=sortby, ascending=False)
    tf = tf.reset_index(drop=True)
    return tf

def get_idf_for_verses(verses, sortby='idf', skip_stopwords: bool = False):
    idf = pd.DataFrame(columns=["word", "dc" ,"idf"])

    N_documents = len(verses)

    if skip_stopwords:
        from nltk.corpus import stopwords
        stopwords_set = set(stopwords.words('english'))
    else:
        stopwords_set = set()

    # Get all_verse_words
    all_verse_words = []
    for verse in verses:
        verse_words = re.findall(r'\w+', verse)
        for i in range(len(verse_words)):
            verse_words[i] = verse_words[i].lower()
        all_verse_words.append(set(verse_words))


    for verse in verses:
        verse_words = re.findall(r'\w+', verse)
        for word in verse_words:
            word = word.lower()
            if word not in idf['word'].values and word not in stopwords_set:
                row = {}
                row['word']=[word]
                row['idf'] = math.log(N_documents/len([True for verse_words in all_verse_words if word in verse_words]))
                row['dc'] = len([True for verse_words in all_verse_words if word in verse_words])
                row_df = pd.DataFrame(row)
                idf = pd.concat([idf, row_df], ignore_index=True)
    idf = idf.sort_values(by=sortby, ascending=False)
    idf = idf.reset_index(drop=True)
    return idf

def get_tf_idf_for_verses(verses, sort_by='tf', skip_stopwords: bool = False):

    tf = get_tf_for_verses(verses, sortby='word', skip_stopwords=skip_stopwords)
    tf = tf.reset_index(drop=True)
    idf = get_idf_for_verses(verses, sortby='word', skip_stopwords=skip_stopwords)
    idf = idf.reset_index(drop=True)

    idf_column = {'dc': idf['dc'].values, 'idf': idf['idf'].values}
    idf_df = pd.DataFrame(idf_column)
    tf_idf = pd.concat([tf, idf_df], axis=1)

    tf_idf_column = {'tf_idf': tf['tf'].values*idf['idf'].values}
    tf_idf_df = pd.DataFrame(tf_idf_column)
    tf_idf = pd.concat([tf_idf, tf_idf_df], axis=1)

    tf_idf = tf_idf.sort_values(by=sort_by, ascending=False)
    return tf_idf



tf = get_tf_for_verses(verses, sortby='tf', skip_stopwords=True)  
idf = get_idf_for_verses(verses, sortby='idf', skip_stopwords=True)

tf_idf = get_tf_idf_for_verses(verses, skip_stopwords=True)

In [11]:
with pd.option_context('display.max_rows', None, 'display.max_columns', None):
    print(tf)

           word  tc        tf
0           god  32  0.040151
1         earth  21  0.026349
2           let  14  0.017566
3         every  12  0.015056
4        waters  11  0.013802
5          upon  10  0.012547
6           day  10  0.012547
7         light  10  0.012547
8          said  10  0.012547
9          kind  10  0.012547
10    firmament   9  0.011292
11          saw   7  0.008783
12       heaven   7  0.008783
13         good   7  0.008783
14         fowl   6  0.007528
15        thing   6  0.007528
16         seed   6  0.007528
17      evening   6  0.007528
18      morning   6  0.007528
19         made   5  0.006274
20     yielding   5  0.006274
21       called   5  0.006274
22      created   5  0.006274
23        forth   5  0.006274
24         tree   4  0.005019
25         herb   4  0.005019
26        night   4  0.005019
27        fruit   4  0.005019
28     darkness   4  0.005019
29         rule   3  0.003764
30     creature   3  0.003764
31        bring   3  0.003764
32        

In [12]:
with pd.option_context('display.max_rows', None, 'display.max_columns', None):
    print(tf_idf)


           word  tc        tf  dc       idf    tf_idf
68          god  32  0.040151  26  0.175891  0.007062
89        earth  21  0.026349  15  0.725937  0.019128
55          let  14  0.017566  10  1.131402  0.019874
87        every  12  0.015056   7  1.488077  0.022405
7        waters  11  0.013802   8  1.354546  0.018695
10         upon  10  0.012547   9  1.236763  0.015518
53        light  10  0.012547   7  1.488077  0.018671
31         said  10  0.012547  10  1.131402  0.014196
96          day  10  0.012547   9  1.236763  0.015518
58         kind  10  0.012547   5  1.824549  0.022893
82    firmament   9  0.011292   7  1.488077  0.016804
61       heaven   7  0.008783   7  1.488077  0.013070
67         good   7  0.008783   7  1.488077  0.013070
30          saw   7  0.008783   7  1.488077  0.013070
24         seed   6  0.007528   3  2.335375  0.017581
42      morning   6  0.007528   6  1.642228  0.012363
16        thing   6  0.007528   6  1.642228  0.012363
75         fowl   6  0.00752

CC: It is the conjunction of coordinating

CD: It is a digit of cardinal

DT: It is the determiner

EX: Existential

FW: It is a foreign word

IN: Preposition and conjunction

JJ: Adjective

JJR and JJS: Adjective and superlative

LS: List marker

MD: Modal

NN: Singular noun

NNS, NNP, NNPS: Proper and plural noun

PDT: Predeterminer

WRB: Adverb of wh

WP$: Possessive wh

WP: Pronoun of wh

WDT: Determiner of wp

VBZ: Verb

VBP, VBN, VBG, VBD, VB: Forms of verbs

UH: Interjection

TO: To go

RP: Particle

RBS, RB, RBR: Adverb

PRP, PRP$: Pronoun personal and professional

In [70]:
import nltk

def categorize_words(verses, skip_stopwords: bool = False):
    tf = pd.DataFrame(columns=["word", "tc", "tf"])
    
    if skip_stopwords:
        from nltk.corpus import stopwords
        stopwords_set = set(stopwords.words('english'))
    else:
        stopwords_set = set()

    nnp_set = set()

    word_types = {}

    # Get n_words
    n_words = 0
    for verse in verses:
        verse_tokenized = nltk.tokenize.word_tokenize(verse)

        verse_pos_tags = nltk.pos_tag(verse_tokenized)

        #print(verse_pos_tags)
        for word, tag in verse_pos_tags:
            if tag in ['NNP', 'NP', 'NNS', 'NNPS', 'NN']:
                nnp_set.add((word, tag, ))
        
        verse_ne = nltk.ne_chunk(verse_pos_tags, binary=False)

        for verse_word in verse_ne:
            if type(verse_word)==tuple:
                try:
                    verse_word[0].lower()
                except:
                    print(verse_word[0])
                if verse_word[0].lower() not in stopwords_set:
                    if verse_word[0].lower() not in word_types:
                        word_types[verse_word[0].lower()] = {verse_word[1]: 1}
                    else:
                        if verse_word[1] not in word_types[verse_word[0].lower()]:
                            word_types[verse_word[0].lower()][verse_word[1]] = 1
                        else:
                            word_types[verse_word[0].lower()][verse_word[1]] += 1
            else:
                label=verse_word.label()
                while verse_word:
                    verse_word_pop = verse_word.pop()
                    if verse_word_pop[0].lower() not in word_types:
                        word_types[verse_word_pop[0].lower()] = {verse_word_pop[1]: 1, label: 1}
                    else:
                        if verse_word_pop[1] not in word_types[verse_word_pop[0].lower()]:
                            word_types[verse_word_pop[0].lower()][verse_word_pop[1]] = 1
                        else:
                            word_types[verse_word_pop[0].lower()][verse_word_pop[1]] += 1
                        if label not in word_types[verse_word_pop[0].lower()]:
                            word_types[verse_word_pop[0].lower()][label] = 1
                        else:
                            word_types[verse_word_pop[0].lower()][label] += 1

    return word_types


def get_word_types_with_tf_idf(verses, sortby='tf', skip_stopwords=False, include_verbs=True):
    tf_idf = get_tf_idf_for_verses(verses, sort_by=sortby, skip_stopwords=skip_stopwords)
    tf_idf = tf_idf.reset_index(drop=True)
    print(len(tf_idf['word'].values))
    import numpy as np
    word_types = categorize_words(verses, skip_stopwords=skip_stopwords)
    
    # Create a word_type column in tf_idf, if the word is not in word_types, then it is nan
    word_type_column = {'word_type':[word_types[word] for word in tf_idf['word'].values if word in word_types]}
    print(len(word_type_column['word_type']))
    word_type_df = pd.DataFrame(word_type_column)
    tf_idf_word_types = pd.concat([tf_idf, word_type_df], axis=1)

    # Fileter out rows with nan word_type
    tf_idf_word_types = tf_idf_word_types[tf_idf_word_types['word_type'].apply(lambda x: not pd.isna(x))]


    if not include_verbs:
        tf_idf_word_types = tf_idf_word_types[tf_idf_word_types['word_type'].apply(lambda x: not {'VB', 'VBD'} & set(x.keys()))]

    new_column_order = ['word', 'word_type', 'tc', 'tf', 'dc', 'idf', 'tf_idf']
    tf_idf_word_types = tf_idf_word_types[new_column_order]

    tf_idf = tf_idf.reset_index(drop=True)


    return tf_idf_word_types


word_types = categorize_words(verses, skip_stopwords=True)  
tf_idf_word_types = get_word_types_with_tf_idf(verses, sortby='tf', skip_stopwords=True)

# with pd.option_context('display.max_rows', None, 'display.max_columns', None):
#     print(tf_idf_word_types)


def get_best_10(df, include_verbs=True):
    best_10 = []
    if include_verbs:
        chosen_types = {'NN', 'NNS', 'NNP', 'PERSON', 'VBD', 'VB', 'GPE'}
    else:
        chosen_types = {'NN', 'NNS', 'NNP', 'PERSON', 'GPE'}

    for index in range(len(df)):
        if len(best_10)==10:
            break
        elif set(df.loc[index, 'word_type'].keys()) & chosen_types:
            best_10.append(df.loc[index, 'word'])
    
    return best_10
    

get_best_10(tf_idf_word_types)

115
115


['god',
 'earth',
 'let',
 'waters',
 'light',
 'said',
 'day',
 'kind',
 'firmament',
 'heaven']

In [71]:
for chapter_index in range(1,2):
    Chapter = Genesis[Genesis["chapter"]==chapter_index]
    chapter_verses = Chapter['text'].values

    tf_idf_word_types_chapter = get_word_types_with_tf_idf(chapter_verses, skip_stopwords=True)

    # with pd.option_context('display.max_rows', None, 'display.max_columns', None):
    #     print(tf_idf_word_types_chapter)

    print(f"Chapter {chapter_index}", get_best_10(tf_idf_word_types_chapter, include_verbs = False))

115
115
Chapter 1 ['god', 'earth', 'waters', 'light', 'day', 'kind', 'firmament', 'heaven', 'seed', 'morning']


In [72]:
all_chapter_verses = []
for chapter_index in range(1,51):
    Chapter = Genesis[Genesis["chapter"]==chapter_index]
    chapter_verses = ''
    for verse in Chapter['text'].values:
        chapter_verses = chapter_verses + " " + verse
    all_chapter_verses.append(chapter_verses)
all_chapter_verses

# genesis_tf_idf = get_tf_idf_for_verses(all_chapter_verses, sort_by='tf', skip_stopwords=True)
genesis_tf_idf_word_type = get_word_types_with_tf_idf(all_chapter_verses, 'tf_idf', skip_stopwords=True, include_verbs=False)
genesis_tf_idf_word_type.style.set_properties(subset=['word_type'], **{'width': '400px'})

2326
2323


Unnamed: 0,word,word_type,tc,tf,dc,idf,tf_idf
0,joseph,"{'NNP': 157, 'PERSON': 146, 'GPE': 10}",157,0.004075,16,1.139434,0.004644
2,abraham,"{'NNP': 134, 'GPE': 32, 'PERSON': 94, 'ORGANIZATION': 1}",134,0.003478,17,1.07881,0.003752
6,duke,"{'JJ': 8, 'NN': 18, 'NNP': 5, 'PERSON': 5, 'CC': 1}",32,0.000831,1,3.912023,0.00325
7,abram,"{'NNP': 59, 'PERSON': 43, 'GPE': 6}",59,0.001532,7,1.966113,0.003011
8,master,{'NN': 30},30,0.000779,2,3.218876,0.002507
9,isaac,"{'NNP': 80, 'PERSON': 56, 'GPE': 11}",80,0.002077,15,1.203973,0.0025
10,sons,{'NNS': 146},146,0.00379,26,0.653926,0.002478
12,years,{'NNS': 113},113,0.002933,23,0.776529,0.002278
13,noah,"{'NNP': 41, 'PERSON': 25, 'GPE': 8}",41,0.001064,6,2.120264,0.002257
15,earth,"{'NN': 120, 'NNP': 1, 'GPE': 1}",121,0.003141,25,0.693147,0.002177
