In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import sqlite3

In [3]:
conn = sqlite3.connect('bible-sqlite.db')
c = conn.cursor()

In [4]:
# Investigate the Data

c.execute("SELECT name FROM sqlite_master").fetchall()

[('bible_version_key',),
 ('cross_reference',),
 ('key_english',),
 ('t_asv',),
 ('t_bbe',),
 ('t_dby',),
 ('t_kjv',),
 ('t_wbt',),
 ('t_web',),
 ('t_ylt',)]

In [5]:
c.execute('PRAGMA table_info(t_asv)').fetchall()

[(0, 'id', 'integer zerofill', 1, None, 0),
 (1, 'b', 'integer', 1, None, 0),
 (2, 'c', 'integer', 1, None, 0),
 (3, 'v', 'integer', 1, None, 0),
 (4, 't', 'text', 1, None, 0),
 (5, 'char_length', 'integer', 0, None, 0)]

In [158]:
# The id is formatted as book, chapter, verse. So 1001001 is 1-001-001 which is Genesis 1:1,
# 40012005 is 40-012-005 which is Matthew 12:5, etc.
c.execute('SELECT * FROM t_asv LIMIT 2').fetchall()

[(1001001,
  1,
  1,
  1,
  'In the beginning God created the heavens and the earth.',
  55,
  'Genesis',
  'Genesis 1:1'),
 (1001002,
  1,
  1,
  2,
  'And the earth was waste and void; and darkness was upon the face of the deep: and the Spirit of God moved upon the face of the waters.',
  134,
  'Genesis',
  'Genesis 1:2')]

In [8]:
# Create new column of character lengths in t
# c.execute('ALTER TABLE t_asv ADD COLUMN char_length integer')
c.execute('UPDATE t_asv SET char_length = LENGTH(t)')

<sqlite3.Cursor at 0x154fcae7b90>

In [9]:
# Create new column of book names
c.execute('ALTER TABLE t_asv ADD COLUMN book_name text')
c.execute("UPDATE t_asv SET book_name = CASE b WHEN 1 THEN 'Genesis' WHEN 2 THEN 'Exodus' WHEN 3 THEN 'Leviticus' WHEN 4 THEN 'Numbers' WHEN 5 THEN 'Deuteronomy' WHEN 6 THEN 'Joshua' WHEN 7 THEN 'Judges' WHEN 8 THEN 'Ruth' WHEN 9 THEN '1 Samuel' WHEN 10 THEN '2 Samuel' WHEN 11 THEN '1 Kings' WHEN 12 THEN '2 Kings' WHEN 13 THEN '1 Chronicles' WHEN 14 THEN '2 Chronicles' WHEN 15 THEN 'Ezra' WHEN 16 THEN 'Nehemiah' WHEN 17 THEN 'Esther' WHEN 18 THEN 'Job' WHEN 19 THEN 'Psalm' WHEN 20 THEN 'Proverbs' WHEN 21 THEN 'Ecclesiastes' WHEN 22 THEN 'Song of Solomon' WHEN 23 THEN 'Isaiah' WHEN 24 THEN 'Jeremiah' WHEN 25 THEN 'Lamentations' WHEN 26 THEN 'Ezekiel' WHEN 27 THEN 'Daniel' WHEN 28 THEN 'Hosea' WHEN 29 THEN 'Joel' WHEN 30 THEN 'Amos' WHEN 31 THEN 'Obadiah' WHEN 32 THEN 'Jonah' WHEN 33 THEN 'Micah' WHEN 34 THEN 'Nahum' WHEN 35 THEN 'Habakkuk' WHEN 36 THEN 'Zephaniah' WHEN 37 THEN 'Haggai' WHEN 38 THEN 'Zechariah' WHEN 39 THEN 'Malachi' WHEN 40 THEN 'Matthew' WHEN 41 THEN 'Mark' WHEN 42 THEN 'Luke' WHEN 43 THEN 'John' WHEN 44 THEN 'Acts' WHEN 45 THEN 'Romans' WHEN 46 THEN '1 Corinthians' WHEN 47 THEN '2 Corinthians' WHEN 48 THEN 'Galatians' WHEN 49 THEN 'Ephesians' WHEN 50 THEN 'Philippians' WHEN 51 THEN 'Colossians' WHEN 52 THEN '1 Thessalonians' WHEN 53 THEN '2 Thessalonians' WHEN 54 THEN '1 Timothy' WHEN 55 THEN '2 Timothy' WHEN 56 THEN 'Titus' WHEN 57 THEN 'Philemon' WHEN 58 THEN 'Hebrews' WHEN 59 THEN 'James' WHEN 60 THEN '1 Peter' WHEN 61 THEN '2 Peter' WHEN 62 THEN '1 John' WHEN 63 THEN '2 John' WHEN 64 THEN '3 John' WHEN 65 THEN 'Jude' WHEN 66 THEN 'Revelation' ELSE 'Other' END")

<sqlite3.Cursor at 0x154fcae7b90>

In [10]:
# Create new column of references
c.execute('ALTER TABLE t_asv ADD COLUMN reference text')
c.execute("UPDATE t_asv SET reference=book_name||' '||c||':'||v")

<sqlite3.Cursor at 0x154fcae7b90>

In [155]:
# Find longest verses
print("Longest verses in ASV Bible:")
c.execute('SELECT t, reference, char_length FROM t_asv ORDER BY LENGTH(t) DESC LIMIT 2').fetchall()

Longest verses in ASV Bible:


[("Then were the king's scribes called at that time, in the third month Sivan, on the three and twentieth `day' thereof; and it was written according to all that Mordecai commanded unto the Jews, and to the satraps, and the governors and princes of the provinces which are from India unto Ethiopia, a hundred twenty and seven provinces, unto every province according to the writing thereof, and unto every people after their language, and to the Jews according to their writing, and according to their language.",
  'Esther 8:9',
  508),
 ("And king Ahaz commanded Urijah the priest, saying, Upon the great altar burn the morning burnt-offering, and the evening meal-offering, and the king's burnt-offering, and his meal-offering, with the burnt-offering of all the people of the land, and their meal-offering, and their drink-offerings; and sprinkle upon it all the blood of the burnt-offering, and all the blood of the sacrifice: but the brazen altar shall be for me to inquire by.",
  '2 Kings 16:

In [154]:
# Find shortest verses
print("Shortest verses in ASV Bible:")
c.execute('SELECT t, reference, char_length FROM t_asv ORDER BY LENGTH(t) LIMIT 7').fetchall()

Shortest verses in ASV Bible:


[('[]', '3 John 1:15', 2),
 ('Jesus wept.', 'John 11:35', 11),
 ('and the second:', 'Luke 20:30', 15),
 ('Rejoice always;', '1 Thessalonians 5:16', 15),
 ('Eber, Peleg, Reu,', '1 Chronicles 1:25', 17),
 ('Adam, Seth, Enosh,', '1 Chronicles 1:1', 18),
 ('Thou shalt not kill.', 'Exodus 20:13', 20)]

In [153]:
# Investigate why there is an empty verse:
print("3 John from ASV Bible:")
c.execute("SELECT t FROM t_asv WHERE book_name='3 John'").fetchall()

3 John from ASV Bible:


[('The elder unto Gaius the beloved, whom I love in truth.',),
 ('Beloved, I pray that in all things thou mayest prosper and be in health, even as thy soul prospereth.',),
 ('For I rejoiced greatly, when brethren came and bare witness unto thy truth, even as thou walkest in truth.',),
 ('Greater joy have I none than this, to hear of my children walking in the truth.',),
 ('Beloved, thou doest a faithful work in whatsoever thou doest toward them that are brethren and strangers withal;',),
 ('who bare witness to thy love before the church: whom thou wilt do well to set forward on their journey worthily of God:',),
 ('because that for the sake of the Name they went forth, taking nothing of the Gentiles.',),
 ('We therefore ought to welcome such, that we may be fellow-workers for the truth.',),
 ('I wrote somewhat unto the church: but Diotrephes, who loveth to have the preeminence among them, receiveth us not.',),
 ("Therefore, if I come, I will bring to remembrance his works which he doet

In [152]:
versions = ['t_asv','t_bbe','t_dby','t_kjv','t_wbt','t_web','t_ylt']
print("3 John 1:15 from different versions:")
for version in versions:
    print(version,c.execute("SELECT t FROM {} WHERE id=64001015".format(version)).fetchall())

3 John 1:15 from different versions:
t_asv [('[]',)]
t_bbe [('May you have peace. Your friends here send you their love. Give my love to our friends by name.',)]
t_dby []
t_kjv [('[]',)]
t_wbt []
t_web []
t_ylt [('[]',)]


In [149]:
# It looks as though one version splits verse 14 into two verses while the others don't.
# Search for other empty verses in all versions
print("versions and their missing verses:")
for version in versions:
    print(version,c.execute("SELECT t,id FROM {} WHERE t='[]'".format(version)).fetchall())

versions and their missing verses:
t_asv [('[]', 64001015)]
t_bbe [('[]', 23064012), ('[]', 27010021), ('[]', 40017021), ('[]', 40018011), ('[]', 40023014), ('[]', 41007016), ('[]', 41009044), ('[]', 41009046), ('[]', 41011026), ('[]', 41015028), ('[]', 42017036), ('[]', 42023017), ('[]', 43005004), ('[]', 44008037), ('[]', 44015034), ('[]', 44019041), ('[]', 44024007), ('[]', 44028029), ('[]', 45016024)]
t_dby []
t_kjv [('[]', 64001015)]
t_wbt []
t_web []
t_ylt [('[]', 64001015)]


In [57]:
# It's interesting that the Bible in Basic English (bbe) version has significantly more empty verses than the other
# versions. This has to do with translation styles which I won't get into here.

In [82]:
# Create new list of words and filtered words in pandas dataframe
import string
bible = pd.read_sql_query('SELECT * FROM t_asv',conn)
trans_table = str.maketrans({key: None for key in string.punctuation})
bible['words'] = [[word.translate(trans_table).lower() for word in bible.loc[x,'t'].split()] \
                  for x in range(bible.shape[0])]

# Find stems and filter out stop-words using natural language toolkit
from nltk.corpus import stopwords
from nltk.stem.snowball import SnowballStemmer
stemmer = SnowballStemmer("english")
bible['filtered_words'] = pd.Series([[stemmer.stem(word) for word in bible.loc[i,'words'] if word not in stopwords.words('english')] for i in range(bible.shape[0])])

In [91]:
# Find most common n-word phrases in ASV Bible
list_of_words = []
for x in bible['words']:
    list_of_words += x

# Create all 3-word combinations
n = 3
list_of_phrases = []
for i in range(len(list_of_words)):
    phrase = ' '.join(list_of_words[i:i+n])
    list_of_phrases.append(phrase)

# Find most common 3-word phrase in the ASV Bible
import operator
phrases_dict = {}
for i in list_of_phrases:
    if i not in phrases_dict:
        phrases_dict[i] = 1
    else:
        phrases_dict[i] += 1
sorted_phrases = sorted(phrases_dict.items(), key=operator.itemgetter(1), reverse=True)
print("The most common 3-word phrases in the ASV Bible:")
[print(sorted_phrases[x]) for x in range(5)]

The most common 3-word phrases in the ASV Bible:
('the son of', 1440)
('the children of', 1344)
('the house of', 820)
('out of the', 776)
('and i will', 725)


[None, None, None, None, None]

In [98]:
# Most common n-word phrases in ASV Bible
n = 6
list_of_phrases = []
for i in range(len(list_of_words)):
    phrase = ' '.join(list_of_words[i:i+n])
    list_of_phrases.append(phrase)
phrases_dict = {}
for i in list_of_phrases:
    if i not in phrases_dict:
        phrases_dict[i] = 1
    else:
        phrases_dict[i] += 1
sorted_phrases = sorted(phrases_dict.items(), key=operator.itemgetter(1), reverse=True)

print("The most common {}-word phrases in the ASV Bible:".format(n))
[print(sorted_phrases[x]) for x in range(5)]

The most common 6-word phrases in the ASV Bible:
('and it came to pass when', 139)
('and it shall come to pass', 101)
('out of the land of egypt', 85)
('and jehovah spake unto moses saying', 72)
('the word of jehovah came unto', 65)


[None, None, None, None, None]

In [103]:
# Find most common n-word phrases in the New Testament of the ASV Bible
list_of_words = []
for x in bible[bible.id>=40001001]['words']:
    list_of_words += x

n = 6
list_of_phrases = []
for i in range(len(list_of_words)):
    phrase = ' '.join(list_of_words[i:i+n])
    list_of_phrases.append(phrase)
phrases_dict = {}
for i in list_of_phrases:
    if i not in phrases_dict:
        phrases_dict[i] = 1
    else:
        phrases_dict[i] += 1
sorted_phrases = sorted(phrases_dict.items(), key=operator.itemgetter(1), reverse=True)
print("The most common {}-word phrases in the New Testament of the ASV Bible:".format(n))
[print(sorted_phrases[x]) for x in range(5)]

The most common 6-word phrases in the New Testament of the ASV Bible:
('verily verily i say unto you', 20)
('and it came to pass when', 14)
('he answered and said unto them', 12)
('and it came to pass as', 11)
('jesus answered and said unto them', 11)


[None, None, None, None, None]

In [109]:
# Find verses that contain a search word

search_word = 'Sluggard'
trans_table = str.maketrans({key: None for key in string.punctuation})
filtered_search_word = stemmer.stem(search_word).translate(trans_table).lower()

found = []
for i in range(len(bible)):
    found.append(filtered_search_word in bible.loc[i,'filtered_words'])
print("Number of verses found: {}".format(bible[found].shape[0]))
bible[found][['reference','t']]

Number of verses found: 14


Unnamed: 0,reference,t
16546,Proverbs 6:6,"Go to the ant, thou sluggard; Consider her ways, and be wise:"
16549,Proverbs 6:9,"How long wilt thou sleep, O sluggard? When wilt thou arise out of thy sleep?"
16682,Proverbs 10:26,"As vinegar to the teeth, and as smoke to the eyes, So is the sluggard to them that send him."
16751,Proverbs 13:4,"The soul of the sluggard desireth, and hath nothing; But the soul of the diligent shall be made fat."
16826,Proverbs 15:19,The way of the sluggard is as a hedge of thorns; But the path of the upright is made a highway.
16949,Proverbs 19:24,"The sluggard burieth his hand in the dish, And will not so much as bring it to his mouth again."
16958,Proverbs 20:4,"The sluggard will not plow by reason of the winter; Therefore he shall beg in harvest, and have nothing."
17009,Proverbs 21:25,The desire of the sluggard killeth him; For his hands refuse to labor.
17028,Proverbs 22:13,"The sluggard saith, There is a lion without: I shall be slain in the streets."
17109,Proverbs 24:30,"I went by the field of the sluggard, And by the vineyard of the man void of understanding;"


In [110]:
# Create sparse matrix for unsupervised learning
def create_sparse_matrix(series):
    sm_cols = set(x for val in series for x in val)
    sm = pd.DataFrame(np.zeros((len(series),len(sm_cols))),columns=sm_cols)
    for i,val in enumerate(series):
        for x in val:
            sm.loc[i,x] += 1
    sm.drop(sm.loc[:,sm.sum()<=1].columns,axis=1,inplace=True)
    return sm

In [125]:
romans = bible[bible.book_name=='Romans']
new_testament = bible[bible.b>=40]

In [122]:
# Initiate unsupervised learning using KMeans clustering
import warnings
warnings.filterwarnings('ignore')

from sklearn.cluster import KMeans
km = KMeans(n_clusters=20)

km.fit(create_sparse_matrix(romans['filtered_words']))
romans['group'] = km.labels_.astype(np.int)
romans['group'].value_counts()

19    140
2      62
14     43
7      37
6      33
4      32
11     19
3      14
8      13
12     12
13      7
5       6
10      4
9       3
18      2
16      2
15      1
17      1
1       1
0       1
Name: group, dtype: int64

In [126]:
km.fit(create_sparse_matrix(new_testament['filtered_words']))
new_testament['group'] = km.labels_.astype(np.int)
new_testament['group'].value_counts()

12    2811
10     721
5      572
19     529
15     430
8      356
13     347
4      323
7      304
6      294
3      261
2      251
16     240
18     170
1      162
9      128
14      20
0       19
17      18
11       2
Name: group, dtype: int64

In [147]:
new_testament[new_testament.group==9]

Unnamed: 0,id,b,c,v,t,char_length,book_name,reference,words,filtered_words,group
23217,40004008,40,4,8,"Again, the devil taketh him unto an exceeding high mountain, and showeth him all the kingdoms of the world, and the glory of them;",130,Matthew,Matthew 4:8,"[again, the, devil, taketh, him, unto, an, exceeding, high, mountain, and, showeth, him, all, the, kingdoms, of, the, world, and, the, glory, of, them]","[devil, taketh, unto, exceed, high, mountain, showeth, kingdom, world, glori]",9
23248,40005014,40,5,14,Ye are the light of the world. A city set on a hill cannot be hid.,66,Matthew,Matthew 5:14,"[ye, are, the, light, of, the, world, a, city, set, on, a, hill, cannot, be, hid]","[ye, light, world, citi, set, hill, cannot, hid]",9
23561,40013022,40,13,22,"And he that was sown among the thorns, this is he that heareth the word; and the care of the world, and the deceitfulness of riches, choke the word, and he becometh unfruitful.",176,Matthew,Matthew 13:22,"[and, he, that, was, sown, among, the, thorns, this, is, he, that, heareth, the, word, and, the, care, of, the, world, and, the, deceitfulness, of, riches, choke, the, word, and, he, becometh, unfruitful]","[sown, among, thorn, heareth, word, care, world, deceit, rich, choke, word, becometh, unfruit]",9
23577,40013038,40,13,38,"and the field is the world; and the good seed, these are the sons of the kingdom; and the tares are the sons of the evil `one';",127,Matthew,Matthew 13:38,"[and, the, field, is, the, world, and, the, good, seed, these, are, the, sons, of, the, kingdom, and, the, tares, are, the, sons, of, the, evil, one]","[field, world, good, seed, son, kingdom, tare, son, evil, one]",9
23578,40013039,40,13,39,and the enemy that sowed them is the devil: and the harvest is the end of the world; and the reapers are angels.,112,Matthew,Matthew 13:39,"[and, the, enemy, that, sowed, them, is, the, devil, and, the, harvest, is, the, end, of, the, world, and, the, reapers, are, angels]","[enemi, sow, devil, harvest, end, world, reaper, angel]",9
23579,40013040,40,13,40,As therefore the tares are gathered up and burned with fire; so shall it be in the end of the world.,100,Matthew,Matthew 13:40,"[as, therefore, the, tares, are, gathered, up, and, burned, with, fire, so, shall, it, be, in, the, end, of, the, world]","[therefor, tare, gather, burn, fire, shall, end, world]",9
23734,40018007,40,18,7,Woe unto the world because of occasions of stumbling! for it must needs be that the occasions come; but woe to that man through whom the occasion cometh!,153,Matthew,Matthew 18:7,"[woe, unto, the, world, because, of, occasions, of, stumbling, for, it, must, needs, be, that, the, occasions, come, but, woe, to, that, man, through, whom, the, occasion, cometh]","[woe, unto, world, occas, stumbl, must, need, occas, come, woe, man, occas, cometh]",9
24042,40025034,40,25,34,"Then shall the King say unto them on his right hand, Come, ye blessed of my Father, inherit the kingdom prepared for you from the foundation of the world:",154,Matthew,Matthew 25:34,"[then, shall, the, king, say, unto, them, on, his, right, hand, come, ye, blessed, of, my, father, inherit, the, kingdom, prepared, for, you, from, the, foundation, of, the, world]","[shall, king, say, unto, right, hand, come, ye, bless, father, inherit, kingdom, prepar, foundat, world]",9
24215,40028020,40,28,20,"teaching them to observe all things whatsoever I commanded you: and lo, I am with you always, even unto the end of the world.",125,Matthew,Matthew 28:20,"[teaching, them, to, observe, all, things, whatsoever, i, commanded, you, and, lo, i, am, with, you, always, even, unto, the, end, of, the, world]","[teach, observ, thing, whatsoev, command, lo, alway, even, unto, end, world]",9
24342,41004019,41,4,19,"and the cares of the world, and the deceitfulness of riches, and the lusts of other things entering in, choke the word, and it becometh unfruitful.",147,Mark,Mark 4:19,"[and, the, cares, of, the, world, and, the, deceitfulness, of, riches, and, the, lusts, of, other, things, entering, in, choke, the, word, and, it, becometh, unfruitful]","[care, world, deceit, rich, lust, thing, enter, choke, word, becometh, unfruit]",9


In [None]:
# Group 0 is the 'seven' group
# Group 17 is the 'son' group
# Group 11 is the 'olive' group
# Group 14 is the 'thing' group
# Group 9 is the 'world' group
# etc.