In [59]:
#import chardet
import pandas as pd
import numpy as np

In [60]:
BIBLE_TEXT_FILE_PATH = "D:/LO_resources/eng-BSB.txt"

BOOK_NAMES_LIST = ['Genesis', 'Exodus', 'Leviticus', 'Numbers', 'Deuteronomy', 'Joshua', 'Judges', 'Ruth', '1 Samuel', '2 Samuel', '1 Kings', '2 Kings', '1 Chronicles', '2 Chronicles', 'Ezra', 'Nehemiah', 'Esther', 'Job', 'Psalm', 'Proverbs', 'Ecclesiastes', 'Song of Solomon', 'Lamentations', 'Isaiah', 'Jeremiah', 'Ezekiel', 'Daniel', 'Hosea', 'Joel', 'Amos', 'Obadiah', 'Jonah', 'Micah', 'Nahum', 'Habakkuk', 'Zephaniah', 'Haggai', 'Zechariah', 'Malachi', 'Matthew', 'Mark', 'Luke', 'John', 'Acts', 'Romans', '1 Corinthians', '2 Corinthians', 'Galatians', 'Ephesians', 'Philippians', 'Colossians', '1 Thessalonians', '2 Thessalonians', '1 Timothy', '2 Timothy', 'Titus', 'Philemon', 'Hebrews', 'James', '1 Peter', '2 Peter', '1 John', '2 John', '3 John', 'Jude', 'Revelation']

# Alternative Book Grouping From Biblica.com 'The Books of the Bible' 
COVENANT_HISTORY_ORDER_LIST = ['Genesis', 'Exodus', 'Leviticus', 'Numbers', 'Deuteronomy', 'Joshua', 'Judges', 'Ruth', '1 Samuel', '2 Samuel', '1 Kings', '2 Kings']
THE_PROPHETS_ORDER_LIST = ['Jonah', 'Amos','Hosea','Micah', 'Isaiah','Zephaniah', 'Nahum', 'Habakkuk', 'Jeremiah', 'Obadiah', 'Ezekiel', 'Haggai', 'Zechariah', 'Joel', 'Malachi']
THE_WRITINGS_ORDER_LIST = ['Psalm','Lamentations','Song of Solomon','Proverbs', 'Ecclesiastes','Job','1 Chronicles', '2 Chronicles','Ezra', 'Nehemiah', 'Esther', 'Daniel',]
NEW_TESTAMENT_ORDER_LIST = [ 'Luke',  'Acts', '1 Thessalonians', '2 Thessalonians', '1 Corinthians', '2 Corinthians', 'Galatians', 'Romans', 'Colossians', 'Ephesians', 'Philemon', 'Philippians', '1 Timothy', 'Titus','2 Timothy',  'Matthew', 'Hebrews', 'James','Mark', '1 Peter', '2 Peter','Jude', 'John', '1 John', '2 John', '3 John',  'Revelation']

ALT_GROUP_LIST = ['Covenant History','The Prophets','The Writings','New Testament']

TRAD_BIBLE_GROUP_DICT = {
    'Genesis': 'Law',
    'Exodus': 'Law',
    'Leviticus': 'Law',
    'Numbers': 'Law',
    'Deuteronomy': 'Law',
    'Joshua': 'History',
    'Judges': 'History',
    'Ruth': 'History',
    '1 Samuel': 'History',
    '2 Samuel': 'History',
    '1 Kings': 'History',
    '2 Kings': 'History',
    '1 Chronicles': 'History',
    '2 Chronicles': 'History',
    'Ezra': 'History',
    'Nehemiah': 'History',
    'Esther': 'History',
    'Job': 'Wisdom',
    'Psalm': 'Wisdom',
    'Proverbs': 'Wisdom',
    'Ecclesiastes': 'Wisdom',
    'Song of Solomon': 'Wisdom',
    'Isaiah': 'Major Prophets',
    'Jeremiah': 'Major Prophets',
    'Lamentations': 'Major Prophets',
    'Ezekiel': 	'Major Prophets', 
	'Daniel':'Major Prophets', 
	'Hosea':'Minor Prophets', 
	'Joel':'Minor Prophets', 
	'Amos':'Minor Prophets', 
	'Obadiah':'Minor Prophets', 
	'Jonah':'Minor Prophets', 
	'Micah':'Minor Prophets', 
	'Nahum':'Minor Prophets', 
	'Habakkuk':'Minor Prophets', 
	'Zephaniah':'Minor Prophets', 
	'Haggai':'Minor Prophets', 
	'Zechariah':'Minor Prophets', 
	'Malachi':'Minor Prophets', 
	'Matthew':'Gospel', 
	'Mark':'Gospel', 
	'Luke':'Gospel', 
	'John':'Gospel', 
	'Acts':'History', 
	'Romans':'Letter', 
	'1 Corinthians':'Letter', 
	'2 Corinthians':'Letter', 
	'Galatians':'Letter', 
	'Ephesians':'Letter', 
	'Philippians':'Letter', 
	'Colossians':'Letter', 
	'1 Thessalonians':'Letter', 
	'2 Thessalonians':'Letter', 
	'1 Timothy':'Letter', 
	'2 Timothy':'Letter', 
	'Titus':'Letter', 
	'Philemon':'Letter',  
 	'Hebrews':'Letter',
 	'James':'Letter',
 	'1 Peter':'Letter',
 	'2 Peter':'Letter',
 	'1 John':'Letter',
 	'2 John':'Letter',
 	'3 John':'Letter',
 	'Jude':'Letter',
 	'Revelation':"Letter"
}


In [61]:
# Open the file in binary mode and detect its encoding
#with open(BIBLE_TEXT_FILE_PATH, 'rb') as f:
#    result = chardet.detect(f.read())
    
# Read the file using the detected encoding
df = pd.read_csv(BIBLE_TEXT_FILE_PATH, sep='\t', encoding='utf-8')
# Drop the first two rows which are about the document
df = df.drop([0, 1])

In [62]:
# Add columns for each meta-data
df['book'] = df['verse'].str.extract(f"({'|'.join(BOOK_NAMES_LIST)})")
df['verse_number'] = df['verse'].str.split(':').str.get(-1)
df['chapter_number'] = df['verse'].str.split(':').str.get(0).str.split(' ').str.get(-1)

df['verse_length'] = df['text'].str.len()
df['verse_word_count'] = df['text'].str.split().str.len()

# Create a new column with the label based on the dictionary
df['traditional_group'] = df['book'].map(TRAD_BIBLE_GROUP_DICT)

# number the books
df['traditional_book_order'] = df['book'].apply(lambda x: BOOK_NAMES_LIST.index(x) + 1)

# Create a list of conditions and corresponding labels
alt_conditions = [
    df['book'].isin(COVENANT_HISTORY_ORDER_LIST),
    df['book'].isin(THE_PROPHETS_ORDER_LIST),
    df['book'].isin(THE_WRITINGS_ORDER_LIST),
    df['book'].isin(NEW_TESTAMENT_ORDER_LIST)
]

# Create a new column with the label based on the conditions
df['alternative_group'] = np.select(alt_conditions, ALT_GROUP_LIST)

# Define the alternative order of the books
alt_book_order = COVENANT_HISTORY_ORDER_LIST +THE_PROPHETS_ORDER_LIST+THE_WRITINGS_ORDER_LIST+NEW_TESTAMENT_ORDER_LIST
df['alternative_book_order'] = df['book'].apply(lambda x: alt_book_order.index(x) + 1)

print(df)

                  verse                                               text  \
2           Genesis 1:1  In the beginning God created the heavens and t...   
3           Genesis 1:2  Now the earth was formless and void, and darkn...   
4           Genesis 1:3  And God said, “Let there be light,” and there ...   
5           Genesis 1:4  And God saw that the light was good, and He se...   
6           Genesis 1:5  God called the light “day,” and the darkness H...   
...                 ...                                                ...   
31099  Revelation 22:17  The Spirit and the bride say, “Come!” Let the ...   
31100  Revelation 22:18  I testify to everyone who hears the words of p...   
31101  Revelation 22:19  And if anyone takes away from the words of thi...   
31102  Revelation 22:20  He who testifies to these things says, “Yes, I...   
31103  Revelation 22:21  The grace of the Lord Jesus be with all the sa...   

             book verse_number chapter_number  verse_length  ve

In [63]:
#df[df['book']=='Job']
df[df['verse_word_count']<=2]

Unnamed: 0,verse,text,book,verse_number,chapter_number,verse_length,verse_word_count,traditional_group,traditional_book_order,alternative_group,alternative_book_order
4699,Numbers 31:33,"72,000 cattle,",Numbers,33,31,14.0,2.0,Law,4,Covenant History,4
4700,Numbers 31:34,"61,000 donkeys,",Numbers,34,31,15.0,2.0,Law,4,Covenant History,4
4710,Numbers 31:44,"36,000 cattle,",Numbers,44,31,14.0,2.0,Law,4,Covenant History,4
4711,Numbers 31:45,"30,500 donkeys,",Numbers,45,31,15.0,2.0,Law,4,Covenant History,4
6397,Joshua 21:14,"Jattir, Eshtemoa,",Joshua,14,21,17.0,2.0,History,6,Covenant History,6
6398,Joshua 21:15,"Holon, Debir,",Joshua,15,21,13.0,2.0,History,6,Covenant History,6
10514,1 Chronicles 6:58,"Hilen, Debir,",1 Chronicles,58,6,13.0,2.0,History,13,The Writings,34
10524,1 Chronicles 6:68,"Jokmeam, Beth-horon,",1 Chronicles,68,6,20.0,2.0,History,13,The Writings,34
26560,John 11:35,Jesus wept.,John,35,11,11.0,2.0,Gospel,43,New Testament,62


In [64]:
df.to_csv('D:/LO_resources/eng-BSB.csv', encoding='utf-8')