In [466]:
import os
import re
import json

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

### KJV

In [467]:
df_kjv = pd.read_csv("../data/kjv_clean.csv")

In [468]:
print(df_kjv)

      testament_name   book_name  chapter_number  verse_number  \
0      Old Testament     Genesis               1             1   
1      Old Testament     Genesis               1             2   
2      Old Testament     Genesis               1             3   
3      Old Testament     Genesis               1             4   
4      Old Testament     Genesis               1             5   
...              ...         ...             ...           ...   
31097  New Testament  Revelation              22            17   
31098  New Testament  Revelation              22            18   
31099  New Testament  Revelation              22            19   
31100  New Testament  Revelation              22            20   
31101  New Testament  Revelation              22            21   

                                              verse_text  
0      In the beginning God created the heaven and th...  
1      And the earth was without form, and void; and ...  
2      And God said, Let there

In [469]:
df_kjv = df_kjv.reset_index(drop=True)

df_kjv['verse_ref'] = df_kjv.apply(
    lambda row: f"{row['book_name']} {row['chapter_number']}:{row['verse_number']}", axis=1
)

In [470]:
df_kjv = df_kjv[["testament_name", 
                "verse_ref", 
                "book_name", 
                "chapter_number", 
                "verse_number", 
                "verse_text"]]

In [471]:
print(df_kjv)

      testament_name         verse_ref   book_name  chapter_number  \
0      Old Testament       Genesis 1:1     Genesis               1   
1      Old Testament       Genesis 1:2     Genesis               1   
2      Old Testament       Genesis 1:3     Genesis               1   
3      Old Testament       Genesis 1:4     Genesis               1   
4      Old Testament       Genesis 1:5     Genesis               1   
...              ...               ...         ...             ...   
31097  New Testament  Revelation 22:17  Revelation              22   
31098  New Testament  Revelation 22:18  Revelation              22   
31099  New Testament  Revelation 22:19  Revelation              22   
31100  New Testament  Revelation 22:20  Revelation              22   
31101  New Testament  Revelation 22:21  Revelation              22   

       verse_number                                         verse_text  
0                 1  In the beginning God created the heaven and th...  
1            

In [472]:
# df_kjv.to_csv("../dataset/kjv.csv", index=False)

In [473]:
verse_to_index = {verse: idx for idx, verse in enumerate(df_kjv['verse_ref'])}

In [474]:
verse_to_testament = {verse: testament for verse, testament in zip(df_kjv['verse_ref'], df_kjv['testament_name'])}

### KJV Sites

In [475]:
df_kjv_sites = pd.read_csv("../data/kjv_locs_all.csv")

In [476]:
print(df_kjv_sites)

      book_name  book_number  chapter_number  verse_number  \
0       2 Kings           12               5            12   
1       2 Kings           12               5            12   
2       2 Kings           12               5            12   
3       2 Kings           12               5            12   
4       2 Kings           12               5            12   
...         ...          ...             ...           ...   
17065  1 Samuel            9               9             5   
17066  1 Samuel            9               9             5   
17067  1 Samuel            9               9             5   
17068  1 Samuel            9               9             5   
17069  1 Samuel            9               9             5   

                                              verse_text name_id_ancient  \
0      Are not Abana and Pharpar, rivers of Damascus,...           Abana   
1      Are not Abana and Pharpar, rivers of Damascus,...        Damascus   
2      Are not Abana and Ph

In [477]:
df_kjv_sites['verse_ref'] = df_kjv_sites.apply(
    lambda row: f"{row['book_name']} {row['chapter_number']}:{row['verse_number']}", axis=1
)

In [478]:
df_kjv_sites.drop(columns=["book_name", "chapter_number", "verse_number", "book_number", "verse_text"], inplace=True)

In [479]:
def get_testament_from_verse_list(verse):
    if verse:
        return verse_to_testament.get(verse, None)
    return None

df_kjv_sites['testament_name'] = df_kjv_sites['verse_ref'].apply(get_testament_from_verse_list)

In [480]:
def clean_text(text):
    if isinstance(text, str):
        text = ''.join([char for char in text if not char.isdigit()])
        return " ".join(text.split())
    return text

df_kjv_sites["name_id_ancient"] = df_kjv_sites["name_id_ancient"].apply(clean_text)
df_kjv_sites["name_id_modern"] = df_kjv_sites["name_id_modern"].apply(clean_text)

In [481]:
df_kjv_sites["country"] = df_kjv_sites["country"].replace("-", "State of Palestine")
df_kjv_sites.replace("-", np.nan, inplace=True)

In [482]:
df_kjv_sites = df_kjv_sites[[
                            "name_id_ancient",
                            "name_id_modern",
                            
                            "latitude",
                            "longitude",
                            
                            "route",
                            "neighborhood",
                            "administrative_area_level_2",
                            "administrative_area_level_1",
                            "locality",
                            "country",
                            
                            "testament_name",
                            "verse_ref"
                            ]]

In [483]:
print(df_kjv_sites)

      name_id_ancient name_id_modern   latitude  longitude  \
0               Abana   Barada River  33.513542  36.305000   
1            Damascus       Damascus  33.511112  36.306390   
2             Pharpar   Barbar River  33.416667  36.133333   
3             Pharpar     Awaj River  33.313620  36.055535   
4             Pharpar     Tora River  33.540556  36.353056   
...               ...            ...        ...        ...   
17065            Zuph         Al Ram  31.854340  35.231610   
17066            Zuph    Ramat Rahel  31.739901  35.216896   
17067            Zuph         Rantis  32.028056  35.019444   
17068            Zuph      Beit Rima  32.032860  35.102500   
17069            Zuph        Al Bira  31.905142  35.214958   

                             route       neighborhood  \
0                 جادة بين السورين  Amarah Barraniyah   
1              Al Kabakbieh Street       Al-Hamidiyeh   
2                   Route Sans Nom                NaN   
3                   Route S

In [484]:
# df_kjv_sites.to_csv("../dataset/kjv_sites.csv", index=False)

### KJV Events

In [540]:
df_kjv_events = pd.read_csv("../data/events.csv", dtype=str)

In [541]:
print(df_kjv_events)

                               title   ID startDate duration rangeFlag  \
0             Creation of all things    1     -4003       6D       NaN   
1           Creation of Adam and Eve    2     -4003       1D       NaN   
2                           The Fall    3     -4003       1D   checked   
3                   Lifetime of Adam    4     -4003     930Y       NaN   
4                    Cain kills Abel    5     -3874       1D       NaN   
..                               ...  ...       ...      ...       ...   
390  Paul's First Roman imprisonment  385      0060       3Y       NaN   
391         Feeding of Five Thousand  393        29       1D       NaN   
392             Jesus Walks on Water  394        29       1D       NaN   
393            Healing in Gennesaret  396        29       1D       NaN   
394             Bread of Life Sermon  397        29       1D       NaN   

                  predecessor  lag Lag Type  \
0                         NaN  NaN      NaN   
1                

In [542]:
df_kjv_events = df_kjv_events[["title", "startDate", "duration", "verses"]]

In [543]:
def sort_verses(verse_str):
    return ",".join(sorted(verse_str.split(","), key=lambda x: tuple(map(int, x.split(".")[1:]))))

df_kjv_events['verses'] = df_kjv_events['verses'].apply(sort_verses)

In [544]:
kjv_books_abv = {'GEN': 'Genesis', 
                'EXOD': 'Exodus', 
                'LEV': 'Leviticus', 
                'NUM': 'Numbers', 
                'DEUT': 'Deuteronomy', 
                'JOSH': 'Joshua', 
                'JUDG': 'Judges', 
                'RUT': 'Ruth', 
                '1SAM': '1 Samuel', 
                '2SAM': '2 Samuel', 
                '1KGS': '1 Kings', 
                '2KGS': '2 Kings', 
                '1CHR': '1 Chronicles', 
                '2CHR': '2 Chronicles', 
                'EZR': 'Ezra', 
                'NEH': 'Nehemiah', 
                'EST': 'Esther', 
                'JOB': 'Job', 
                'PSA': 'Psalms', 
                'PRO': 'Proverbs', 
                'ECC': 'Ecclesiastes', 
                'SNG': 'Song of Solomon', 
                'ISA': 'Isaiah', 
                'JER': 'Jeremiah', 
                'LAM': 'Lamentations', 
                'EZEK': 'Ezekiel', 
                'DAN': 'Daniel', 
                'HOS': 'Hosea', 
                'JOEL': 'Joel', 
                'AMOS': 'Amos', 
                'OBAD': 'Obadiah', 
                'JONAH': 'Jonah', 
                'MIC': 'Micah', 
                'NAH': 'Nahum', 
                'HAB': 'Habakkuk', 
                'ZEPH': 'Zephaniah', 
                'HAG': 'Haggai', 
                'ZECH': 'Zechariah', 
                'MAL': 'Malachi', 
                'MATT': 'Matthew', 
                'MARK': 'Mark', 
                'LUKE': 'Luke', 
                'JOHN': 'John', 
                'ACTS': 'Acts', 
                'ROM': 'Romans', 
                '1CO': '1 Corinthians', 
                '2CO': '2 Corinthians', 
                'GAL': 'Galatians', 
                'EPH': 'Ephesians', 
                'PHP': 'Philippians', 
                'COL': 'Colossians', 
                '1TH': '1 Thessalonians', 
                '2TH': '2 Thessalonians', 
                '1TI': '1 Timothy', 
                '2TI': '2 Timothy', 
                'TIT': 'Titus', 
                'PHM': 'Philemon', 
                'HEB': 'Hebrews', 
                'JAS': 'James', 
                '1PE': '1 Peter', 
                '2PE': '2 Peter', 
                '1JN': '1 John', 
                '2JN': '2 John', 
                '3JN': '3 John', 
                'JUD': 'Jude', 
                'REV': 'Revelation'}

In [545]:
def format_verses(verses_str):
    verse_list = verses_str.split(',')
    formatted_verses = []
    
    for verse in verse_list:
        parts = verse.split('.')
        if len(parts) == 3:
            book_abv, chapter, verse_num = parts
            book_abv = book_abv.upper()
            # if book_abv in kjv_books_abv:
            formatted_verses.append(f"{kjv_books_abv[book_abv]} {chapter}:{verse_num}")
    
    return formatted_verses


df_kjv_events["verses_ls"] = df_kjv_events["verses"].apply(format_verses)
df_kjv_events.drop(columns=['verses'], inplace=True)

In [546]:
def group_verses(verses):
    """
    Given a list of verses (as strings), group them into lists where
    each verse is consecutive to the previous one (i.e. its index in df_kjv is one more).
    """
    groups = []
    current_group = []
    for v in verses:
        if not current_group:
            current_group.append(v)
        else:
            prev_v = current_group[-1]
            if verse_to_index.get(v, -1) == verse_to_index.get(prev_v, -2) + 1:
                current_group.append(v)
            else:
                groups.append(current_group)
                current_group = [v]
    if current_group:
        groups.append(current_group)
    return groups


df_kjv_events['verses_lss'] = df_kjv_events['verses_ls'].apply(group_verses)

In [547]:
df_kjv_events.drop(columns=['verses_ls'], inplace=True)
df_kjv_events.rename(columns={'verses_lss': 'verse_ref_ls_ls',
                            'startDate': 'start_date'}, inplace=True)

In [548]:
def get_testament_from_verse_list(verse_list):
    """
    Given a list of verse references, return the testament name for the
    first (i.e. minimum) verse in that list.
    """
    if verse_list:
        first_verse = verse_list[0][0]
        return verse_to_testament.get(first_verse, None)
    return None

df_kjv_events['testament_name'] = df_kjv_events['verse_ref_ls_ls'].apply(get_testament_from_verse_list)

In [549]:
def process_start_date(value):
    value = str(value)
    if '-' in value and len(value) > 5:
        year = value.split('-')[0]
    else:
        year = value
    return int(year)

df_kjv_events['start_year'] = df_kjv_events['start_date'].apply(process_start_date)
df_kjv_events.drop(columns=['start_date'], inplace=True)

In [550]:
def get_largest_duration(duration):
    unit_order = ['Y', 'M', 'W', 'D']
    
    matches = re.findall(r'(\d*\.?\d+)([YMWD])', duration)
    
    for unit in unit_order:
        for value, u in matches:
            if u == unit:
                return f"{value}{u}"
    return None

df_kjv_events['duration'] = df_kjv_events['duration'].apply(get_largest_duration)

In [551]:
df_kjv_events = df_kjv_events[['title', 
                            'start_year', 
                            'duration',
                            
                            'testament_name',
                            'verse_ref_ls_ls'
                            ]]

In [552]:
print(df_kjv_events)

                               title  start_year duration testament_name  \
0             Creation of all things       -4003       6D  Old Testament   
1           Creation of Adam and Eve       -4003       1D  Old Testament   
2                           The Fall       -4003       1D  Old Testament   
3                   Lifetime of Adam       -4003     930Y  Old Testament   
4                    Cain kills Abel       -3874       1D  Old Testament   
..                               ...         ...      ...            ...   
390  Paul's First Roman imprisonment          60       3Y  New Testament   
391         Feeding of Five Thousand          29       1D  New Testament   
392             Jesus Walks on Water          29       1D  New Testament   
393            Healing in Gennesaret          29       1D  New Testament   
394             Bread of Life Sermon          29       1D  New Testament   

                                       verse_ref_ls_ls  
0    [[Genesis 1:1, Genesis 1:

In [553]:
# df_kjv_events.to_csv("../dataset/kjv_events.csv", index=False)

### KJV Cross-References

In [608]:
df_kjv_cross_references = pd.read_csv("../data/kjv_cr.csv")

In [609]:
print(df_kjv_cross_references)

       book_name_source_min  chapter_number_source_min  \
0                   Genesis                          1   
1                   Genesis                          1   
2                   Genesis                          1   
3                   Genesis                          1   
4                   Genesis                          1   
...                     ...                        ...   
344794           Revelation                         22   
344795           Revelation                         22   
344796           Revelation                         22   
344797           Revelation                         22   
344798           Revelation                         22   

        verse_number_source_min  book_name_source_max  \
0                             1                   NaN   
1                             1                   NaN   
2                             1                   NaN   
3                             1                   NaN   
4                 

In [610]:
def format_book_name_source(row):
    chapter_min = int(row["chapter_number_source_min"]) if not pd.isna(row["chapter_number_source_min"]) else row["chapter_number_source_min"]
    verse_min = int(row["verse_number_source_min"]) if not pd.isna(row["verse_number_source_min"]) else row["verse_number_source_min"]
    
    if pd.isna(row["book_name_source_max"]):
        return f"{row['book_name_source_min']} {chapter_min}:{verse_min}"

    chapter_max = int(row["chapter_number_source_max"]) if not pd.isna(row["chapter_number_source_max"]) else row["chapter_number_source_max"]
    verse_max = int(row["verse_number_source_max"]) if not pd.isna(row["verse_number_source_max"]) else row["verse_number_source_max"]
    
    return f"{row['book_name_source_min']} {chapter_min}:{verse_min} - {row['book_name_source_max']} {chapter_max}:{verse_max}"



df_kjv_cross_references["verse_ref_source"] = df_kjv_cross_references.apply(format_book_name_source, axis=1)

In [611]:
def format_book_name_target(row):
    chapter_min = int(row["chapter_number_target_min"]) if not pd.isna(row["chapter_number_target_min"]) else row["chapter_number_target_min"]
    verse_min = int(row["verse_number_target_min"]) if not pd.isna(row["verse_number_target_min"]) else row["verse_number_target_min"]
    
    if pd.isna(row["book_name_target_max"]):
        return f"{row['book_name_target_min']} {chapter_min}:{verse_min}"

    chapter_max = int(row["chapter_number_target_max"]) if not pd.isna(row["chapter_number_target_max"]) else row["chapter_number_target_max"]
    verse_max = int(row["verse_number_target_max"]) if not pd.isna(row["verse_number_target_max"]) else row["verse_number_target_max"]
    
    return f"{row['book_name_target_min']} {chapter_min}:{verse_min} - {row['book_name_target_max']} {chapter_max}:{verse_max}"



df_kjv_cross_references["verse_ref_target"] = df_kjv_cross_references.apply(format_book_name_target, axis=1)

In [612]:
df_kjv_cross_references.drop(columns=["book_name_source_min", 
                                    "chapter_number_source_min", 
                                    "verse_number_source_min",

                                    "book_name_source_max", 
                                    "chapter_number_source_max", 
                                    "verse_number_source_max",
                                     
                                    "book_name_target_min", 
                                    "chapter_number_target_min", 
                                    "verse_number_target_min",
                                     
                                    "book_name_target_max", 
                                    "chapter_number_target_max", 
                                    "verse_number_target_max"], inplace=True)

In [613]:
verse_to_index = {verse: idx for idx, verse in enumerate(df_kjv['verse_ref'])}

In [614]:
verse_to_testament = {verse: testament for verse, testament in zip(df_kjv['verse_ref'], df_kjv['testament_name'])}

In [None]:
def get_verses_from_range(verse_range):
    """
    Given a range string such as "Genesis 50:25 - Exodus 1:2",
    return a list of all verse references from the start verse to the end verse (inclusive).
    """
    verse_range = verse_range.strip()
    parts = verse_range.split('-')
    if len(parts) != 2:
        return [verse_range]
    
    start_str = parts[0].strip()
    end_str = parts[1].strip()
    
    pattern = r"^(\d*\s*\D+?)\s+(\d+):(\d+)$"
    m_start = re.match(pattern, start_str)
    m_end = re.match(pattern, end_str)
    if not m_start or not m_end:
        # print(start_str, end_str)
        return [verse_range]
    
    start_book    = m_start.group(1).strip()
    start_chapter = m_start.group(2)
    start_verse   = m_start.group(3)
    
    end_book      = m_end.group(1).strip()
    end_chapter   = m_end.group(2)
    end_verse     = m_end.group(3)
    
    start_ref = f"{start_book} {start_chapter}:{start_verse}"
    if start_ref == "3 John 1:15":
        start_ref = "3 John 1:14"
        
    end_ref   = f"{end_book} {end_chapter}:{end_verse}"
    if end_ref == "3 John 1:15":
        end_ref = "3 John 1:14"
    
    if start_ref not in verse_to_index or end_ref not in verse_to_index:
        # print(m_start, m_end)
        # print(start_book, end_book)
        # print(start_ref)
        # print(end_ref)
        return [verse_range]
    
    start_idx = verse_to_index[start_ref]
    end_idx   = verse_to_index[end_ref]
    
    if start_idx > end_idx:
        start_idx, end_idx = end_idx, start_idx
    
    verses = df_kjv.loc[start_idx:end_idx, 'verse_ref'].tolist()
    return verses



def get_verse_list(verse_str):
    """
    Given a verse reference string (either a single verse like "Genesis 1:1"
    or a range such as "Genesis 50:25 - Exodus 1:2"),
    return a list of verse references.
      - If no '-' is present, returns a single–element list.
      - If a '-' is present, returns the list of all verses in the range.
    """
    if '-' in verse_str:
        return get_verses_from_range(verse_str)
    else:
        return [verse_str.strip()]



def get_testament_from_verse_list(verse_list):
    """
    Given a list of verse references, return the testament name for the
    first (i.e. minimum) verse in that list.
    """
    if verse_list:
        first_verse = verse_list[0]
        return verse_to_testament.get(first_verse, None)
    return None



df_kjv_cross_references['verse_ref_source_ls'] = df_kjv_cross_references['verse_ref_source'].apply(get_verse_list)
df_kjv_cross_references['verse_ref_target_ls'] = df_kjv_cross_references['verse_ref_target'].apply(get_verse_list)

df_kjv_cross_references['testament_name_source'] = df_kjv_cross_references['verse_ref_source_ls'].apply(get_testament_from_verse_list)
df_kjv_cross_references['testament_name_target'] = df_kjv_cross_references['verse_ref_target_ls'].apply(get_testament_from_verse_list)

In [618]:
print(df_kjv_cross_references[df_kjv_cross_references["verse_ref_source_ls"].apply(lambda x: len(x) > 1)])

Empty DataFrame
Columns: [verse_ref_source, verse_ref_target, verse_ref_source_ls, verse_ref_target_ls, testament_name_source, testament_name_target]
Index: []


In [619]:
print(df_kjv_cross_references[df_kjv_cross_references["verse_ref_target_ls"].apply(lambda x: len(x) > 1)])

        verse_ref_source                   verse_ref_target  \
13           Genesis 1:1  Colossians 1:16 - Colossians 1:17   
32           Genesis 1:1        Psalms 148:4 - Psalms 148:5   
41           Genesis 1:1                John 1:1 - John 1:3   
48           Genesis 1:1        Psalms 89:11 - Psalms 89:12   
54           Genesis 1:1          Romans 1:19 - Romans 1:20   
...                  ...                                ...   
344769  Revelation 22:19  Revelation 2:17 - Revelation 2:18   
344773  Revelation 22:19    Revelation 3:4 - Revelation 3:5   
344778  Revelation 22:19   Revelation 7:9 - Revelation 7:17   
344782  Revelation 22:20        2 Peter 3:12 - 2 Peter 3:14   
344794  Revelation 22:21    Ephesians 6:23 - Ephesians 6:24   

       verse_ref_source_ls                                verse_ref_target_ls  \
13           [Genesis 1:1]                 [Colossians 1:16, Colossians 1:17]   
32           [Genesis 1:1]                       [Psalms 148:4, Psalms 148:5]   


In [620]:
df_kjv_cross_references = df_kjv_cross_references[["testament_name_source", 
                                                "verse_ref_source_ls",
                                                "testament_name_target", 
                                                "verse_ref_target_ls"]]

In [621]:
print(df_kjv_cross_references)

       testament_name_source verse_ref_source_ls testament_name_target  \
0              Old Testament       [Genesis 1:1]         New Testament   
1              Old Testament       [Genesis 1:1]         Old Testament   
2              Old Testament       [Genesis 1:1]         Old Testament   
3              Old Testament       [Genesis 1:1]         Old Testament   
4              Old Testament       [Genesis 1:1]         New Testament   
...                      ...                 ...                   ...   
344794         New Testament  [Revelation 22:21]         New Testament   
344795         New Testament  [Revelation 22:21]         New Testament   
344796         New Testament  [Revelation 22:21]         New Testament   
344797         New Testament  [Revelation 22:21]         New Testament   
344798         New Testament  [Revelation 22:21]         New Testament   

                     verse_ref_target_ls  
0                           [1 John 1:1]  
1                        

In [624]:
# df_kjv_cross_references.to_csv("../dataset/kjv_cross_references.csv", index=False)