In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
pd.options.display.max_rows = 50

In [3]:
df = pd.read_csv("../data/kjv.csv")
df = df.astype({col: 'string' for col in df.select_dtypes(include='object').columns})

In [4]:
for k, v in df.items():
    print(k, v.dtype)

version_name string
version_abbr string
testament_abbr string
testament_name string
book_name string
book_number int64
chapter_number int64
verse_number int64
verse_text string


In [5]:
df.drop(['version_name', 'version_abbr'], axis=1, inplace=True)

In [6]:
print(df.sample(n=10, random_state=42))

      testament_abbr testament_name     book_name  book_number  \
1520              OT  Old Testament       Genesis            1   
30806             NT  New Testament    Revelation           66   
2932              OT  Old Testament     Leviticus            3   
11068             OT  Old Testament  1 Chronicles           13   
15647             OT  Old Testament        Psalms           19   
21107             OT  Old Testament       Ezekiel           26   
18109             OT  Old Testament        Isaiah           23   
13821             OT  Old Testament           Job           18   
13488             OT  Old Testament           Job           18   
24991             NT  New Testament          Luke           42   

       chapter_number  verse_number  \
1520               50            14   
30806               6            13   
2932                8            15   
11068              25            22   
15647             105            41   
21107              26             7   


In [7]:
print(df.describe(include='all'))

       testament_abbr testament_name book_name   book_number  chapter_number  \
count           31102          31102     31102  31102.000000    31102.000000   
unique              2              2        66           NaN             NaN   
top                OT  Old Testament    Psalms           NaN             NaN   
freq            23145          23145      2461           NaN             NaN   
mean              NaN            NaN       NaN     22.496142       20.631246   
std               NaN            NaN       NaN     16.495843       23.408441   
min               NaN            NaN       NaN      1.000000        1.000000   
25%               NaN            NaN       NaN      9.000000        6.000000   
50%               NaN            NaN       NaN     19.000000       14.000000   
75%               NaN            NaN       NaN     40.000000       26.000000   
max               NaN            NaN       NaN     66.000000      150.000000   

        verse_number                   

In [8]:
old_testament = [
    "Genesis", "Exodus", "Leviticus", "Numbers", "Deuteronomy",
    "Joshua", "Judges", "Ruth", "1 Samuel", "2 Samuel",
    "1 Kings", "2 Kings", "1 Chronicles", "2 Chronicles", "Ezra",
    "Nehemiah", "Esther", "Job", "Psalms", "Proverbs",
    "Ecclesiastes", "Song of Solomon", "Isaiah", "Jeremiah", "Lamentations",
    "Ezekiel", "Daniel", "Hosea", "Joel", "Amos",
    "Obadiah", "Jonah", "Micah", "Nahum", "Habakkuk",
    "Zephaniah", "Haggai", "Zechariah", "Malachi"
]

new_testament = [
    "Matthew", "Mark", "Luke", "John", "Acts",
    "Romans", "1 Corinthians", "2 Corinthians", "Galatians", "Ephesians",
    "Philippians", "Colossians", "1 Thessalonians", "2 Thessalonians", "1 Timothy",
    "2 Timothy", "Titus", "Philemon", "Hebrews", "James",
    "1 Peter", "2 Peter", "1 John", "2 John", "3 John",
    "Jude", "Revelation"
]

books = old_testament + new_testament

### Verses / Book

In [9]:
import pandas as pd
import altair as alt
from IPython.display import display, Markdown

In [10]:
df = pd.read_csv("../data/kjv.csv")
df = df.astype({col: 'string' for col in df.select_dtypes(include='object').columns})

df.drop(['version_name', 'version_abbr'], axis=1, inplace=True)

In [11]:
verse_count_per_book = df.groupby("book_name").size().reset_index(name="verse_count")
verse_count_per_book.rename(columns={"book_name": "Book", "verse_count": "# of Verses"}, inplace=True)

In [12]:
display(Markdown(f"### Verses / Book"))

chart_verse_count = (
    alt.Chart(verse_count_per_book)
    .mark_bar()
    .encode(
        x=alt.X("# of Verses:Q", title="# of Verses"),
        y=alt.Y("Book:N", sort=books, title="Book"),
        tooltip=["Book", "# of Verses"],
    )
    .properties(
                # title="# of Verses per Book",
                width='container',
                height=1000)
    .interactive()
)
display(chart_verse_count)

### Verses / Book

### Chapters / Book

In [13]:
import pandas as pd
import altair as alt
from IPython.display import display, Markdown

In [14]:
df = pd.read_csv("../data/kjv.csv")
df = df.astype({col: 'string' for col in df.select_dtypes(include='object').columns})

df.drop(['version_name', 'version_abbr'], axis=1, inplace=True)

In [15]:
chapter_count_per_book = df.groupby("book_name")["chapter_number"].max().reset_index()
chapter_count_per_book.rename(columns={"book_name": "Book", "chapter_number": "# of Chapters"}, inplace=True)

In [16]:
display(Markdown(f"### Chapters / Book"))

chart_chapter_count = (
    alt.Chart(chapter_count_per_book)
    .mark_bar()
    .encode(
        x=alt.X("# of Chapters:Q", title="# of Chapters"),
        y=alt.Y("Book:N", sort=books, title="Book"),
        tooltip=["Book", "# of Chapters"],
    )
    .properties(
                # title="# of Chapters per Book",
                width='container',
                height=1000)
    .interactive()
)
display(chart_chapter_count)

### Chapters / Book

### Verses / Chapter

In [17]:
import pandas as pd
import altair as alt
from IPython.display import display, Markdown

In [18]:
df = pd.read_csv("../data/kjv.csv")
df = df.astype({col: 'string' for col in df.select_dtypes(include='object').columns})

df.drop(['version_name', 'version_abbr'], axis=1, inplace=True)

In [19]:
chapter_verse_counts = df.groupby(["book_name", "chapter_number"]).size().reset_index(name="verse_count")
chapter_verse_counts.rename(columns={"book_name": "Book", "chapter_number": "Chapter", "verse_count": "# of Verses"}, inplace=True)

In [20]:
chart_verse_heatmap = (
    alt.Chart(chapter_verse_counts)
    .mark_rect()
    .encode(
        x=alt.X("Chapter:O", title="Chapter"),
        y=alt.Y("Book:N", sort=books, title="Book"),
        color=alt.Color("# of Verses:Q", title="# of Verses", scale=alt.Scale(scheme="blues")),
        tooltip=["Book", "Chapter", "# of Verses"],
    )
    .properties(
                # title="# of Verses per Chapter",
                width='container', 
                height=1000)
    .interactive()
)
display(chart_verse_heatmap)

### Verses / Testament

In [21]:
import pandas as pd
import altair as alt
from IPython.display import display, Markdown

In [22]:
df = pd.read_csv("../data/kjv.csv")
df = df.astype({col: 'string' for col in df.select_dtypes(include='object').columns})

df.drop(['version_name', 'version_abbr'], axis=1, inplace=True)

In [23]:
testament_contribution = df.groupby("testament_name").size().reset_index(name="verse_count")
testament_contribution.rename(columns={"testament_name": "Testament", "verse_count": "# of Verses"}, inplace=True)

In [24]:
display(Markdown(f"### Verses / Testament"))

chart_testament_pie = (
    alt.Chart(testament_contribution)
    .mark_arc()
    .encode(
        theta=alt.Theta("# of Verses:Q", title="Verse Count"),
        color=alt.Color("Testament:N", legend=alt.Legend(title="Testament")),
        tooltip=["Testament", "# of Verses"],
    )
    .properties(
                # title="Testament Contribution by Verse Count",
                width='container', 
                height=500)
    .interactive()
)

display(chart_testament_pie)

### Verses / Testament

### Lexical Richness / Book

In [25]:
import pandas as pd
import altair as alt
from IPython.display import display, Markdown

In [26]:
df = pd.read_csv("../data/kjv.csv")
df = df.astype({col: 'string' for col in df.select_dtypes(include='object').columns})

df.drop(['version_name', 'version_abbr'], axis=1, inplace=True)

In [27]:
def calculate_lexical_richness(df):
    lexical_data = []
    
    for book, verses in df.groupby("book_name"):
        all_text = " ".join(verses["verse_text"])
        words = all_text.split()
        total_words = len(words)
        unique_words = len(set(words))
        lexical_richness = unique_words / total_words if total_words > 0 else 0
        lexical_data.append({"book_name": book, "lexical_richness": lexical_richness})
        
    return pd.DataFrame(lexical_data)

In [28]:
lexical_richness_df = calculate_lexical_richness(df)
lexical_richness_df.rename(columns={"book_name": "Book", "lexical_richness": "Lexical Richness"}, inplace=True)

In [29]:
display(Markdown(f"### Lexical Richness by Book"))

chart_lexical_richness = (
    alt.Chart(lexical_richness_df)
    .mark_bar()
    .encode(
        x=alt.X("Lexical Richness:Q", title="Lexical Richness (Unique-to-Total Word Ratio)"),
        y=alt.Y("Book:N", title="Book", sort=books),
        tooltip=["Book", "Lexical Richness"]
    )
    .properties(
                title="Lexical Richness by Book", 
                width='container', 
                height=1000)
    .interactive()
)

display(chart_lexical_richness)

### Lexical Richness by Book

### Word Cloud

In [30]:
import pandas as pd
import plotly.graph_objects as go

import spacy
from wordcloud import WordCloud

from IPython.display import display, Markdown

In [31]:
df = pd.read_csv("../data/kjv.csv")
df = df.astype({col: 'string' for col in df.select_dtypes(include='object').columns})

df.drop(['version_name', 'version_abbr'], axis=1, inplace=True)

In [32]:
nlp = spacy.load("en_core_web_sm")

In [33]:
# def generate_word_cloud(book_name, df, pos_tag="NOUN"):
#     book_verses = df[df["book_name"] == book_name]["verse_text"]
    
#     if book_verses.empty:
#         print(f"No Verses in Book: {book_name}")
#         return
    
#     text = " ".join(book_verses)
    
#     doc = nlp(text)
#     tokens = [token.text for token in doc if token.pos_ == pos_tag]
#     texts = " ".join(tokens)
#     wordcloud = WordCloud(width=1000, height=500, background_color="white").generate(texts)
    
#     # wordcloud = WordCloud(width=800, height=400, background_color="white").generate(text)
    
#     plt.figure(figsize=(15, 10))
#     plt.imshow(wordcloud, interpolation="bilinear")
#     plt.axis("off")
#     plt.title(f"Word Cloud for {book_name}", fontsize=15)
#     plt.show()

In [34]:
def generate_word_cloud(book_name, df, pos_tag=None):
    book_verses = df[df["book_name"] == book_name]["verse_text"]
    
    if book_verses.empty:
        print(f"No Verses in Book: {book_name}")
        return
    
    text = " ".join(book_verses)
    
    if pos_tag is None:
        texts = text
    else:
        doc = nlp(text)
        tokens = [token.text for token in doc if token.pos_ == pos_tag]
        texts = " ".join(tokens)
    
    wordcloud = WordCloud(width=1000, height=500, background_color="white").generate(texts)
    wordcloud_image = wordcloud.to_image()
    
    display(Markdown(f"### {pos_tag.capitalize()} Cloud ({book_name})"))
    
    fig = go.Figure()
    
    fig.add_layout_image(
        dict(
            source=wordcloud_image,
            x=0,
            y=1,
            xref="paper",
            yref="paper",
            sizex=1,
            sizey=1,
            xanchor="left",
            yanchor="top",
            layer="below",
        )
    )
    
    # pos_tag = pos_tag.capitalize()
    
    fig.update_layout(
        # title=f"{pos_tag} Cloud ({book_name})",
        xaxis={"visible": False},
        yaxis={"visible": False},
        margin=dict(t=40, l=0, r=0, b=0),
    )
    
    fig.show()

In [35]:
generate_word_cloud("John", df, pos_tag="VERB")

### Verb Cloud (John)

### Sentiment Analysis (Books & Chapters)

In [36]:
import pandas as pd
import altair as alt

from nltk.sentiment import SentimentIntensityAnalyzer

from IPython.display import display, Markdown

In [37]:
# import nltk
# nltk.download("vader_lexicon")

In [38]:
df = pd.read_csv("../data/kjv.csv")
df = df.astype({col: 'string' for col in df.select_dtypes(include='object').columns})

df.drop(['version_name', 'version_abbr'], axis=1, inplace=True)

In [39]:
sia = SentimentIntensityAnalyzer()

In [40]:
def analyze_sentiments(df):
    df["sentiment"] = df["verse_text"].apply(lambda text: sia.polarity_scores(text)["compound"])
    
    sentiment_by_book = df.groupby("book_name")["sentiment"].mean().reset_index()
    sentiment_by_chapter = df.groupby(["book_name", "chapter_number"])["sentiment"].mean().reset_index()
    
    return sentiment_by_book, sentiment_by_chapter

In [41]:
sentiment_by_book, sentiment_by_chapter = analyze_sentiments(df)

#### Sentiment by Book

In [42]:
sentiment_by_book.rename(columns={"book_name": "Book", "sentiment": "Average Sentiment"}, inplace=True)

In [43]:
# sentiment_by_book.to_csv("../data/sentiment_by_book.csv", index=False)

In [44]:
display(Markdown(f"### Sentiment by Book"))

chart_sentiment_by_book = (
    alt.Chart(sentiment_by_book)
    .mark_bar()
    .encode(
        x=alt.X("Average Sentiment:Q", title="Average Sentiment"),
        y=alt.Y("Book:N", sort=books, title="Book"),
        tooltip=["Book", "Average Sentiment"]
    )
    .properties(
                # title="Sentiment by Book",
                width='container', 
                height=1000)
    .interactive()
)

display(chart_sentiment_by_book)

### Sentiment by Book

#### Sentiment by Chapter

In [45]:
sentiment_by_chapter.rename(columns={"book_name": "Book", "chapter_number": "Chapter", "sentiment": "Average Sentiment"}, inplace=True)

In [46]:
display(Markdown(f"### Sentiment by Chapter"))

chart_sentiment_by_chapter = (
    alt.Chart(sentiment_by_chapter)
    .mark_circle(size=100)
    .encode(
        x=alt.X("Chapter:Q", title="Chapter"),
        y=alt.Y("Average Sentiment:Q", title="Average Sentiment"),
        color=alt.Color("Book:N", sort=books, legend=alt.Legend(title="Book")),
        tooltip=["Book", "Chapter", "Average Sentiment"]
    )
    .properties(
                # title="Sentiment by Chapter",
                width='container', 
                height=1000)
    .interactive()
)

display(chart_sentiment_by_chapter)

### Sentiment by Chapter

### Biblical Places

In [47]:
import pandas as pd
import re
import requests

from tqdm import tqdm
tqdm.pandas()

import pydeck as pdk
pdk.settings.notebook_display = True
from IPython.display import display, Markdown

# from geopy.geocoders import Nominatim
# from geopy.exc import GeocoderTimedOut

In [48]:
df_kjv = pd.read_csv("../data/kjv.csv")
df_kjv = df_kjv.astype({col: 'string' for col in df_kjv.select_dtypes(include='object').columns})
df_kjv.drop(['version_name', 'version_abbr'], axis=1, inplace=True)

df_locs = pd.read_csv("../data/bib_locs.csv")

In [49]:
print(df_kjv)

      testament_abbr testament_name   book_name  book_number  chapter_number  \
0                 OT  Old Testament     Genesis            1               1   
1                 OT  Old Testament     Genesis            1               1   
2                 OT  Old Testament     Genesis            1               1   
3                 OT  Old Testament     Genesis            1               1   
4                 OT  Old Testament     Genesis            1               1   
...              ...            ...         ...          ...             ...   
31097             NT  New Testament  Revelation           66              22   
31098             NT  New Testament  Revelation           66              22   
31099             NT  New Testament  Revelation           66              22   
31100             NT  New Testament  Revelation           66              22   
31101             NT  New Testament  Revelation           66              22   

       verse_number                    

In [50]:
print(df_locs)

            verse name_id_ancient location_id_ancient name_id_modern  \
0      2 Kgs 5:12           Abana             aea17b7   Barada River   
1       Num 27:12          Abarim             aa8275b         Abarim   
2       Num 33:47          Abarim             aa8275b         Abarim   
3       Num 33:48          Abarim             aa8275b         Abarim   
4      Deut 32:49          Abarim             aa8275b         Abarim   
...           ...             ...                 ...            ...   
17065   1 Sam 9:5            Zuph             a4d0250         Al Ram   
17066   1 Sam 9:5            Zuph             a4d0250    Ramat Rahel   
17067   1 Sam 9:5            Zuph             a4d0250         Rantis   
17068   1 Sam 9:5            Zuph             a4d0250      Beit Rima   
17069   1 Sam 9:5            Zuph             a4d0250        Al Bira   

      location_id_modern   latitude  longitude  
0                m39ac0b  33.513542  36.305000  
1                m207993  31.753900  

#### Data Preparation

In [51]:
def parse_verse(verse):
    match = re.match(r"([0-9A-Za-z\s]+)\s+(\d+):(\d+)", verse)
    if match:
        book_abbr = match.group(1).strip()
        chapter = int(match.group(2))
        verse_num = int(match.group(3))
        return pd.Series([book_abbr, chapter, verse_num])
    return pd.Series([None, None, None])

In [52]:
df_locs[['book_abbr', 'chapter_number', 'verse_number']] = df_locs['verse'].apply(parse_verse)

In [53]:
books_kjv = df_kjv["book_name"].unique().tolist()
print(books_kjv)

['Genesis', 'Exodus', 'Leviticus', 'Numbers', 'Deuteronomy', 'Joshua', 'Judges', 'Ruth', '1 Samuel', '2 Samuel', '1 Kings', '2 Kings', '1 Chronicles', '2 Chronicles', 'Ezra', 'Nehemiah', 'Esther', 'Job', 'Psalms', 'Proverbs', 'Ecclesiastes', 'Song of Solomon', 'Isaiah', 'Jeremiah', 'Lamentations', 'Ezekiel', 'Daniel', 'Hosea', 'Joel', 'Amos', 'Obadiah', 'Jonah', 'Micah', 'Nahum', 'Habakkuk', 'Zephaniah', 'Haggai', 'Zechariah', 'Malachi', 'Matthew', 'Mark', 'Luke', 'John', 'Acts', 'Romans', '1 Corinthians', '2 Corinthians', 'Galatians', 'Ephesians', 'Philippians', 'Colossians', '1 Thessalonians', '2 Thessalonians', '1 Timothy', '2 Timothy', 'Titus', 'Philemon', 'Hebrews', 'James', '1 Peter', '2 Peter', '1 John', '2 John', '3 John', 'Jude', 'Revelation']


In [54]:
book_mapping = {
    # Old Testament (OT)
    "Gen": "Genesis",
    "Ex": "Exodus",
    "Lev": "Leviticus",
    "Num": "Numbers",
    "Deut": "Deuteronomy",
    "Josh": "Joshua",
    "Judg": "Judges",
    "Ruth": "Ruth",
    "1 Sam": "1 Samuel",
    "2 Sam": "2 Samuel",
    "1 Kgs": "1 Kings",
    "2 Kgs": "2 Kings",
    "1 Chr": "1 Chronicles",
    "2 Chr": "2 Chronicles",
    "Ezra": "Ezra",
    "Neh": "Nehemiah",
    "Est": "Esther",
    "Job": "Job",
    "Ps": "Psalms",
    "Prov": "Proverbs",
    "Eccl": "Ecclesiastes",
    "Sng": "Song of Solomon",
    "Isa": "Isaiah",
    "Jer": "Jeremiah",
    "Lam": "Lamentations",
    "Ezek": "Ezekiel",
    "Dan": "Daniel",
    "Hos": "Hosea",
    "Joel": "Joel",
    "Amos": "Amos",
    "Obad": "Obadiah",
    "Jonah": "Jonah",
    "Mic": "Micah",
    "Nahum": "Nahum",
    "Hab": "Habakkuk",
    "Zeph": "Zephaniah",
    "Hag": "Haggai",
    "Zech": "Zechariah",
    "Mal": "Malachi",

    # New Testament (NT)
    "Matt": "Matthew",
    "Mark": "Mark",
    "Luke": "Luke",
    "John": "John",
    "Acts": "Acts",
    "Rom": "Romans",
    "1 Cor": "1 Corinthians",
    "2 Cor": "2 Corinthians",
    "Gal": "Galatians",
    "Eph": "Ephesians",
    "Phil": "Philippians",
    "Col": "Colossians",
    "1 Thes": "1 Thessalonians",
    "2 Thes": "2 Thessalonians",
    "1 Tim": "1 Timothy",
    "2 Tim": "2 Timothy",
    "Titus": "Titus",
    "Phlm": "Philemon",
    "Heb": "Hebrews",
    "Jas": "James",
    "1 Pet": "1 Peter",
    "2 Pet": "2 Peter",
    "1 John": "1 John",
    "2 John": "2 John",
    "3 John": "3 John",
    "Jude": "Jude",
    "Rev": "Revelation"
}

In [55]:
df_locs['book_name'] = df_locs['book_abbr'].map(book_mapping)

In [56]:
df = pd.merge(
    df_locs,
    df_kjv[['book_name', 'book_number', 'chapter_number', 'verse_number', 'verse_text']],
    on=['book_name', 'chapter_number', 'verse_number'],
    how='inner'
)

In [57]:
df = df[[
    'book_name', 'book_number', 'chapter_number', 'verse_number',
    'verse_text', 'name_id_ancient', 'location_id_ancient',
    'name_id_modern', 'location_id_modern', 'latitude', 'longitude'
]]

In [58]:
print(df)

      book_name  book_number  chapter_number  verse_number  \
0       2 Kings           12               5            12   
1       2 Kings           12               5            12   
2       2 Kings           12               5            12   
3       2 Kings           12               5            12   
4       2 Kings           12               5            12   
...         ...          ...             ...           ...   
17065  1 Samuel            9               9             5   
17066  1 Samuel            9               9             5   
17067  1 Samuel            9               9             5   
17068  1 Samuel            9               9             5   
17069  1 Samuel            9               9             5   

                                              verse_text name_id_ancient  \
0      Are not Abana and Pharpar, rivers of Damascus,...           Abana   
1      Are not Abana and Pharpar, rivers of Damascus,...        Damascus   
2      Are not Abana and Ph

In [59]:
# df = pd.DataFrame({
#     "latitude": [33.513542, 33.511112, 33.416667, 33.313620, 33.540556],
#     "longitude": [36.305000, 36.306390, 36.133333, 36.055535, 36.353056]
# })

API_KEY = ''

In [60]:
geocode_cache = {}

def get_geocode_info(lat, lon):
    """
    Retrieve geocode information from Google Maps Geocoding API
    for the given latitude and longitude. Uses caching to avoid duplicate API calls.
    
    Returns a dict with keys:
      - global_code
      - formatted_address
      - route
      - neighborhood
      - administrative_area_level_2
      - administrative_area_level_1
      - locality
      - country
    If a field is not available in the API response, its value will be "-".
    """
    cache_key = (lat, lon)
    if cache_key in geocode_cache:
        return geocode_cache[cache_key]

    base_url = "https://maps.googleapis.com/maps/api/geocode/json"
    params = {
        "latlng": f"{lat},{lon}",
        "key": API_KEY
    }
    
    response = requests.get(base_url, params=params)
    data = response.json()

    result_info = {
        "global_code": "-",
        "formatted_address": "-",
        "route": "-",
        "neighborhood": "-",
        "administrative_area_level_2": "-",
        "administrative_area_level_1": "-",
        "locality": "-",
        "country": "-"
    }
    
    if "plus_code" in data and "global_code" in data["plus_code"]:
        result_info["global_code"] = data["plus_code"]["global_code"]

    results = data.get("results", [])
    if results:
        result_info["formatted_address"] = results[0].get("formatted_address", "-")
        
        for component in results[0].get("address_components", []):
            types = component.get("types", [])
            if "route" in types:
                result_info["route"] = component.get("long_name", "-")
            if "neighborhood" in types:
                result_info["neighborhood"] = component.get("long_name", "-")
            if "administrative_area_level_2" in types:
                result_info["administrative_area_level_2"] = component.get("long_name", "-")
            if "administrative_area_level_1" in types:
                result_info["administrative_area_level_1"] = component.get("long_name", "-")
            if "locality" in types:
                result_info["locality"] = component.get("long_name", "-")
            if "country" in types:
                result_info["country"] = component.get("long_name", "-")
        
        for result in results:
            for component in result.get("address_components", []):
                types = component.get("types", [])
                if result_info["route"] == "-" and "route" in types:
                    result_info["route"] = component.get("long_name", "-")
                if result_info["neighborhood"] == "-" and "neighborhood" in types:
                    result_info["neighborhood"] = component.get("long_name", "-")
                if result_info["administrative_area_level_2"] == "-" and "administrative_area_level_2" in types:
                    result_info["administrative_area_level_2"] = component.get("long_name", "-")
                if result_info["administrative_area_level_1"] == "-" and "administrative_area_level_1" in types:
                    result_info["administrative_area_level_1"] = component.get("long_name", "-")
                if result_info["locality"] == "-" and "locality" in types:
                    result_info["locality"] = component.get("long_name", "-")
                if result_info["country"] == "-" and "country" in types:
                    result_info["country"] = component.get("long_name", "-")
                    
    geocode_cache[cache_key] = result_info
    return result_info

def fetch_geocode(row):
    """Wrapper function to fetch geocode info for a DataFrame row."""
    lat = row['latitude']
    lon = row['longitude']
    info = get_geocode_info(lat, lon)
    return pd.Series(info)

In [61]:
# new_columns = [
#     "global_code", "formatted_address", "route", "neighborhood",
#     "administrative_area_level_2", "administrative_area_level_1", "locality", "country"
# ]

# df[new_columns] = df.progress_apply(fetch_geocode, axis=1)

#### Map Visualization

In [62]:
df = pd.read_csv("../data/kjv_locs_all.csv")

In [63]:
books_kjv = [
        'Genesis', 'Exodus', 'Leviticus', 'Numbers', 'Deuteronomy', 'Joshua', 
        'Judges', 'Ruth', '1 Samuel', '2 Samuel', '1 Kings', '2 Kings', 
        '1 Chronicles', '2 Chronicles', 'Ezra', 'Nehemiah', 'Esther', 'Job', 
        'Psalms', 'Proverbs', 'Ecclesiastes', 'Song of Solomon', 'Isaiah', 
        'Jeremiah', 'Lamentations', 'Ezekiel', 'Daniel', 'Hosea', 'Joel', 'Amos', 
        'Obadiah', 'Jonah', 'Micah', 'Nahum', 'Habakkuk', 'Zephaniah', 'Haggai', 
        'Zechariah', 'Malachi', 'Matthew', 'Mark', 'Luke', 'John', 'Acts', 
        'Romans', '1 Corinthians', '2 Corinthians', 'Galatians', 'Ephesians', 
        'Philippians', 'Colossians', '1 Thessalonians', '2 Thessalonians', 
        '1 Timothy', '2 Timothy', 'Titus', 'Philemon', 'Hebrews', 'James', 
        '1 Peter', '2 Peter', '1 John', '2 John', '3 John', 'Jude', 'Revelation'
    ]
    
books_kjv_abv = {
    "Genesis": "GEN",
    "Exodus": "EXO",
    "Leviticus": "LEV",
    "Numbers": "NUM",
    "Deuteronomy": "DEU",
    "Joshua": "JOS",
    "Judges": "JDG",
    "Ruth": "RUT",
    "1 Samuel": "1SA",
    "2 Samuel": "2SA",
    "1 Kings": "1KI",
    "2 Kings": "2KI",
    "1 Chronicles": "1CH",
    "2 Chronicles": "2CH",
    "Ezra": "EZR",
    "Nehemiah": "NEH",
    "Esther": "EST",
    "Job": "JOB",
    "Psalms": "PSA",
    "Proverbs": "PRO",
    "Ecclesiastes": "ECC",
    "Song of Solomon": "SNG",
    "Isaiah": "ISA",
    "Jeremiah": "JER",
    "Lamentations": "LAM",
    "Ezekiel": "EZK",
    "Daniel": "DAN",
    "Hosea": "HOS",
    "Joel": "JOL",
    "Amos": "AMO",
    "Obadiah": "OBA",
    "Jonah": "JON",
    "Micah": "MIC",
    "Nahum": "NAM",
    "Habakkuk": "HAB",
    "Zephaniah": "ZEP",
    "Haggai": "HAG",
    "Zechariah": "ZEC",
    "Malachi": "MAL",
    "Matthew": "MAT",
    "Mark": "MRK",
    "Luke": "LUK",
    "John": "JHN",
    "Acts": "ACT",
    "Romans": "ROM",
    "1 Corinthians": "1CO",
    "2 Corinthians": "2CO",
    "Galatians": "GAL",
    "Ephesians": "EPH",
    "Philippians": "PHP",
    "Colossians": "COL",
    "1 Thessalonians": "1TH",
    "2 Thessalonians": "2TH",
    "1 Timothy": "1TI",
    "2 Timothy": "2TI",
    "Titus": "TIT",
    "Philemon": "PHM",
    "Hebrews": "HEB",
    "James": "JAS",
    "1 Peter": "1PE",
    "2 Peter": "2PE",
    "1 John": "1JN",
    "2 John": "2JN",
    "3 John": "3JN",
    "Jude": "JUD",
    "Revelation": "REV"
}


countries_kjv = [
    'Syria', 'Jordan', 'Israel', 'Lebanon', 'Iraq', 'Egypt', 'Türkiye', 'Greece', 'North Macedonia', 
    'Italy', 'Libya', 'Cyprus', 'Saudi Arabia', 'Sudan', 'Djibouti', 'Iran', 'Yemen', 'Somalia', 'Azerbaijan', 
    'Armenia', 'Spain', 'Uganda', 'Tunisia', 'Croatia', 'Pakistan', 'Mozambique', 'India', 'Sri Lanka', 
    'Eritrea', 'Oman', 'Kuwait', 'Georgia', 'Bangladesh', 'Malta', 'United Arab Emirates'
]

In [64]:
df["count"] = df.groupby(["latitude", "longitude"])["latitude"].transform("count")
min_count, max_count = df["count"].min(), df["count"].max()
df["color_intensity"] = np.interp(df["count"], [min_count, max_count], [50, 255])

In [65]:
print(df)

      book_name  book_number  chapter_number  verse_number  \
0       2 Kings           12               5            12   
1       2 Kings           12               5            12   
2       2 Kings           12               5            12   
3       2 Kings           12               5            12   
4       2 Kings           12               5            12   
...         ...          ...             ...           ...   
17065  1 Samuel            9               9             5   
17066  1 Samuel            9               9             5   
17067  1 Samuel            9               9             5   
17068  1 Samuel            9               9             5   
17069  1 Samuel            9               9             5   

                                              verse_text name_id_ancient  \
0      Are not Abana and Pharpar, rivers of Damascus,...           Abana   
1      Are not Abana and Pharpar, rivers of Damascus,...        Damascus   
2      Are not Abana and Ph

In [66]:
layer = pdk.Layer(
    "ScatterplotLayer",
    data=df,
    get_position=["longitude", "latitude"],
    get_fill_color="[255, 0, 0, color_intensity]",
    get_radius=500,
    pickable=True,
)

mid_lat = df["latitude"].mean()
mid_long = df["longitude"].mean()

view_state = pdk.ViewState(
    longitude=mid_long,
    latitude=mid_lat,
    zoom=5,
    pitch=0,
)

tooltip = {
    "html": (
        "<b>Verse:</b> {book_name} {chapter_number}:{verse_number}<br/>"
        "<b>Scripture:</b> \"{verse_text}\"<br/>"
        "<b>Version:</b> KJV (King James Version)<br/>"
        "<b>Ancient Place:</b> {name_id_ancient}<br/>"
        "<b>Modern Place:</b> {name_id_modern}<br/>"
        "<b>----------</b><br/>"
        # "<b>Address:</b> {formatted_address}<br/>"
        # "<b>Latitude:</b> {latitude}<br/>"
        # "<b>Longitude:</b> {longitude}<br/>"
        "<b>District / County:</b> {administrative_area_level_2}<br/>"
        "<b>State / Province:</b> {administrative_area_level_1}<br/>"
        "<b>Country:</b> {country}<br/>"
        "<b>----------</b><br/>"
        "<b>Google Maps:</b> <a href='https://www.google.com/maps/place/{latitude},{longitude}' target='_blank'>{name_id_modern}</a> <br/>"
        "<b>Bible.com:</b> <a href='https://www.bible.com/bible/1/{books_kjv_abv[book_name]}.{chapter_number}.{verse_number}' target='_blank'>{book_name} {chapter_number}:{verse_number}</a>"
    ),
    "style": {
        "backgroundColor": "steelblue",
        "color": "white"
    }
}

In [67]:
# display(Markdown(f"### Biblical Places"))

# deck = pdk.Deck(
#     layers=[layer],
#     map_provider="carto",
#     initial_view_state=view_state,
#     height = 1000,
#     tooltip=tooltip,
# )

# deck.show()

### Cross References

In [68]:
import pandas as pd
from tqdm import tqdm
import numpy as np

In [69]:
df_kjv = pd.read_csv("../data/kjv.csv")
df_kjv = df_kjv.astype({col: 'string' for col in df_kjv.select_dtypes(include='object').columns})
df_kjv.drop(['version_name', 'version_abbr', 'testament_abbr'], axis=1, inplace=True)

In [70]:
print(df_kjv)

      testament_name   book_name  book_number  chapter_number  verse_number  \
0      Old Testament     Genesis            1               1             1   
1      Old Testament     Genesis            1               1             2   
2      Old Testament     Genesis            1               1             3   
3      Old Testament     Genesis            1               1             4   
4      Old Testament     Genesis            1               1             5   
...              ...         ...          ...             ...           ...   
31097  New Testament  Revelation           66              22            17   
31098  New Testament  Revelation           66              22            18   
31099  New Testament  Revelation           66              22            19   
31100  New Testament  Revelation           66              22            20   
31101  New Testament  Revelation           66              22            21   

                                              verse

In [71]:
df_cr = pd.read_csv("../data/cross_references.txt", sep="\t", skiprows=1, usecols=[0, 1], names=["from_verse", "to_verse"], engine='python')

In [72]:
print(df_cr)

       from_verse           to_verse
0         Gen.1.1          1John.1.1
1         Gen.1.1           Ps.134.3
2         Gen.1.1          Ps.102.25
3         Gen.1.1          Isa.51.16
4         Gen.1.1            Eph.3.9
...           ...                ...
344794  Rev.22.21  Eph.6.23-Eph.6.24
344795  Rev.22.21         2Cor.13.14
344796  Rev.22.21        2Thess.3.18
344797  Rev.22.21          Rom.16.20
344798  Rev.22.21            Rom.1.7

[344799 rows x 2 columns]


#### Data Preparation

In [73]:
books_cr = ['1Chr', '1Cor', '1John', '1Kgs', '1Pet', '1Sam', '1Thess', '1Tim', '2Chr', '2Cor', '2John', '2Kgs', '2Pet', '2Sam', '2Thess', '2Tim', '3John', 'Acts', 'Amos', 'Col', 'Dan', 'Deut', 'Eccl', 'Eph', 'Esth', 'Exod', 'Ezek', 'Ezra', 'Gal', 'Gen', 'Hab', 'Hag', 'Heb', 'Hos', 'Isa', 'Jas', 'Jer', 'Job', 'Joel', 'John', 'Jonah', 'Josh', 'Jude', 'Judg', 'Lam', 'Lev', 'Luke', 'Mal', 'Mark', 'Matt', 'Mic', 'Nah', 'Neh', 'Num', 'Obad', 'Phil', 'Phlm', 'Prov', 'Ps', 'Rev', 'Rom', 'Ruth', 'Song', 'Titus', 'Zech', 'Zeph']

In [74]:
book_mapping = {
    # Old Testament (OT)
    "Gen": "Genesis",
    "Exod": "Exodus",
    "Lev": "Leviticus",
    "Num": "Numbers",
    "Deut": "Deuteronomy",
    "Josh": "Joshua",
    "Judg": "Judges",
    "Ruth": "Ruth",
    "1Sam": "1 Samuel",
    "2Sam": "2 Samuel",
    "1Kgs": "1 Kings",
    "2Kgs": "2 Kings",
    "1Chr": "1 Chronicles",
    "2Chr": "2 Chronicles",
    "Ezra": "Ezra",
    "Neh": "Nehemiah",
    "Esth": "Esther",
    "Job": "Job",
    "Ps": "Psalms",
    "Prov": "Proverbs",
    "Eccl": "Ecclesiastes",
    "Song": "Song of Solomon",
    "Isa": "Isaiah",
    "Jer": "Jeremiah",
    "Lam": "Lamentations",
    "Ezek": "Ezekiel",
    "Dan": "Daniel",
    "Hos": "Hosea",
    "Joel": "Joel",
    "Amos": "Amos",
    "Obad": "Obadiah",
    "Jonah": "Jonah",
    "Mic": "Micah",
    "Nah": "Nahum",
    "Hab": "Habakkuk",
    "Zeph": "Zephaniah",
    "Hag": "Haggai",
    "Zech": "Zechariah",
    "Mal": "Malachi",

    # New Testament (NT)
    "Matt": "Matthew",
    "Mark": "Mark",
    "Luke": "Luke",
    "John": "John",
    "Acts": "Acts",
    "Rom": "Romans",
    "1Cor": "1 Corinthians",
    "2Cor": "2 Corinthians",
    "Gal": "Galatians",
    "Eph": "Ephesians",
    "Phil": "Philippians",
    "Col": "Colossians",
    "1Thess": "1 Thessalonians",
    "2Thess": "2 Thessalonians",
    "1Tim": "1 Timothy",
    "2Tim": "2 Timothy",
    "Titus": "Titus",
    "Phlm": "Philemon",
    "Heb": "Hebrews",
    "Jas": "James",
    "1Pet": "1 Peter",
    "2Pet": "2 Peter",
    "1John": "1 John",
    "2John": "2 John",
    "3John": "3 John",
    "Jude": "Jude",
    "Rev": "Revelation"
}

In [75]:
print(sorted(set(books_cr) - set(book_mapping.keys())))

[]


In [76]:
def parse_verse_string(verse_str, mapping):
    """
    Given a string like "Eph.6.22" or "Eph.6.22-Eph.6.24",
    parse it into a dictionary with:
      - book_name_min, chapter_number_min, verse_number_min
      - (if a range) book_name_max, chapter_number_max, verse_number_max
    If the abbreviation is not found in mapping, the original is kept.
    """
    
    if '-' in verse_str:
        left_part, right_part = verse_str.split('-')

        try:
            book, chapter, verse = left_part.split('.')
        except ValueError:
            print("Error parsing (left part) verse:", verse_str)
            return None

        if '.' in right_part:
            parts = right_part.split('.')
            if len(parts) == 3:
                book_max, chapter_max, verse_max = parts
            else:
                chapter_max, verse_max = parts
                book_max = book
        else:
            book_max = book
            chapter_max = chapter
            verse_max = right_part

        return {
            'book_name_min': mapping.get(book, book),
            'chapter_number_min': int(chapter),
            'verse_number_min': int(verse),
            'book_name_max': mapping.get(book_max, book_max),
            'chapter_number_max': int(chapter_max),
            'verse_number_max': int(verse_max)
        }
    else:
        try:
            book, chapter, verse = verse_str.split('.')
        except ValueError:
            print("Error parsing verse:", verse_str)
            return None
        return {
            'book_name_min': mapping.get(book, book),
            'chapter_number_min': int(chapter),
            'verse_number_min': int(verse),
            'book_name_max': np.nan,
            'chapter_number_max': np.nan,
            'verse_number_max': np.nan
        }

In [77]:
def get_verse_text(info, df_kjv):
    """
    Given a dictionary (as returned by parse_verse_string) and the df_kjv dataframe,
    return the verse text. If info indicates a range (i.e. book_name_max is not nan),
    then iterate row‐by‐row (assuming df_kjv is sorted in Bible order) starting at the
    _min verse until the _max verse is reached (inclusive).
    """
    
    if pd.isna(info['book_name_max']):
        cond = (
            (df_kjv['book_name'] == info['book_name_min']) &
            (df_kjv['chapter_number'] == info['chapter_number_min']) &
            (df_kjv['verse_number'] == info['verse_number_min'])
        )
        row_found = df_kjv.loc[cond]
        if not row_found.empty:
            return row_found.iloc[0]['verse_text']
        else:
            return ""
    else:
        cond_start = (
            (df_kjv['book_name'] == info['book_name_min']) &
            (df_kjv['chapter_number'] == info['chapter_number_min']) &
            (df_kjv['verse_number'] == info['verse_number_min'])
        )
        start_rows = df_kjv.loc[cond_start]
        if start_rows.empty:
            return ""
        start_idx = start_rows.index[0]
        pos = df_kjv.index.get_loc(start_idx)
        texts = []
        
        for row in df_kjv.iloc[pos:].itertuples(index=False):
            texts.append(row.verse_text)
            
            if (row.book_name == info['book_name_max'] and 
                row.chapter_number == info['chapter_number_max'] and 
                row.verse_number == info['verse_number_max']):
                break
        return " ".join(texts).strip()

In [78]:
results = []

# for _, row in tqdm(df_cr.iterrows(), total=len(df_cr)):
#     source_str = row['from_verse']
#     target_str = row['to_verse']
    
#     source_info = parse_verse_string(source_str, book_mapping)
#     target_info = parse_verse_string(target_str, book_mapping)
    
#     if source_info is None or target_info is None:
#         continue
    
#     verse_text_source = get_verse_text(source_info, df_kjv)
#     verse_text_target = get_verse_text(target_info, df_kjv)
    
#     results.append({
#         "book_name_source_min": source_info['book_name_min'],
#         "chapter_number_source_min": source_info['chapter_number_min'],
#         "verse_number_source_min": source_info['verse_number_min'],
#         "book_name_source_max": source_info['book_name_max'],
#         "chapter_number_source_max": source_info['chapter_number_max'],
#         "verse_number_source_max": source_info['verse_number_max'],
#         "verse_text_source": verse_text_source,
#         "book_name_target_min": target_info['book_name_min'],
#         "chapter_number_target_min": target_info['chapter_number_min'],
#         "verse_number_target_min": target_info['verse_number_min'],
#         "book_name_target_max": target_info['book_name_max'],
#         "chapter_number_target_max": target_info['chapter_number_max'],
#         "verse_number_target_max": target_info['verse_number_max'],
#         "verse_text_target": verse_text_target,
#     })

In [79]:
# df = pd.DataFrame(results, columns=[
#     "book_name_source_min",
#     "chapter_number_source_min",
#     "verse_number_source_min",
#     "book_name_source_max",
#     "chapter_number_source_max",
#     "verse_number_source_max",
#     "verse_text_source",
#     "book_name_target_min",
#     "chapter_number_target_min",
#     "verse_number_target_min",
#     "book_name_target_max",
#     "chapter_number_target_max",
#     "verse_number_target_max",
#     "verse_text_target"
# ])

In [80]:
# df.to_csv("../data/kjv_cr_all.csv", index=False)

df = pd.read_csv("../data/kjv_cr_all.csv", dtype=str)

In [81]:
print(df)

       book_name_source_min chapter_number_source_min verse_number_source_min  \
0                   Genesis                         1                       1   
1                   Genesis                         1                       1   
2                   Genesis                         1                       1   
3                   Genesis                         1                       1   
4                   Genesis                         1                       1   
...                     ...                       ...                     ...   
344794           Revelation                        22                      21   
344795           Revelation                        22                      21   
344796           Revelation                        22                      21   
344797           Revelation                        22                      21   
344798           Revelation                        22                      21   

       book_name_source_max

#### Data Checking

In [82]:
books_kjv = [
    'Genesis', 'Exodus', 'Leviticus', 'Numbers', 'Deuteronomy', 'Joshua', 
    'Judges', 'Ruth', '1 Samuel', '2 Samuel', '1 Kings', '2 Kings', 
    '1 Chronicles', '2 Chronicles', 'Ezra', 'Nehemiah', 'Esther', 'Job', 
    'Psalms', 'Proverbs', 'Ecclesiastes', 'Song of Solomon', 'Isaiah', 
    'Jeremiah', 'Lamentations', 'Ezekiel', 'Daniel', 'Hosea', 'Joel', 'Amos', 
    'Obadiah', 'Jonah', 'Micah', 'Nahum', 'Habakkuk', 'Zephaniah', 'Haggai', 
    'Zechariah', 'Malachi', 'Matthew', 'Mark', 'Luke', 'John', 'Acts', 
    'Romans', '1 Corinthians', '2 Corinthians', 'Galatians', 'Ephesians', 
    'Philippians', 'Colossians', '1 Thessalonians', '2 Thessalonians', 
    '1 Timothy', '2 Timothy', 'Titus', 'Philemon', 'Hebrews', 'James', 
    '1 Peter', '2 Peter', '1 John', '2 John', '3 John', 'Jude', 'Revelation'
]

In [83]:
books_cr = set(df["book_name_source_min"]).union(
    df["book_name_source_max"],
    df["book_name_target_min"],
    df["book_name_target_max"]
)

books_cr_unique = {book for book in books_cr if pd.notna(book)}

In [84]:
print(books_cr_unique.issubset(set(books_kjv)))

True


In [85]:
invalid_cr_source = df[
    ((df["book_name_source_max"].isna() |
        df["chapter_number_source_max"].isna() |
        df["verse_number_source_max"].isna()) &
        ~(df["book_name_source_max"].isna() &
        df["chapter_number_source_max"].isna() &
        df["verse_number_source_max"].isna()))
    |
    ((~df["book_name_source_max"].isna() |
        ~df["chapter_number_source_max"].isna() |
        ~df["verse_number_source_max"].isna()) &
        (df["book_name_source_max"].isna() |
        df["chapter_number_source_max"].isna() |
        df["verse_number_source_max"].isna()))
]


invalid_cr_target = df[
    ((df["book_name_target_max"].isna() |
        df["chapter_number_target_max"].isna() |
        df["verse_number_target_max"].isna()) &
        ~(df["book_name_target_max"].isna() &
        df["chapter_number_target_max"].isna() &
        df["verse_number_target_max"].isna()))
    |
    ((~df["book_name_target_max"].isna() |
        ~df["chapter_number_target_max"].isna() |
        ~df["verse_number_target_max"].isna()) &
        (df["book_name_target_max"].isna() |
        df["chapter_number_target_max"].isna() |
        df["verse_number_target_max"].isna()))
]

In [86]:
print(invalid_cr_source)
print(invalid_cr_target)

Empty DataFrame
Columns: [book_name_source_min, chapter_number_source_min, verse_number_source_min, book_name_source_max, chapter_number_source_max, verse_number_source_max, verse_text_source, book_name_target_min, chapter_number_target_min, verse_number_target_min, book_name_target_max, chapter_number_target_max, verse_number_target_max, verse_text_target]
Index: []
Empty DataFrame
Columns: [book_name_source_min, chapter_number_source_min, verse_number_source_min, book_name_source_max, chapter_number_source_max, verse_number_source_max, verse_text_source, book_name_target_min, chapter_number_target_min, verse_number_target_min, book_name_target_max, chapter_number_target_max, verse_number_target_max, verse_text_target]
Index: []
