In [1]:
from bs4 import BeautifulSoup
import pandas as pd

In [2]:
filepath = "C:\\Users\\hlmq\\code\\esv_bible_qa\\app\\src\\short_name_book_mapping.csv"

book_lookup_df = pd.read_csv(filepath)

In [3]:
# ------------- #
# HTML Parsing
# ------------- #

def parse_html_file(file_path):
    """
    Parses an HTML file and extracts data.

    Args:
        file_path (str): The path to the HTML file.

    Returns:
        bs4.BeautifulSoup: A BeautifulSoup object representing the parsed HTML.
    """
    try:
        with open(file_path, 'r', encoding='utf-8') as file:
            html_content = file.read()
        soup = BeautifulSoup(html_content, 'html.parser')
        return soup
    except FileNotFoundError:
         print(f"Error: File not found at '{file_path}'")
         return None
    


# ------------- #
# Verses
# ------------- #

def match_book_names_to_short_names(short_code:str, book_lookup_df:dict[str,str]=book_lookup_df)->dict[str,str]:
    """ Creates a list of the full name of a book with the same length as list_of_verses. """
    book_name = book_lookup_df.loc[book_lookup_df['Short Name'] == short_code, 'Book'].values
    return book_name[0]


def parse_verse(list_of_verses:list[str]):
    """ Global function to orchestrate each of the pieces to parse from the verse string. """


    book_list = []
    chapter_list = []
    start_verse_list = []
    end_verse_list = []

    # Iterate the values
    for item in list_of_verses:
        # Item looks like: 'Ge 1:1-2'
        
        # Get book
        short_code = item.split(" ")[0]
        book = match_book_names_to_short_names(short_code=short_code)
        # Get chapter
        chapter = item.split(" ")[1].split(":")[0]
        # Start, End Verse.  Handles when only one verse is listed.
        if "-" in item:
            start_verse = item.split(" ")[1].split(":")[1].split("-")[0]
            end_verse = item.split("-")[1]
        else:
            start_verse = item.split(" ")[1].split(":")[1]
            end_verse = start_verse
        
        # Append values to lists
        book_list.append(book)
        chapter_list.append(chapter)
        start_verse_list.append(start_verse)
        end_verse_list.append(end_verse)

    return book_list, chapter_list, start_verse_list, end_verse_list


# ------------- #
# Summaries
# ------------- #

def clean_verse_summaries(list_of_summaries:list[str])->list[str]:
    """ Strips off unnecessary text from strings in a list. """
    clean_list_of_summaries = []

    for item in list_of_summaries:
        # Cast as a string
        item = str(item)
        # Remove stuff on the left
        item = item.replace("<td class=\"summary\">", "")
        # Remove stuff on the right
        item = item.replace("</td>", "")
        # Add cleaned obs to list
        clean_list_of_summaries.append(item)
    
    return clean_list_of_summaries




## Handle Verses

In [4]:
file_path = "C:\\Users\\hlmq\\OneDrive - Chevron\\Desktop\\Projects\\Bible\\Outline for Book genesis.html"

# Parse the html
soup = parse_html_file(file_path)

if soup:
    # Extract all verses
    verses = [a.text for a in soup.find_all('a', class_="vcVerseLink")]
    
    # Extract all summaries
    summaries = [a for a in soup.find_all('td', class_="summary")]

In [5]:
book_list, chapter_list, start_verse_list, end_verse_list = parse_verse(verses)

## Handle Summaries

In [6]:
clean_summaries = clean_verse_summaries(summaries)

## Compile Results and Output

In [7]:
output = pd.DataFrame()

output['Book'] = book_list
output['Chapter'] = chapter_list
output['Start Verse'] = start_verse_list
output['End Verse'] = end_verse_list
output['Summary'] = clean_summaries

output.head()

Unnamed: 0,Book,Chapter,Start Verse,End Verse,Summary
0,Genesis,1,1,2,God creates heaven and earth;
1,Genesis,1,3,5,the light;
2,Genesis,1,6,8,the firmament;
3,Genesis,1,9,13,separates the dry land;
4,Genesis,1,14,19,"forms the sun, moon, and stars;"


In [8]:
output.tail()

Unnamed: 0,Book,Chapter,Start Verse,End Verse,Summary
321,Genesis,50,22,22,His age.
322,Genesis,50,23,23,He sees the third generation of his sons.
323,Genesis,50,24,24,He prophesies unto his brethren of their return.
324,Genesis,50,25,25,He takes an oath of them concerning his bones.
325,Genesis,50,26,26,"He dies, and is put into a coffin."
