In [1]:
import pandas as pd
import re

In [2]:
def extract_heading(row):
    """
    Extracts the heading text from the parsed_text column if the verse_ref is 'heading'.
    """
    if row['verse_ref'] == 'heading':
        return row['parsed_text']
    return None

def clean_special_characters(text):
    """
    Cleans special characters from the text.
    """
    if pd.isna(text) or text == '':
        return text
    
    # Define a regex pattern to match common special characters
    pattern = r'[^\w\s,.!?;:\'\"-]'
    
    # Replace special characters with an empty string
    cleaned_text = re.sub(pattern, '', text)
    
    return cleaned_text

def extract_chapter(verse_ref):
    """
    Extracts the chapter number from the verse_ref.
    """
    match = re.search(r'(\d+):', verse_ref)
    if match:
        return match.group(1)
    return None

def extract_book(parsed_text):
    """
    Extracts the book name from the parsed_text if it is in all capital letters.
    """
    if pd.isna(parsed_text) or parsed_text == '':
        return None
    
    # Split the text into words and check for all capital letters
    words = parsed_text.split()
    for word in words:
        if word.isupper() and len(word) > 1:  # Ensure it's a valid book name
            return word
    return None


In [3]:
# Input data
INPUT_FILENAME = 'ESV_Bible_Extracted_and_Parsed.csv'
INPUT_FILEPATH = "C:\\Users\\hlmq\\code\\bible-app\\Process ESV\\Out\\"

df = pd.read_csv(INPUT_FILEPATH + INPUT_FILENAME)

In [4]:
df.head()

Unnamed: 0,pageContent,original_index,verse_ref,parsed_text,verse_type
0,JOHN\nChapter 1\nChapter 2\nChapter 3\nChapter...,0,unknown,JOHN Chapter 1 Chapter 2 Chapter 3 Chapter 4 C...,unknown
1,The Word Became Flesh\n1:1 In the beginning wa...,1,heading,The Word Became Flesh,heading
2,The Word Became Flesh\n1:1 In the beginning wa...,1,1:1,"In the beginning was the Word, and the Word wa...",chapter_verse
3,The Word Became Flesh\n1:1 In the beginning wa...,1,2,He was in the beginning with God.,verse
4,The Word Became Flesh\n1:1 In the beginning wa...,1,3,"All things were made through him, and without ...",verse


In [5]:
# Apply the function to create the new 'heading' column
df['heading'] = df.apply(extract_heading, axis=1)
# Forward fill the headings to apply them to all relevant rows
df['heading'] = df['heading'].ffill()

In [6]:
# Apply the function to extract chapter numbers from verse_ref
df['chapter'] = df['verse_ref'].apply(extract_chapter)
# Forward fill the chapter numbers to apply them to all relevant rows
df['chapter'] = df['chapter'].ffill()
df.head()

Unnamed: 0,pageContent,original_index,verse_ref,parsed_text,verse_type,heading,chapter
0,JOHN\nChapter 1\nChapter 2\nChapter 3\nChapter...,0,unknown,JOHN Chapter 1 Chapter 2 Chapter 3 Chapter 4 C...,unknown,,
1,The Word Became Flesh\n1:1 In the beginning wa...,1,heading,The Word Became Flesh,heading,The Word Became Flesh,
2,The Word Became Flesh\n1:1 In the beginning wa...,1,1:1,"In the beginning was the Word, and the Word wa...",chapter_verse,The Word Became Flesh,1.0
3,The Word Became Flesh\n1:1 In the beginning wa...,1,2,He was in the beginning with God.,verse,The Word Became Flesh,1.0
4,The Word Became Flesh\n1:1 In the beginning wa...,1,3,"All things were made through him, and without ...",verse,The Word Became Flesh,1.0


In [7]:
# Apply the function to create the new 'book' column
df['book'] = df.apply(lambda row: extract_book(row['parsed_text']) if pd.isna(row['heading']) else None, axis=1)
# Forward fill the book names to apply them to all relevant rows
df['book'] = df['book'].ffill()

In [18]:
# Can you modify this to flag when 'Footnotes' is a substring of the 'heading' column?  If it is, I would like to remove that row from the DataFrame.
df = df[~df['heading'].str.contains('Footnotes', na=False)]
df = df[df['verse_ref']!= 'heading']  # Remove rows where verse_ref is 'heading'
len(df)

963

In [19]:
df.head()

Unnamed: 0,pageContent,original_index,verse_ref,parsed_text,verse_type,heading,chapter,book
2,The Word Became Flesh\n1:1 In the beginning wa...,1,1:1,"In the beginning was the Word, and the Word wa...",chapter_verse,The Word Became Flesh,1,JOHN
3,The Word Became Flesh\n1:1 In the beginning wa...,1,2,He was in the beginning with God.,verse,The Word Became Flesh,1,JOHN
4,The Word Became Flesh\n1:1 In the beginning wa...,1,3,"All things were made through him, and without ...",verse,The Word Became Flesh,1,JOHN
5,The Word Became Flesh\n1:1 In the beginning wa...,1,4,"In him was life,1 and the life was the light o...",verse,The Word Became Flesh,1,JOHN
6,The Word Became Flesh\n1:1 In the beginning wa...,1,5,"The light shines in the darkness, and the dark...",verse,The Word Became Flesh,1,JOHN


In [20]:
df.tail()

Unnamed: 0,pageContent,original_index,verse_ref,parsed_text,verse_type,heading,chapter,book
1149,The Lame Beggar Healed\n3:1 Now Peter and John...,91,6,"But Peter said, I have no silver and gold, but...",verse,The Lame Beggar Healed,3,JOHN
1150,The Lame Beggar Healed\n3:1 Now Peter and John...,91,7,And he took him by the right hand and raised h...,verse,The Lame Beggar Healed,3,JOHN
1151,The Lame Beggar Healed\n3:1 Now Peter and John...,91,8,"And leaping up he stood and began to walk, and...",verse,The Lame Beggar Healed,3,JOHN
1152,The Lame Beggar Healed\n3:1 Now Peter and John...,91,9,And all the people saw him walking and praisin...,verse,The Lame Beggar Healed,3,JOHN
1153,The Lame Beggar Healed\n3:1 Now Peter and John...,91,10,and recognized him as the one who sat at the B...,verse,The Lame Beggar Healed,3,JOHN


In [11]:
len(df)

1154

In [21]:
df.to_csv(INPUT_FILEPATH + 'ESV_Bible_Processed.csv', index=False)