In [1]:
import pandas as pd
import re

In [2]:
# Functions

def parse_bible_text(pageContent):
    """
    Parse bible pageContent and split it into separate entries based on verse numbers.
    
    This function looks for patterns like:
    - 4:1 (chapter:verse at beginning)
    - 2( or 3h or 4A etc. (verse number followed by letter)
    
    Args:
        pageContent (str): The raw bible pageContent to parse
        
    Returns:
        list: List of dictionaries containing parsed verses
    """
    
    # Remove extra whitespace and normalize pageContent
    pageContent = re.sub(r'\s+', ' ', pageContent.strip())
    
    # Find all verse markers - looking for number followed by letter or punctuation
    # Pattern explanation:
    # \b(\d+) - word boundary followed by one or more digits (verse number)
    # (?=[a-zA-Z(]) - positive lookahead for letter or opening parenthesis
    verse_pattern = r'\b(\d+)(?=[a-zA-Z(])'
    
    # Also look for chapter:verse pattern at the beginning
    chapter_verse_pattern = r'\b(\d+:\d+)\s'
    
    # Find all matches
    verse_matches = list(re.finditer(verse_pattern, pageContent))
    chapter_matches = list(re.finditer(chapter_verse_pattern, pageContent))
    
    # Combine and sort all matches by position
    all_matches = []
    
    # Add chapter:verse matches
    for match in chapter_matches:
        all_matches.append({
            'start': match.start(),
            'end': match.end(),
            'verse_ref': match.group(1),
            'type': 'chapter_verse'
        })
    
    # Add verse-only matches
    for match in verse_matches:
        all_matches.append({
            'start': match.start(),
            'end': match.start() + len(match.group(1)),  # Only include the number, preserve letter
            'verse_ref': match.group(1),
            'type': 'verse'
        })
    
    # Sort by position
    all_matches.sort(key=lambda x: x['start'])
    
    # Split text based on matches
    parsed_verses = []
    
    if not all_matches:
        # No verses found, return the whole pageContent
        return [{'verse_ref': 'unknown', 'pageContent': pageContent.strip(), 'type': 'unknown'}]
    
    # Handle pageContent before first verse (usually heading)
    if all_matches[0]['start'] > 0:
        heading_pageContent = pageContent[:all_matches[0]['start']].strip()
        if heading_pageContent:
            parsed_verses.append({
                'verse_ref': 'heading',
                'pageContent': heading_pageContent,
                'type': 'heading'
            })
    
    # Process each verse
    for i, match in enumerate(all_matches):
        # Determine where this verse ends
        if i + 1 < len(all_matches):
            verse_end = all_matches[i + 1]['start']
        else:
            verse_end = len(pageContent)
        
        # Extract verse pageContent
        verse_text = pageContent[match['end']:verse_end].strip()
        
        if verse_text:  # Only add if there's actual content
            parsed_verses.append({
                'verse_ref': match['verse_ref'],
                'pageContent': verse_text,
                'type': match['type']
            })
    
    return parsed_verses


def clean_csv_data(df, text_column='pageContent'):
    """
    Apply the parsing function to a pandas DataFrame.
    
    Args:
        df (DataFrame): The input DataFrame
        text_column (str): Name of the column containing the text to parse
        
    Returns:
        DataFrame: New DataFrame with parsed verses
    """
    
    all_parsed = []
    
    for index, row in df.iterrows():
        text = str(row[text_column])
        parsed_verses = parse_bible_text(text)
        
        for verse in parsed_verses:
            new_row = row.copy()
            new_row['original_index'] = index
            new_row['verse_ref'] = verse['verse_ref']
            new_row['parsed_text'] = verse['pageContent']
            new_row['verse_type'] = verse['type']
            all_parsed.append(new_row)
    
    return pd.DataFrame(all_parsed)


# Replace special characters like â€œ with proper quotes
def clean_special_characters(text):
    """Clean common encoding issues in text"""
    replacements = {
        'â€œ': '"',
        'â€': '"', 
        'â€™': "'",
        'â€˜': "'",
        'â€"': '—',
        'â€"': '–'
    }
    
    # Apply all replacements - this handles characters anywhere in the text
    for old, new in replacements.items():
        text = text.replace(old, new)
    
    return text


def consolidate_rows_without_verses(df, text_column='pageContent'):
    """
    Consolidate rows by appending text from rows without verse numbers 
    to the previous row.
    
    Args:
        df (DataFrame): The input DataFrame
        text_column (str): Name of the column containing the text to check
        
    Returns:
        DataFrame: New DataFrame with consolidated rows
    """
    
    def has_verse_numbers(text):
        """Check if text contains verse number patterns"""
        if pd.isna(text) or text.strip() == '':
            return False
            
        text = str(text)
        
        # Check for chapter:verse pattern (e.g., "4:1")
        chapter_verse_pattern = r'\b\d+:\d+\b'
        if re.search(chapter_verse_pattern, text):
            return True
            
        # Check for verse number followed by letter or punctuation (e.g., "2But", "3(")
        verse_pattern = r'\b\d+(?=[a-zA-Z(])'
        if re.search(verse_pattern, text):
            return True
            
        return False
    
    # Create a copy of the dataframe to work with
    consolidated_df = df.copy()
    consolidated_rows = []
    
    i = 0
    while i < len(consolidated_df):
        current_row = consolidated_df.iloc[i].copy()
        current_text = str(current_row[text_column])
        
        # Check if current row has verse numbers
        if has_verse_numbers(current_text):
            # This row has verse numbers, keep it as is
            consolidated_rows.append(current_row)
        else:
            # This row doesn't have verse numbers
            if len(consolidated_rows) > 0:
                # Append to the previous row
                previous_row = consolidated_rows[-1]
                previous_text = str(previous_row[text_column])
                
                # Combine the text with a space
                combined_text = previous_text + " " + current_text
                previous_row[text_column] = combined_text
                
                print(f"Consolidated row {i}: '{current_text[:50]}...' -> appended to previous row")
            else:
                # No previous row to append to, keep this row
                consolidated_rows.append(current_row)
                print(f"Row {i}: No previous row to append to, keeping as standalone")
        
        i += 1
    
    # Convert back to DataFrame
    result_df = pd.DataFrame(consolidated_rows)
    result_df.reset_index(drop=True, inplace=True)
    
    print(f"\nConsolidation complete:")
    print(f"  Original rows: {len(df)}")
    print(f"  Consolidated rows: {len(result_df)}")
    print(f"  Rows merged: {len(df) - len(result_df)}")
    
    return result_df

In [3]:
# Variables

# Import data
import_filename = 'ESV_extracted_text.csv'
import_filepath = "C:\\Users\\hlmq\\code\\bible-app\\Process ESV\\Out\\"
# Export data
export_filename = 'ESV_Bible_Extracted_and_Parsed.csv'
export_filepath = "C:\\Users\\hlmq\\code\\bible-app\\Process ESV\\Out\\"

In [4]:
# Import Data
df = pd.read_csv(str(import_filepath)+str(import_filename))

# Delete rows with null values in 'pageContent' column
mask = df['pageContent'].isnull()
df = df[~mask]

df.reset_index(drop=True, inplace=True)

df.head()

Unnamed: 0,docName,pageContent,pageNumber
0,ESV Bible.pdf,The Word Became Flesh\r\n1:1 In the beginning ...,4998
1,ESV Bible.pdf,"right to become children of God, 13who were bo...",4999
2,ESV Bible.pdf,The Testimony of John the Baptist\r\n19And thi...,5000
3,ESV Bible.pdf,baptizing.,5001
4,ESV Bible.pdf,"Behold, the Lamb of God\r\n29The next day he s...",5002


In [5]:
# Apply character cleaning first
if 'pageContent' in df.columns:
    df['pageContent'] = df['pageContent'].apply(clean_special_characters)
    print("\nCharacter cleaning applied.")

# Apply the parsing function
consolidated_df = consolidate_rows_without_verses(df, text_column='pageContent')
cleaned_df = clean_csv_data(df, text_column='pageContent')
print(f"\nParsing complete. Expanded from {len(df)} to {len(cleaned_df)} rows")


Character cleaning applied.
Consolidated row 3: 'baptizing....' -> appended to previous row
Consolidated row 7: 'of God ascending and descending on the Son of
Man...' -> appended to previous row
Consolidated row 11: 'from the dead, his disciples remembered that he
h...' -> appended to previous row
Consolidated row 78: 'So the Pharisees said to one another, “You see
th...' -> appended to previous row
Consolidated row 81: 'the light, believe in the light, that you may beco...' -> appended to previous row
Consolidated row 89: 'was night....' -> appended to previous row
Consolidated row 105: 'these things to you, that in me you may have peace...' -> appended to previous row
Consolidated row 128: 'announced to the disciples, “I have seen the Lord”...' -> appended to previous row

Consolidation complete:
  Original rows: 135
  Consolidated rows: 127
  Rows merged: 8

Parsing complete. Expanded from 135 to 982 rows

Parsing complete. Expanded from 135 to 982 rows


In [6]:
output_file = str(export_filepath) + str(export_filename)
cleaned_df.to_csv(output_file, index=False)
print(f"\n=== EXPORT ===")
print(f"Cleaned data saved to: {output_file}")


=== EXPORT ===
Cleaned data saved to: C:\Users\hlmq\code\bible-app\Process ESV\Out\ESV_Bible_Extracted_and_Parsed.csv
