# Bible Text Preprocessing
This notebook loads the KJV Bible text and creates a structured DataFrame, and create embeddings.
Embeddings saved as csv and chromadb. 

In [11]:
# Import required libraries
import pandas as pd
import numpy as np

#for creating embeddings
import pandas as pd
import os
import shutil
import time
import json
from langchain.embeddings import OpenAIEmbeddings
from langchain.docstore.document import Document
import chromadb
from langchain.vectorstores import Chroma
from tqdm.auto import tqdm


In [12]:
# Read the KJV Bible text file
with open('assets/bible/kjv.txt', 'r', encoding='utf-8') as file:
    lines = file.readlines()

# Skip the header lines (first two lines)
bible_lines = lines[2:]

# Initialize lists to store the components
books = []
chapters = []
verses = []
texts = []

# Process each line
for line in bible_lines:
    
    # Split the reference from the text
    reference, text = line.strip().split('\t')

    # Split the reference into book, chapter, and verse
    book = ' '.join(reference.split()[:-1])  # Everything except the last part
    chapter_verse = reference.split()[-1]    # Last part (chapter:verse)
    chapter, verse = chapter_verse.split(':')
    
    # Append to respective lists
    books.append(book)
    chapters.append(int(chapter))
    verses.append(int(verse))
    texts.append(text)

# Create DataFrame
df = pd.DataFrame({
    'book': books,
    'chapter': chapters,
    'verse': verses,
    'text': texts
})

# Display the first few rows
print(f"Total verses: {len(df)}")
df.head()

Total verses: 31102


Unnamed: 0,book,chapter,verse,text
0,Genesis,1,1,In the beginning God created the heaven and th...
1,Genesis,1,2,"And the earth was without form, and void; and ..."
2,Genesis,1,3,"And God said, Let there be light: and there wa..."
3,Genesis,1,4,"And God saw the light, that [it was] good: and..."
4,Genesis,1,5,"And God called the light Day, and the darkness..."


In [13]:
# Counts
total_books = df['book'].nunique()
total_chapters = df.groupby(['book', 'chapter']).size().shape[0]
total_verses = len(df)

print(f"Total books: {total_books}")
print(f"Total chapters: {total_chapters}")
print(f"Total verses: {total_verses}")

# For a detailed breakdown:
print("\nDetailed breakdown:")
breakdown = df.groupby(['book', 'chapter']).agg({
    'verse': ['count', 'min', 'max']
}).reset_index()
print(breakdown)

Total books: 66
Total chapters: 1189
Total verses: 31102

Detailed breakdown:
              book chapter verse        
                           count min max
0     1 Chronicles       1    54   1  54
1     1 Chronicles       2    55   1  55
2     1 Chronicles       3    24   1  24
3     1 Chronicles       4    43   1  43
4     1 Chronicles       5    26   1  26
...            ...     ...   ...  ..  ..
1184     Zechariah      13     9   1   9
1185     Zechariah      14    21   1  21
1186     Zephaniah       1    18   1  18
1187     Zephaniah       2    15   1  15
1188     Zephaniah       3    20   1  20

[1189 rows x 5 columns]


In [14]:
def save_checkpoint(checkpoint_file: str, last_processed_idx: int, completed_books: list):
    """Save progress checkpoint"""
    with open(checkpoint_file, 'w') as f:
        json.dump({
            'last_processed_idx': last_processed_idx,
            'completed_books': completed_books
        }, f)

def load_checkpoint(checkpoint_file: str):
    """Load progress from checkpoint"""
    if os.path.exists(checkpoint_file):
        with open(checkpoint_file, 'r') as f:
            return json.load(f)
    return {'last_processed_idx': 0, 'completed_books': []}

def process_verses_with_resume(
    df: pd.DataFrame, 
    embeddings_model: OpenAIEmbeddings,
    batch_size: int = 100,
    csv_path: str = 'assets/bible/kjv_embeddings.csv',
    persist_directory: str = 'assets/bible/chromadb',
    checkpoint_file: str = 'assets/bible/embedding_checkpoint.json'
) -> None:
    """
    Process Bible verse embeddings with failure recovery.
    """
    # Create directories
    os.makedirs(os.path.dirname(csv_path), exist_ok=True)
    
    # Load checkpoint if exists
    checkpoint = load_checkpoint(checkpoint_file)
    start_idx = checkpoint['last_processed_idx']
    completed_books = set(checkpoint['completed_books'])
    
    # Load existing CSV if it exists
    existing_df = None
    if os.path.exists(csv_path) and start_idx > 0:
        existing_df = pd.read_csv(csv_path)
        print(f"Resuming from index {start_idx}, found {len(existing_df)} existing records")
    
    # Calculate remaining batches
    remaining_df = df.iloc[start_idx:]
    n_batches = (len(remaining_df) + batch_size - 1) // batch_size
    print(f"Processing {len(remaining_df)} remaining verses in {n_batches} batches...")
    
    try:
        # Process each batch
        for i in tqdm(range(n_batches)):
            batch_start = i * batch_size
            batch_end = min((i + 1) * batch_size, len(remaining_df))
            batch_df = remaining_df.iloc[batch_start:batch_end].copy()
            
            # Generate embeddings for batch
            batch_embeddings = [
                embeddings_model.embed_query(text) 
                for text in batch_df['text']
            ]
            batch_df['embedding'] = batch_embeddings
            
            # Save batch to CSV
            if i == 0 and start_idx == 0:
                batch_df.to_csv(csv_path, mode='w', header=True, index=False)
            else:
                batch_df.to_csv(csv_path, mode='a', header=False, index=False)
            
            # Update checkpoint
            last_processed_idx = start_idx + batch_end
            save_checkpoint(checkpoint_file, last_processed_idx, list(completed_books))
            
            time.sleep(0.1)  # Rate limit delay
            
    except Exception as e:
        print(f"Error during embedding generation at index {last_processed_idx}: {str(e)}")
        print(f"Progress saved. Restart the script to continue from index {last_processed_idx}")
        raise
    
    print("Embeddings generation complete!")
    
    # ChromaDB Processing
    try:
        if not os.path.exists(persist_directory):
            client = chromadb.PersistentClient(path=persist_directory)
            vectorstore = Chroma(
                client=client,
                embedding_function=embeddings_model,
                persist_directory=persist_directory
            )
        else:
            client = chromadb.PersistentClient(path=persist_directory)
            vectorstore = Chroma(
                client=client,
                embedding_function=embeddings_model,
                persist_directory=persist_directory
            )
        
        # Process books not yet in ChromaDB
        print("Adding to ChromaDB...")
        for book in df['book'].unique():
            if book in completed_books:
                print(f"Skipping {book} (already processed)")
                continue
                
            book_df = df[df['book'] == book]
            n_book_batches = (len(book_df) + batch_size - 1) // batch_size
            
            try:
                for i in range(n_book_batches):
                    start_idx = i * batch_size
                    end_idx = min((i + 1) * batch_size, len(book_df))
                    batch = book_df.iloc[start_idx:end_idx]
                    
                    documents = []
                    for _, row in batch.iterrows():
                        document = Document(
                            page_content=row['text'],
                            metadata={
                                'book': row['book'],
                                'chapter': row['chapter'],
                                'verse': row['verse']
                            }
                        )
                        documents.append(document)
                    
                    vectorstore.add_documents(documents)
                
                completed_books.add(book)
                save_checkpoint(checkpoint_file, last_processed_idx, list(completed_books))
                print(f"Added {len(book_df)} verses from {book}")
                
            except Exception as e:
                print(f"Error processing book {book}: {str(e)}")
                print(f"Progress saved. Restart the script to continue from book {book}")
                raise
        
        # Persist the vector store
        vectorstore.persist()
        
        # Clear checkpoint after successful completion
        if os.path.exists(checkpoint_file):
            os.remove(checkpoint_file)
            
        print("Processing complete!")
        
    except Exception as e:
        print(f"Error during ChromaDB processing: {str(e)}")
        print("Embeddings CSV is safe. Restart the script to continue ChromaDB processing.")
        raise

embeddings_model = OpenAIEmbeddings()

process_verses_with_resume(
        df=df,
        embeddings_model=embeddings_model,
        batch_size=100  # Adjust based on your needs
    )


Processing 31102 remaining verses in 312 batches...


  0%|          | 0/312 [00:00<?, ?it/s]

Embeddings generation complete!
Adding to ChromaDB...
Added 1533 verses from Genesis
Added 1213 verses from Exodus
Added 859 verses from Leviticus
Added 1288 verses from Numbers
Added 959 verses from Deuteronomy
Added 658 verses from Joshua
Added 618 verses from Judges
Added 85 verses from Ruth
Added 810 verses from 1 Samuel
Added 695 verses from 2 Samuel
Added 816 verses from 1 Kings
Added 719 verses from 2 Kings
Added 942 verses from 1 Chronicles
Added 822 verses from 2 Chronicles
Added 280 verses from Ezra
Added 406 verses from Nehemiah
Added 167 verses from Esther
Added 1070 verses from Job
Added 2461 verses from Psalm
Added 915 verses from Proverbs
Added 222 verses from Ecclesiastes
Added 117 verses from Song of Solomon
Added 1292 verses from Isaiah
Added 1364 verses from Jeremiah
Added 154 verses from Lamentations
Added 1273 verses from Ezekiel
Added 357 verses from Daniel
Added 197 verses from Hosea
Added 73 verses from Joel
Added 146 verses from Amos
Added 21 verses from Obadia

In [15]:
'''
# load csv
df = pd.read_csv('assets/bible/kjv_embeddings.csv')

#load chromadb
client = chromadb.PersistentClient(path=persist_directory)
vectorstore = Chroma(client=client, embedding_function=embeddings_model, persist_directory=persist_directory)
'''

"\n# load csv\ndf = pd.read_csv('assets/bible/kjv_embeddings.csv')\n\n#load chromadb\nclient = chromadb.PersistentClient(path=persist_directory)\nvectorstore = Chroma(client=client, embedding_function=embeddings_model, persist_directory=persist_directory)\n"