## Setup environment and check parameters

In [1]:
import os
import yaml 
import torch 

# Bible data directory 
data_directory = "../data/bible_versions/cuvs"
assert(os.path.isdir(data_directory)), f"Data directory does not exist..."

# Load the embedding config file 
config_file = "./bible_embedding_config.yaml"
assert(os.path.isfile(config_file)), "Embedding config file does not exist..."

with open(config_file, "r") as f: 
    config = yaml.safe_load(f) 

# Load embedding model 
assert("model" in config), f"'model' configuration missed..."

if (config["model"]["framework"] == "huggingface"): 
    model_name = config["model"]["name"] 
    print(f"Loading Huggingface embedding model: {model_name}")

    from langchain_huggingface import HuggingFaceEmbeddings 
    os.environ["PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION"] = "python"

    embedder = HuggingFaceEmbeddings(
        model_name=model_name,
        model_kwargs={
            "device": (
                "cuda" 
                if (torch.cuda.is_available()) 
                else "cpu"
            )
        },
        encode_kwargs={
            "normalize_embeddings": False
        }
    )
else: 
    assert(False), f"Unknown embedding framework: {config['model']['framework']}"

Loading Huggingface embedding model: sentence-transformers/all-mpnet-base-v2


  from tqdm.autonotebook import tqdm, trange
To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


In [15]:
from tqdm import tqdm 
from pathlib import Path 
from bible_study_bot.data.definitions import BibleBook

for book_name, book_params in config["books"].items():
    # Check if the book file exist
    book_file = Path(data_directory, book_params["file"])
    assert(book_file.exists()), f"Book file does not exist: {book_file}"
    assert(book_file.suffix in [".yaml", ".yml"]), f"Unsupported book file format: {book_file.suffix}"

    # Load embedding parameters 
    embedding_context_scope = book_params["context_scope"] 
    n_prev_context_verses = (
        config["defaults"]["n_prev_context_verses"]
        if ("n_prev_context_verses" not in book_params)
        else book_params["n_prev_context_verses"]
    )
    n_next_context_verses = (
        config["defaults"]["n_next_context_verses"]
        if ("n_next_context_verses" not in book_params)
        else book_params["n_next_context_verses"]
    )

    # Load the book file 
    with open(book_file, "r", encoding="utf-8") as f: 
        bible_book_dict = yaml.safe_load(f) 
        bible_book = BibleBook(**bible_book_dict)

        # Iterate through the verses 
        for i_bible_verse, bible_verse in tqdm(enumerate(bible_book.verses), desc=f"Processing book {book_name}...", total=len(bible_book.verses)): 
            verse_context = bible_book.verses[
                max(0, i_bible_verse - n_prev_context_verses)
                : min(len(bible_book.verses)-1, i_bible_verse + n_next_context_verses)
            ]

            # chop the context based on embedding_context_scope
            if (embedding_context_scope == "book"): 
                pass 

            else: 
                assert(False), f"Unknown embedding context scope: {embedding_context_scope}"

            # Embed the text 
            verse_context_text = " ".join(list(map(lambda verse: verse.text.strip(), verse_context)))
            
            # Save the text to vector store 



Embedding book: genesis


Processing book genesis...: 100%|██████████| 1533/1533 [00:00<00:00, 381843.82it/s]
