## Setup environment and check parameters

In [1]:
import os
import yaml 
import torch 

# Bible data directory 
data_directory = "../data/bible_versions/cuvs"
assert(os.path.isdir(data_directory)), f"Data directory does not exist..."

# Load the embedding config file 
config_file = "./bible_embedding_config.yaml"
assert(os.path.isfile(config_file)), "Embedding config file does not exist..."

with open(config_file, "r") as f: 
    config = yaml.safe_load(f) 

# Load embedding model 
assert("model" in config), f"'model' configuration missed..."

if (config["model"]["framework"] == "huggingface"): 
    model_name = config["model"]["name"] 
    print(f"Loading Huggingface embedding model: {model_name}")

    from langchain_huggingface import HuggingFaceEmbeddings 
    os.environ["PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION"] = "python"

    embedder = HuggingFaceEmbeddings(
        model_name=model_name,
        model_kwargs={
            "device": (
                "cuda" 
                if (torch.cuda.is_available()) 
                else "cpu"
            )
        },
        encode_kwargs={
            "normalize_embeddings": False
        }
    )
else: 
    assert(False), f"Unknown embedding framework: {config['model']['framework']}"

# Probe the embedding dimension 
embedding_dim = len(embedder.embed_query("this is a test")) 
print(f"Embedding dimension: {embedding_dim}")

# Load the vector store (client) 
vs_provider = config["vector_store"]["provider"]

if (vs_provider == "qdrant"): 
    # Create Qdrant client 
    from qdrant_client import QdrantClient 
    qdrant_client = QdrantClient(**config["vector_store"]["client_args"])

    # Create the collection if it does not exist 
    from qdrant_client.models import VectorParams, Distance
    collection_name = config["vector_store"]["collection_name"]

    if (not qdrant_client.collection_exists(collection_name)): 
        qdrant_client.create_collection(
            collection_name, 
            vectors_config=VectorParams(
                size=embedding_dim, 
                distance=Distance.COSINE
            )
        )

    # Wrap the Qdrant client with LangChain 
    from langchain_qdrant import QdrantVectorStore 
    vector_store = QdrantVectorStore(
        client=qdrant_client,
        collection_name=collection_name, 
        embedding=embedder
    )

else: 
    assert(False), f"Unsupported vector store provider: {vs_provider}"

Loading Huggingface embedding model: hfl/chinese-roberta-wwm-ext


  from tqdm.autonotebook import tqdm, trange
No sentence-transformers model found with name hfl/chinese-roberta-wwm-ext. Creating a new one with mean pooling.
  return self.fget.__get__(instance, owner)()


Embedding dimension: 768


In [2]:
import uuid 
from tqdm import tqdm 
from pathlib import Path 
from langchain_core.documents import Document 
from bible_study_bot.data.loaders import load_bible_book_from_file, load_verse_context
from bible_study_bot.data.preprocesses import preproc_for_embedding

for book_name, book_params in config["books"].items():
    # Check if the book file exist
    book_file = Path(data_directory, book_params["file"])
    assert(book_file.exists()), f"Book file does not exist: {book_file}"
    assert(book_file.suffix in [".yaml", ".yml"]), f"Unsupported book file format: {book_file.suffix}"

    # Load embedding parameters 
    embedding_context_scope = book_params["context_scope"] 
    n_prev_context_verses = (
        config["defaults"]["n_prev_context_verses"]
        if ("n_prev_context_verses" not in book_params)
        else book_params["n_prev_context_verses"]
    )
    n_next_context_verses = (
        config["defaults"]["n_next_context_verses"]
        if ("n_next_context_verses" not in book_params)
        else book_params["n_next_context_verses"]
    )

    # Load the book file 
    bible_book = load_bible_book_from_file(book_file)

    # Iterate through the verses 
    for bible_verse in tqdm(bible_book.verses, desc=f"Processing book {book_name}...", total=len(bible_book.verses)): 
        verse_context_text = load_verse_context(
            bible_book=bible_book, 
            chapter=bible_verse.metadata["chapter"], 
            verse=bible_verse.metadata["verse"], 
            context_scope=embedding_context_scope, 
            n_prev_context_verses=n_prev_context_verses, 
            n_next_context_verses=n_next_context_verses
        )

        verse_context_text = preproc_for_embedding(verse_context_text)
        
        # Save the text to vector store 
        vector_store.add_documents(
            documents=[Document(
                page_content=verse_context_text, 
                metadata={
                    "text": bible_verse.text, 
                    **bible_book.metadata, 
                    **bible_verse.metadata
                }
            )], 
            ids=[str(uuid.uuid4())]
        )


Processing book genesis...: 100%|██████████| 1533/1533 [12:54<00:00,  1.98it/s]


In [3]:
# rels = vector_store.similarity_search(
#     "谁进入了挪亚的方舟？",
#     k=10
# )