## Build a vector store from a duckdb table using custom code

In [1]:
import os

# Please change the paths to your own paths
paper_parser_dir = "/work/data/projects/data2report/paper-parser"
os.environ["HF_HOME"] = "/work/data/environments/cache/"

import os
import sys

script_dir = os.path.join(paper_parser_dir, "scripts")
sys.path.append(script_dir)

print(sys.path)

In [None]:
!pip install -qU qdrant-client duckdb

In [2]:
import duckdb

conn = duckdb.connect()
conn.sql("SELECT COUNT(*) FROM read_parquet('/work/data/projects/data2report/paragraphs.parquet') LIMIT 1").show()

In [3]:
!nvidia-smi

In [None]:
from embedding import DuckDBLoader

raw_documents = DuckDBLoader(
    "/work/data/projects/data2report/paragraphs.duckdb",
    page_content_column="text",
    metadata_columns=["pmid", "pmc", "doi", "pubdate"],
).load()

sentences = [doc.get("text") for doc in list(raw_documents)[:4096]]
sentences

### Run NV-Embed-v2

In [None]:
from embedding import EmbeddingVectorDB
import torch
from transformers import AutoModel, AutoTokenizer
from accelerate import init_empty_weights, infer_auto_device_map

model_name = "nvidia/NV-Embed-v2"

with init_empty_weights():
    model = AutoModel.from_pretrained(model_name, trust_remote_code=True)

device_map = infer_auto_device_map(model)
model = EmbeddingVectorDB.load_model(model_name=model_name, model=model, device_map=device_map, torch_dtype=torch.float16)

In [None]:
import os
from embedding import build_vector_db

duckdb_file = "/work/data/projects/data2report/paragraphs.duckdb"
vector_db_dir = "/work/data/projects/data2report/nvembed-vector-store"
batch_size = 2 * 256
num_documents = 10000

build_vector_db(
    cache_filepath=vector_db_dir,
    raw_document_path=duckdb_file,
    # model_name="nvidia/NV-Embed-v2",
    model=model,
    page_content_column="text",
    metadata_columns=["pmid", "pmc", "doi", "pubdate"],
    num_documents=num_documents,
    batch_size=batch_size,
    allow_batch_mode=True
)

### Run all-MiniLM-L6-v2

In [4]:
from sentence_transformers import SentenceTransformer
import torch

model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2", trust_remote_code=True, model_kwargs={"torch_dtype": torch.float16})

In [None]:
import os
from embedding import build_vector_db

duckdb_file = "/work/data/projects/data2report/paragraphs.duckdb"
vector_db_dir = "/work/data/projects/data2report/minillm-rocksdb-2025012500"
batch_size = 4096
num_documents = 0

os.environ["ROCKSDB_WRITE_BUFFER_SIZE"] = str(1024 * 1024 * 1024 * 8)
os.environ["ROCKSDB_MAX_WRITE_BUFFER_NUMBER"] = "3"
os.environ["ROCKSDB_MIN_WRITE_BUFFER_NUMBER_TO_MERGE"] = "3"
os.environ["ROCKSDB_MAX_BACKGROUND_FLUSHES"] = "10"
os.environ["ROCKSDB_MAX_BACKGROUND_COMPACTIONS"] = "10"
os.environ["ROCKSDB_BLOCK_CACHE_SIZE"] = str(1024 * 1024 * 1024 * 50)

build_vector_db(
    cache_filepath=vector_db_dir,
    raw_document_path=duckdb_file,
    # model_name="sentence-transformers/all-MiniLM-L6-v2",
    model=model,
    page_content_column="text",
    metadata_columns=["pmid", "pmc", "doi", "pubdate"],
    num_documents=num_documents,
    batch_size=batch_size,
    allow_batch_mode=True,
    num_threads=80,
    next_batch_count=0
)

## Example: Build a vector store from a duckdb table

### Install dependencies

In [11]:
!pip install -qU langchain-community duckdb chromadb "langchain-chroma>=0.1.2" ipywidgets sentence-transformers einops datasets python-dotenv

### Load the environment variables

In [None]:
import os
from dotenv import load_dotenv

load_dotenv(dotenv_path="/Users/jy006/.ssh/.env")

### Test the duckdb loader

In [6]:
from langchain_community.document_loaders import DuckDBLoader

In [None]:
loader = DuckDBLoader(
    "SELECT * FROM read_parquet('/Volumes/Backup/ProjectData/Papers/PMC_OA_Bulk/processed/oa_other/paragraphs/0a96d2c04ab604cd71eed3b268c298c9_20250103_135911.parquet')",
    page_content_columns=["text"],
    metadata_columns=["pmid", "pmc", "doi", "pubdate"],
)

data = loader.load()
data[0]

### Build the knowledge base

In [None]:
from langchain_community.embeddings import OpenAIEmbeddings, OllamaEmbeddings, SentenceTransformerEmbeddings
from langchain_community.document_loaders import DuckDBLoader
from langchain_community.vectorstores import Chroma

import os

def load_vector_store(chroma_db_dir, num_documents=1000):
    try:
        # Load the document
        raw_documents = DuckDBLoader(
            query=f"SELECT * FROM read_parquet('/Volumes/Backup/ProjectData/Papers/PMC_OA_Bulk/processed/oa_other/paragraphs/0a96d2c04ab604cd71eed3b268c298c9_20250103_135911.parquet') LIMIT {num_documents}",
            page_content_columns=["text"],
            metadata_columns=["pmid", "pmc", "doi", "pubdate"],
        ).load()

        def preprocess_metadata(documents):
            for doc in documents:
                if doc.metadata:
                    # 替换 metadata 中的 None 为 ""
                    doc.metadata = {k: (v if v is not None else "") for k, v in doc.metadata.items()}
            return documents

        # # Split the document into smaller chunks
        # text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0)
        # documents = text_splitter.split_documents(raw_documents)

        # Embed each chunk and load it into the vector store
        vector_store = Chroma.from_documents(
            documents=preprocess_metadata(raw_documents),
            # embedding=OllamaEmbeddings(model="mistral:7b"),
            # embedding=OpenAIEmbeddings(),
            embedding=SentenceTransformerEmbeddings(
                model_name="nvidia/NV-Embed-v2", model_kwargs={"trust_remote_code": True}
            ),
            persist_directory=str(chroma_db_dir),
        )
        vector_store.persist()
        print("Vector store successfully created and persisted.")
    except Exception as e:
        print(f"Error loading vector store: {e}")


# Path to Chroma vector database
# chroma_db_dir = os.path.abspath("./chroma_db_chatgpt")
chroma_db_dir = os.path.abspath("./chroma_db_nvidia")

print("Chroma vector database path:", chroma_db_dir)

# Load the vector store
if not os.path.exists(chroma_db_dir):
    print("Chroma database not found. Creating a new one...")
    load_vector_store(chroma_db_dir, num_documents=100)
else:
    print("Chroma database found. Skipping creation.")

In [None]:
import os

from phi.agent import Agent
from phi.knowledge.langchain import LangChainKnowledgeBase
from phi.model.ollama.chat import Ollama
from phi.model.openai.chat import OpenAIChat
from langchain_chroma import Chroma
from langchain_community.embeddings import OpenAIEmbeddings, OllamaEmbeddings

# Path to Chroma vector database
chroma_db_dir = os.path.abspath("./chroma_db_chatgpt")
# os.makedirs(chroma_db_dir, exist_ok=True)
print("Chroma vector database path:", chroma_db_dir)

# Get the vector database
db = Chroma(
    # embedding_function=OllamaEmbeddings(model="mistral:7b"),
    embedding_function=OpenAIEmbeddings(),
    persist_directory=str(chroma_db_dir),
)

# Check if Chroma database has any documents
if not db._collection.count():
    print("Chroma database is empty. Please ensure documents are loaded.")
else:
    print(f"Chroma database contains {db._collection.count()} documents.")

In [None]:
import duckdb

conn = duckdb.connect()
data = conn.sql("SELECT * FROM read_parquet('/Volumes/Backup/ProjectData/Papers/PMC_OA_Bulk/processed/oa_other/paragraphs/0a96d2c04ab604cd71eed3b268c298c9_20250103_135911.parquet')")

data.show()

In [None]:
# Count the number of unique pmc
conn.sql("SELECT COUNT(DISTINCT pmc) FROM read_parquet('/Volumes/Backup/ProjectData/Papers/PMC_OA_Bulk/processed/oa_other/paragraphs/0a96d2c04ab604cd71eed3b268c298c9_20250103_135911.parquet')").show()


In [None]:
# Create a retriever from the vector store
retriever = db.as_retriever()

docs = retriever.get_relevant_documents("observed base pair difference", k=10)
if not docs:
    print("No relevant documents retrieved.")
else:
    print(f"Retrieved {len(docs)} documents.")

    # Create a knowledge base from the vector store
    knowledge_base = LangChainKnowledgeBase(retriever=retriever, num_documents=10)

    # Initialize the Agent
    # https://docs.phidata.com/agents/knowledge#step-3-agentic-rag
    kb_agent = Agent(
        model=OpenAIChat(id="gpt-4o"),
        # model=Ollama(id="mistral:7b"),
        knowledge_base=knowledge_base,
        add_reference_to_prompt=True,
        # add_references=True,
        instructions=[
            "Always prioritize information from the knowledge base over your training data.",
            "If the knowledge base does not contain information relevant to the query, respond with: 'No relevant information found in the knowledge base.'",
            "Do not generate answers based on prior training data unless explicitly instructed.",
        ],
        markdown=True,
        # debug_mode=True,
    )

    # Test the Agent with a query
    kb_agent.print_response(
        "What was the observed base pair difference between many of the strain types?"
    )

## Run deepspeed

In [None]:
!export TRITON_CACHE_DIR=/tmp && deepspeed --num_gpus 2 --master_port 60000 /work/data/projects/data2report/deepspeed/run_deepspeed.py

## Reorganize the rocksdb database

In [None]:
from embedding import CacheDB

os.environ["ROCKSDB_WRITE_BUFFER_SIZE"] = str(1024 * 1024 * 1024 * 8)
os.environ["ROCKSDB_MAX_WRITE_BUFFER_NUMBER"] = "3"
os.environ["ROCKSDB_MIN_WRITE_BUFFER_NUMBER_TO_MERGE"] = "3"
os.environ["ROCKSDB_MAX_BACKGROUND_FLUSHES"] = "16"
os.environ["ROCKSDB_MAX_BACKGROUND_COMPACTIONS"] = "16"
os.environ["ROCKSDB_BLOCK_CACHE_SIZE"] = str(1024 * 1024 * 1024 * 50)

with CacheDB(cache_filepath="/work/data/projects/data2report/minillm-rocksdb-2025012500", cache_mode="rocksdb") as cache_db:
    cache_db.db.compact_range()