In [0]:
# Run this in the first cell of your notebook
%pip install langchain==0.1.0 chromadb==0.4.0 sentence-transformers==2.2.2 openai==1.0.0 torch

# Restart the Python kernel after installation
dbutils.library.restartPython()

In [0]:
from langchain.vectorstores import Chroma
from langchain.embeddings import HuggingFaceEmbeddings
import chromadb
from chromadb.config import Settings
import os

# Create directory if it doesn't exist
os.makedirs("/dbfs/vector_store", exist_ok=True)

# Initialize embeddings model for CPU
embeddings = HuggingFaceEmbeddings(
    model_name="sentence-transformers/all-MiniLM-L6-v2",
    model_kwargs={'device': 'cpu'},  # CPU-based embeddings
    encode_kwargs={'normalize_embeddings': True}  # For better similarity search
)

# Test embeddings are working
test_embedding = embeddings.embed_query("test")
print(f"Embedding dimension: {len(test_embedding)}")

# Configure ChromaDB
chroma_client = chromadb.Client(Settings(
    chroma_db_impl="duckdb+parquet",
    persist_directory="/dbfs/vector_store"
))

# Create collection
collection_name = "knowledge_base"
vector_store = Chroma(
    collection_name=collection_name,
    embedding_function=embeddings,
    client=chroma_client,
    persist_directory="/dbfs/vector_store"
)