In [0]:
# Run this in the first cell of your notebook
# Install packages one by one to ensure each succeeds
%pip install langchain==0.1.0
%pip install chromadb==0.4.0
%pip install sentence-transformers==2.2.2
%pip install openai==1.0.0
%pip install torch
%pip install transformers

# Restart the Python kernel after installation
dbutils.library.restartPython()

In [0]:
# Run this after restart to verify all packages are installed
import sys
import pkg_resources

required_packages = ['langchain', 'chromadb', 'sentence_transformers', 'openai', 'torch', 'transformers']

for package in required_packages:
    try:
        version = pkg_resources.get_distribution(package).version
        print(f"✓ {package} version {version} is installed")
    except:
        print(f"✗ {package} is NOT installed")
        
# If any packages are missing, try installing them individually

In [0]:
# Import with error handling
try:
    from langchain.vectorstores import Chroma
    from langchain.embeddings import HuggingFaceEmbeddings
    import chromadb
    from chromadb.config import Settings
    import os
    print("✓ All imports successful")
except ImportError as e:
    print(f"Import error: {e}")
    print("Trying alternative import...")
    
    # Alternative: Use sentence-transformers directly
    from sentence_transformers import SentenceTransformer
    from langchain.vectorstores import Chroma
    import chromadb
    from chromadb.config import Settings
    import os
    
    # Create a custom embedding class
    class CustomEmbeddings:
        def __init__(self, model_name):
            self.model = SentenceTransformer(model_name)
        
        def embed_documents(self, texts):
            return self.model.encode(texts, convert_to_tensor=False).tolist()
        
        def embed_query(self, text):
            return self.model.encode([text], convert_to_tensor=False)[0].tolist()

# Use local directory instead of DBFS for ChromaDB
persist_directory = "/tmp/vector_store"
os.makedirs(persist_directory, exist_ok=True)

# Initialize embeddings model for CPU
try:
    # Try the standard way first
    embeddings = HuggingFaceEmbeddings(
        model_name="sentence-transformers/all-MiniLM-L6-v2",
        model_kwargs={'device': 'cpu'},
        encode_kwargs={'normalize_embeddings': True}
    )
except Exception as e:
    print(f"Standard initialization failed: {e}")
    print("Using custom embeddings...")
    embeddings = CustomEmbeddings("sentence-transformers/all-MiniLM-L6-v2")

# Test embeddings are working
test_embedding = embeddings.embed_query("test")
print(f"Embedding dimension: {len(test_embedding)}")

# Configure ChromaDB with local storage
chroma_client = chromadb.Client(Settings(
    chroma_db_impl="duckdb+parquet",
    persist_directory=persist_directory,
    anonymized_telemetry=False
))

# Create collection
collection_name = "knowledge_base"
vector_store = Chroma(
    collection_name=collection_name,
    embedding_function=embeddings,
    client=chroma_client,
    persist_directory=persist_directory
)

print(f"Vector store initialized at {persist_directory}")