In [1]:
from llama_index.core import VectorStoreIndex, StorageContext
from llama_index.vector_stores.chroma import ChromaVectorStore
from llama_index.llms.openai import OpenAI
from llama_index.core.agent.workflow import FunctionAgent
import chromadb
import openai
import os
import asyncio
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
from llama_index.core import Settings

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
from dotenv import load_dotenv
load_dotenv(dotenv_path=os.path.expanduser("~/.scoracle.env"))

True

In [3]:
# === CONFIGURATION ===
CHROMA_PATH = "../chroma_db"
COLLECTION_NAME = "scoracle_index"
TOP_K = 5
Settings.embed_model = HuggingFaceEmbedding(model_name="sentence-transformers/all-MiniLM-L6-v2")

# === OPENAI API KEY ===
openai.api_key = os.getenv("OPENAI_API_KEY")
if openai.api_key is None:
    raise ValueError("Please set the OPENAI_API_KEY environment variable.")

# === Retrieve the Chroma vector store ===
chroma_client = chromadb.PersistentClient(path=CHROMA_PATH)
collection = chroma_client.get_or_create_collection(COLLECTION_NAME)
vector_store = ChromaVectorStore(chroma_collection=collection)
storage_context = StorageContext.from_defaults(vector_store=vector_store)

# === Rebuild the index ===
index = VectorStoreIndex.from_vector_store(vector_store)
query_engine = index.as_query_engine(similarity_top_k=TOP_K)

In [6]:
response = await query_engine.aquery("Is scanpy in the knowledge base?")
print(response.source_nodes)



In [4]:
# Check model consistency
print("✅ Embedding model:", Settings.embed_model)

# Confirm embedding dimension
print("✅ Embedding dim:", Settings.embed_model.get_query_embedding("test"))

# Confirm vector store path
print("✅ Chroma path:", CHROMA_PATH)

# Reconnect to collection
client = chromadb.PersistentClient(path=CHROMA_PATH)
collection = client.get_or_create_collection("scoracle_index")
print("✅ Collection name:", collection.name)
print("✅ # of vectors in Chroma:", collection.count())

✅ Embedding model: model_name='sentence-transformers/all-MiniLM-L6-v2' embed_batch_size=10 callback_manager=<llama_index.core.callbacks.base.CallbackManager object at 0x7f4c437bea50> num_workers=None max_length=256 normalize=True query_instruction=None text_instruction=None cache_folder=None show_progress_bar=False
✅ Embedding dim: [0.011573437601327896, 0.0251361895352602, -0.036701854318380356, 0.05932486057281494, -0.0071490430273115635, -0.04119423031806946, 0.07708737999200821, 0.037442535161972046, 0.01244898047298193, -0.00611766055226326, 0.01703425496816635, -0.07701537013053894, -0.0003942012262996286, 0.027909034863114357, -0.015989141538739204, -0.06827524304389954, 0.008884700015187263, -0.02028077282011509, -0.08035991340875626, -0.013074060901999474, -0.04109995812177658, -0.02589803747832775, -0.026538634672760963, 0.03305227309465408, -0.022079160436987877, 0.021046141162514687, -0.05792197957634926, 0.03294876962900162, 0.02970741130411625, -0.06224839389324188, 0.038

In [9]:
client = chromadb.PersistentClient(path=CHROMA_PATH)
collection = client.get_or_create_collection("scoracle_index")
print("🧠 DB path:", CHROMA_PATH)
print("🧾 Collection list:", chroma_client.list_collections())
print("✅ Collection name:", collection.name)
print("✅ # of vectors in Chroma:", collection.count())
print("🔬 Welcome to scOracle — Ask about single-cell analysis!\n")

🧠 DB path: ../chroma_db
🧾 Collection list: [Collection(name=scoracle_index)]
✅ Collection name: scoracle_index
✅ # of vectors in Chroma: 7904
🔬 Welcome to scOracle — Ask about single-cell analysis!

