In [10]:
from llama_index.core import (
    SimpleDirectoryReader,
    VectorStoreIndex,
    StorageContext,
    ServiceContext,
)
from llama_index.core.node_parser import SentenceSplitter, CodeSplitter
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
from llama_index.vector_stores.chroma import ChromaVectorStore
import chromadb
import os

from html_doc_loader import load_html_documents
from llama_index.core.node_parser import SentenceSplitter

In [5]:
# === CONFIGURATION ===
# Set the path to the directory containing the documents
DATA_DIR_DOCS = "../data/scanpy_docs"
# Set the path to the directory containing the code
DATA_DIR_CODE = "../data/scanpy_src/core"
DATA_DIR_TUTORIALs = "../data/scanpy_src/tutorials"
CHROMA_PATH = "chroma_db"
COLLECTION_NAME = "scoracle_index"
EMBED_MODEL = "sentence-transformers/all-MiniLM-L6-v2"

In [11]:
html_docs = load_html_documents(DATA_DIR_DOCS)

[INFO] Loaded 298 HTML documents from ../data/scanpy_docs


In [12]:
# Chunk
splitter = SentenceSplitter(chunk_size=500, chunk_overlap=100)
html_nodes = splitter.get_nodes_from_documents(html_docs)

In [15]:
code_loader = SimpleDirectoryReader(
    input_dir=DATA_DIR_CODE,
    recursive=True,
    required_exts=[".py"],
)
code_nodes = code_loader.load_data()
for doc in code_nodes:
    doc.metadata["type"] = "code"

In [18]:
## === Load tutorial files (.ipynb) ===
notebook_loader = SimpleDirectoryReader(
    input_dir=DATA_DIR_TUTORIALs,
    recursive=True,
    required_exts=[".ipynb"],
)
notebook_nodes = notebook_loader.load_data()
for doc in notebook_nodes:
    doc.metadata["type"] = "tutorial"
    

In [19]:
len(notebook_nodes), len(code_nodes), len(html_nodes)

(344, 117, 632)

In [20]:
# === Set up embedding model ===
embed_model = HuggingFaceEmbedding(model_name=EMBED_MODEL)

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


In [21]:
embed_model

HuggingFaceEmbedding(model_name='sentence-transformers/all-MiniLM-L6-v2', embed_batch_size=10, callback_manager=<llama_index.core.callbacks.base.CallbackManager object at 0x7f3f6e4e6110>, num_workers=None, max_length=256, normalize=True, query_instruction=None, text_instruction=None, cache_folder=None, show_progress_bar=False)