In [1]:
from ingestion.file_loader import collect_files_from_path, extract_text_from_file
from ingestion.chunker import simple_text_chunker
from vector_store.embedder import get_openai_embedding
from vector_store.vector_index import add_documents_to_index
import uuid

def ingest_path(path, collection_name="default"):
    files = collect_files_from_path(path)
    all_chunks, all_metas, all_ids, all_embeddings = [], [], [], []

    for file in files:
        content = extract_text_from_file(file)
        if not content:
            print(f"Skipped empty or unreadable file: {file}")
            continue

        chunks = simple_text_chunker(content)
        for i, chunk in enumerate(chunks):
            try:
                embedding = get_openai_embedding(chunk)
            except Exception as e:
                print(f"Embedding failed for chunk {i} in {file}: {e}")
                continue

            all_chunks.append(chunk)
            all_embeddings.append(embedding)
            all_metas.append({
                "source": str(file),
                "chunk_index": i
            })
            all_ids.append(str(uuid.uuid4()))

    if all_chunks:
        add_documents_to_index(
            collection_name,
            documents=all_chunks,
            embeddings=all_embeddings,
            metadatas=all_metas,
            ids=all_ids
        )
        print(f"Ingested {len(all_chunks)} chunks into '{collection_name}' index.")
    else:
        print("No valid content was ingested.")

from vector_store.vector_index import query_index, compile_context

def get_context(query, collection_name='default'):
    results = query_index(collection_name, query)
    return compile_context(results)

In [2]:
path = './data/test_data/'
ingest_path(path)

Ingested 3 chunks into 'default' index.


In [7]:
context = get_context('docx')

In [8]:
context

['Test DOCX Document\nThis is a paragraph from a test DOCX file.\nIt is being used to validate document ingestion and chunking.']