In [None]:
import os
from pdfplumber import open as open_pdf
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_openai import OpenAIEmbeddings
import chromadb
from chromadb.config import Settings
import getpass

# get openai api key from environment variable or prompt and if not set
# if not os.getenv("OPENAI_API_KEY"):
#     openai_api_key = 
#     os.environ["OPENAI_API_KEY"] = openai_api_key

# 1. extract & chunk
pdf_dir = "/Users/tanishbhowmick/Documents/Projects/MediMind/MediBackend/ai-training/data"
pdf_files = [os.path.join(pdf_dir, f) for f in os.listdir(pdf_dir) if f.lower().endswith(".pdf")]

all_text = ""
for pdf_path in pdf_files:
    with open_pdf(pdf_path) as pdf:
        all_text += "\n".join(page.extract_text() or "" for page in pdf.pages) + "\n"

chunks = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200).split_text(all_text)

# 2. embed
embeddings = OpenAIEmbeddings(model="text-embedding-3-small")
vectors = embeddings.embed_documents(chunks)

# 3. init Chroma with a persist directory
client = chromadb.PersistentClient(path="../.chromadb")
collection = client.get_or_create_collection(
    name="inceptive_data",
    metadata={"source": "Inceptive Data"}
)

# 4. add data
ids = [f"chunk_{i}" for i in range(len(chunks))]
metadatas = [{"chunk_index": i} for i in range(len(chunks))]
collection.add(
    documents=chunks,
    embeddings=vectors,
    metadatas=metadatas,
    ids=ids
)

print(f"Ingest complete: {collection.count()} chunks stored in “.chromadb/”.")


Ingest complete: 295 chunks stored in “.chromadb/”.


In [3]:
print(client.list_collections())


[Collection(name=inceptive_data)]
