In [None]:
import os
from pdfplumber import open as open_pdf
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_openai import OpenAIEmbeddings
import chromadb
from chromadb.config import Settings
import getpass

# 1. extract & chunk
with open_pdf("/Users/tanishbhowmick/Downloads/inceptivemanual_clinician.pdf") as pdf:
    text = "\n".join(p.extract_text() or "" for p in pdf.pages)
chunks = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200).split_text(text)

# 2. embed
embeddings = OpenAIEmbeddings(model="text-embedding-3-small")
vectors = embeddings.embed_documents(chunks)

# 3. init Chroma with a persist directory
client = chromadb.PersistentClient(path=".chromadb")
collection = client.get_or_create_collection(
    name="inceptive_clinician_manual",
    metadata={"source": "Inceptive Clinician Manual"}
)

# 4. add data
ids = [f"chunk_{i}" for i in range(len(chunks))]
metadatas = [{"chunk_index": i} for i in range(len(chunks))]
collection.add(
    documents=chunks,
    embeddings=vectors,
    metadatas=metadatas,
    ids=ids
)

print(f"Ingest complete: {collection.count()} chunks stored in “.chromadb/”.")


Ingest complete: 244 chunks stored in “.chromadb/”.


In [2]:
print(client.list_collections())


[Collection(name=inceptive_clinician_manual)]
