In [81]:
import yaml

# load config
with open("config.yaml", "r") as file:
    config = yaml.safe_load(file)

## extracting

In [1]:
import fitz  # PyMuPDF

def extract_text_from_pdf(pdf_path):
    doc = fitz.open(pdf_path)
    all_text = []
    
    for page_num in range(len(doc)):
        page = doc.load_page(page_num)
        text = page.get_text("text")
        text = text.strip()
        if text:
            all_text.append(text)
    
    full_text = "\n\n".join(all_text)
    return full_text

In [58]:
path = "doc/Ethan Rasiel, Ph.D., Paul N. Friga - The McKinsey Mind.pdf"
raw_text = extract_text_from_pdf(pdf_path=path)

## chunking

In [59]:
from langchain.text_splitter import CharacterTextSplitter

# 2️⃣ Chunk your text
text_splitter = CharacterTextSplitter(
    separator="\n",
    chunk_size=500,
    chunk_overlap=50
)
chunks = text_splitter.split_text(raw_text)

## embedding

In [65]:
from sentence_transformers import SentenceTransformer

embedding_model = SentenceTransformer('all-MiniLM-L6-v2')
# Embed text
embeddings = embedding_model.encode(chunks)
print(f"Vector length: {len(embeddings[0])}")

Vector length: 384


## store in db

### chroma cloud

In [48]:
import chromadb
  
chroma_client = chromadb.CloudClient(
  api_key=config['chroma_api'],
  tenant=config['chroma_tenant'],
  database='first_rag'
)

In [66]:
# 3️⃣ Create or get collection
collection = chroma_client.get_or_create_collection(name="my_collection")

In [61]:
# # Cloud collections work the same way!
# collection.add(
#     ids=[f"doc_{i}" for i in range(len(chunks))],  # unique IDs
#     embeddings=embeddings.tolist(),               # must be list of lists!
#     documents=chunks,                              # optional, but useful
#     metadatas=[{"source": "example"} for _ in chunks]  # optional metadata
# )

# print("✅ Vectors stored in Chroma Cloud!")

In [76]:
# Just store the first 200 chunks instead of all
n = 300
collection.add(
    ids=[f"doc_{i}" for i in range(n)],
    embeddings=embeddings[:n].tolist(),
    documents=chunks[:n],
    metadatas=[{"source": "example"} for _ in range(n)]
)

### chroma local

In [75]:
# import chromadb
# from chromadb.config import Settings

# chroma_client = chromadb.Client(Settings(
#     chroma_db_impl="duckdb+parquet",
#     persist_directory="./chroma_local"
# ))


## retrieve

In [85]:
query = "What is Mckinsey"

# Embed locally
query_embedding = embedding_model.encode([query])

# Search Chroma Cloud
results = collection.query(
    query_embeddings=query_embedding.tolist(),
    n_results=3
)

In [86]:
# Extract relevant chunks
contexts = results['documents'][0]
context_text = "\n".join(contexts)

In [87]:
import google.generativeai as genai

# Configure the SDK
genai.configure(api_key=config['genimi_api'])

# Create the Gemini 2.5 Flash model
gemini_model = genai.GenerativeModel("gemini-2.5-flash")

In [88]:
# Call your local Llama
prompt = f"""Answer the question below using ONLY the context below.

Context:
{context_text}

Question:
{query}

Answer:"""

response = gemini_model.generate_content(prompt)

print(response.text)

McKinsey is described as a "unique organization" with consultants and a flat hierarchy. Its consultants work with clients and have freedom of access and action. It is also referred to as "the Firm."
