In [81]:
import yaml

# load config
with open("config.yaml", "r") as file:
    config = yaml.safe_load(file)

## extracting

In [1]:
import fitz  # PyMuPDF

def extract_text_from_pdf(pdf_path):
    doc = fitz.open(pdf_path)
    all_text = []
    
    for page_num in range(len(doc)):
        page = doc.load_page(page_num)
        text = page.get_text("text")
        text = text.strip()
        if text:
            all_text.append(text)
    
    full_text = "\n\n".join(all_text)
    return full_text

In [58]:
path = "doc/Ethan Rasiel, Ph.D., Paul N. Friga - The McKinsey Mind.pdf"
raw_text = extract_text_from_pdf(pdf_path=path)

## chunking

In [113]:
from langchain.text_splitter import CharacterTextSplitter

# 2️⃣ Chunk your text
text_splitter = CharacterTextSplitter(
    separator="\n",
    chunk_size=50,
    chunk_overlap=10
)
chunks = text_splitter.split_text(raw_text)

Created a chunk of size 75, which is longer than the specified 50
Created a chunk of size 67, which is longer than the specified 50
Created a chunk of size 125, which is longer than the specified 50
Created a chunk of size 127, which is longer than the specified 50
Created a chunk of size 122, which is longer than the specified 50
Created a chunk of size 91, which is longer than the specified 50
Created a chunk of size 127, which is longer than the specified 50
Created a chunk of size 119, which is longer than the specified 50
Created a chunk of size 118, which is longer than the specified 50
Created a chunk of size 124, which is longer than the specified 50
Created a chunk of size 121, which is longer than the specified 50
Created a chunk of size 122, which is longer than the specified 50
Created a chunk of size 125, which is longer than the specified 50
Created a chunk of size 127, which is longer than the specified 50
Created a chunk of size 129, which is longer than the specified 5

## embedding

In [114]:
from sentence_transformers import SentenceTransformer

embedding_model = SentenceTransformer('all-MiniLM-L6-v2')
# Embed text
embeddings = embedding_model.encode(chunks)
print(f"Vector length: {len(embeddings[0])}")

KeyboardInterrupt: 

## store in db

### chroma cloud

In [48]:
import chromadb
  
chroma_client = chromadb.CloudClient(
  api_key=config['chroma_api'],
  tenant=config['chroma_tenant'],
  database='first_rag'
)

In [66]:
# 3️⃣ Create or get collection
collection = chroma_client.get_or_create_collection(name="my_collection")

In [61]:
# # Cloud collections work the same way!
# collection.add(
#     ids=[f"doc_{i}" for i in range(len(chunks))],  # unique IDs
#     embeddings=embeddings.tolist(),               # must be list of lists!
#     documents=chunks,                              # optional, but useful
#     metadatas=[{"source": "example"} for _ in chunks]  # optional metadata
# )

# print("✅ Vectors stored in Chroma Cloud!")

In [76]:
# Just store the first 200 chunks instead of all
n = 300
collection.add(
    ids=[f"doc_{i}" for i in range(n)],
    embeddings=embeddings[:n].tolist(),
    documents=chunks[:n],
    metadatas=[{"source": "example"} for _ in range(n)]
)

### chroma local

In [100]:
import chromadb

# Local vector DB (PersistentClient = new style!)
chroma_client = chromadb.PersistentClient(path="./vector_database")

collection = chroma_client.get_or_create_collection(name="my_local_collection")

In [101]:
# Cloud collections work the same way!
collection.add(
    ids=[f"doc_{i}" for i in range(len(chunks))],  # unique IDs
    embeddings=embeddings.tolist(),               # must be list of lists!
    documents=chunks,                              # optional, but useful
    metadatas=[{"source": "example"} for _ in chunks]  # optional metadata
)

## retrieve

In [111]:
query = "How to think like McKinsey?"

# Embed locally
query_embedding = embedding_model.encode([query])

# Search Chroma Cloud
results = collection.query(
    query_embeddings=query_embedding.tolist(),
    n_results=3
)

# Extract relevant chunks
contexts = results['documents'][0]
context_text = "\n".join(contexts)

In [107]:
contexts

['after already achieving great success with his solo effort on his first\nbook, The McKinsey Way. Second, he offered incredible insight,\nenthusiasm, and direction. And third, he taught me lessons in per-\nsistence, storytelling, and unselfishness. —Paul N. Friga\n\x02  \x02  \x02\nThe authors owe thanks to many without whom this book\nwould not be in your hands. First, their agent, Daniel Greenberg at\nJames Levine Communications, Inc.; their editor, Mary Glenn;',
 'without a recognizable structure or factual support? When\nMcKinsey-ites exit the Firm, they are often shocked by the sloppy\nthinking processes prevalent in many organizations.\nMost of us are not blessed from birth with the ability to think\nin a rigorous, structured manner; we have to learn how. Unfortu-\nnately, that skill is not part of most university curricula, and few\ncompanies have the resources or the inclination to teach it to their',
 'MIND\nMCKINSEY\nCopyright © 2002 by The McGraw-Hill Companies, Inc. All ri

In [108]:
import google.generativeai as genai

# Configure the SDK
genai.configure(api_key=config['genimi_api'])

# Create the Gemini 2.5 Flash model
gemini_model = genai.GenerativeModel("gemini-2.5-flash")

In [109]:
# response = gemini_model.generate_content(query)

# print(response.text)

In [112]:
# Call your local Llama
prompt = f"""Answer the question below using ONLY the context below.

Context:
{context_text}

Question:
{query}

Answer:"""

response = gemini_model.generate_content(prompt)

print(response.text)

The ability to think in a rigorous, structured manner is a skill that must be learned, as most people are not born with it. McKinsey-ites do not engage in sloppy thinking processes.
