## Install Dependencies

In [1]:
!pip -q install langchain langchain-community langchain-google-genai pinecone-client langchain-pinecone pypdf

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m62.1/62.1 kB[0m [31m1.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.5/2.5 MB[0m [31m19.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m65.5/65.5 kB[0m [31m5.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m328.2/328.2 kB[0m [31m22.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.0/1.0 MB[0m [31m59.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m476.4/476.4 kB[0m [31m32.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.7/84.7 kB[0m [31m7.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m16.6/16.6 MB[0m [31m75.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

## Keys and config

In [26]:
import os, re, time, json
from datetime import datetime
from google.colab import userdata

os.environ["GOOGLE_API_KEY"] = userdata.get('GOOGLE_API_KEY')
os.environ["PINECONE_API_KEY"] = userdata.get('PINECONE_API_KEY')

INDEX_NAME = "resumetan1"
PINECONE_CLOUD = "aws"
PINECONE_REGION = "us-east-1"

RESUME_PDF_PATH = "/content/Tanay_Mehendale_Resume_AI2.pdf" # rename to your uploaded file
RESUME_ID = f"resume_{datetime.utcnow().strftime('%Y%m%d_%H%M%S')}"

  RESUME_ID = f"resume_{datetime.utcnow().strftime('%Y%m%d_%H%M%S')}"


## Imports

In [27]:
from langchain_community.document_loaders import PyPDFLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter

from langchain_google_genai import ChatGoogleGenerativeAI, GoogleGenerativeAIEmbeddings

from pinecone import Pinecone, ServerlessSpec
from langchain_pinecone import PineconeVectorStore

## Load PDF

In [28]:
loader = PyPDFLoader(RESUME_PDF_PATH)
pages = loader.load()

print("Pages:", len(pages))
print("Sample page snippet:", pages[0].page_content[:300])

Pages: 1
Sample page snippet: Tanay Mehendale 
San Jose, CA (can relocate) | tanay.mehendale@tamu.edu | 979-344-3679 | LinkedIn/tanay-mehendale | Portfolio | GitHub 
SUMMARY 
AI focused Data Engineer with 2.5+ years of experience building systems using Python, LLMs, and cloud platforms. Hands-on 
experience designing AI agents, 


## Chunking - Recursive Character Text Splitter

In [29]:
splitter = RecursiveCharacterTextSplitter(
    chunk_size=550,
    chunk_overlap=80,
    separators=[
        "\nEXPERIENCE\n", "\nPROJECTS\n", "\nEDUCATION\n",  "\nSKILLS\n",
        "\n\n", "\n", "•", "-", " ", ""
    ],
)
chunks = splitter.split_documents(pages)
print("Chunks:", len(chunks))


Chunks: 1


## Embedding

In [7]:
embeddings = GoogleGenerativeAIEmbeddings(model="models/text-embedding-004")

llm = ChatGoogleGenerativeAI(model="gemini-2.5-flash", temperature=0.0)

LABEL_SET = ["experience", "projects", "skills", "education", "other"]

def label_section(text: str) -> str:
  prompt = ("""
    You label resume text chunks.
    Return exactly one lowercase label from this list: experience, projects, skills, education, other
    Rules:
    - Choose experience if the text looks like a job role with bullets, impacts, responsibilities.
    - Choose projects if it describes a project, build, pipeline, dashboard, system.
    - Choose skills if it is mostly tools, technologies, languages.
    - Choose education if it is school, degree, coursework.
    - Otherwise other.
    Chunk:
    {text[:2000]}
    """)
  out = llm.invoke(prompt).content.strip().lower()
  out = re.sub(r"[^a-z]", "", out)
  return out if out in LABEL_SET else "other"



import re

def guess_section_rule(text: str) -> str | None:
    t = text.lower()

    # strong skills signal
    if ("skills" in t and len(t) < 900) or ("certifications" in t) or ("languages" in t and "python" in t):
        if sum(kw in t for kw in ["python", "sql", "aws", "spark", "tableau", "kafka", "snowflake"]) >= 3:
            return "skills"

    # strong projects signal
    if " | link" in t or "github" in t or "vercel" in t or "deployed" in t:
        return "projects"

    # strong experience signals
    has_dates = bool(re.search(r"\b(20\d{2}|'\d{2})\b", t)) or ("present" in t)
    has_role_words = any(w in t for w in ["intern", "engineer", "analyst", "developer", "assistant"])
    has_action_bullets = ("•" in text) or any(v in t for v in ["built", "designed", "developed", "implemented", "optimized", "migrated", "reduced", "improved"])

    if (has_dates and has_role_words) or (has_role_words and has_action_bullets):
        return "experience"

    return None


In [8]:
LABEL_SET = ["experience", "projects", "skills", "education", "other"]

def label_section_hybrid(text: str) -> str:
    rule = guess_section_rule(text)
    if rule:
        return rule

    prompt = (
        "Return exactly one label from: experience, projects, skills, education, other.\n"
        "You are labeling resume chunks.\n\n"
        "Choose experience for job roles, responsibilities, impact bullets.\n"
        "Choose projects for named projects, systems, pipelines, dashboards, links.\n"
        "Choose skills for tool lists and certifications.\n"
        "Choose education for degree, university, graduation.\n\n"
        "Return only the label.\n\n"
        f"Chunk:\n{text[:1500]}"
    )
    out = llm.invoke(prompt).content.strip().lower()
    out = re.sub(r"[^a-z]", "", out)
    return out if out in LABEL_SET else "other"


## Metadata, id, label

In [9]:
from collections import Counter
import time

for i, d in enumerate(chunks):
    if d.metadata is None:
        d.metadata = {}
    d.metadata["doc_type"] = "resume"
    d.metadata["resume_id"] = RESUME_ID
    d.metadata["chunk_index"] = i
    d.metadata["chunk_id"] = f"{RESUME_ID}:c{i}"

for d in chunks:
    d.metadata["section"] = label_section_hybrid(d.page_content)
    time.sleep(0.02)

print(Counter([d.metadata.get("section") for d in chunks]))

# print 3 examples from each bucket so you can sanity check
for sec in ["experience", "projects", "skills", "other"]:
    print("\nSECTION:", sec)
    shown = 0
    for d in chunks:
        if d.metadata.get("section") == sec:
            print(d.page_content[:220].replace("\n", " "))
            shown += 1
            if shown == 3:
                break


Counter({'projects': 6, 'experience': 2, 'skills': 1})

SECTION: experience
EXPERIENCE  Data Engineer Intern – Texas A&M University | Remote, San Jose, CA Aug 2025 – Present   (Python, ETL, AWS, Data Pipelines, Data Modeling, Pandas, Sentiment Analysis)  • Architected a Python based ETL pipeline
across 25+ subreddits to expand data coverage by 300%  • Optimized performance by resolving API failures, resulting in 100% data capture reliability and a 34% increase in speed  • Integrated LLM based sentiment analysis 

SECTION: projects
Tanay Mehendale  San Jose, CA (can relocate) | tanay.mehendale@tamu.edu | 979-344-3679 | LinkedIn/tanay-mehendale | Portfolio | GitHub  SUMMARY  AI focused Data Engineer with 2.5+ years of experience building systems usi
(SQL, Python, ETL, AWS, S3, Glue, Data Modeling, RCA)  • Developed 45+ custom ERP transactions using ABAP with SQL, improving data retrieval speed by 95% for 400+ users  • Diagnosed and resolved 30+ technical support iss
issues by 84% across 

In [10]:
ids = [d.metadata["chunk_id"] for d in chunks]
print("Unique IDs now:", len(set(ids)))
print("Sample IDs:", ids[:10])

Unique IDs now: 9
Sample IDs: ['resume_20251219_024419:c0', 'resume_20251219_024419:c1', 'resume_20251219_024419:c2', 'resume_20251219_024419:c3', 'resume_20251219_024419:c4', 'resume_20251219_024419:c5', 'resume_20251219_024419:c6', 'resume_20251219_024419:c7', 'resume_20251219_024419:c8']


## Creating Pinecone index

In [11]:
pc = Pinecone(api_key=os.environ["PINECONE_API_KEY"])

test_vec = embeddings.embed_query("dimension test")
dim = len(test_vec)
print("Embedding dimension:", dim)

existing = [idx["name"] for idx in pc.list_indexes()]
if INDEX_NAME not in existing:
  pc.create_index(
  name=INDEX_NAME,
  dimension=dim,
  metric="cosine",
  spec=ServerlessSpec(cloud=PINECONE_CLOUD, region=PINECONE_REGION),
  )
  print("Created index:", INDEX_NAME)
else:
  print("Index already exists:", INDEX_NAME)

Embedding dimension: 768
Created index: resumetan1


## Upsert chunks to Pinecone via LangChain

In [12]:
vectorstore = PineconeVectorStore(index_name=INDEX_NAME, embedding=embeddings)

texts = [d.page_content for d in chunks]
metadatas = [d.metadata for d in chunks]
ids = [d.metadata["chunk_id"] for d in chunks]

vectorstore.add_texts(texts=texts, metadatas=metadatas, ids=ids)

print("Upsert complete. Total chunks indexed:", len(ids))

Upsert complete. Total chunks indexed: 9


## Validating Chunk Retrieval and Connections

In [13]:
from pinecone import Pinecone

pc = Pinecone(api_key=os.environ["PINECONE_API_KEY"])
stats = pc.Index(INDEX_NAME).describe_index_stats()
print(stats)


{'dimension': 768,
 'index_fullness': 0.0,
 'metric': 'cosine',
 'namespaces': {'': {'vector_count': 9}},
 'total_vector_count': 9,
 'vector_type': 'dense'}


In [14]:
print("Pages:", len(pages))
print("Chunks:", len(chunks))
print("First 3 chunk lengths:", [len(c.page_content) for c in chunks[:3]])


Pages: 1
Chunks: 9
First 3 chunk lengths: [516, 539, 455]


In [15]:
ids = [d.metadata.get("chunk_id") for d in chunks]
print("IDs count:", len(ids))
print("Unique IDs:", len(set(ids)))
print("Sample IDs:", ids[:10])


IDs count: 9
Unique IDs: 9
Sample IDs: ['resume_20251219_024419:c0', 'resume_20251219_024419:c1', 'resume_20251219_024419:c2', 'resume_20251219_024419:c3', 'resume_20251219_024419:c4', 'resume_20251219_024419:c5', 'resume_20251219_024419:c6', 'resume_20251219_024419:c7', 'resume_20251219_024419:c8']


In [16]:
def preview_hits(query: str, k: int = 5, section_filter=None):
    flt = {"doc_type": "resume", "resume_id": RESUME_ID}

    if section_filter is not None:
        flt["section"] = section_filter

    hits = vectorstore.similarity_search_with_score(
        query,
        k=k,
        filter=flt
    )

    print("Filter used:", flt)
    print("Hits returned:", len(hits))
    print()

    for doc, score in hits:
        cid = doc.metadata.get("chunk_id")
        sec = doc.metadata.get("section")
        print("score:", round(float(score), 4), "| section:", sec, "| id:", cid)
        print(doc.page_content[:180].replace("\n", " "))
        print()

    return hits


In [17]:
preview_hits("Kafka streaming pipeline", k=5)
preview_hits("Kafka streaming pipeline", k=5, section_filter="experience")

Filter used: {'doc_type': 'resume', 'resume_id': 'resume_20251219_024419'}
Hits returned: 5

score: 0.4549 | section: projects | id: resume_20251219_024419:c5
• Developed an AI agent to tailor cover letters by matching user writing style, leveraged rapid prototyping in Cursor  InterviewBuddy | Link | RAG, Python, LLM, Generative AI, Gemi

score: 0.4506 | section: experience | id: resume_20251219_024419:c1
EXPERIENCE  Data Engineer Intern – Texas A&M University | Remote, San Jose, CA Aug 2025 – Present   (Python, ETL, AWS, Data Pipelines, Data Modeling, Pandas, Sentiment Analysis)  •

score: 0.4188 | section: experience | id: resume_20251219_024419:c2
across 25+ subreddits to expand data coverage by 300%  • Optimized performance by resolving API failures, resulting in 100% data capture reliability and a 34% increase in speed  • 

score: 0.4153 | section: projects | id: resume_20251219_024419:c4
issues by 84% across 52+ releases  PROJECTS  ApartmentFinder – AI Agent for Relocations | Lin

[(Document(id='resume_20251219_024419:c1', metadata={'author': 'Resume Editing', 'chunk_id': 'resume_20251219_024419:c1', 'chunk_index': 1.0, 'creationdate': '2025-12-15T14:29:45-08:00', 'creator': 'Microsoft® Word 2021', 'doc_type': 'resume', 'moddate': '2025-12-15T14:29:45-08:00', 'page': 0.0, 'page_label': '1', 'producer': 'Microsoft® Word 2021', 'resume_id': 'resume_20251219_024419', 'section': 'experience', 'source': '/content/Tanay_Mehendale_Resume_AI2.pdf', 'total_pages': 1.0}, page_content='EXPERIENCE \nData Engineer Intern – Texas A&M University | Remote, San Jose, CA Aug 2025 – Present \n (Python, ETL, AWS, Data Pipelines, Data Modeling, Pandas, Sentiment Analysis) \n• Architected a Python based ETL pipeline to consolidate 3 data sources into a multi-layer data lake on Amazon S3, \nreducing data preparation time by 40% and aiding time series analysis \n• Developed scalable logic to ingest 2M+ unstructured raw text via Reddit API, implementing regex-based alias matching \nacro

## Generating Questions

In [30]:
def retrieve_resume_evidence(
    query: str,
    section: str,
    resume_id: str,
    top_k: int = 8
):
    flt = {
        "doc_type": "resume",
        "resume_id": resume_id,
        "section": section
    }

    hits = vectorstore.similarity_search_with_score(
        query,
        k=top_k,
        filter=flt
    )

    evidence = []
    for doc, score in hits:
        evidence.append({
            "chunk_id": doc.metadata.get("chunk_id"),
            "section": doc.metadata.get("section"),
            "score": float(score),
            "text": doc.page_content
        })

    # Deduplicate by chunk_id
    seen = {}
    for e in evidence:
        cid = e["chunk_id"]
        if cid not in seen or e["score"] < seen[cid]["score"]:
            seen[cid] = e

    return list(seen.values())


## Prompts

In [31]:
# Colab cell: Query-time RAG. One prompt. Resume-only. Pinecone already populated.

from pathlib import Path

# Assumes these already exist in your notebook:
# - vectorstore: PineconeVectorStore(index_name=..., embedding=embeddings)
# - llm: ChatGoogleGenerativeAI(...)
# - RESUME_ID: the resume_id you indexed (example: "resume_20251219_021401")

USER_QUERY = "Generate interview questions based on my resume."
OUT_PATH = "/content/interview_questions.md"

N_EXP = 1
N_PROJ = 1
N_SKILL = 1
N_BEH = 2

def _truncate(text: str, max_chars: int = 850) -> str:
    text = (text or "").strip()
    if len(text) <= max_chars:
        return text
    return text[:max_chars].rstrip() + "..."

def _retrieve(section: str, query: str, top_k: int):
    flt = {"doc_type": "resume", "resume_id": RESUME_ID, "section": section}
    hits = vectorstore.similarity_search_with_score(query, k=top_k, filter=flt)

    evidence = []
    for doc, score in hits:
        evidence.append({
            "chunk_id": doc.metadata.get("chunk_id"),
            "section": doc.metadata.get("section"),
            "score": float(score),
            "text": doc.page_content
        })

    # Dedup by chunk_id, keep best score
    best = {}
    for e in evidence:
        cid = e.get("chunk_id")
        if not cid:
            continue
        if cid not in best or e["score"] < best[cid]["score"]:
            best[cid] = e

    out = list(best.values())
    out.sort(key=lambda x: x["score"])
    return out

def _format_block(evidence_list, empty_msg: str):
    if not evidence_list:
        return empty_msg
    parts = []
    for e in evidence_list:
        parts.append(f"[{e['chunk_id']}]\n{_truncate(e['text'])}")
    return "\n\n".join(parts)

def build_single_prompt(user_query: str, exp_block: str, proj_block: str, skill_block: str) -> str:
    return f"""
  You are an interview coach. You generate grounded interview practice content from resume evidence.

  User request:
  {user_query}

  Resume evidence. This is the ONLY source of candidate facts. Do not use outside knowledge.
  Each evidence item starts with an id in square brackets.

  EXPERIENCE EVIDENCE:
  {exp_block}

  PROJECTS EVIDENCE:
  {proj_block}

  SKILLS EVIDENCE:
  {skill_block}

  Rules:
  - Strict grounding. Do not invent employers, titles, dates, tools, metrics, or outcomes not in evidence.
  - If a detail is needed for STAR and not present, write "Missing detail:" and state what is missing.
  - Every question must include an Evidence line listing chunk ids used.
  - Avoid repeating the same question phrasing.

  Task:
  Create a single interview prep pack with these sections:
  1) Experience-Based Questions. Generate exactly {N_EXP} questions.
  2) Project Deep Dive Questions. Generate exactly {N_PROJ} questions.
  3) Skills Verification Questions. Generate exactly {N_SKILL} questions.
  4) Behavioral Questions. Generate exactly {N_BEH} questions grounded in the resume evidence themes.

  Output format. Return Markdown only:

  # Resume-Based Interview Question Pack

  ## Experience-Based Questions
  ### Q1. {{Question}}
  Why They'll Ask This:
  {{Reason}}
  How To Prepare:
  - {{Tip 1}}
  - {{Tip 2}}
  Sample Answer (STAR):
  Situation:
  Task:
  Action:
  Result:
  Evidence:
  - [chunk_id]

  Repeat Q2..Q{N_EXP}.

  ## Project Deep Dive Questions
  Same structure, Q1..Q{N_PROJ}.

  ## Skills Verification Questions
  No STAR required.
  ### Q1. {{Question}}
  Why They'll Ask This:
  {{Reason}}
  How To Prepare:
  - {{Tip}}
  Evidence:
  - [chunk_id]

  Repeat Q2..Q{N_SKILL}.

  ## Behavioral Questions
  Same structure as Experience, Q1..Q{N_BEH}.
  """.strip()

# 1) Retrieve capped evidence per section
exp_evidence = _retrieve(
    section="experience",
    query="work experience responsibilities impact achievements ownership debugging collaboration",
    top_k=2
)

proj_evidence = _retrieve(
    section="projects",
    query="projects built designed implemented deployed pipeline dashboard system RAG vector database langchain",
    top_k=4
)

skill_evidence = _retrieve(
    section="skills",
    query="skills technologies tools languages frameworks cloud databases",
    top_k=1
)

# 2) Format evidence blocks (truncate to control tokens)
exp_block = _format_block(exp_evidence, "None found in index for this section.")
proj_block = _format_block(proj_evidence, "None found in index for this section.")
skill_block = _format_block(skill_evidence, "None found in index for this section.")

# 3) Build one prompt and generate
prompt = build_single_prompt(USER_QUERY, exp_block, proj_block, skill_block)
result_md = llm.invoke(prompt).content

# 4) Save to file
Path(OUT_PATH).write_text(result_md, encoding="utf-8")

print("Resume ID:", RESUME_ID)
print("Experience chunks used:", [e["chunk_id"] for e in exp_evidence])
print("Project chunks used:", [e["chunk_id"] for e in proj_evidence])
print("Skills chunks used:", [e["chunk_id"] for e in skill_evidence])
print("Saved:", OUT_PATH)
print()
print(result_md[:1400])


Resume ID: resume_20251219_030037
Experience chunks used: []
Project chunks used: []
Skills chunks used: []
Saved: /content/interview_questions.md

# Resume-Based Interview Question Pack

## Experience-Based Questions
### Q1. Can you tell me about a significant professional experience you've had that you believe showcases your capabilities?
Why They'll Ask This: To understand your professional background and how you apply your skills in a real-world setting, even if not explicitly detailed on your resume. This helps them gauge your ability to articulate your past roles and responsibilities.
How To Prepare:
- Think of a relevant professional experience, ideally one that highlights skills pertinent to the role you're interviewing for.
- Structure your answer using the STAR method, focusing on your specific contributions and the impact you made.
Sample Answer (STAR):
Situation: Missing detail: No professional experience is provided in your resume evidence to ground this answer.
Task: Miss

In [32]:
print(result_md)

# Resume-Based Interview Question Pack

## Experience-Based Questions
### Q1. Can you tell me about a significant professional experience you've had that you believe showcases your capabilities?
Why They'll Ask This: To understand your professional background and how you apply your skills in a real-world setting, even if not explicitly detailed on your resume. This helps them gauge your ability to articulate your past roles and responsibilities.
How To Prepare:
- Think of a relevant professional experience, ideally one that highlights skills pertinent to the role you're interviewing for.
- Structure your answer using the STAR method, focusing on your specific contributions and the impact you made.
Sample Answer (STAR):
Situation: Missing detail: No professional experience is provided in your resume evidence to ground this answer.
Task: Missing detail: No professional experience is provided in your resume evidence to ground this answer.
Action: Missing detail: No professional experience