In [1]:
# 2. Import required libraries
import os
from dotenv import load_dotenv
from langchain.document_loaders import PyPDFLoader, DirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings import HuggingFaceEmbeddings
from langchain_pinecone import PineconeVectorStore
from langchain_google_genai import ChatGoogleGenerativeAI
from langchain.chains import create_retrieval_chain
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain_core.prompts import ChatPromptTemplate
from pinecone import Pinecone, ServerlessSpec

# 3. Load Environment Variables
# Create a .env file with your PINECONE_API_KEY and GOOGLE_API_KEY
load_dotenv()

PINECONE_API_KEY = os.getenv("PINECONE_API_KEY")
GOOGLE_API_KEY = os.getenv("GOOGLE_API_KEY")

# Set them as environment variables if needed
os.environ["PINECONE_API_KEY"] = PINECONE_API_KEY
os.environ["GOOGLE_API_KEY"] = GOOGLE_API_KEY

  from .autonotebook import tqdm as notebook_tqdm


In [7]:
os.chdir("../")

In [9]:
# 1. Create a directory named 'job_documents' and place your resume and job description PDFs inside.

# 2. Load documents from the directory
def load_documents(directory_path):
    # Supports various file types, focusing on PDFs here
    loader = DirectoryLoader(
        directory_path,
        glob="*.pdf",
        loader_cls=PyPDFLoader
    )
    documents = loader.load()
    return documents


In [10]:
docs = load_documents("data")
print(f"Loaded {len(docs)} documents.")

Loaded 5 documents.


In [11]:
# Assume you have a unique identifier for the current user or session
current_user_id = "user_abc_123"

# Split documents into smaller chunks (as before)
def split_text_into_chunks(documents):
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=1000,
        chunk_overlap=100
    )
    chunks = text_splitter.split_documents(documents)
    return chunks

text_chunks = split_text_into_chunks(docs)

# **NEW STEP: Add the user_id to each chunk's metadata**
for chunk in text_chunks:
    chunk.metadata['user_id'] = current_user_id

print(f"Added user ID '{current_user_id}' to the metadata of {len(text_chunks)} chunks.")

# The next step, `PineconeVectorStore.from_documents`, will automatically
# save this new metadata along with the embeddings in Pinecone.

Added user ID 'user_abc_123' to the metadata of 10 chunks.


In [12]:
print(text_chunks)

[Document(metadata={'producer': 'pdfTeX-1.40.20', 'creator': 'LaTeX with hyperref', 'creationdate': '2020-08-29T04:02:33+00:00', 'author': '', 'keywords': '', 'moddate': '2020-08-29T04:02:33+00:00', 'ptex.fullbanner': 'This is pdfTeX, Version 3.14159265-2.6-1.40.20 (TeX Live 2019) kpathsea version 6.3.1', 'subject': '', 'title': '', 'trapped': '/False', 'source': 'data\\jakes-resume.pdf', 'total_pages': 1, 'page': 0, 'page_label': '1', 'user_id': 'user_abc_123'}, page_content='Jake Ryan\n123-456-7890 | jake@su.edu | linkedin.com/in/jake | github.com/jake\nEducation\nSouthwestern University Georgetown, TX\nBachelor of Arts in Computer Science, Minor in Business Aug. 2018 – May 2021\nBlinn College Bryan, TX\nAssociate’s in Liberal Arts Aug. 2014 – May 2018\nExperience\nUndergraduate Research Assistant June 2020 – Present\nTexas A&M University College Station, TX\n• Developed a REST API using FastAPI and PostgreSQL to store data from learning management systems\n• Developed a full-stack w

In [13]:
# 1. Initialize the embedding model
def get_embedding_model():
    model_name = "sentence-transformers/all-MiniLM-L6-v2"
    embeddings = HuggingFaceEmbeddings(model_name=model_name)
    return embeddings

embeddings = get_embedding_model()
print("Embedding model loaded.")


  embeddings = HuggingFaceEmbeddings(model_name=model_name)


Embedding model loaded.


In [14]:

# 2. Initialize Pinecone and create an index
pc = Pinecone(api_key=PINECONE_API_KEY)
index_name = "interview-prep-assistant"
embedding_dimension = 384 # Based on all-MiniLM-L6-v2

if index_name not in pc.list_indexes().names():
    print(f"Creating new index: {index_name}")
    pc.create_index(
        name=index_name,
        dimension=embedding_dimension,
        metric="cosine",
        spec=ServerlessSpec(cloud="aws", region="us-east-1")
    )

# 3. Store the document chunks in Pinecone
# This will embed the text_chunks and upload them to your index.
print("Storing document embeddings in Pinecone...")
docsearch = PineconeVectorStore.from_documents(
    documents=text_chunks,
    embedding=embeddings,
    index_name=index_name
)
print("Embeddings stored successfully.")

Creating new index: interview-prep-assistant
Storing document embeddings in Pinecone...
Embeddings stored successfully.


In [25]:
# 1. Initialize the LLM
llm = ChatGoogleGenerativeAI(model="models/gemini-2.0-flash", temperature=0.6)

# 2. Create a prompt for question generation
question_gen_prompt_text = """
Based on the provided resume and job description below, please act as a senior hiring manager.
Your task is to generate a list of 5-7 insightful interview questions that thoroughly probe the candidate's suitability for the role.
The questions should cover technical skills, behavioral competencies, and past project experiences mentioned in the resume.

<context>
{context}
</context>

Questions:
"""
question_gen_prompt = ChatPromptTemplate.from_template(question_gen_prompt_text)

# 3. Create and run the generation chain
# We combine the content of all documents to provide full context
full_document_content = "\\n\\n".join([doc.page_content for doc in docs])
question_generation_chain = question_gen_prompt | llm

# 4. Generate the questions
response = question_generation_chain.invoke({"context": full_document_content})
interview_questions = response.content.strip().split('\\n')
print("--- Generated Interview Questions ---")
for q in interview_questions:
    print(q)

--- Generated Interview Questions ---
Okay, based on Jake Ryan's resume and the Software Developer job description, here are 6 insightful interview questions designed to assess his suitability for the role:

1.  **Technical Depth & Problem Solving (FastAPI & PostgreSQL):** "In your role as an Undergraduate Research Assistant, you developed a REST API using FastAPI and PostgreSQL. Can you describe a challenging technical problem you encountered during that project, and how you approached solving it?  Specifically, I'm interested in hearing about any performance bottlenecks you identified and how you optimized your code or database queries to address them. Please provide specific code examples if possible."
    *   *Rationale:* This question probes his understanding of the technologies mentioned, his problem-solving abilities under pressure, and his ability to articulate technical solutions clearly. The specific request for code examples tests his practical coding skills and attention to

In [26]:
response

AIMessage(content='Okay, based on Jake Ryan\'s resume and the Software Developer job description, here are 6 insightful interview questions designed to assess his suitability for the role:\n\n1.  **Technical Depth & Problem Solving (FastAPI & PostgreSQL):** "In your role as an Undergraduate Research Assistant, you developed a REST API using FastAPI and PostgreSQL. Can you describe a challenging technical problem you encountered during that project, and how you approached solving it?  Specifically, I\'m interested in hearing about any performance bottlenecks you identified and how you optimized your code or database queries to address them. Please provide specific code examples if possible."\n    *   *Rationale:* This question probes his understanding of the technologies mentioned, his problem-solving abilities under pressure, and his ability to articulate technical solutions clearly. The specific request for code examples tests his practical coding skills and attention to detail.\n\n2.

In [28]:
import re

# Extract only the quoted question part after the numbered heading
interview_questions = re.findall(r'“([^”]+)”|"([^"]+)"', response.content)

# The regex returns tuples because it captures two possible quote styles (“ ” or " ")
# Flatten and clean them:
interview_questions = [q1 or q2 for q1, q2 in interview_questions]

print("Extracted Questions:")
for q in interview_questions:
    print("-", q)


Extracted Questions:
- In your role as an Undergraduate Research Assistant, you developed a REST API using FastAPI and PostgreSQL. Can you describe a challenging technical problem you encountered during that project, and how you approached solving it?  Specifically, I'm interested in hearing about any performance bottlenecks you identified and how you optimized your code or database queries to address them. Please provide specific code examples if possible.
- Your Gitlytics project showcases full-stack development experience. Given the requirement for program maintenance, modifications, and enhancements in this role, describe a situation where you had to refactor a significant portion of the Gitlytics codebase. What were the key considerations you made when deciding how to approach the refactoring, and how did you ensure minimal disruption to the existing functionality and data integrity?
- The job description emphasizes communication and collaboration. Tell me about your experience co

In [29]:
interview_questions

["In your role as an Undergraduate Research Assistant, you developed a REST API using FastAPI and PostgreSQL. Can you describe a challenging technical problem you encountered during that project, and how you approached solving it?  Specifically, I'm interested in hearing about any performance bottlenecks you identified and how you optimized your code or database queries to address them. Please provide specific code examples if possible.",
 'Your Gitlytics project showcases full-stack development experience. Given the requirement for program maintenance, modifications, and enhancements in this role, describe a situation where you had to refactor a significant portion of the Gitlytics codebase. What were the key considerations you made when deciding how to approach the refactoring, and how did you ensure minimal disruption to the existing functionality and data integrity?',
 "The job description emphasizes communication and collaboration. Tell me about your experience collaborating with 

In [30]:
# 1. Create a retriever from our Pinecone vector store with the filter
retriever = docsearch.as_retriever(
    search_type="similarity",
    search_kwargs={
        'k': 3,
        'filter': {'user_id': current_user_id} # <-- THIS IS THE CRITICAL CHANGE
    }
)

# 2. Create a prompt for answer generation
answer_gen_prompt_text = """
You are an expert interview coach. Answer the following question based on the provided context from the candidate's resume and the job description.
Craft a clear, concise, and compelling answer that highlights the candidate's strengths and aligns them with the job requirements.

<context>
{context}
</context>

Question: {input}

Answer:
"""
answer_gen_prompt = ChatPromptTemplate.from_messages(
    [("system", answer_gen_prompt_text), ("human", "{input}")]
)

# 3. Create the RAG chain
question_answer_chain = create_stuff_documents_chain(llm, answer_gen_prompt)
rag_chain = create_retrieval_chain(retriever, question_answer_chain)

# 4. Generate ideal answers for each question
ideal_answers = {}
print("\\n--- Generating Ideal Answers ---")
for question in interview_questions:
    if not question: continue
    print(f"Generating answer for: {question}")
    response = rag_chain.invoke({"input": question})
    ideal_answers[question] = response['answer']

\n--- Generating Ideal Answers ---
Generating answer for: In your role as an Undergraduate Research Assistant, you developed a REST API using FastAPI and PostgreSQL. Can you describe a challenging technical problem you encountered during that project, and how you approached solving it?  Specifically, I'm interested in hearing about any performance bottlenecks you identified and how you optimized your code or database queries to address them. Please provide specific code examples if possible.
Generating answer for: Your Gitlytics project showcases full-stack development experience. Given the requirement for program maintenance, modifications, and enhancements in this role, describe a situation where you had to refactor a significant portion of the Gitlytics codebase. What were the key considerations you made when deciding how to approach the refactoring, and how did you ensure minimal disruption to the existing functionality and data integrity?
Generating answer for: The job description

In [31]:
# 1. Select a question for the user to answer
practice_question = interview_questions[6] 
print(f"\\n--- Interview Practice ---")
print(f"Question: {practice_question}")

# 2. Get the user's answer
user_answer = input("Your answer: ")

# 3. Define a prompt for feedback generation
feedback_prompt_text = """
You are a helpful and constructive interview coach. Your task is to compare the User's Answer with the Ideal Answer for the given interview question.
Provide feedback on the user's answer, highlighting its strengths and suggesting specific, actionable improvements.
Focus on whether the user effectively leveraged their experience (as detailed in the ideal answer) and how they could better align with the job's needs.

Interview Question:
{question}

Ideal Answer (based on resume and job description):
{ideal_answer}

User's Answer:
{user_answer}

Constructive Feedback:
"""
feedback_prompt = ChatPromptTemplate.from_template(feedback_prompt_text)

# 4. Create the feedback generation chain
feedback_chain = feedback_prompt | llm

# 5. Generate and display the feedback
ideal_answer_for_q = ideal_answers[practice_question]
feedback = feedback_chain.invoke({
    "question": practice_question,
    "ideal_answer": ideal_answer_for_q,
    "user_answer": user_answer
})

print("\\n--- Your Feedback ---")
print(feedback.content)

\n--- Interview Practice ---
Question: This role requires effectively managing time while working on multiple assignments with guidance as to relative priorities of assignments. Describe a time where you had to juggle multiple projects or tasks with competing deadlines. How did you prioritize your work, and what strategies did you use to ensure that you met all of your commitments?
\n--- Your Feedback ---
Okay, let's break down your answer and see how we can improve it to better align with the ideal answer and the job requirements.

**Strengths:**

*   **Addresses the core question:** You acknowledge the need to juggle multiple projects and mention prioritizing based on importance and urgency.

**Weaknesses:**

*   **Lacks Specificity and Context:** The answer is extremely vague. It doesn't provide any details about the projects, the deadlines, or the specific strategies you used. "Academic journey" is too broad.
*   **Doesn't Demonstrate Strategies:** While you mention "importance and