In [3]:
# Step 1: Upload the processed Excel file
from google.colab import files
uploaded = files.upload()

# Step 2: Load Excel
import pandas as pd
filename = list(uploaded.keys())[0]
df = pd.read_csv(filename)

# Step 3: Clean whitespace in key columns
for col in ['Company', 'Role', 'Round Name', 'Topic Questions']:
    df[col] = df[col].astype(str).str.strip()

# Step 4: Create combined text for embedding
df['combined_text'] = df.apply(
    lambda row: f"{row['Company']} - {row['Role']} - {row['Round Name']} - {row['Topic Questions']}",
    axis=1
)

# Step 5: Install and load embedding model
!pip install -q sentence-transformers
from sentence_transformers import SentenceTransformer
import json

model = SentenceTransformer('all-MiniLM-L6-v2')

# Step 6: Generate embeddings
df['embedding'] = df['combined_text'].apply(lambda x: model.encode(x).tolist())

# Step 7: Drop combined text (optional)
df.drop(columns=['combined_text'], inplace=True)

# Step 8: Convert to records and save as JSONL
jsonl_file = 'processed_interview_with_embeddings.jsonl'
with open(jsonl_file, 'w') as f:
    for record in df.to_dict(orient='records'):
        f.write(json.dumps(record) + '\n')

# Step 9: Download the JSONL file
files.download(jsonl_file)

Saving RAG_processed_updated.csv to RAG_processed_updated (2).csv


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [4]:
# Read and preview JSONL file in Colab
import json

with open('processed_interview_with_embeddings.jsonl', 'r') as f:
    lines = f.readlines()

# Parse first few JSON objects
for i, line in enumerate(lines[:5]):
    data = json.loads(line)
    print(f"\n--- Record {i+1} ---")
    print(json.dumps(data, indent=2))


--- Record 1 ---
{
  "Company": "Meta",
  "Role": "Machine Learning Engineer",
  "Tags": "Behavioral,Machinelearning,Programminglanguages,Frameworks",
  "Round Number": "Round 1",
  "Round Name": "HR Interview",
  "Topic Questions": "Why do you want to join Meta?",
  "embedding": [
    -0.028817638754844666,
    0.03692211955785751,
    0.019611019641160965,
    0.04577888548374176,
    0.15243586897850037,
    -0.04528956860303879,
    0.04119359329342842,
    0.013343761675059795,
    -0.04114455729722977,
    -0.06190158799290657,
    -0.16111835837364197,
    -0.051127590239048004,
    -0.06963231414556503,
    -0.03054792247712612,
    0.015425600111484528,
    -0.005130475386977196,
    -0.01619338057935238,
    -0.07248613238334656,
    -0.027303021401166916,
    -0.06193158030509949,
    -0.026261577382683754,
    -0.038942739367485046,
    0.04584978520870209,
    -0.0448080375790596,
    -0.04424094408750534,
    0.03950132057070732,
    -0.01852833479642868,
    0.084595665

In [7]:
# Step 1: Install Pinecone client
!pip uninstall -y pinecone-client

Found existing installation: pinecone-client 6.0.0
Uninstalling pinecone-client-6.0.0:
  Successfully uninstalled pinecone-client-6.0.0
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m587.6/587.6 kB[0m [31m43.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m259.3/259.3 kB[0m [31m28.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m65.5/65.5 kB[0m [31m7.6 MB/s[0m eta [36m0:00:00[0m
[?25h

In [8]:
!pip install -q pinecone

In [11]:
# Step 2: Imports
import json
import os
import uuid
from pinecone import Pinecone, ServerlessSpec

# Step 3: Initialize Pinecone client (set your API key)
PINECONE_REDACTED
INDEX_NAME = "ai-mock-interview-questions"

pc = Pinecone(api_key=PINECONE_API_KEY)

# Step 4: Load JSONL file
records = []
with open('processed_interview_with_embeddings.jsonl', 'r') as f:
    for line in f:
        record = json.loads(line)
        records.append(record)

# Step 5: Prepare data
texts = [r['Topic Questions'] for r in records]
embeddings = [r['embedding'] for r in records]
metadatas = [{k: v for k, v in r.items() if k not in ['embedding', 'Topic Questions']} for r in records]
ids = [f"q_{i}" for i in range(len(texts))]

# Step 6: Create (or get) Pinecone index
# Infer embedding dimension from the first vector
if not embeddings:
    raise ValueError("No embeddings found to index.")

embed_dim = len(embeddings[0])
existing = {idx.name for idx in pc.list_indexes()}

if INDEX_NAME not in existing:
    pc.create_index(
        name=INDEX_NAME,
        dimension=embed_dim,
        metric="cosine",
        spec=ServerlessSpec(cloud="aws", region="us-east-1")
    )

index = pc.Index(INDEX_NAME)

In [12]:
# Step 7: Upsert to Pinecone
vectors = []
for i in range(len(ids)):
    md = dict(metadatas[i])
    md["text"] = texts[i]  # preserves your original document content
    vectors.append({
        "id": ids[i],
        "values": embeddings[i],
        "metadata": md
    })

# Batch upsert (optional batching for large datasets)
BATCH = 100
for start in range(0, len(vectors), BATCH):
    index.upsert(vectors=vectors[start:start + BATCH])

print(f"Upserted {len(vectors)} vectors to Pinecone index '{INDEX_NAME}' (dim={embed_dim}, metric=cosine).")


Upserted 817 vectors to Pinecone index 'ai-mock-interview-questions' (dim=384, metric=cosine).


In [17]:
# Using Gemini to extract interview details.
import google.generativeai as genai
import json, re

# 1) Configure Gemini (unchanged)
genai.configure(REDACTED)
model = genai.GenerativeModel('gemini-2.5-flash')

# ---- NEW: Pinecone + embeddings ----
from pinecone import Pinecone
from sentence_transformers import SentenceTransformer

# Pinecone client (use the same API key you used before)
PINECONE_REDACTED
INDEX_NAME = "ai-mock-interview-questions"

pc = Pinecone(api_key=PINECONE_API_KEY)
index = pc.Index(INDEX_NAME)

# Embedding model must match what you used to build the index
embed_model = SentenceTransformer("all-MiniLM-L6-v2")  # 384-dim


# 2) Metadata extractor using Gemini prompt (unchanged)
def extract_metadata_from_query():
    user_query = input("Enter your interview query: ")
    prompt = f"""
Extract the following information from the sentence below:

Sentence: "{user_query}"

Return a JSON object with:
- Company
- Role
- Round Number

Format:
{{
  "Company": "...",
  "Role": "...",
  "Round Number": "..."
}}
"""
    try:
        response = model.generate_content(prompt)
        response_json = response.text.strip()
        match = re.search(r'\{.*\}', response_json, re.DOTALL)
        if match:
            metadata = json.loads(match.group())
            print(" Extracted Metadata:", metadata)
            return user_query, metadata
        else:
            print("Could not find valid JSON in response.")
            return user_query, None
    except Exception as e:
        print("Error extracting metadata:", e)
        return user_query, None

In [39]:
# --- Pinecone-based pipeline

from sentence_transformers import SentenceTransformer

# Ensure we have the same embedding model loaded
try:
    _ = embed_model  # check if already defined
except NameError:
    embed_model = SentenceTransformer("all-MiniLM-L6-v2")  # 384-dim


def _normalize_for_filter(md: dict) -> dict:
    """Normalize values"""
    if md is None:
        return {}
    out = {}

    # Company: strip only (assumes you indexed proper case like 'Meta')
    if md.get("Company"):
        out["Company"] = str(md["Company"]).strip()

    # Role: title-case to match e.g. 'Data Scientist'
    if md.get("Role"):
        out["Role"] = str(md["Role"]).strip().title()

    # Round Number: ensure it starts with 'Round '
    rn = md.get("Round Number")
    if rn is not None and str(rn).strip() != "":
        rn_str = str(rn).strip()
        out["Round Number"] = rn_str if rn_str.lower().startswith("round") else f"Round {rn_str}"

    return out


def run_full_pipeline():
    user_query, metadata = extract_metadata_from_query()
    if metadata is None:
        metadata = {}

    # --- normalize BEFORE building the filter ---
    norm = _normalize_for_filter(metadata)

    # Defining metadata filter (same structure as your Chroma version)
    and_clauses = []
    if norm.get("Company"):
        and_clauses.append({"Company": {"$eq": norm["Company"]}})
    if norm.get("Role"):
        and_clauses.append({"Role": {"$eq": norm["Role"]}})
    if norm.get("Round Number"):
        and_clauses.append({"Round Number": {"$eq": norm["Round Number"]}})

    metadata_filter = {"$and": and_clauses} if and_clauses else None

    # Query string (use normalized values for consistency)
    role_for_query = (norm.get("Role") or metadata.get("Role") or "data science").strip()
    round_for_query = (norm.get("Round Number") or metadata.get("Round Number") or "Round 1").strip()
    query_string = f"interview questions for {role_for_query} {round_for_query}"
    print("\n Querying:", query_string)
    if metadata_filter:
        print("Using metadata filter:", metadata_filter)

    # Encode and query Pinecone
    qvec = embed_model.encode(query_string).tolist()
    res = index.query(
        vector=qvec,
        top_k=10,
        include_metadata=True,
        filter=metadata_filter  # can be None
    )

    matches = res.get("matches", []) or []

    # Optional: keep retry without filter if the strict filter yields nothing
    if not matches and metadata_filter:
        print("No results with filter; retrying without filter…")
        res = index.query(vector=qvec, top_k=10, include_metadata=True)
        matches = res.get("matches", []) or []

    if not matches:
        print(" No results found.")
        return

    for i, m in enumerate(matches, start=1):
        md = m.get("metadata", {}) or {}
        text = md.get("text", "")
        print(f"\n--- Match {i} ---")
        print("Question:", text)
        print("Metadata:", md)



In [40]:
run_full_pipeline()

Enter your interview query: Could you help me practice for Data Scientist Round 3 interview for Meta
 Extracted Metadata: {'Company': 'Meta', 'Role': 'Data Scientist', 'Round Number': 3}

 Querying: interview questions for Data Scientist Round 3
Using metadata filter: {'$and': [{'Company': {'$eq': 'Meta'}}, {'Role': {'$eq': 'Data Scientist'}}, {'Round Number': {'$eq': 'Round 3'}}]}

--- Match 1 ---
Question: Which data sets would you use to answer specific product-related questions?
Metadata: {'Company': 'Meta', 'Role': 'Data Scientist', 'Round Name': 'Analytical Reasoning Interview', 'Round Number': 'Round 3', 'Tags': 'Behavioural,Statistics,ProductCase,ExperimentDesign,BiasDetection,DataInterpretation,Storytelling,Metrics', 'text': 'Which data sets would you use to answer specific product-related questions?'}

--- Match 2 ---
Question: How do you draw meaningful conclusions from a dataset?
Metadata: {'Company': 'Meta', 'Role': 'Data Scientist', 'Round Name': 'Analytical Reasoning Int