# Interview Q&A RAG+NER steps followed for this file:

**Step 1: Load the RAG Data**  
- Load `processed_interview_with_embeddings.jsonl`  
- Bulk-add the following into a ChromaDB collection:  
  - **documents**: interview questions  
  - **embeddings**: precomputed vectors  
  - **metadatas**: Company, Role, Tags, Round Number, Round Name  
  - **ids**: unique identifiers

**Step 2: Prepare NLP “Lens”**  
1. **Load spaCy** model `en_core_web_sm`  
2. **Create PhraseMatchers** seeded from the RAG DB:  
   - **Role** matcher: all distinct `Role` values  
   - **Round Name** matcher: all distinct `Round Name` values  
3. **Compile regex** `\bRound\s*(\d+)\b` to capture **Round Number**  
4. **Use spaCy NER** to extract **Company** as an ORG entity

**Step 3: Normalize Extracted Fields**  
- Fuzzy-match each raw span (Company, Role, Round Number, Round Name) back to RAG exact metadata values using RapidFuzz  
- Guarantees only ever filter on values present in the collection

**Step 4: Build a Metadata Filter**  
Package the four canonical values into a Mongo-style filter:
```json
{
  "$and": [
    { "Company":      { "$eq": "Meta"                    } },
    { "Role":         { "$eq": "Machine Learning Engineer" } },
    { "Round Number": { "$eq": "Round 1"                  } },
    { "Round Name":   { "$eq": "HR Interview"             } }
  ]
}

**Step 5: Run the Semantic Search**

* Call collection.query() with:
  * query_texts: your user’s prompt (e.g. “interview questions”)
  * n_results: number of matches to retrieve
  * where: the metadata filter built above
* Returns the top‐N interview questions that both semantically match
and exactly fit the Company/Role/Round context

In [None]:
# 1: Install Dependencies
!pip install -q spacy rapidfuzz chromadb
!python -m spacy download en_core_web_sm


[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/67.3 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m67.3/67.3 kB[0m [31m5.8 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.1/3.1 MB[0m [31m93.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m18.9/18.9 MB[0m [31m90.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m94.9/94.9 kB[0m [31m8.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m284.2/284.2 kB[0m [31m21.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.0/2.0 MB[0m [31m68.3 MB/s[0m eta [36m0:00:00[

In [None]:
#  2: Imports & Initialization
import json
import re
import spacy
from spacy.matcher import PhraseMatcher
from rapidfuzz import process
from chromadb import Client

# Load spaCy English model and initialize matchers
nlp = spacy.load("en_core_web_sm")
role_matcher       = PhraseMatcher(nlp.vocab, attr="LOWER")
round_name_matcher = PhraseMatcher(nlp.vocab, attr="LOWER")
# Regex to capture "Round <number>"
ROUND_RE = re.compile(r"\bRound\s*(\d+)\b", flags=re.I)


In [None]:
# 3: Define NER & Filter Functions
def refresh_matchers(collection):
    """Populate PhraseMatchers by fetching metadata values from the collection."""
    # Fetch all metadata entries
    total = collection.count()
    all_meta = collection.get(include=["metadatas"], limit=total)
    metas = all_meta.get("metadatas", [])

    # Extract distinct values
    roles = list({m.get("Role") for m in metas if m.get("Role")})
    round_names = list({m.get("Round Name") for m in metas if m.get("Round Name")})

    # Remove existing patterns if present
    for matcher, label in [(role_matcher, "ROLE"), (round_name_matcher, "ROUND_NAME")]:
        try:
            matcher.remove(label)
        except (KeyError, ValueError):
            pass

    # Add new patterns for exact matching
    role_matcher.add("ROLE", [nlp.make_doc(r) for r in roles])
    round_name_matcher.add("ROUND_NAME", [nlp.make_doc(rn) for rn in round_names])

def canonicalize(raw: str, choices: list[str], cutoff: int = 75) -> str | None:
    """Fuzzy-match `raw` span back to one of the known metadata values in `choices`."""
    if not raw or not choices:
        return None
    match = process.extractOne(raw, choices, score_cutoff=cutoff)
    return match[0] if match else None

def extract_entities(text: str, collection) -> dict:
    """Extract Company, Role, Round Number, and Round Name from free-form input."""
    doc = nlp(text)

    # Company via spaCy ORG
    company_raw = next((ent.text for ent in doc.ents if ent.label_ == "ORG"), None)

    # Role & Round Name via PhraseMatcher
    role_raw, round_name_raw = None, None
    for _, start, end in role_matcher(doc):
        role_raw = doc[start:end].text
    for _, start, end in round_name_matcher(doc):
        round_name_raw = doc[start:end].text

    # Round Number via regex
    m = ROUND_RE.search(text)
    round_number_raw = f"Round {m.group(1)}" if m else None

    # Re-fetch actual metadata lists for canonicalization
    total = collection.count()
    all_meta = collection.get(include=["metadatas"], limit=total)
    metas = all_meta.get("metadatas", [])
    companies   = list({m.get("Company") for m in metas if m.get("Company")})
    roles       = list({m.get("Role") for m in metas if m.get("Role")})
    round_nums  = list({m.get("Round Number") for m in metas if m.get("Round Number")})
    round_names = list({m.get("Round Name") for m in metas if m.get("Round Name")})

    # Fuzzy-canonicalize each field
    return {
        "Company":      canonicalize(company_raw,   companies),
        "Role":         canonicalize(role_raw,      roles),
        "Round Number": canonicalize(round_number_raw, round_nums),
        "Round Name":   canonicalize(round_name_raw,   round_names),
    }

def build_metadata_filter(ents: dict) -> dict:
    """Convert extracted entity dict into a Mongo-style {'$and': [...]} filter."""
    clauses = []
    for key, val in ents.items():
        if val:
            clauses.append({ key: {"$eq": val} })
    return {"$and": clauses}


In [None]:
# 4: Initialize ChromaDB & Load Embeddings
client = Client()
collection = client.get_or_create_collection(name="interview_questions")

docs, embs, metas, ids = [], [], [], []
with open("/content/processed_interview_with_embeddings.jsonl", "r") as f:
    for i, line in enumerate(f):
        rec = json.loads(line)
        docs.append(rec["Topic Questions"])
        embs.append(rec["embedding"])
        metas.append({
            "Company":      rec.get("Company"),
            "Role":         rec.get("Role"),
            "Tags":         rec.get("Tags"),
            "Round Number": rec.get("Round Number"),
            "Round Name":   rec.get("Round Name"),
        })
        ids.append(rec.get("id", f"rec_{i}"))
collection.add(documents=docs, embeddings=embs, metadatas=metas, ids=ids)
print(f"Loaded {len(docs)} records into ChromaDB")


Loaded 657 records into ChromaDB


In [None]:
# 5: Refresh Matchers
refresh_matchers(collection)


In [None]:
# 6: Extract Entities & Build Filter
user_input = "I have an HR Interview (Round 1) for a Machine Learning Engineer at Meta"
ents = extract_entities(user_input, collection)
metadata_filter = build_metadata_filter(ents)
print("Built metadata filter:", metadata_filter)


Built metadata filter: {'$and': [{'Company': {'$eq': 'Meta'}}, {'Role': {'$eq': 'Machine Learning Engineer'}}, {'Round Number': {'$eq': 'Round 1'}}, {'Round Name': {'$eq': 'HR Interview'}}]}


In [None]:
# 7: Run Semantic Query & Show Results
results = collection.query(
    query_texts=["interview questions"],
    n_results=5,
    where=metadata_filter
)
for i, q in enumerate(results['documents'][0], 1):
    print(f"\n--- Match {i} ---")
    print("Question:", q)
    print("Metadata:", results['metadatas'][0][i-1])

/root/.cache/chroma/onnx_models/all-MiniLM-L6-v2/onnx.tar.gz: 100%|██████████| 79.3M/79.3M [00:00<00:00, 103MiB/s]



--- Match 1 ---
Question: If so, can you provide an example and explain your thought process?
Metadata: {'Company': 'Meta', 'Round Name': 'HR Interview', 'Tags': 'Behavioral,Machinelearning,Programminglanguages,Frameworks', 'Role': 'Machine Learning Engineer', 'Round Number': 'Round 1'}

--- Match 2 ---
Question: Why do you think you will be a good fit for the role?
Metadata: {'Round Number': 'Round 1', 'Round Name': 'HR Interview', 'Tags': 'Behavioral,Machinelearning,Programminglanguages,Frameworks', 'Role': 'Machine Learning Engineer', 'Company': 'Meta'}

--- Match 3 ---
Question: Have you worked with large datasets?
Metadata: {'Role': 'Machine Learning Engineer', 'Company': 'Meta', 'Round Name': 'HR Interview', 'Tags': 'Behavioral,Machinelearning,Programminglanguages,Frameworks', 'Round Number': 'Round 1'}

--- Match 4 ---
Question: What responsibilities do you expect to have from your job at Meta?
Metadata: {'Tags': 'Behavioral,Machinelearning,Programminglanguages,Frameworks', 'Ro