In [5]:
import pandas as pd
import numpy as np


df = pd.read_pickle("data/resumes_with_tfidf.pkl")
df_jd = pd.read_pickle("data/processed_jds.pkl")

resume_embeddings = np.load("embeddings/resume_embeddings.npy")
jd_embeddings = np.load("embeddings/jd_embeddings.npy")


In [7]:
#computing SBERT similarity
jd_idx = 0
jd_vec = jd_embeddings[jd_idx]

sbert_scores = np.dot(
    resume_embeddings,
    jd_vec
) / (
    np.linalg.norm(resume_embeddings, axis=1) *
    np.linalg.norm(jd_vec)
)

df["sbert_similarity"] = sbert_scores


In [8]:
print("TF-IDF similarity stats:")
print("Min:", df["tfidf_similarity"].min())
print("Max:", df["tfidf_similarity"].max())
print("Mean:", df["tfidf_similarity"].mean())


TF-IDF similarity stats:
Min: 0.0003479989961072339
Max: 0.17980836819990206
Mean: 0.029674027536393012


In [9]:
print("SBERT similarity stats:")
print("Min:", df["sbert_similarity"].min())
print("Max:", df["sbert_similarity"].max())
print("Mean:", df["sbert_similarity"].mean())


SBERT similarity stats:
Min: 0.12103441
Max: 0.6265481
Mean: 0.44218734


In [10]:
print(df.columns.tolist())

['Category', 'Resume', 'cleaned_resume', 'skills', 'education', 'education_level', 'experience_years', 'experience_level', 'match_score', 'rule_score', 'tfidf_similarity', 'sbert_similarity']


In [11]:
df["final_score"] = (
    0.30 * df["rule_score"] +
    0.20 * df["tfidf_similarity"] +
    0.50 * df["sbert_similarity"]
)

ranked = df.sort_values("final_score", ascending=False)
ranked.head(10)


Unnamed: 0,Category,Resume,cleaned_resume,skills,education,education_level,experience_years,experience_level,match_score,rule_score,tfidf_similarity,sbert_similarity,final_score
37,Data Science,Education Details \r\n B.Tech Rayat and Bahr...,education details b tech rayat and bahra insti...,"[machine learning, natural language processing...",[undergraduate],1,10.0,senior,0.95,0.95,0.116957,0.626548,0.621665
17,Data Science,Education Details \r\n B.Tech Rayat and Bahr...,education details b tech rayat and bahra insti...,"[machine learning, natural language processing...",[undergraduate],1,10.0,senior,0.95,0.95,0.116957,0.626548,0.621665
27,Data Science,Education Details \r\n B.Tech Rayat and Bahr...,education details b tech rayat and bahra insti...,"[machine learning, natural language processing...",[undergraduate],1,10.0,senior,0.95,0.95,0.116957,0.626548,0.621665
7,Data Science,Education Details \r\n B.Tech Rayat and Bahr...,education details b tech rayat and bahra insti...,"[machine learning, natural language processing...",[undergraduate],1,10.0,senior,0.95,0.95,0.116957,0.626548,0.621665
39,Data Science,Expertise â Data and Quantitative Analysis â...,expertise data and quantitative analysis decis...,"[opencv, machine learning, natural language pr...",[undergraduate],1,10.0,senior,0.95,0.95,0.179808,0.57796,0.609942
9,Data Science,Expertise â Data and Quantitative Analysis â...,expertise data and quantitative analysis decis...,"[opencv, machine learning, natural language pr...",[undergraduate],1,10.0,senior,0.95,0.95,0.179808,0.57796,0.609942
19,Data Science,Expertise â Data and Quantitative Analysis â...,expertise data and quantitative analysis decis...,"[opencv, machine learning, natural language pr...",[undergraduate],1,10.0,senior,0.95,0.95,0.179808,0.57796,0.609942
29,Data Science,Expertise â Data and Quantitative Analysis â...,expertise data and quantitative analysis decis...,"[opencv, machine learning, natural language pr...",[undergraduate],1,10.0,senior,0.95,0.95,0.179808,0.57796,0.609942
13,Data Science,Skills â¢ R â¢ Python â¢ SAP HANA â¢ Table...,skills r python sap hana tableau sap hana sql ...,"[machine learning, natural language processing...",[undergraduate],1,10.0,senior,0.95,0.95,0.116877,0.541997,0.579374
23,Data Science,Skills â¢ R â¢ Python â¢ SAP HANA â¢ Table...,skills r python sap hana tableau sap hana sql ...,"[machine learning, natural language processing...",[undergraduate],1,10.0,senior,0.95,0.95,0.116877,0.541997,0.579374


In [16]:
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()

df[["sbert_similarity", "tfidf_similarity"]] = scaler.fit_transform(
    df[["sbert_similarity", "tfidf_similarity"]]
)



## Evaluation

In [17]:
# Synthetic relevance label
# 1 = good fit, 0 = not fit
df["label"] = (df["final_score"] > 0.6).astype(int)


In [24]:
# precision@k , shows ranking quality

def precision_at_k(df, score_col, k=10):
    top_k = df.sort_values(score_col, ascending=False).head(k)
    return top_k["label"].mean()



In [25]:
print("TF-IDF Precision@10:",
      precision_at_k(df, "tfidf_norm", 10))

print("SBERT Precision@10:",
      precision_at_k(df, "sbert_norm", 10))

print("Hybrid Precision@10:",
      precision_at_k(df, "final_score", 10))


TF-IDF Precision@10: 0.4
SBERT Precision@10: 0.4
Hybrid Precision@10: 0.8


In [26]:
print(df.columns.tolist())


['Category', 'Resume', 'cleaned_resume', 'skills', 'education', 'education_level', 'experience_years', 'experience_level', 'match_score', 'rule_score', 'tfidf_similarity', 'sbert_similarity', 'final_score', 'label', 'tfidf_norm', 'sbert_norm', 'tfidf_only', 'sbert_only']


## Model comparision

In [27]:
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()
df["tfidf_norm"] = scaler.fit_transform(df[["tfidf_similarity"]])
df["sbert_norm"] = scaler.fit_transform(df[["sbert_similarity"]])

df["tfidf_only"] = df["tfidf_norm"]
df["sbert_only"] = df["sbert_norm"]


In [30]:
print("TF-IDF Precision@10:",
      precision_at_k(df, "tfidf_norm", 10))

print("SBERT Precision@10:",
      precision_at_k(df, "sbert_norm", 10))

print("Hybrid Precision@10:",
      precision_at_k(df, "final_score", 10))


TF-IDF Precision@10: 0.4
SBERT Precision@10: 0.4
Hybrid Precision@10: 0.8


In [67]:
df["final_score"].describe()


count    962.000000
mean       0.332216
std        0.090408
min        0.076195
25%        0.277047
50%        0.334359
75%        0.392083
max        0.621665
Name: final_score, dtype: float64

Scores are continuous and overlapping, so ranking is more appropriate than binary classification

## Auto-Generated Interview Questions

After ranking candidates, the system automatically generates tailored interview questions, based on matched skills and candidate experience level

In [31]:
df.columns

Index(['Category', 'Resume', 'cleaned_resume', 'skills', 'education',
       'education_level', 'experience_years', 'experience_level',
       'match_score', 'rule_score', 'tfidf_similarity', 'sbert_similarity',
       'final_score', 'label', 'tfidf_norm', 'sbert_norm', 'tfidf_only',
       'sbert_only'],
      dtype='object')

In [49]:
SKILL_QUESTION_TEMPLATES = {
    "machine learning": {
        "junior": "What is {skill}? Explain with an example.",
        "mid": "How have you applied {skill} in a real project?",
        "senior": "How would you design a scalable system using {skill}?"
    },
    "deep learning": {
        "junior": "What is deep learning? How is it different from ML?",
        "mid": "Describe a deep learning model you have trained.",
        "senior": "How would you optimize a deep learning model in production?"
    },
    "python": {
        "junior": "What are Python lists and dictionaries?",
        "mid": "How do you write efficient Python code for large datasets?",
        "senior": "How would you optimize Python code for performance?"
    },
    "nlp": {
        "junior": "What is NLP? Name common NLP tasks.",
        "mid": "Explain an NLP pipeline you have implemented.",
        "senior": "How would you build an end-to-end NLP system?"
    }
}


In [50]:
def fallback_question(skill, level):
    if level == "junior":
        return f"What is {skill}? Explain the basics."
    elif level == "mid":
        return f"Explain how you have used {skill} in a project."
    else:
        return f"What are the challenges of using {skill} in production systems?"


In [51]:
def generate_interview_questions(skills, experience_level, max_questions=5):
    questions = []

    if not isinstance(skills, list):
        return questions

    for skill in skills[:max_questions]:
        skill_lower = skill.lower()

        if skill_lower in SKILL_QUESTION_TEMPLATES:
            template = SKILL_QUESTION_TEMPLATES[skill_lower][experience_level]
            question = template.format(skill=skill)
        else:
            question = fallback_question(skill, experience_level)

        questions.append(question)

    return questions


In [52]:
top_candidates = df.sort_values("final_score", ascending=False).head(10).copy()


In [53]:
top_candidates["interview_questions"] = top_candidates.apply(
    lambda row: generate_interview_questions(
        row["skills"],
        row["experience_level"]
    ),
    axis=1
)


In [54]:
top_candidates[["skills", "experience_level", "interview_questions"]].head(3)


Unnamed: 0,skills,experience_level,interview_questions
37,"[machine learning, natural language processing...",senior,[How would you design a scalable system using ...
17,"[machine learning, natural language processing...",senior,[How would you design a scalable system using ...
27,"[machine learning, natural language processing...",senior,[How would you design a scalable system using ...


Designed an interview-assist module that generates job-aware validation questions by analyzing skill overlap and gaps between resumes and job descriptions.

In [56]:
jd_skills = df_jd.loc[jd_idx, "jd_skills"]


In [57]:
def skill_gap(resume_skills, jd_skills):
    return list(set(jd_skills) - set(resume_skills))


In [58]:
def generate_smart_questions(resume_skills, jd_skills, experience_level):
    questions = []

    common = set(resume_skills) & set(jd_skills)
    missing = set(jd_skills) - set(resume_skills)

    # Validate strong skills
    for skill in list(common)[:2]:
        questions.append(
            f"Can you explain a real project where you used {skill}?"
        )

    # Probe missing skills
    for skill in list(missing)[:2]:
        questions.append(
            f"This role requires {skill}. How would you get up to speed on it?"
        )

    # Senior depth
    if experience_level == "senior":
        questions.append(
            "Describe a technical decision you made that significantly impacted system performance."
        )

    return questions


In [62]:
top_candidates["interview_questions"] = top_candidates.apply(
    lambda row: generate_smart_questions(
        row["skills"],
        jd_skills,
        row["experience_level"]
    ),
    axis=1
)


In [63]:
top_candidates[["skills", "experience_level", "interview_questions"]].head(3)


Unnamed: 0,skills,experience_level,interview_questions
37,"[machine learning, natural language processing...",senior,[Can you explain a real project where you used...
17,"[machine learning, natural language processing...",senior,[Can you explain a real project where you used...
27,"[machine learning, natural language processing...",senior,[Can you explain a real project where you used...
