In [1]:
import nltk
from nltk.corpus import stopwords
import re
import string
from collections import Counter
from wordcloud import WordCloud

In [2]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import os

In [3]:
df = pd.read_csv('resumes/UpdatedResumeDataSet.csv')
df.head()

Unnamed: 0,Category,Resume
0,Data Science,Skills * Programming Languages: Python (pandas...
1,Data Science,Education Details \r\nMay 2013 to May 2017 B.E...
2,Data Science,"Areas of Interest Deep Learning, Control Syste..."
3,Data Science,Skills â¢ R â¢ Python â¢ SAP HANA â¢ Table...
4,Data Science,"Education Details \r\n MCA YMCAUST, Faridab..."


In [4]:
df.shape

(962, 2)

In [None]:
df.Category.value_counts()

In [None]:
plt.figure(figsize=(15,5))
sns.histplot(df['Category'])
plt.xticks(rotation=90)
plt.show()


In [None]:
counts=df['Category'].value_counts()
labels=df['Category'].nunique()
labels

In [None]:
df['Category'].unique()

In [None]:
import numpy as np
counts=df['Category'].value_counts()
labels=df['Category'].unique()
plt.figure(figsize=(15,10))
plt.pie(counts,labels=labels,autopct='%1.1f%%',shadow=True,colors=plt.cm.plasma(np.linspace(0,1,3)))
plt.show()

In [None]:
df['Resume'][0]

## Skill extraction

In [5]:
import string
import re
def clean_resume_text(text):
    if pd.isna(text):
        return ""

    # lowercase
    text = text.lower()

    # remove urls
    text = re.sub(r'http\S+|www\S+', ' ', text)

    # remove emails
    text = re.sub(r'\S+@\S+', ' ', text)

    # remove html tags
    text = re.sub(r'<.*?>', ' ', text)

    # keep letters, numbers and spaces (IMPORTANT)
    text = re.sub(r'[^a-z0-9\s]', ' ', text)

    # normalize whitespace
    text = re.sub(r'\s+', ' ', text).strip()

    return text

df['cleaned_resume']=df['Resume'].apply(clean_resume_text)

In [None]:
df['Resume'][0]

In [None]:

print("\nCLEANED RESUME:\n")
print(df['cleaned_resume'][0][:600])


In [6]:
SKILL_ALIASES = {
    'naive bayes': ['naive bayes', 'na ve bayes'],
    'scikit-learn': ['scikit learn', 'scikit-learn'],
    'opencv': ['opencv', 'open cv'],
    'natural language processing': ['nlp', 'natural language processing'],
    'machine learning': ['machine learning', 'ml'],
    'deep learning': ['deep learning', 'dl']
}


In [7]:
def extract_skills_advanced(text, skill_aliases):
    extracted = set()
    for skill, variants in skill_aliases.items():
        for variant in variants:
            if variant in text:
                extracted.add(skill)
    return list(extracted)


In [8]:
df['skills'] = df['cleaned_resume'].apply(
    lambda x: extract_skills_advanced(x, SKILL_ALIASES)
)


In [None]:
df.head()

## Education Extraction


In [9]:
EDUCATION_PATTERNS = {
    "doctorate": [
        r"\bph\s*\.?\s*d\b",
        r"\bdoctorate\b"
    ],
    "postgraduate": [
        r"\bm\s*\.?\s*tech\b",
        r"\bm\s*\.?\s*e\b",
        r"\bm\s*\.?\s*sc\b",
        r"\bmaster\b",
        r"\bmba\b"
    ],
    "undergraduate": [
        r"\bb\s*\.?\s*tech\b",
        r"\bb\s*\.?\s*e\b",
        r"\bb\s*\.?\s*sc\b",
        r"\bbachelor\b",
        r"\bdegree\b"
    ]
}


In [10]:
import re
def extract_education(text, education_patterns):
    found_levels = set()

    for level, patterns in education_patterns.items():
        for pattern in patterns:
            if re.search(pattern, text):
                found_levels.add(level)

    return list(found_levels)


In [11]:
df['education'] = df['cleaned_resume'].apply(
    lambda x: extract_education(x, EDUCATION_PATTERNS)
)


In [None]:
df[['education']].head(10)


In [12]:
EDU_RANK = {
    "undergraduate": 1,
    "postgraduate": 2,
    "doctorate": 3
}


In [13]:
def get_highest_education(education_list):
    if not education_list:
        return 0
    return max(EDU_RANK[e] for e in education_list)


In [14]:
df['education_level'] = df['education'].apply(get_highest_education)


In [None]:
df['education_level'].head(10)

## Experience extraction

1. Layer A — Numeric duration extraction (regex)
    Convert months → years
    Normalize phrases like “less than 1 year”

2. Layer B — Aggregate signal
    If multiple durations appear → take max or sum (capped)

3. Layer C — Bucket into levels
    Junior
    Mid
    Senior

In [15]:
EXPERIENCE_PATTERNS = {
    "months": r"(\d+)\s+months?",
    "years": r"(\d+)\s*\+?\s*years?",
    "less_than_year": r"less than\s+1\s+year"
}


In [16]:
import re

def extract_experience_years(text):
    years = []

    # match months
    months = re.findall(EXPERIENCE_PATTERNS["months"], text)
    for m in months:
        years.append(int(m) / 12)

    # match years
    yrs = re.findall(EXPERIENCE_PATTERNS["years"], text)
    for y in yrs:
        years.append(int(y))

    # handle "less than 1 year"
    if re.search(EXPERIENCE_PATTERNS["less_than_year"], text):
        years.append(0.5)

    if not years:
        return 0.0

    # cap total experience to avoid exaggeration
    return min(sum(years), 10)


In [17]:
df['experience_years'] = df['cleaned_resume'].apply(extract_experience_years)



In [18]:
def experience_bucket(years):
    if years == 0:
        return "unknown"
    elif years < 2:
        return "junior"
    elif years < 5:
        return "mid"
    else:
        return "senior"


In [19]:

df['experience_level'] = df['experience_years'].apply(experience_bucket)


In [None]:
df['experience_years'].head(10)

In [20]:
jd_path = "resumes/job_descriptions.csv"
df_jd = pd.read_csv(jd_path)

print(df_jd.shape)
df_jd.head(2)


(39, 4)


Unnamed: 0,job_title,company,location,job_description
0,Data Scientist,Dhurin,,About the job About the Company Dhurin is a fa...
1,Data Scientist,GrowExx,,About the job About the Company Growexx is loo...


In [21]:
df_jd["cleaned_jd"] = df_jd["job_description"].apply(clean_resume_text)


In [22]:
#extract skills from JD
df_jd["jd_skills"] = df_jd["cleaned_jd"].apply(
    lambda x: extract_skills_advanced(x, SKILL_ALIASES)
)



In [23]:
df_jd[["job_title", "jd_skills"]].head(5)


Unnamed: 0,job_title,jd_skills
0,Data Scientist,"[machine learning, natural language processing]"
1,Data Scientist,"[machine learning, natural language processing]"
2,Data Scientist,"[machine learning, natural language processing..."
3,Data Scientist,"[machine learning, natural language processing..."
4,Data Scientist,"[machine learning, natural language processing..."


In [24]:
# extracting experience requirements from JD
df_jd["jd_experience_years"] = df_jd["cleaned_jd"].apply(extract_experience_years)
df_jd["jd_experience_level"] = df_jd["jd_experience_years"].apply(experience_bucket)


In [None]:
df_jd[["job_title", "jd_experience_level"]].head()


In [25]:
df_jd["jd_education"] = df_jd["cleaned_jd"].apply(
    lambda x: extract_education(x, EDUCATION_PATTERNS)
)

df_jd["jd_education_level"] = df_jd["jd_education"].apply(get_highest_education)


In [None]:
df_jd[["job_title", "jd_education_level"]].head()


In [26]:
# save processed resumes
df.to_pickle("data/processed_resumes.pkl")

# save processed job descriptions
df_jd.to_pickle("data/processed_jds.pkl")

print("Preprocessing completed and saved")


Preprocessing completed and saved


## Matching & ranking

In [27]:
# skill overlap score
def skill_match_score(resume_skills, jd_skills):
    if not jd_skills:
        return 0.0
    overlap = set(resume_skills).intersection(set(jd_skills))
    return len(overlap) / len(jd_skills)


In [28]:
# experience compatibility score
EXPERIENCE_RANK = {
    "unknown": 0,
    "junior": 1,
    "mid": 2,
    "senior": 3
}

def experience_match_score(resume_level, jd_level):
    if jd_level == "unknown":
        return 0.5  # neutral
    r = EXPERIENCE_RANK.get(resume_level, 0)
    j = EXPERIENCE_RANK.get(jd_level, 0)

    if r >= j:
        return 1.0   # meets or exceeds requirement
    elif r == j - 1:
        return 0.5   # slightly underqualified
    else:
        return 0.0   # underqualified


In [29]:
#eductation compatibility score
def education_match_score(resume_edu, jd_edu):
    if jd_edu == 0:
        return 0.5  # neutral
    if resume_edu >= jd_edu:
        return 1.0
    else:
        return 0.0


In [30]:
# FINAL WEIGHTED MATCH SCORE

def final_match_score(resume_row, jd_row,
                      w_skill=0.6, w_exp=0.3, w_edu=0.1):

    skill_score = skill_match_score(
        resume_row["skills"], jd_row["jd_skills"]
    )

    exp_score = experience_match_score(
        resume_row["experience_level"], jd_row["jd_experience_level"]
    )

    edu_score = education_match_score(
        resume_row["education_level"], jd_row["jd_education_level"]
    )

    final_score = (
        w_skill * skill_score +
        w_exp * exp_score +
        w_edu * edu_score
    )

    return final_score


In [31]:
jd_idx = 0
jd_row = df_jd.iloc[jd_idx]

df["match_score"] = df.apply(
    lambda r: final_match_score(r, jd_row),
    axis=1
)



In [32]:
ranked_resumes = df.sort_values(
    by="match_score", ascending=False
)

ranked_resumes[
    ["Category", "skills", "experience_level", "education_level", "match_score"]
].head(10)


Unnamed: 0,Category,skills,experience_level,education_level,match_score
0,Data Science,"[deep learning, naive bayes, opencv, scikit-le...",senior,1,0.95
3,Data Science,"[machine learning, natural language processing...",senior,1,0.95
37,Data Science,"[machine learning, natural language processing...",senior,1,0.95
39,Data Science,"[opencv, machine learning, natural language pr...",senior,1,0.95
19,Data Science,"[opencv, machine learning, natural language pr...",senior,1,0.95
29,Data Science,"[opencv, machine learning, natural language pr...",senior,1,0.95
30,Data Science,"[deep learning, naive bayes, opencv, scikit-le...",senior,1,0.95
13,Data Science,"[machine learning, natural language processing...",senior,1,0.95
10,Data Science,"[deep learning, naive bayes, opencv, scikit-le...",senior,1,0.95
7,Data Science,"[machine learning, natural language processing...",senior,1,0.95


In [36]:

jd_idx = 0
jd_row = df_jd.iloc[jd_idx]

df["rule_score"] = df.apply(
    lambda row: final_match_score(row, jd_row),
    axis=1
)


In [37]:
df.to_pickle("data/processed_resumes.pkl")


obsrvations :
 1. You are seeing a baseline saturation effect, which is expected
 2. This is exactly why TF-IDF / embeddings / learned models come next

final_score =
  0.45 * rule_based_score +
  0.55 * text_similarity


In [None]:
ranked_resumes = df.sort_values(
    by="final_score_tfidf",
    ascending=False
)

ranked_resumes[
    ["Category", "experience_level", "rule_score", "tfidf_similarity", "final_score_tfidf"]
].head(10)


observations :
1. Ordering changed → TF-IDF is influencing ranking

2. TF-IDF values differ → signal exists

3. Differences are small → expected for long documents


TF-IDF improved ranking slightly, but due to lexical mismatch and long document length, semantic embeddings were needed for better discrimination

TF-IDF provided limited resolution due to lexical mismatch in long unstructured documents, which motivated the use of Sentence-BERT for semantic similarity

## SBERT

In [None]:
#rank resume using SBERT- enhanced scores
ranked_resumes = df.sort_values(
    by="final_score_sbert",
    ascending=False
)

ranked_resumes[
    ["Category", "rule_score", "tfidf_similarity", "sbert_similarity", "final_score_sbert"]
].head(10)


In [None]:
#comparing TF_IDF vs SBERT
rank_compare = df.sort_values(
    by="final_score_sbert",
    ascending=False
)[
    ["rule_score", "tfidf_similarity", "sbert_similarity"]
].head(10)

rank_compare
