In [1]:
import pandas as pd
import numpy as np

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

import warnings
warnings.filterwarnings("ignore")

In [8]:
df = pd.read_csv("data/resumes.csv")
df.head()

Unnamed: 0,ID,Resume_str,Resume_html,Category
0,16852973,HR ADMINISTRATOR/MARKETING ASSOCIATE\...,"<div class=""fontsize fontface vmargins hmargin...",HR
1,22323967,"HR SPECIALIST, US HR OPERATIONS ...","<div class=""fontsize fontface vmargins hmargin...",HR
2,33176873,HR DIRECTOR Summary Over 2...,"<div class=""fontsize fontface vmargins hmargin...",HR
3,27018550,HR SPECIALIST Summary Dedica...,"<div class=""fontsize fontface vmargins hmargin...",HR
4,17812897,HR MANAGER Skill Highlights ...,"<div class=""fontsize fontface vmargins hmargin...",HR


In [9]:
df.columns

Index(['ID', 'Resume_str', 'Resume_html', 'Category'], dtype='str')

In [10]:
def clean_text(text):
    text = str(text).lower()
    text = text.replace('\n', ' ')
    return text

df['clean_resume'] = df['Resume_str'].apply(clean_text)
df[['Category', 'clean_resume']].head()

Unnamed: 0,Category,clean_resume
0,HR,hr administrator/marketing associate ...
1,HR,"hr specialist, us hr operations ..."
2,HR,hr director summary over 2...
3,HR,hr specialist summary dedica...
4,HR,hr manager skill highlights ...


In [11]:
with open("data/job_description.txt", "r") as file:
    job_description = file.read().lower()

job_description

'we are looking for a machine learning engineer with skills in python,\nmachine learning, data analysis, pandas, numpy, scikit-learn,\nnatural language processing, sql, and statistics.\nexperience with model building and data preprocessing is preferred.'

In [12]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

vectorizer = TfidfVectorizer(stop_words='english')

resume_vectors = vectorizer.fit_transform(df['clean_resume'])
job_vector = vectorizer.transform([job_description])

In [13]:
similarity_scores = cosine_similarity(resume_vectors, job_vector)
df['match_score'] = similarity_scores.flatten()

ranked_candidates = df.sort_values(by='match_score', ascending=False)
ranked_candidates[['Category', 'match_score']].head(10)

Unnamed: 0,Category,match_score
1339,AUTOMOBILE,0.229493
926,AGRICULTURE,0.205026
1762,ENGINEERING,0.203273
1218,CONSULTANT,0.160629
2153,BANKING,0.158932
1348,AUTOMOBILE,0.129368
2291,ARTS,0.124561
1040,SALES,0.116136
1091,SALES,0.110984
1142,CONSULTANT,0.106977
