In [1]:
import pandas as pd

# Load the resumes
resumes = pd.read_csv('../data/raw/UpdatedResumeDataSet.csv')

# Load the job descriptions
jobs = pd.read_csv('../data/raw/job_title_des.csv')

Check the quality of the data

In [2]:
print(jobs.shape)
print(jobs.columns)

# Drop the unnamed column
jobs = jobs.drop(columns=['Unnamed: 0'], errors='ignore')

# Rename columns for easier access
jobs = jobs.rename(columns={
        'Job Title': 'job_title',
        'Job Description': 'job_description'
})

# Check for missing values
print(jobs['job_title'].isnull().sum())
print(jobs['job_description'].isnull().sum())


 # Inspect 
print(jobs['job_title'][0])
print(jobs['job_description'][0])



(2277, 3)
Index(['Unnamed: 0', 'Job Title', 'Job Description'], dtype='object')
0
0
Flutter Developer
We are looking for hire experts flutter developer. So you are eligible this post then apply your resume.
Job Types: Full-time, Part-time
Salary: ₹20,000.00 - ₹40,000.00 per month
Benefits:
Flexible schedule
Food allowance
Schedule:
Day shift
Supplemental Pay:
Joining bonus
Overtime pay
Experience:
total work: 1 year (Preferred)
Housing rent subsidy:
Yes
Industry:
Software Development
Work Remotely:
Temporarily due to COVID-19


Check out some more job descriptions

In [3]:
for i in range(3):
        print(jobs['job_title'][i])
        print(jobs['job_description'][i])
        print("\n")

Flutter Developer
We are looking for hire experts flutter developer. So you are eligible this post then apply your resume.
Job Types: Full-time, Part-time
Salary: ₹20,000.00 - ₹40,000.00 per month
Benefits:
Flexible schedule
Food allowance
Schedule:
Day shift
Supplemental Pay:
Joining bonus
Overtime pay
Experience:
total work: 1 year (Preferred)
Housing rent subsidy:
Yes
Industry:
Software Development
Work Remotely:
Temporarily due to COVID-19


Django Developer
PYTHON/DJANGO (Developer/Lead) - Job Code(PDJ - 04)
Strong Python experience in API development (REST/RPC).
Experience working with API Frameworks (Django/flask).
Experience evaluating and improving the efficiency of programs in a Linux environment.
Ability to effectively handle multiple tasks with a high level of accuracy and attention to detail.
Good verbal and written communication skills.
Working knowledge of SQL.
JSON experience preferred.
Good knowledge in automated unit testing using PyUnit.


Machine Learning
Data Scien

In [4]:
print(resumes.shape)
print(resumes.columns)

resumes = resumes.rename(columns={
        'Category': 'category',
        'Resume': 'resume'
})

# print(resumes.columns)

# Check for missing values
print(resumes['category'].isnull().sum())
print(resumes['resume'].isnull().sum())


(962, 2)
Index(['Category', 'Resume'], dtype='object')
0
0


In [5]:
for i in range(3):
        print(resumes['category'][i])
        print(resumes['resume'][i])
        print('\n')

Data Science
Skills * Programming Languages: Python (pandas, numpy, scipy, scikit-learn, matplotlib), Sql, Java, JavaScript/JQuery. * Machine learning: Regression, SVM, NaÃ¯ve Bayes, KNN, Random Forest, Decision Trees, Boosting techniques, Cluster Analysis, Word Embedding, Sentiment Analysis, Natural Language processing, Dimensionality reduction, Topic Modelling (LDA, NMF), PCA & Neural Nets. * Database Visualizations: Mysql, SqlServer, Cassandra, Hbase, ElasticSearch D3.js, DC.js, Plotly, kibana, matplotlib, ggplot, Tableau. * Others: Regular Expression, HTML, CSS, Angular 6, Logstash, Kafka, Python Flask, Git, Docker, computer vision - Open CV and understanding of Deep learning.Education Details 

Data Science Assurance Associate 

Data Science Assurance Associate - Ernst & Young LLP
Skill Details 
JAVASCRIPT- Exprience - 24 months
jQuery- Exprience - 24 months
Python- Exprience - 24 monthsCompany Details 
company - Ernst & Young LLP
description - Fraud Investigations and Dispute Ser

Clean the resumes and descriptions for more consistent formating

In [6]:
import re


def clean_text(text):
        text = re.sub(r'<.*?>', '', text) # In case of any HTML
        text = re.sub(r'\s+', ' ', text).strip()

        return text

Load SBERT

In [7]:
from sentence_transformers import SentenceTransformer

# Load a pre-trained SBERT model
model = SentenceTransformer('all-MiniLM-L6-v2')

  from .autonotebook import tqdm as notebook_tqdm


In [8]:
# Include the resume categories and job titles for SBERT
full_resumes = [
        f'{category} {text}' for category, text in zip(resumes['category'], resumes['resume'])
]
full_jobs = [
        f'{title} {description}' for title, description in zip(jobs['job_title'], jobs['job_description'])
]

# Clean the resume's and jobs
cleaned_resumes = [clean_text(r) for r in full_resumes]
cleaned_jobs = [clean_text(j) for j in full_jobs]

# Get the embeddings
resume_embeddings = model.encode(cleaned_resumes, convert_to_tensor=True)
job_embeddings = model.encode(cleaned_jobs, convert_to_tensor=True)

In [9]:
from sentence_transformers.util import cos_sim

# Get cosine similarity between each resume and each job
similarity_matrix = cos_sim(resume_embeddings, job_embeddings)

In [10]:
import torch
resume_idx = 0
similarities = similarity_matrix[resume_idx]

top_k = torch.topk(similarities, k=5)
top_indices = top_k.indices.tolist()
top_scores = top_k.values.tolist()

for i, score in zip(top_indices, top_scores):
        print(f'Job {i} - Score: {score:.4f}')
        print(jobs.iloc[i]['job_title'])
        print()

Job 329 - Score: 0.7256
Machine Learning

Job 1183 - Score: 0.7078
Machine Learning

Job 1831 - Score: 0.7025
Machine Learning

Job 1217 - Score: 0.6935
Machine Learning

Job 1688 - Score: 0.6863
Machine Learning



In [11]:
print("Resume 0 Title:", resumes.iloc[0]['category'])
print("\nResume 0 Text:")
print(resumes.iloc[0]['resume'])  # or ['cleaned_resume'] if you stored the cleaned version separately


Resume 0 Title: Data Science

Resume 0 Text:
Skills * Programming Languages: Python (pandas, numpy, scipy, scikit-learn, matplotlib), Sql, Java, JavaScript/JQuery. * Machine learning: Regression, SVM, NaÃ¯ve Bayes, KNN, Random Forest, Decision Trees, Boosting techniques, Cluster Analysis, Word Embedding, Sentiment Analysis, Natural Language processing, Dimensionality reduction, Topic Modelling (LDA, NMF), PCA & Neural Nets. * Database Visualizations: Mysql, SqlServer, Cassandra, Hbase, ElasticSearch D3.js, DC.js, Plotly, kibana, matplotlib, ggplot, Tableau. * Others: Regular Expression, HTML, CSS, Angular 6, Logstash, Kafka, Python Flask, Git, Docker, computer vision - Open CV and understanding of Deep learning.Education Details 

Data Science Assurance Associate 

Data Science Assurance Associate - Ernst & Young LLP
Skill Details 
JAVASCRIPT- Exprience - 24 months
jQuery- Exprience - 24 months
Python- Exprience - 24 monthsCompany Details 
company - Ernst & Young LLP
description - Frau

In [12]:
job_indices = [329, 775, 605, 1688, 90]

for idx in job_indices:
    print(f"\n--- Job {idx} ---")
    print("Title:", jobs.iloc[idx]['job_title'])
    print("Description:", jobs.iloc[idx]['job_description'])  # or ['cleaned_job'] if cleaned



--- Job 329 ---
Title: Machine Learning
Description: Job Description:
We are looking for an experienced Data Scientist having excellent working knowledge of NLP & deep learning frameworks. He/She should be keen to collaborate with product and engineering heads using notebooks and visualizations. He/she should ask right questions, connect the dots and uncover hidden potential of data.
Responsibilities:
· Work as the lead data strategist, identifying and integrating new datasets that can be leveraged through the product capabilities and work closely with the engineering team to strategize and execute the development of data products
· Develop advanced algorithms that solve problems of large dimensionality in a computationally efficient and statistically effective manner
· Execute application of statistical and data mining techniques (e.g. hypothesis testing, machine learning and retrieval processes) on large, unstructured data sets to identify trends, figures and other relevant informat