## Job Matching Algorithm

In [25]:
import pandas as pd
import nltk
from nltk.tokenize import RegexpTokenizer
from nltk.stem import PorterStemmer
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

nltk.download('punkt')
nltk.download('wordnet')

from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Mulwa\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Mulwa\AppData\Roaming\nltk_data...


In [26]:
lemmatizer = WordNetLemmatizer()

In [3]:
data = pd.read_csv('./dataset_first_100k.csv')
data.head()

Unnamed: 0,Job Id,Experience,Qualifications,Salary Range,location,Country,latitude,longitude,Work Type,Company Size,...,Contact,Job Title,Role,Job Portal,Job Description,Benefits,skills,Responsibilities,Company,Company Profile
0,1089843540111562,5 to 15 Years,M.Tech,$59K-$99K,Douglas,Isle of Man,54.2361,-4.5481,Intern,26801,...,001-381-930-7517x737,Digital Marketing Specialist,Social Media Manager,Snagajob,Social Media Managers oversee an organizations...,"{'Flexible Spending Accounts (FSAs), Relocatio...","Social media platforms (e.g., Facebook, Twitte...","Manage and grow social media accounts, create ...",Icahn Enterprises,"{""Sector"":""Diversified"",""Industry"":""Diversifie..."
1,398454096642776,2 to 12 Years,BCA,$56K-$116K,Ashgabat,Turkmenistan,38.9697,59.5563,Intern,100340,...,461-509-4216,Web Developer,Frontend Web Developer,Idealist,Frontend Web Developers design and implement u...,"{'Health Insurance, Retirement Plans, Paid Tim...","HTML, CSS, JavaScript Frontend frameworks (e.g...","Design and code user interfaces for websites, ...",PNC Financial Services Group,"{""Sector"":""Financial Services"",""Industry"":""Com..."
2,481640072963533,0 to 12 Years,PhD,$61K-$104K,Macao,"Macao SAR, China",22.1987,113.5439,Temporary,84525,...,9687619505,Operations Manager,Quality Control Manager,Jobs2Careers,Quality Control Managers establish and enforce...,"{'Legal Assistance, Bonuses and Incentive Prog...",Quality control processes and methodologies St...,Establish and enforce quality control standard...,United Services Automobile Assn.,"{""Sector"":""Insurance"",""Industry"":""Insurance: P..."
3,688192671473044,4 to 11 Years,PhD,$65K-$91K,Porto-Novo,Benin,9.3077,2.3158,Full-Time,129896,...,+1-820-643-5431x47576,Network Engineer,Wireless Network Engineer,FlexJobs,"Wireless Network Engineers design, implement, ...","{'Transportation Benefits, Professional Develo...",Wireless network design and architecture Wi-Fi...,"Design, configure, and optimize wireless netwo...",Hess,"{""Sector"":""Energy"",""Industry"":""Mining, Crude-O..."
4,117057806156508,1 to 12 Years,MBA,$64K-$87K,Santiago,Chile,-35.6751,-71.5429,Intern,53944,...,343.975.4702x9340,Event Manager,Conference Manager,Jobs2Careers,A Conference Manager coordinates and manages c...,"{'Flexible Spending Accounts (FSAs), Relocatio...",Event planning Conference logistics Budget man...,Specialize in conference and convention planni...,Cairn Energy,"{""Sector"":""Energy"",""Industry"":""Energy - Oil & ..."


In [6]:
data.columns

Index(['Job Id', 'Experience', 'Qualifications', 'Salary Range', 'location',
       'Country', 'latitude', 'longitude', 'Work Type', 'Company Size',
       'Job Posting Date', 'Preference', 'Contact Person', 'Contact',
       'Job Title', 'Role', 'Job Portal', 'Job Description', 'Benefits',
       'skills', 'Responsibilities', 'Company', 'Company Profile'],
      dtype='object')

In [19]:
skills = data.loc[:, ['Job Id', 'skills']]
skills

Unnamed: 0,Job Id,skills
0,1089843540111562,"Social media platforms (e.g., Facebook, Twitte..."
1,398454096642776,"HTML, CSS, JavaScript Frontend frameworks (e.g..."
2,481640072963533,Quality control processes and methodologies St...
3,688192671473044,Wireless network design and architecture Wi-Fi...
4,117057806156508,Event planning Conference logistics Budget man...
...,...,...
99995,44173633240886,Document management Recordkeeping Attention to...
99996,1737306657093458,Financial planning Investment strategy Retirem...
99997,1542439567458634,User interface (UI) design User experience (UX...
99998,1682763660855356,Computer-aided design (CAD) software Technical...


In [29]:
applicants = [
    {"id": 1, "experience": "Data Scientist with 5 years experience in python, machine learning and NLP"},
    {"id": 2, "experience": "Software engineer skilled in Java, web development, and cloud infrastructure"}
]

In [36]:
df_applicants = pd.DataFrame(applicants)
df_applicants

Unnamed: 0,id,experience
0,1,Data Scientist with 5 years experience in pyth...
1,2,"Software engineer skilled in Java, web develop..."


In [27]:
## preprocessing

def preprocess_text(text):
    tokens = word_tokenize(text.lower())
    lemmatized = [lemmatizer.lemmatize(word) for word in tokens]
    return ' '.join(lemmatized)


In [37]:
df_applicants['experience_cleaned'] = df_applicants['experience'].apply(preprocess_text)
df_applicants

Unnamed: 0,id,experience,experience_cleaned
0,1,Data Scientist with 5 years experience in pyth...,data scientist with 5 year experience in pytho...
1,2,"Software engineer skilled in Java, web develop...","software engineer skilled in java , web develo..."


In [69]:
skills['skills_cleaned'] = skills['skills'].apply(preprocess_text)
len(skills['Job Id'].unique())

100000

In [72]:

all_texts = pd.concat([df_applicants['experience_cleaned'], skills['skills_cleaned']])

tfidf = TfidfVectorizer()
tfidf_matrix = tfidf.fit_transform(all_texts)

applicant_vectors = tfidf_matrix[:len(df_applicants)]
job_vectors = tfidf_matrix[len(df_applicants):]

job_vectors.shape

(100000, 994)

In [80]:
from scipy.spatial.distance import cdist

distances = cdist(applicant_vectors.toarray(), job_vectors.toarray(), metric='cityblock')
distances.shape

(2, 100000)

In [104]:
negative_distances = distances
negative_distances.shape

(2, 100000)

In [105]:
top_recommendations = {}

num_top_recommendations = 5

In [106]:
for i, applicant_id in enumerate(df_applicants['id']):
    top_job_indices = np.argsort(negative_distances[i])[:num_top_recommendations]
    top_job_ids = skills['Job Id'].iloc[top_job_indices].values
    top_recommendations[applicant_id] = top_job_ids

In [107]:

for applicant_id, recommended_jobs in top_recommendations.items():
    print(f"Applicant {applicant_id} is recommended for the following jobs: {recommended_jobs}")

Applicant 1 is recommended for the following jobs: [2885091299391555 1489755324022800  969514974815671 1356755775095376
 1015855334130895]
Applicant 2 is recommended for the following jobs: [1531014351382409  836608419713153  380757586176342    6611294559332
 2394491125353872]


In [109]:
data[data['Job Id'] == 969514974815671]

Unnamed: 0,Job Id,Experience,Qualifications,Salary Range,location,Country,latitude,longitude,Work Type,Company Size,...,Contact,Job Title,Role,Job Portal,Job Description,Benefits,skills,Responsibilities,Company,Company Profile
89147,969514974815671,0 to 8 Years,B.Tech,$63K-$84K,Guatemala City,Guatemala,15.7835,-90.2308,Temporary,90822,...,7686325500,Data Scientist,Machine Learning Engineer,Idealist,Machine Learning Engineers develop machine lea...,"{'Health Insurance, Retirement Plans, Flexible...",Machine learning algorithms Python programming...,Develop machine learning models and algorithms...,Chewy,"{""Sector"":""E-commerce"",""Industry"":""Internet Se..."
