In [1]:
import pandas as pd
import numpy as np
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity


In [2]:
# Select a random sample of 10% of the job offers and job applications
job_offers = pd.read_csv('FinalDataSetJobOffers.csv').sample(frac=0.1, random_state=42)
job_seekers= pd.read_csv('CvDatasetFinal_3.csv').sample(frac=0.1, random_state=42)

In [8]:
job_offers.head()

Unnamed: 0,Job post,Company name,Job description,Required skills,Location,Company rating,Company review,Experience required
19774,incent compens,axtria india privat limit,hands-on experi work rang healthcar data sale ...,excel sql oper manag data analyt vba report ad...,noida hyderabad/secunderabad pune bangalore/be...,3.6,162.0,2-6 yr
17859,contractu hire data analyst,axi bank,master graduat data scienc engin comput scienc...,data analysi sql data scienc bank,remot,3.9,16434.0,3-4 yr
7220,senior softwar engin,unizen technolog,design develop unit test linux devic driver re...,git linux o commun u boot unit test jtag arm l...,bangalore/bengaluru,3.5,7.0,10 yr
4744,senior softwar develop,ibm,minimum 7 year experi fortran c c java shell s...,comput scienc softwar design code xml html ski...,kochi/cochin,4.2,13315.0,10 yr
4170,android develop awc softwar pvt ltd noida,awc softwar pvt ltd,posit android kotlin 3 year noida.not look 15 ...,android kotlin,noida,4.1,83.0,3-5 yr


In [9]:
job_seekers.head()

Unnamed: 0,Category,Name,Email,Phone,Education,Skills,Experience,Experience_Rating
140,Java Developer,Karen Thompson,karen.thompson@gmail.com,343.781.4444,detail bachelor engineering computer savitriba...,skill language java operating system window li...,32 month linux 6 month adavance java le 1 year...,5
398,Hadoop,Andrew Adams,andrew.adams@gmail.com,001-559-587-9020x6074,detail electronics communication indore madhya...,set programming language apache hadoop python ...,31 month hadoop 31 month hadoop 31 month hive ...,5
6,Data Science,Christine Myers,christine.myers@gmail.com,+1-131-902-7364x03539,detail january 2017 b tech computer science en...,skill python tableau data visualization studio...,13 month python 24 month solution 24 month dat...,5
334,Network Security Engineer,Jade Hensley,jade.hensley@gmail.com,(558)588-2984x33032,detail july 2012 april 2015 bachelor science i...,skill writing skill english good certainly cle...,24 monthscompany detail company karvy innotech...,3
322,Network Security Engineer,Tiffany Bailey,tiffany.bailey@gmail.com,548.866.0991,detail september 2006 august 2011 bachelor eng...,set skill skilled analyzing monitoring network...,72 month checkpoint 72 month cisco 72 month ci...,5


In [3]:
# Check for missing values in the job offers dataframe
print("Missing values in job_offers:")
print(job_offers.isna().sum())

# Check for missing values in the job applications dataframe
print("Missing values in job_applications:")
print(job_seekers.isna().sum())

Missing values in job_offers:
Job post               0
Company name           0
Job description        0
Required skills        0
Location               0
Company rating         0
Company review         0
Experience required    0
dtype: int64
Missing values in job_applications:
Category             0
Name                 0
Email                0
Phone                0
Education            0
Skills               0
Experience           2
Experience_Rating    0
dtype: int64


In [4]:
# Drop rows with missing values in job_offers and job_applications dataframes
job_offers.dropna(inplace=True)
job_seekers.dropna(inplace=True)


In [5]:
# Preprocess the job offer and job seeker descriptions
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


True

In [12]:
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [6]:
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

In [7]:
def preprocess_text(text):
    # Convert text to lowercase
    text = text.lower()
    
    # Tokenize text into words
    words = nltk.word_tokenize(text)
    
    # Remove stopwords and punctuation
    words = [w for w in words if w not in stop_words and w.isalnum()]
    
    # Lemmatize words
    words = [lemmatizer.lemmatize(w) for w in words]
    
    # Join words back into text
    text = ' '.join(words)
    
    return text


In [13]:
job_offers['Required skills'] = job_offers['Required skills'].apply(preprocess_text)
job_seekers['Skills'] = job_seekers['Skills'].apply(preprocess_text)


In [15]:
# Vectorize the job offer and job seeker descriptions
vectorizer = TfidfVectorizer()
job_offer_vec = vectorizer.fit_transform(job_offers['Required skills'] )
job_seeker_vecs = vectorizer.transform(job_seekers['Skills'])

In [16]:
# Calculate cosine similarity between the job offer and job seeker descriptions
similarity_scores = cosine_similarity(job_offer_vec, job_seeker_vecs)

In [18]:
# Rank the job seekers based on similarity scores and recommend the top candidates
job_seeker_scores = similarity_scores[0]
top_candidates = np.argsort(job_seeker_scores)[::-1][:10]
recommended_job_seekers = job_seekers.iloc[top_candidates]

In [23]:
target_job_title = 'Data Scientist'
target_job_description = 'We are looking for a Data Scientist to join our team'

target_job = job_offers[(job_offers['Job post'] == target_job_title) & (job_offers['Job description'] == target_job_description)]


In [24]:
# Create a new job offer based on the target job title and description
new_job_offer = pd.DataFrame({'Job psot': [target_job_title],
                              'Job description': [target_job_description]})

In [26]:
# Preprocess the new job offer description
new_job_offer['Job description'] = new_job_offer['Job description'].apply(preprocess_text)


In [27]:
# Vectorize the new job offer description
new_job_offer_vec = vectorizer.transform(new_job_offer['Job description'])

In [28]:
# Calculate cosine similarity between the new job offer and job seeker descriptions
similarity_scores = cosine_similarity(new_job_offer_vec, job_seeker_vecs)

In [30]:
# Rank the job seekers based on similarity scores and recommend the top candidates
job_seeker_scores = similarity_scores[0]
top_candidates = np.argsort(job_seeker_scores)[::-1][:10]
recommended_job_seekers = job_seekers.iloc[top_candidates]

In [31]:
# Print the recommended job seekers
print(recommended_job_seekers)

             Category               Name                        Email  \
6        Data Science    Christine Myers    christine.myers@gmail.com   
180     SAP Developer         Linda Hill         linda.hill@gmail.com   
167    Java Developer        Anna Carter        anna.carter@gmail.com   
225  Python Developer     Michael Phelps     michael.phelps@gmail.com   
2        Data Science       Jason Montes       jason.montes@gmail.com   
30       Data Science    Robert Gonzalez    robert.gonzalez@gmail.com   
10       Data Science        Sandy Wells        sandy.wells@gmail.com   
504        Blockchain       Wesley Brown       wesley.brown@gmail.com   
529        Blockchain        William Lee        william.lee@gmail.com   
456     ETL Developer  Shannon Rodriguez  shannon.rodriguez@gmail.com   

                      Phone  \
6     +1-131-902-7364x03539   
180  001-905-818-0939x86817   
167           (921)385-5941   
225   001-231-615-3663x4270   
2        (962)653-8542x1217   
30        