In [2]:
# Import Library Python
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import re
import os

In [3]:
clean_postings = pd.read_csv('clean_postings.csv')
clean_profiles = pd.read_csv('clean_profiles.csv')

In [4]:
# Count similarity score between job description and profile description using TF-IDF

vectorizer = TfidfVectorizer(min_df=10, max_df=0.8, sublinear_tf=True, use_idf=True)
job_vectors = vectorizer.fit_transform(clean_postings['combined'])
profile_vectors = vectorizer.transform(clean_profiles['combined'])

similarity_matrix = cosine_similarity(profile_vectors, job_vectors)

top_recommendations = []

for profile_idx in range(len(clean_profiles)):
    # Get the index of the job with the highest similarity score for this profile
    top_job_idx = np.argmax(similarity_matrix[profile_idx])
    top_similarity_score = similarity_matrix[profile_idx, top_job_idx]

    # Append the top recommendation details to the list
    top_recommendations.append({
        'profile_id': clean_profiles.iloc[profile_idx]['id'],
        'profile_position': clean_profiles.iloc[profile_idx]['position'],
        'job_title': clean_postings.iloc[top_job_idx]['title'],
        'description_similarity': top_similarity_score
    })

# Create a DataFrame from the list of top recommendations
recommendations_df = pd.DataFrame(top_recommendations)
recommendations_df = recommendations_df.sort_values(by='description_similarity', ascending=False)
recommendations_df


Unnamed: 0,profile_id,profile_position,job_title,description_similarity
251,mariateresa-e-6694b497,elementary school teacher miur,elementary school teacher,0.677383
292,dr-hala-el-tawil-049289156,clinical pharmacist bsc pharmbcpscphq teamstep...,clinical pharmacist,0.504882
206,david-mcadams-9498a846,architecturesolution engineer project developm...,healthrules developer,0.504097
299,a-g%C3%B6khan-%C3%A7imen-2b0365100,english teacher,teacher,0.494033
349,syed-m-hassan-01066aa1,intelligent automation team lead rpa developer...,robotic process automation consultant,0.451779
...,...,...,...,...
170,ahmad-alshugairi-0322b241,ceo aram alehsan holding ltd,multiple positions,0.115120
164,jonahmolina,certified level thermographer president infrar...,customer service representative,0.110605
278,loo-jia-liang-2134aa26,student nanyang technological university,lead auditor banking,0.108357
224,tony-van-de-kasteele-6150a933,owner browniesdownies zwolle,pt entrepreneur intern,0.106452


In [5]:
recommendations_df = recommendations_df.dropna(subset=['profile_position', 'job_title'])
recommendations_df.isnull().sum()

profile_id                0
profile_position          0
job_title                 0
description_similarity    0
dtype: int64

In [6]:
# Count similarity score between job title and profile position to determine how good the model 

vectorizer1 = TfidfVectorizer(min_df=0, max_df=0.8, sublinear_tf=True, use_idf=True)
position_vectors = vectorizer.fit_transform(recommendations_df['profile_position'])
title_vectors = vectorizer.transform(recommendations_df['job_title'])

similarity_matrix1 = cosine_similarity(position_vectors, title_vectors)

recommendations_df['title_similarity'] = np.diagonal(similarity_matrix1)
recommendations1_df = recommendations_df.sort_values(by='description_similarity', ascending=False)
recommendations1_df


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  recommendations_df['title_similarity'] = np.diagonal(similarity_matrix1)


Unnamed: 0,profile_id,profile_position,job_title,description_similarity,title_similarity
251,mariateresa-e-6694b497,elementary school teacher miur,elementary school teacher,0.677383,0.000000
292,dr-hala-el-tawil-049289156,clinical pharmacist bsc pharmbcpscphq teamstep...,clinical pharmacist,0.504882,0.000000
206,david-mcadams-9498a846,architecturesolution engineer project developm...,healthrules developer,0.504097,0.438653
299,a-g%C3%B6khan-%C3%A7imen-2b0365100,english teacher,teacher,0.494033,0.000000
349,syed-m-hassan-01066aa1,intelligent automation team lead rpa developer...,robotic process automation consultant,0.451779,0.000000
...,...,...,...,...,...
170,ahmad-alshugairi-0322b241,ceo aram alehsan holding ltd,multiple positions,0.115120,0.000000
164,jonahmolina,certified level thermographer president infrar...,customer service representative,0.110605,0.000000
278,loo-jia-liang-2134aa26,student nanyang technological university,lead auditor banking,0.108357,0.000000
224,tony-van-de-kasteele-6150a933,owner browniesdownies zwolle,pt entrepreneur intern,0.106452,0.000000


In [7]:
recommendations2_df = recommendations_df.sort_values(by='title_similarity', ascending=False)
recommendations2_df

Unnamed: 0,profile_id,profile_position,job_title,description_similarity,title_similarity
205,melissa-sninsky-13a6a08b,assistant property manager,assistant property manager,0.208958,1.0
414,david-walker-1624a3198,student western illinois university,veterinary student representative western univ...,0.163315,1.0
93,lexi-smith-9607ba172,experienced team manager,front office manager,0.239136,1.0
324,stefan-kalenderov-4ba886164,certified bc salesforce commerce cloud fullsta...,salesforce developer,0.227415,1.0
486,tim-anderson-mba-81a7ab163,chief financial officer ready electric company...,chief financial officer national retail growth...,0.227536,1.0
...,...,...,...,...,...
386,donna-bianchi-06,administrative receptionist fresenius medical ...,front desk receptionist,0.228908,0.0
492,nagylagouveia,marketing endomarketing social media,sap developer,0.229408,0.0
367,bill-kingzett-42b09632,business entrepreneur,pt entrepreneur intern,0.229523,0.0
39,anthony-cordeiro-41857b6b,administration home healthsmith llc,environmental operations manager,0.229659,0.0


In [8]:
# DCG
def dcg(scores_dcg, k):
    scores_dcg = np.array(scores_dcg[:k])
    return np.sum(scores_dcg / np.log2(np.arange(1, len(scores_dcg) + 1) + 1))
k = 100
dcg_score = dcg(recommendations1_df['title_similarity'], k)

print(f"DCG untuk {k} item teratas: {dcg_score}")

DCG untuk 100 item teratas: 2.295903477733675


In [9]:
# IDCG
def idcg(scores_idcg, k):
    scores_idcg = np.array(scores_idcg[:k])
    return np.sum(scores_idcg / np.log2(np.arange(1, len(scores_idcg) + 1) + 1))
k = 100
idcg_score = idcg(recommendations2_df['title_similarity'], k)

print(f"IDCG untuk {k} item teratas: {idcg_score}")

IDCG untuk 100 item teratas: 16.11654641460292


In [12]:
# NDCG
dcg_score/idcg_score

0.1424562942128468