# Models for Ranking

One single word embedding model is used for modelling the similarity score for skills and interests.
A score is calculated from each Word Mover's Distance computed and a final score is calculated as a weighted combination of all scores.

## Import Libraries

In [3665]:
import pandas as pd
from gensim.models import Word2Vec
import numpy as np

# Enter Person's Profile

In [3666]:
# Get the person's profile
person_id = 19

# Manual setting
#person_skills = ['Python', 'Microsoft SQL Server', 'Microsoft Azure']
#person_interests = ['Python']
#person_experience = 5 # years

# Get the profile from the collected data from the survey
employee = pd.read_csv(filepath_or_buffer="../2-data/survey_people.csv", sep=",", encoding="latin1")
employee = employee[employee['id']==person_id]
person_skills = employee['skills'].values[0].split(';')
person_interests = employee['skills-want'].values[0].split(';')
person_experience = employee['YearsCodePro'].values[0]

print(person_id)
print(person_skills)
print(person_interests)
print(person_experience)


19
['MATLAB', 'Python', 'SQL', 'Microsoft SQL Server', 'Google Cloud', 'Microsoft Azure']
['SQL', 'Microsoft Azure']
2.0


# Load IT Roles Dataset

In [3667]:
# Read the roles collected from the Survey
df_roles = pd.read_csv(filepath_or_buffer="../2-data/ITroles.csv", sep=",", encoding="latin1")

# Tokenize the skills
df_roles['skills'] = df_roles['skills'].apply(lambda x: x.split(';'))
df_roles['skills'] = df_roles['skills'].apply(lambda x: [value for value in x if value != ''])

df_roles = df_roles[['id', 'skills']]
df_roles



Unnamed: 0,id,skills
0,19805,"[diploma, machining, cnc m, mould, conventiona..."
1,80208,"[Compensation, Benefits, HR Functions, Alm, Pa..."
2,122729,"[Simulink, stateflow, Matlab developer, target..."
3,4772,"[gis, analysis, geographic_information_system,..."
4,44923,"[Full Stack Developer, AngularJS, SaaS applica..."
...,...,...
10353,91663,"[customer interaction, knowledge, java, androi..."
10354,86050,"[Technical Management, Project Management, MS ..."
10355,54515,"[XCode, IOS, Objective C, Project Management]"
10356,36160,"[Director, NoSQL, Node.js, CTO, SQL, JIRA, Agi..."


# Load the Word Embedding Model

In [3668]:
model = Word2Vec.load("../3-word_embedding/model-w2vcombinedfiltered")

# Sample the roles dataset as available roles

In [3669]:
# Get only a sample to simulate the available roles
df_roles = df_roles.sample(5)
df_roles

Unnamed: 0,id,skills
3899,59422,"[Workflows, Build, cascading, stylesheets, Res..."
6152,60173,"[development, java, xml, struts, hibernate, al..."
5297,14797,"[experience, visual, after effects, sound, be,..."
7256,28564,"[Codeigniter Developer, API development, PHP, ..."
4664,114576,"[JSON, Javascript, JQuery, Ajax, User Interfac..."


# Compute Similarity Scores

In [3670]:
# Normalize scores
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()

## Score from Skills

In [3671]:
# Calculate the distance between the person's skills and each role skills
similarity_scores = []
for skills in df_roles['skills']:
    similarity_scores.append(model.wv.wmdistance(person_skills, skills))

# Add the dissimilarity score to each role
df_roles['similarity_score_skills'] = similarity_scores

# Get the highest distance before inf
df_temp = df_roles.loc[(df_roles['similarity_score_skills'] < np.inf)]
max_score = df_temp['similarity_score_skills'].max()
print(max_score)

# Replace rows where the score is inf - meaning there is no similarity
df_roles['similarity_score_skills'] = df_roles['similarity_score_skills'].replace(np.inf, max_score)

# Normalize scores and convert distance to similarity - 0 = more dissimilar, 1 = more similar
df_roles['similarity_score_skills'] = 1 - scaler.fit_transform(df_roles[['similarity_score_skills']])

# Sort the roles by similarity score
df_roles = df_roles.sort_values(by=['similarity_score_skills'], ascending=False)

df_roles

1.285084561570378


Unnamed: 0,id,skills,similarity_score_skills
3899,59422,"[Workflows, Build, cascading, stylesheets, Res...",1.0
7256,28564,"[Codeigniter Developer, API development, PHP, ...",0.9818126
4664,114576,"[JSON, Javascript, JQuery, Ajax, User Interfac...",0.6571857
5297,14797,"[experience, visual, after effects, sound, be,...",0.09731251
6152,60173,"[development, java, xml, struts, hibernate, al...",8.881784e-16


## Score from Interests

In [3672]:
# Calculate the similarity between the person's skills and each role skills
similarity_scores = []
for skills in df_roles['skills']:
    similarity_scores.append(model.wv.wmdistance(person_interests, skills))

# Add the similarity scores to each role
df_roles['similarity_score_interests'] = similarity_scores

# Get the highest distance before inf
df_temp = df_roles.loc[(df_roles['similarity_score_interests'] < np.inf)]
max_score = df_temp['similarity_score_interests'].max()
print(max_score)

# Replace rows where the score is inf - meaning there is no similarity
df_roles['similarity_score_interests'] = df_roles['similarity_score_interests'].replace(np.inf, max_score)

# Normalize scores and convert distance to similarity - 0 = more dissimilar, 1 = more similar
df_roles['similarity_score_interests'] =  1 - scaler.fit_transform(df_roles[['similarity_score_interests']])

# Sort the roles by similarity score
df_roles = df_roles.sort_values(by=['similarity_score_interests'], ascending=False)

df_roles

1.306481125338535


Unnamed: 0,id,skills,similarity_score_skills,similarity_score_interests
7256,28564,"[Codeigniter Developer, API development, PHP, ...",0.9818126,1.0
3899,59422,"[Workflows, Build, cascading, stylesheets, Res...",1.0,0.985507
4664,114576,"[JSON, Javascript, JQuery, Ajax, User Interfac...",0.6571857,0.730745
5297,14797,"[experience, visual, after effects, sound, be,...",0.09731251,0.120547
6152,60173,"[development, java, xml, struts, hibernate, al...",8.881784e-16,0.0


## Score from Experience - Data not available in the dataset

In [3673]:
# Cannot calculate score for Experience - Skill2Vec dataset has no experience column

# Add Results from all 3 Models

## Model 1 - Compute Final Score

In [3674]:
# Set weights
weight_skills = 0.8
weight_interests = 0
#weight_experience = 0.2

# Calculate final score
df_roles['score-model1'] = df_roles['similarity_score_skills'] * weight_skills + df_roles['similarity_score_interests'] * weight_interests #+ df_roles['similarity_score_experience'] * weight_experience

df_roles

Unnamed: 0,id,skills,similarity_score_skills,similarity_score_interests,score-model1
7256,28564,"[Codeigniter Developer, API development, PHP, ...",0.9818126,1.0,0.7854501
3899,59422,"[Workflows, Build, cascading, stylesheets, Res...",1.0,0.985507,0.8
4664,114576,"[JSON, Javascript, JQuery, Ajax, User Interfac...",0.6571857,0.730745,0.5257485
5297,14797,"[experience, visual, after effects, sound, be,...",0.09731251,0.120547,0.07785001
6152,60173,"[development, java, xml, struts, hibernate, al...",8.881784e-16,0.0,7.105427e-16


## Model 2 - Compute final score

In [3675]:
# Set weights
weight_skills = 0.4
weight_interests = 0.4
#weight_experience = 0.2

# Calculate final score
df_roles['score-model2'] = df_roles['similarity_score_skills'] * weight_skills + df_roles['similarity_score_interests'] * weight_interests #+ df_roles['similarity_score_experience'] * weight_experience

df_roles

Unnamed: 0,id,skills,similarity_score_skills,similarity_score_interests,score-model1,score-model2
7256,28564,"[Codeigniter Developer, API development, PHP, ...",0.9818126,1.0,0.7854501,0.7927251
3899,59422,"[Workflows, Build, cascading, stylesheets, Res...",1.0,0.985507,0.8,0.7942027
4664,114576,"[JSON, Javascript, JQuery, Ajax, User Interfac...",0.6571857,0.730745,0.5257485,0.5551721
5297,14797,"[experience, visual, after effects, sound, be,...",0.09731251,0.120547,0.07785001,0.0871437
6152,60173,"[development, java, xml, struts, hibernate, al...",8.881784e-16,0.0,7.105427e-16,3.552714e-16


## Model 3 - Compute Final Score

In [3676]:
# Set weights
weight_skills = 0
weight_interests = 0.8
#weight_experience = 0.2

# Calculate final score
df_roles['score-model3'] = df_roles['similarity_score_skills'] * weight_skills + df_roles['similarity_score_interests'] * weight_interests #+ df_roles['similarity_score_experience'] * weight_experience

df_roles

Unnamed: 0,id,skills,similarity_score_skills,similarity_score_interests,score-model1,score-model2,score-model3
7256,28564,"[Codeigniter Developer, API development, PHP, ...",0.9818126,1.0,0.7854501,0.7927251,0.8
3899,59422,"[Workflows, Build, cascading, stylesheets, Res...",1.0,0.985507,0.8,0.7942027,0.788405
4664,114576,"[JSON, Javascript, JQuery, Ajax, User Interfac...",0.6571857,0.730745,0.5257485,0.5551721,0.584596
5297,14797,"[experience, visual, after effects, sound, be,...",0.09731251,0.120547,0.07785001,0.0871437,0.096437
6152,60173,"[development, java, xml, struts, hibernate, al...",8.881784e-16,0.0,7.105427e-16,3.552714e-16,0.0


# Add Columns for Ordered List

In [3677]:
# Rank similarities
df_roles['rank-model1'] = df_roles['score-model1'].rank(ascending=False)
df_roles['rank-model2'] = df_roles['score-model2'].rank(ascending=False)
df_roles['rank-model3'] = df_roles['score-model3'].rank(ascending=False)

# Select only required columns
df_roles = df_roles[['id','skills', 'rank-model1', 'rank-model2', 'rank-model3']]
df_roles


Unnamed: 0,id,skills,rank-model1,rank-model2,rank-model3
7256,28564,"[Codeigniter Developer, API development, PHP, ...",2.0,2.0,1.0
3899,59422,"[Workflows, Build, cascading, stylesheets, Res...",1.0,1.0,2.0
4664,114576,"[JSON, Javascript, JQuery, Ajax, User Interfac...",3.0,3.0,3.0
5297,14797,"[experience, visual, after effects, sound, be,...",4.0,4.0,4.0
6152,60173,"[development, java, xml, struts, hibernate, al...",5.0,5.0,5.0


# Export Results

In [3599]:
# Save results in a CSV file
df_roles.to_csv(f"../6-results/ranking_ITProfessional_{person_id}.csv")
df_roles


Unnamed: 0,id,skills,rank-model1,rank-model2,rank-model3
2138,14086,"[Executive, Data Analytics, Analytics, Analyst...",2.0,1.0,1.0
8734,86820,"[Django, Python, software developer, python de...",1.0,2.0,2.0
2751,83654,"[DevOps, GIT, Gradle, stash, accurev, perforce...",3.0,3.0,3.0
2490,15389,"[investment banking, sql, capital market, linu...",4.0,4.0,4.0
4020,128096,"[development, maintaining, javascript, html, a...",5.0,5.0,5.0
