# Models for Ranking

One single word embedding model is used for modelling the similarity score for skills and interests.
A score is calculated from each Word Mover's Distance computed and a final score is calculated as a weighted combination of all scores.

## Import Libraries

In [3783]:
import pandas as pd
from gensim.models import Word2Vec
import numpy as np

# Enter Person's Profile

In [3784]:
# Get the person's profile
person_id = 20

# Manual setting
#person_skills = ['Python', 'Microsoft SQL Server', 'Microsoft Azure']
#person_interests = ['Python']
#person_experience = 5 # years

# Get the profile from the collected data from the survey
employee = pd.read_csv(filepath_or_buffer="../2-data/survey_people.csv", sep=",", encoding="latin1")
employee = employee[employee['id']==person_id]
person_skills = employee['skills'].values[0].split(';')
person_interests = employee['skills-want'].values[0].split(';')
person_experience = employee['YearsCodePro'].values[0]

print(person_id)
print(person_skills)
print(person_interests)
print(person_experience)


20
['Bash/Shell', 'C', 'HTML/CSS', 'Java', 'JavaScript', 'MATLAB', 'Python', 'R', 'SQL', 'Microsoft SQL Server', 'MySQL', 'Oracle', 'PostgreSQL', 'AWS', 'Google Cloud', 'Microsoft Azure', 'Node.js', 'React.js', 'Apache Kafka', 'Keras', 'NumPy', 'Pandas', 'Scikit-learn', 'Spring', 'TensorFlow', 'Torch/PyTorch', 'Docker', 'Homebrew', 'Kubernetes', 'npm', 'Yarn']
['Bash/Shell', 'Go', 'Java', 'JavaScript', 'Python', 'Rust', 'SQL', 'Cassandra', 'Microsoft SQL Server', 'MySQL', 'Oracle', 'PostgreSQL', 'AWS', 'Google Cloud', 'Microsoft Azure', 'Oracle Cloud Infrastructure', 'Angular.js', 'Django', 'Laravel', 'Node.js', 'React.js', 'Vue.js', 'Keras', 'NumPy', 'Pandas', 'Scikit-learn', 'Spring', 'TensorFlow', 'Torch/PyTorch', 'Docker', 'Kubernetes', 'npm']
2.0


# Load IT Roles Dataset

In [3785]:
# Read the roles collected from the Survey
df_roles = pd.read_csv(filepath_or_buffer="../2-data/ITroles.csv", sep=",", encoding="latin1")

# Tokenize the skills
df_roles['skills'] = df_roles['skills'].apply(lambda x: x.split(';'))
df_roles['skills'] = df_roles['skills'].apply(lambda x: [value for value in x if value != ''])

df_roles = df_roles[['id', 'skills']]
df_roles



Unnamed: 0,id,skills
0,19805,"[diploma, machining, cnc m, mould, conventiona..."
1,80208,"[Compensation, Benefits, HR Functions, Alm, Pa..."
2,122729,"[Simulink, stateflow, Matlab developer, target..."
3,4772,"[gis, analysis, geographic_information_system,..."
4,44923,"[Full Stack Developer, AngularJS, SaaS applica..."
...,...,...
10353,91663,"[customer interaction, knowledge, java, androi..."
10354,86050,"[Technical Management, Project Management, MS ..."
10355,54515,"[XCode, IOS, Objective C, Project Management]"
10356,36160,"[Director, NoSQL, Node.js, CTO, SQL, JIRA, Agi..."


# Load the Word Embedding Model

In [3786]:
model = Word2Vec.load("../3-word_embedding/model-w2vcombinedfiltered")

# Sample the roles dataset as available roles

In [3787]:
# Get only a sample to simulate the available roles
df_roles = df_roles.sample(5)
df_roles

Unnamed: 0,id,skills
5910,100527,"[Java Application Developer, Software Developm..."
5785,110581,"[sql queries, customer profiling, direct marke..."
1019,28326,"[GWT, Rest, Subversion, angularjs, angular JS,..."
9978,118374,"[development, tools, ux, html css, browser com..."
3704,103401,"[angular js, Sharepoint Development, .Net, Cli..."


# Compute Similarity Scores

In [3788]:
# Normalize scores
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()

## Score from Skills

In [3789]:
# Calculate the distance between the person's skills and each role skills
similarity_scores = []
for skills in df_roles['skills']:
    similarity_scores.append(model.wv.wmdistance(person_skills, skills))

# Add the dissimilarity score to each role
df_roles['similarity_score_skills'] = similarity_scores

# Get the highest distance before inf
df_temp = df_roles.loc[(df_roles['similarity_score_skills'] < np.inf)]
max_score = df_temp['similarity_score_skills'].max()
print(max_score)

# Replace rows where the score is inf - meaning there is no similarity
df_roles['similarity_score_skills'] = df_roles['similarity_score_skills'].replace(np.inf, max_score)

# Normalize scores and convert distance to similarity - 0 = more dissimilar, 1 = more similar
df_roles['similarity_score_skills'] = 1 - scaler.fit_transform(df_roles[['similarity_score_skills']])

# Sort the roles by similarity score
df_roles = df_roles.sort_values(by=['similarity_score_skills'], ascending=False)

df_roles

1.2688089722826326


Unnamed: 0,id,skills,similarity_score_skills
5910,100527,"[Java Application Developer, Software Developm...",1.0
1019,28326,"[GWT, Rest, Subversion, angularjs, angular JS,...",0.7545789
3704,103401,"[angular js, Sharepoint Development, .Net, Cli...",0.7496262
5785,110581,"[sql queries, customer profiling, direct marke...",0.2335439
9978,118374,"[development, tools, ux, html css, browser com...",4.440892e-16


## Score from Interests

In [3790]:
# Calculate the similarity between the person's skills and each role skills
similarity_scores = []
for skills in df_roles['skills']:
    similarity_scores.append(model.wv.wmdistance(person_interests, skills))

# Add the similarity scores to each role
df_roles['similarity_score_interests'] = similarity_scores

# Get the highest distance before inf
df_temp = df_roles.loc[(df_roles['similarity_score_interests'] < np.inf)]
max_score = df_temp['similarity_score_interests'].max()
print(max_score)

# Replace rows where the score is inf - meaning there is no similarity
df_roles['similarity_score_interests'] = df_roles['similarity_score_interests'].replace(np.inf, max_score)

# Normalize scores and convert distance to similarity - 0 = more dissimilar, 1 = more similar
df_roles['similarity_score_interests'] =  1 - scaler.fit_transform(df_roles[['similarity_score_interests']])

# Sort the roles by similarity score
df_roles = df_roles.sort_values(by=['similarity_score_interests'], ascending=False)

df_roles

1.2724068055956426


Unnamed: 0,id,skills,similarity_score_skills,similarity_score_interests
5910,100527,"[Java Application Developer, Software Developm...",1.0,1.0
1019,28326,"[GWT, Rest, Subversion, angularjs, angular JS,...",0.7545789,0.788334
3704,103401,"[angular js, Sharepoint Development, .Net, Cli...",0.7496262,0.7698732
5785,110581,"[sql queries, customer profiling, direct marke...",0.2335439,0.2282753
9978,118374,"[development, tools, ux, html css, browser com...",4.440892e-16,-4.440892e-16


## Score from Experience - Data not available in the dataset

In [3791]:
# Cannot calculate score for Experience - Skill2Vec dataset has no experience column

# Add Results from all 3 Models

## Model 1 - Compute Final Score

In [3792]:
# Set weights
weight_skills = 0.8
weight_interests = 0
#weight_experience = 0.2

# Calculate final score
df_roles['score-model1'] = df_roles['similarity_score_skills'] * weight_skills + df_roles['similarity_score_interests'] * weight_interests #+ df_roles['similarity_score_experience'] * weight_experience

df_roles

Unnamed: 0,id,skills,similarity_score_skills,similarity_score_interests,score-model1
5910,100527,"[Java Application Developer, Software Developm...",1.0,1.0,0.8
1019,28326,"[GWT, Rest, Subversion, angularjs, angular JS,...",0.7545789,0.788334,0.6036631
3704,103401,"[angular js, Sharepoint Development, .Net, Cli...",0.7496262,0.7698732,0.5997009
5785,110581,"[sql queries, customer profiling, direct marke...",0.2335439,0.2282753,0.1868351
9978,118374,"[development, tools, ux, html css, browser com...",4.440892e-16,-4.440892e-16,3.552714e-16


## Model 2 - Compute final score

In [3793]:
# Set weights
weight_skills = 0.4
weight_interests = 0.4
#weight_experience = 0.2

# Calculate final score
df_roles['score-model2'] = df_roles['similarity_score_skills'] * weight_skills + df_roles['similarity_score_interests'] * weight_interests #+ df_roles['similarity_score_experience'] * weight_experience

df_roles

Unnamed: 0,id,skills,similarity_score_skills,similarity_score_interests,score-model1,score-model2
5910,100527,"[Java Application Developer, Software Developm...",1.0,1.0,0.8,0.8
1019,28326,"[GWT, Rest, Subversion, angularjs, angular JS,...",0.7545789,0.788334,0.6036631,0.617165
3704,103401,"[angular js, Sharepoint Development, .Net, Cli...",0.7496262,0.7698732,0.5997009,0.6078
5785,110581,"[sql queries, customer profiling, direct marke...",0.2335439,0.2282753,0.1868351,0.184728
9978,118374,"[development, tools, ux, html css, browser com...",4.440892e-16,-4.440892e-16,3.552714e-16,0.0


## Model 3 - Compute Final Score

In [3794]:
# Set weights
weight_skills = 0
weight_interests = 0.8
#weight_experience = 0.2

# Calculate final score
df_roles['score-model3'] = df_roles['similarity_score_skills'] * weight_skills + df_roles['similarity_score_interests'] * weight_interests #+ df_roles['similarity_score_experience'] * weight_experience

df_roles

Unnamed: 0,id,skills,similarity_score_skills,similarity_score_interests,score-model1,score-model2,score-model3
5910,100527,"[Java Application Developer, Software Developm...",1.0,1.0,0.8,0.8,0.8
1019,28326,"[GWT, Rest, Subversion, angularjs, angular JS,...",0.7545789,0.788334,0.6036631,0.617165,0.6306672
3704,103401,"[angular js, Sharepoint Development, .Net, Cli...",0.7496262,0.7698732,0.5997009,0.6078,0.6158985
5785,110581,"[sql queries, customer profiling, direct marke...",0.2335439,0.2282753,0.1868351,0.184728,0.1826202
9978,118374,"[development, tools, ux, html css, browser com...",4.440892e-16,-4.440892e-16,3.552714e-16,0.0,-3.552714e-16


# Add Columns for Ordered List

In [3795]:
# Rank similarities
df_roles['rank-model1'] = df_roles['score-model1'].rank(ascending=False)
df_roles['rank-model2'] = df_roles['score-model2'].rank(ascending=False)
df_roles['rank-model3'] = df_roles['score-model3'].rank(ascending=False)

# Select only required columns
df_roles = df_roles[['id','skills', 'rank-model1', 'rank-model2', 'rank-model3']]
df_roles


Unnamed: 0,id,skills,rank-model1,rank-model2,rank-model3
5910,100527,"[Java Application Developer, Software Developm...",1.0,1.0,1.0
1019,28326,"[GWT, Rest, Subversion, angularjs, angular JS,...",2.0,2.0,2.0
3704,103401,"[angular js, Sharepoint Development, .Net, Cli...",3.0,3.0,3.0
5785,110581,"[sql queries, customer profiling, direct marke...",4.0,4.0,4.0
9978,118374,"[development, tools, ux, html css, browser com...",5.0,5.0,5.0


# Export Results

In [3756]:
# Save results in a CSV file
df_roles.to_csv(f"../6-results/ranking_ITProfessional_{person_id}.csv")
df_roles


Unnamed: 0,id,skills,rank-model1,rank-model2,rank-model3
5798,113710,"[IT Services, Laravel, REST, API Development, ...",2.0,1.0,1.0
1536,21340,"[Codeigniter, PHP, MySQL, Ubuntu, SVN, MVC, Li...",1.0,2.0,2.0
6032,25999,"[Design, Operational Support, Process Transiti...",3.0,3.0,3.0
10163,104465,"[Production Planning, Industrial Engineering, ...",4.0,4.0,4.0
10124,15381,"[java, j2ee, sql, oracle, unix, linux, html, a...",5.0,5.0,5.0
