# Models for Ranking

The word embedding model is used for computing the similarity score between roles and people, based on skills and interests.
A score is calculated from each Word Mover's Distance computed and a final score is calculated as a weighted combination of all scores.

## Import Libraries

In [718]:
import pandas as pd
from gensim.models import Word2Vec
import numpy as np

## Enter Role Requirements

In [719]:
# Get the role requirements
role_id = 25
#role_skills = ['C#', 'Kotlin', 'Python', 'SQL', 'PostgreSQL', '.NET', 'Apache Kafka', 'Pandas', 'Docker', 'Kubernetes']
#role_experience = 'Associate'

# Get the role from the collected data from the survey
role = pd.read_csv(filepath_or_buffer="../2-data/survey_roles.csv", sep=",", encoding="latin1")
role = role[role['id']==role_id]
role_skills = role['skills'].values[0].split(';')
role_experience = role['explevel'].values[0]

print(role_id)
print(role_skills)
print(role_experience)

25
['C#', 'Java', 'JavaScript', 'Kotlin', 'Python', 'Rust', 'SQL', 'MongoDB', 'PostgreSQL', 'AWS', 'Google Cloud', 'Microsoft Azure', 'React.js', '.NET', 'Apache Kafka', 'Apache Spark', 'Hadoop', 'Pandas', 'Docker', 'Kubernetes', 'Terraform']
Architect


# Load Employees Dataset

In [720]:
# Read the employees dataset - from the StackOverflow Survey Dataset
df_people = pd.read_csv(filepath_or_buffer="../2-data/employees.csv", sep=",", encoding="latin1")

# Tokenize the skills
df_people['skills'] = df_people['skills'].apply(lambda x: x.split(';'))

# Get only required columns
df_people = df_people[['id', 'skills', 'skills-want', 'YearsCodePro']]
df_people



Unnamed: 0,id,skills,skills-want,YearsCodePro
0,2,"[JavaScript, TypeScript]",Rust;TypeScript,0
1,3,"[C#, C++, HTML/CSS, JavaScript, Python, Micros...",C#;C++;HTML/CSS;JavaScript;TypeScript;Microsof...,5
2,4,"[C#, JavaScript, SQL, TypeScript, Microsoft SQ...",C#;SQL;TypeScript;Microsoft SQL Server;;ASP.NE...,17
3,5,"[C#, HTML/CSS, JavaScript, SQL, Swift, TypeScr...",C#;Elixir;F#;Go;JavaScript;Rust;TypeScript;Clo...,3
4,6,"[C++, Lua, , , , , Homebrew]",Lua;;;;;Homebrew,0
...,...,...,...,...
68821,73264,"[Bash/Shell, Dart, JavaScript, PHP, Python, SQ...",Bash/Shell;Go;JavaScript;Python;SQL;TypeScript...,5
68822,73265,"[Bash/Shell, HTML/CSS, JavaScript, Python, SQL...",HTML/CSS;JavaScript;Python;Elasticsearch;Neo4j...,5
68823,73266,"[HTML/CSS, JavaScript, PHP, Python, SQL, Maria...",C#;HTML/CSS;JavaScript;PHP;Python;SQL;MariaDB;...,33
68824,73267,"[C#, Delphi, VBA, Microsoft SQL Server, MongoD...",Delphi,31


# Load Word Embedding Model

In [721]:
model = Word2Vec.load("../3-word_embedding/model-w2vcombinedfiltered")

# Sample the employees dataset as available people

In [722]:
# Get only a sample - to simulate available employees
df_people = df_people.sample(5)
df_people

Unnamed: 0,id,skills,skills-want,YearsCodePro
16198,16923,"[Bash/Shell, Java, Python, SQL, VBA, Microsoft...",Haskell;Python;;;;NumPy,7
6988,7304,"[C#, SQL, Cloud Firestore, SQLite, Firebase, ,...",C#;Dart;JavaScript;SQL;Cloud Firestore;SQLite;...,5
56167,59425,"[C, HTML/CSS, JavaScript, Lua, PHP, Python, SQ...",C;HTML/CSS;Lua;PHP;Python;SQL;MySQL;;FastAPI;E...,0
37066,39114,"[HTML/CSS, JavaScript, TypeScript, Neo4j, Post...",C#;TypeScript;Neo4j;PostgreSQL;Redis;DigitalOc...,25
5620,5866,"[C#, HTML/CSS, JavaScript, MongoDB, , Express,...",HTML/CSS;JavaScript;SQL;;;Express;jQuery;Node....,0


# Compute Similarity Scores

In [723]:
# Scaler to normalize scores
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()

## Score from Skills

In [724]:
# Calculate the distance between each person's skills and the role skills
similarity_scores = []
for skills in df_people['skills']:
    similarity_scores.append(model.wv.wmdistance(role_skills, skills))

# Add the dissimilarity score to each row
df_people['similarity_score_skills'] = similarity_scores

# Get the highest distance before inf
df_temp = df_people.loc[(df_people['similarity_score_skills'] < np.inf)]
max_score = df_temp['similarity_score_skills'].max()
print(max_score)

# Remove rows where the score is inf - meaning there is no similarity
df_people['similarity_score_skills'] = df_people['similarity_score_skills'].replace(np.inf, max_score)

# Normalize scores and convert distance to similarity -> 0 = more dissimilar, 1 = more similar
df_people['similarity_score_skills'] = 1 - scaler.fit_transform(df_people[['similarity_score_skills']])

df_people

0.5939105425731112


Unnamed: 0,id,skills,skills-want,YearsCodePro,similarity_score_skills
16198,16923,"[Bash/Shell, Java, Python, SQL, VBA, Microsoft...",Haskell;Python;;;;NumPy,7,1.0
6988,7304,"[C#, SQL, Cloud Firestore, SQLite, Firebase, ,...",C#;Dart;JavaScript;SQL;Cloud Firestore;SQLite;...,5,0.0
56167,59425,"[C, HTML/CSS, JavaScript, Lua, PHP, Python, SQ...",C;HTML/CSS;Lua;PHP;Python;SQL;MySQL;;FastAPI;E...,0,0.366308
37066,39114,"[HTML/CSS, JavaScript, TypeScript, Neo4j, Post...",C#;TypeScript;Neo4j;PostgreSQL;Redis;DigitalOc...,25,0.557821
5620,5866,"[C#, HTML/CSS, JavaScript, MongoDB, , Express,...",HTML/CSS;JavaScript;SQL;;;Express;jQuery;Node....,0,0.229488


## Score from Interests

In [725]:
# Tokenize the skills
df_people['skills-want'] = df_people['skills-want'].apply(lambda x: x.split(';'))

# Calculate the similarity between each person's skills and the role skills
similarity_scores = []
for skills in df_people['skills-want']:
    similarity_scores.append(model.wv.wmdistance(role_skills, skills))

# Add the similarity scores to each row
df_people['similarity_score_interests'] = similarity_scores

# Get the highest distance before inf
df_temp = df_people.loc[(df_people['similarity_score_interests'] < np.inf)]
max_score = df_temp['similarity_score_interests'].max()
print(max_score)

# Remove rows where the score is inf - meaning there is no similarity
df_people['similarity_score_interests'] = df_people['similarity_score_interests'].replace(np.inf, max_score)

# Normalize scores and convert distance to similarity - 0 = more dissimilar, 1 = more similar
df_people['similarity_score_interests'] =  1 - scaler.fit_transform(df_people[['similarity_score_interests']])

df_people

0.6640734201269305


Unnamed: 0,id,skills,skills-want,YearsCodePro,similarity_score_skills,similarity_score_interests
16198,16923,"[Bash/Shell, Java, Python, SQL, VBA, Microsoft...","[Haskell, Python, , , , NumPy]",7,1.0,0.0
6988,7304,"[C#, SQL, Cloud Firestore, SQLite, Firebase, ,...","[C#, Dart, JavaScript, SQL, Cloud Firestore, S...",5,0.0,0.710448
56167,59425,"[C, HTML/CSS, JavaScript, Lua, PHP, Python, SQ...","[C, HTML/CSS, Lua, PHP, Python, SQL, MySQL, , ...",0,0.366308,0.259322
37066,39114,"[HTML/CSS, JavaScript, TypeScript, Neo4j, Post...","[C#, TypeScript, Neo4j, PostgreSQL, Redis, Dig...",25,0.557821,1.0
5620,5866,"[C#, HTML/CSS, JavaScript, MongoDB, , Express,...","[HTML/CSS, JavaScript, SQL, , , Express, jQuer...",0,0.229488,0.331061


## Score from Experience

In [726]:
# Define bin edges for each level of experience
bin_edges = [-1, 2, 5, 10, 50]

# Define the labels for each level of experience
labels = ['Junior', 'Associate', 'Senior', 'Architect']

# Convert the 'years' column to categorical levels of experience
df_people['Experience'] = pd.cut(pd.to_numeric(df_people['YearsCodePro']), bins=bin_edges, labels=labels)

df_people

Unnamed: 0,id,skills,skills-want,YearsCodePro,similarity_score_skills,similarity_score_interests,Experience
16198,16923,"[Bash/Shell, Java, Python, SQL, VBA, Microsoft...","[Haskell, Python, , , , NumPy]",7,1.0,0.0,Senior
6988,7304,"[C#, SQL, Cloud Firestore, SQLite, Firebase, ,...","[C#, Dart, JavaScript, SQL, Cloud Firestore, S...",5,0.0,0.710448,Associate
56167,59425,"[C, HTML/CSS, JavaScript, Lua, PHP, Python, SQ...","[C, HTML/CSS, Lua, PHP, Python, SQL, MySQL, , ...",0,0.366308,0.259322,Junior
37066,39114,"[HTML/CSS, JavaScript, TypeScript, Neo4j, Post...","[C#, TypeScript, Neo4j, PostgreSQL, Redis, Dig...",25,0.557821,1.0,Architect
5620,5866,"[C#, HTML/CSS, JavaScript, MongoDB, , Express,...","[HTML/CSS, JavaScript, SQL, , , Express, jQuer...",0,0.229488,0.331061,Junior


In [727]:
from scipy.spatial.distance import euclidean

# Define a function to compute the similarity metric
def similarity(level1, level2):
    # Define the vectors representing the levels of experience
    levels = {'Junior': [0, 0, 0, 0],
              'Associate': [1, 0, 0, 0],
              'Senior': [1, 1, 0, 0],
              'Architect': [1, 1, 1, 0]}
    
    # Compute the Euclidean distance between the two vectors
    distance = euclidean(levels[level1], levels[level2])
    
    # Return the similarity metric
    return 1 / (1 + distance)

In [728]:
# Compute the similarity based on level
df_people['similarity_score_experience'] = df_people["Experience"].apply(lambda x: similarity(role_experience, x))
df_people

Unnamed: 0,id,skills,skills-want,YearsCodePro,similarity_score_skills,similarity_score_interests,Experience,similarity_score_experience
16198,16923,"[Bash/Shell, Java, Python, SQL, VBA, Microsoft...","[Haskell, Python, , , , NumPy]",7,1.0,0.0,Senior,0.5
6988,7304,"[C#, SQL, Cloud Firestore, SQLite, Firebase, ,...","[C#, Dart, JavaScript, SQL, Cloud Firestore, S...",5,0.0,0.710448,Associate,0.414214
56167,59425,"[C, HTML/CSS, JavaScript, Lua, PHP, Python, SQ...","[C, HTML/CSS, Lua, PHP, Python, SQL, MySQL, , ...",0,0.366308,0.259322,Junior,0.366025
37066,39114,"[HTML/CSS, JavaScript, TypeScript, Neo4j, Post...","[C#, TypeScript, Neo4j, PostgreSQL, Redis, Dig...",25,0.557821,1.0,Architect,1.0
5620,5866,"[C#, HTML/CSS, JavaScript, MongoDB, , Express,...","[HTML/CSS, JavaScript, SQL, , , Express, jQuer...",0,0.229488,0.331061,Junior,0.366025


# Add Results from all 3 Models

## Model 1 - Compute Final Score

In [729]:
# Set weights
weight_skills = 0.8
weight_interests = 0
weight_experience = 0.2

# Calculate final score
df_people['score-model1'] = df_people['similarity_score_skills'].astype(float) * weight_skills + df_people['similarity_score_interests'].astype(float) * weight_interests + df_people['similarity_score_experience'].astype(float) * weight_experience

df_people

Unnamed: 0,id,skills,skills-want,YearsCodePro,similarity_score_skills,similarity_score_interests,Experience,similarity_score_experience,score-model1
16198,16923,"[Bash/Shell, Java, Python, SQL, VBA, Microsoft...","[Haskell, Python, , , , NumPy]",7,1.0,0.0,Senior,0.5,0.9
6988,7304,"[C#, SQL, Cloud Firestore, SQLite, Firebase, ,...","[C#, Dart, JavaScript, SQL, Cloud Firestore, S...",5,0.0,0.710448,Associate,0.414214,0.082843
56167,59425,"[C, HTML/CSS, JavaScript, Lua, PHP, Python, SQ...","[C, HTML/CSS, Lua, PHP, Python, SQL, MySQL, , ...",0,0.366308,0.259322,Junior,0.366025,0.366251
37066,39114,"[HTML/CSS, JavaScript, TypeScript, Neo4j, Post...","[C#, TypeScript, Neo4j, PostgreSQL, Redis, Dig...",25,0.557821,1.0,Architect,1.0,0.646257
5620,5866,"[C#, HTML/CSS, JavaScript, MongoDB, , Express,...","[HTML/CSS, JavaScript, SQL, , , Express, jQuer...",0,0.229488,0.331061,Junior,0.366025,0.256796


## Model 2 - Compute Final score

In [730]:
# Set weights
weight_skills = 0.4
weight_interests = 0.4
weight_experience = 0.2

# Calculate final score
df_people['score-model2'] = df_people['similarity_score_skills'].astype(float) * weight_skills + df_people['similarity_score_interests'].astype(float) * weight_interests + df_people['similarity_score_experience'].astype(float) * weight_experience

df_people

Unnamed: 0,id,skills,skills-want,YearsCodePro,similarity_score_skills,similarity_score_interests,Experience,similarity_score_experience,score-model1,score-model2
16198,16923,"[Bash/Shell, Java, Python, SQL, VBA, Microsoft...","[Haskell, Python, , , , NumPy]",7,1.0,0.0,Senior,0.5,0.9,0.5
6988,7304,"[C#, SQL, Cloud Firestore, SQLite, Firebase, ,...","[C#, Dart, JavaScript, SQL, Cloud Firestore, S...",5,0.0,0.710448,Associate,0.414214,0.082843,0.367022
56167,59425,"[C, HTML/CSS, JavaScript, Lua, PHP, Python, SQ...","[C, HTML/CSS, Lua, PHP, Python, SQL, MySQL, , ...",0,0.366308,0.259322,Junior,0.366025,0.366251,0.323457
37066,39114,"[HTML/CSS, JavaScript, TypeScript, Neo4j, Post...","[C#, TypeScript, Neo4j, PostgreSQL, Redis, Dig...",25,0.557821,1.0,Architect,1.0,0.646257,0.823128
5620,5866,"[C#, HTML/CSS, JavaScript, MongoDB, , Express,...","[HTML/CSS, JavaScript, SQL, , , Express, jQuer...",0,0.229488,0.331061,Junior,0.366025,0.256796,0.297425


## Model 3 - Compute Final Score

In [731]:
# Set weights
weight_skills = 0
weight_interests = 0.8
weight_experience = 0.2

# Calculate final score
df_people['score-model3'] = df_people['similarity_score_skills'].astype(float) * weight_skills + df_people['similarity_score_interests'].astype(float) * weight_interests + df_people['similarity_score_experience'].astype(float) * weight_experience

df_people

Unnamed: 0,id,skills,skills-want,YearsCodePro,similarity_score_skills,similarity_score_interests,Experience,similarity_score_experience,score-model1,score-model2,score-model3
16198,16923,"[Bash/Shell, Java, Python, SQL, VBA, Microsoft...","[Haskell, Python, , , , NumPy]",7,1.0,0.0,Senior,0.5,0.9,0.5,0.1
6988,7304,"[C#, SQL, Cloud Firestore, SQLite, Firebase, ,...","[C#, Dart, JavaScript, SQL, Cloud Firestore, S...",5,0.0,0.710448,Associate,0.414214,0.082843,0.367022,0.651201
56167,59425,"[C, HTML/CSS, JavaScript, Lua, PHP, Python, SQ...","[C, HTML/CSS, Lua, PHP, Python, SQL, MySQL, , ...",0,0.366308,0.259322,Junior,0.366025,0.366251,0.323457,0.280663
37066,39114,"[HTML/CSS, JavaScript, TypeScript, Neo4j, Post...","[C#, TypeScript, Neo4j, PostgreSQL, Redis, Dig...",25,0.557821,1.0,Architect,1.0,0.646257,0.823128,1.0
5620,5866,"[C#, HTML/CSS, JavaScript, MongoDB, , Express,...","[HTML/CSS, JavaScript, SQL, , , Express, jQuer...",0,0.229488,0.331061,Junior,0.366025,0.256796,0.297425,0.338054


# Add Column for Ordering the List

In [732]:
# Rank similarities
df_people['rank-model1'] = df_people['score-model1'].rank(ascending=False)
df_people['rank-model2'] = df_people['score-model2'].rank(ascending=False)
df_people['rank-model3'] = df_people['score-model3'].rank(ascending=False)

# Select only required columns
df_people = df_people[['id','skills','skills-want', 'Experience', 'rank-model1', 'rank-model2', 'rank-model3']]

df_people

Unnamed: 0,id,skills,skills-want,Experience,rank-model1,rank-model2,rank-model3
16198,16923,"[Bash/Shell, Java, Python, SQL, VBA, Microsoft...","[Haskell, Python, , , , NumPy]",Senior,1.0,2.0,5.0
6988,7304,"[C#, SQL, Cloud Firestore, SQLite, Firebase, ,...","[C#, Dart, JavaScript, SQL, Cloud Firestore, S...",Associate,5.0,3.0,2.0
56167,59425,"[C, HTML/CSS, JavaScript, Lua, PHP, Python, SQ...","[C, HTML/CSS, Lua, PHP, Python, SQL, MySQL, , ...",Junior,3.0,4.0,4.0
37066,39114,"[HTML/CSS, JavaScript, TypeScript, Neo4j, Post...","[C#, TypeScript, Neo4j, PostgreSQL, Redis, Dig...",Architect,2.0,1.0,1.0
5620,5866,"[C#, HTML/CSS, JavaScript, MongoDB, , Express,...","[HTML/CSS, JavaScript, SQL, , , Express, jQuer...",Junior,4.0,5.0,3.0


# Export Results

In [733]:
# Save results in a CSV file
df_people.to_csv(f"../6-results/ranking_role_{role_id}.csv")
df_people

Unnamed: 0,id,skills,skills-want,Experience,rank-model1,rank-model2,rank-model3
16198,16923,"[Bash/Shell, Java, Python, SQL, VBA, Microsoft...","[Haskell, Python, , , , NumPy]",Senior,1.0,2.0,5.0
6988,7304,"[C#, SQL, Cloud Firestore, SQLite, Firebase, ,...","[C#, Dart, JavaScript, SQL, Cloud Firestore, S...",Associate,5.0,3.0,2.0
56167,59425,"[C, HTML/CSS, JavaScript, Lua, PHP, Python, SQ...","[C, HTML/CSS, Lua, PHP, Python, SQL, MySQL, , ...",Junior,3.0,4.0,4.0
37066,39114,"[HTML/CSS, JavaScript, TypeScript, Neo4j, Post...","[C#, TypeScript, Neo4j, PostgreSQL, Redis, Dig...",Architect,2.0,1.0,1.0
5620,5866,"[C#, HTML/CSS, JavaScript, MongoDB, , Express,...","[HTML/CSS, JavaScript, SQL, , , Express, jQuer...",Junior,4.0,5.0,3.0
