# Match Recommendations for ranking

The word embedding model is used for computing the similarities between roles and people, based on skills and interests.
A score is calculated from each Word Mover's Distance computed and a final score is calculated as a weighted combination of all scores.

## Enter Role Requirements

In [32]:
# Get the role requirements
role_id = 1
role_skills = ['C#', 'Kotlin', 'Python', 'SQL', 'PostgreSQL', '.NET', 'Apache Kafka', 'Pandas', 'Docker', 'Kubernetes']
role_experience = 'Associate'

## Import Libraries

In [33]:
import pandas as pd
from gensim.models import Word2Vec
import numpy as np

# Load Employees Dataset

In [34]:
# Read the employees dataset
df_people = pd.read_csv(filepath_or_buffer="../2-data/employees.csv", sep=",", encoding="latin1")

# Tokenize the skills
df_people['skills'] = df_people['skills'].apply(lambda x: x.split(';'))

# Get only required columns
df_people = df_people[['id', 'skills', 'skills-want', 'YearsCodePro']]
df_people



Unnamed: 0,id,skills,skills-want,YearsCodePro
0,2,"[JavaScript, TypeScript]",Rust;TypeScript,0
1,3,"[C#, C++, HTML/CSS, JavaScript, Python, Micros...",C#;C++;HTML/CSS;JavaScript;TypeScript;Microsof...,5
2,4,"[C#, JavaScript, SQL, TypeScript, Microsoft SQ...",C#;SQL;TypeScript;Microsoft SQL Server;;ASP.NE...,17
3,5,"[C#, HTML/CSS, JavaScript, SQL, Swift, TypeScr...",C#;Elixir;F#;Go;JavaScript;Rust;TypeScript;Clo...,3
4,6,"[C++, Lua, , , , , Homebrew]",Lua;;;;;Homebrew,0
...,...,...,...,...
68821,73264,"[Bash/Shell, Dart, JavaScript, PHP, Python, SQ...",Bash/Shell;Go;JavaScript;Python;SQL;TypeScript...,5
68822,73265,"[Bash/Shell, HTML/CSS, JavaScript, Python, SQL...",HTML/CSS;JavaScript;Python;Elasticsearch;Neo4j...,5
68823,73266,"[HTML/CSS, JavaScript, PHP, Python, SQL, Maria...",C#;HTML/CSS;JavaScript;PHP;Python;SQL;MariaDB;...,33
68824,73267,"[C#, Delphi, VBA, Microsoft SQL Server, MongoD...",Delphi,31


# Load Word Embedding Model

In [35]:
model = Word2Vec.load("../3-word_embedding/model-w2vcombinedfiltered")

# Sample the employees dataset as available people

In [36]:
# Get only a sample
df_people = df_people.sample(5)

# Compute Similarity Scores

In [37]:
# Scaler to normalize scores
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()

## Score from Skills

In [38]:
# Calculate the distance between the person's skills and the role skills
similarity_scores = []
for skills in df_people['skills']:
    similarity_scores.append(model.wv.wmdistance(role_skills, skills))

# Add the dissimilarity score to each role
df_people['similarity_score_skills'] = similarity_scores

# Get the highest distance before inf
df_temp = df_people.loc[(df_people['similarity_score_skills'] < np.inf)]
max_score = df_temp['similarity_score_skills'].max()
print(max_score)

# Remove rows where the score is inf - meaning there is no similarity
df_people['similarity_score_skills'] = df_people['similarity_score_skills'].replace(np.inf, max_score)

# Normalize scores and convert distance to similarity -> 0 = more dissimilar, 1 = more similar
df_people['similarity_score_skills'] = 1 - scaler.fit_transform(df_people[['similarity_score_skills']])

df_people

0.6368627724939029


Unnamed: 0,id,skills,skills-want,YearsCodePro,similarity_score_skills
5081,5302,"[Go, Java, Kotlin, MariaDB, Oracle, PostgreSQL...",Go;Java;Kotlin;Rust;Elasticsearch;MariaDB;Neo4...,21,1.0
64159,68142,"[Assembly, C#, C++, SQL, Microsoft SQL Server,...",C#;Microsoft SQL Server;OVH;ASP.NET Core ;Blaz...,27,0.879412
32654,34402,"[Python, MongoDB]",Python;Rust;MongoDB,9,0.0
25078,26298,"[Kotlin, , , , , Homebrew]",Dart;Kotlin;;;;;Homebrew,8,0.278468
11062,11561,"[C#, HTML/CSS, JavaScript, Kotlin, Python, SQL...",C#;HTML/CSS;JavaScript;Python;SQL;TypeScript;D...,7,0.571893


## Score from Interests

In [39]:
# Tokenize the skills
df_people['skills-want'] = df_people['skills-want'].apply(lambda x: x.split(';'))

# Calculate the similarity between the person's skills and the role skills
similarity_scores = []
for skills in df_people['skills-want']:
    similarity_scores.append(model.wv.wmdistance(role_skills, skills))

# Add the similarity scores to each role
df_people['similarity_score_interests'] = similarity_scores

# Get the highest distance before inf
df_temp = df_people.loc[(df_people['similarity_score_interests'] < np.inf)]
max_score = df_temp['similarity_score_interests'].max()
print(max_score)

# Remove rows where the score is inf - meaning there is no similarity
df_people['similarity_score_interests'] = df_people['similarity_score_interests'].replace(np.inf, max_score)

# Normalize scores and convert distance to similarity - 0 = more dissimilar, 1 = more similar
df_people['similarity_score_interests'] =  1 - scaler.fit_transform(df_people[['similarity_score_interests']])

df_people

0.6405709671096291


Unnamed: 0,id,skills,skills-want,YearsCodePro,similarity_score_skills,similarity_score_interests
5081,5302,"[Go, Java, Kotlin, MariaDB, Oracle, PostgreSQL...","[Go, Java, Kotlin, Rust, Elasticsearch, MariaD...",21,1.0,0.760188
64159,68142,"[Assembly, C#, C++, SQL, Microsoft SQL Server,...","[C#, Microsoft SQL Server, OVH, ASP.NET Core ,...",27,0.879412,0.684502
32654,34402,"[Python, MongoDB]","[Python, Rust, MongoDB]",9,0.0,0.0
25078,26298,"[Kotlin, , , , , Homebrew]","[Dart, Kotlin, , , , , Homebrew]",8,0.278468,0.142306
11062,11561,"[C#, HTML/CSS, JavaScript, Kotlin, Python, SQL...","[C#, HTML/CSS, JavaScript, Python, SQL, TypeSc...",7,0.571893,1.0


## Score from Experience

In [40]:
# Define bin edges for each level of experience
bin_edges = [-1, 4, 8, 12, 100]

# Define the labels for each level of experience
labels = ['Junior', 'Associate', 'Senior', 'Architect']

# Convert the 'years' column to categorical levels of experience
df_people['Experience'] = pd.cut(pd.to_numeric(df_people['YearsCodePro']), bins=bin_edges, labels=labels)

df_people

Unnamed: 0,id,skills,skills-want,YearsCodePro,similarity_score_skills,similarity_score_interests,Experience
5081,5302,"[Go, Java, Kotlin, MariaDB, Oracle, PostgreSQL...","[Go, Java, Kotlin, Rust, Elasticsearch, MariaD...",21,1.0,0.760188,Architect
64159,68142,"[Assembly, C#, C++, SQL, Microsoft SQL Server,...","[C#, Microsoft SQL Server, OVH, ASP.NET Core ,...",27,0.879412,0.684502,Architect
32654,34402,"[Python, MongoDB]","[Python, Rust, MongoDB]",9,0.0,0.0,Senior
25078,26298,"[Kotlin, , , , , Homebrew]","[Dart, Kotlin, , , , , Homebrew]",8,0.278468,0.142306,Associate
11062,11561,"[C#, HTML/CSS, JavaScript, Kotlin, Python, SQL...","[C#, HTML/CSS, JavaScript, Python, SQL, TypeSc...",7,0.571893,1.0,Associate


In [41]:
from scipy.spatial.distance import euclidean

# Define a function to compute the similarity metric
def similarity(level1, level2):
    # Define the vectors representing the levels of experience
    levels = {'Junior': [0, 0, 0, 0],
              'Associate': [1, 0, 0, 0],
              'Senior': [1, 1, 0, 0],
              'Architect': [1, 1, 1, 0]}
    
    # Compute the Euclidean distance between the two vectors
    distance = euclidean(levels[level1], levels[level2])
    
    # Return the similarity metric
    return 1 / (1 + distance)

In [42]:
# Compute the similarity based on level
df_people['similarity_score_experience'] = df_people["Experience"].apply(lambda x: similarity(role_experience, x))
df_people

Unnamed: 0,id,skills,skills-want,YearsCodePro,similarity_score_skills,similarity_score_interests,Experience,similarity_score_experience
5081,5302,"[Go, Java, Kotlin, MariaDB, Oracle, PostgreSQL...","[Go, Java, Kotlin, Rust, Elasticsearch, MariaD...",21,1.0,0.760188,Architect,0.414214
64159,68142,"[Assembly, C#, C++, SQL, Microsoft SQL Server,...","[C#, Microsoft SQL Server, OVH, ASP.NET Core ,...",27,0.879412,0.684502,Architect,0.414214
32654,34402,"[Python, MongoDB]","[Python, Rust, MongoDB]",9,0.0,0.0,Senior,0.5
25078,26298,"[Kotlin, , , , , Homebrew]","[Dart, Kotlin, , , , , Homebrew]",8,0.278468,0.142306,Associate,1.0
11062,11561,"[C#, HTML/CSS, JavaScript, Kotlin, Python, SQL...","[C#, HTML/CSS, JavaScript, Python, SQL, TypeSc...",7,0.571893,1.0,Associate,1.0


# Add Results from all 3 Models

## Model 1 - Compute Final Score

In [43]:
# Set weights
weight_skills = 0.8
weight_interests = 0
weight_experience = 0.2

# Calculate final score
df_people['score-model1'] = df_people['similarity_score_skills'] * weight_skills + df_people['similarity_score_interests'] * weight_interests + df_people['similarity_score_experience'] * weight_experience

df_people

Unnamed: 0,id,skills,skills-want,YearsCodePro,similarity_score_skills,similarity_score_interests,Experience,similarity_score_experience,score-model1
5081,5302,"[Go, Java, Kotlin, MariaDB, Oracle, PostgreSQL...","[Go, Java, Kotlin, Rust, Elasticsearch, MariaD...",21,1.0,0.760188,Architect,0.414214,0.882843
64159,68142,"[Assembly, C#, C++, SQL, Microsoft SQL Server,...","[C#, Microsoft SQL Server, OVH, ASP.NET Core ,...",27,0.879412,0.684502,Architect,0.414214,0.786373
32654,34402,"[Python, MongoDB]","[Python, Rust, MongoDB]",9,0.0,0.0,Senior,0.5,0.1
25078,26298,"[Kotlin, , , , , Homebrew]","[Dart, Kotlin, , , , , Homebrew]",8,0.278468,0.142306,Associate,1.0,0.422775
11062,11561,"[C#, HTML/CSS, JavaScript, Kotlin, Python, SQL...","[C#, HTML/CSS, JavaScript, Python, SQL, TypeSc...",7,0.571893,1.0,Associate,1.0,0.657515


## Model 2 - Compute Final score

In [44]:
# Set weights
weight_skills = 0.4
weight_interests = 0.4
weight_experience = 0.2

# Calculate final score
df_people['score-model2'] = df_people['similarity_score_skills'] * weight_skills + df_people['similarity_score_interests'] * weight_interests + df_people['similarity_score_experience'] * weight_experience

df_people

Unnamed: 0,id,skills,skills-want,YearsCodePro,similarity_score_skills,similarity_score_interests,Experience,similarity_score_experience,score-model1,score-model2
5081,5302,"[Go, Java, Kotlin, MariaDB, Oracle, PostgreSQL...","[Go, Java, Kotlin, Rust, Elasticsearch, MariaD...",21,1.0,0.760188,Architect,0.414214,0.882843,0.786918
64159,68142,"[Assembly, C#, C++, SQL, Microsoft SQL Server,...","[C#, Microsoft SQL Server, OVH, ASP.NET Core ,...",27,0.879412,0.684502,Architect,0.414214,0.786373,0.708408
32654,34402,"[Python, MongoDB]","[Python, Rust, MongoDB]",9,0.0,0.0,Senior,0.5,0.1,0.1
25078,26298,"[Kotlin, , , , , Homebrew]","[Dart, Kotlin, , , , , Homebrew]",8,0.278468,0.142306,Associate,1.0,0.422775,0.36831
11062,11561,"[C#, HTML/CSS, JavaScript, Kotlin, Python, SQL...","[C#, HTML/CSS, JavaScript, Python, SQL, TypeSc...",7,0.571893,1.0,Associate,1.0,0.657515,0.828757


## Model 3 - Compute Final Score

In [45]:
# Set weights
weight_skills = 0
weight_interests = 0.8
weight_experience = 0.2

# Calculate final score
df_people['score-model3'] = df_people['similarity_score_skills'] * weight_skills + df_people['similarity_score_interests'] * weight_interests + df_people['similarity_score_experience'] * weight_experience

df_people

Unnamed: 0,id,skills,skills-want,YearsCodePro,similarity_score_skills,similarity_score_interests,Experience,similarity_score_experience,score-model1,score-model2,score-model3
5081,5302,"[Go, Java, Kotlin, MariaDB, Oracle, PostgreSQL...","[Go, Java, Kotlin, Rust, Elasticsearch, MariaD...",21,1.0,0.760188,Architect,0.414214,0.882843,0.786918,0.690993
64159,68142,"[Assembly, C#, C++, SQL, Microsoft SQL Server,...","[C#, Microsoft SQL Server, OVH, ASP.NET Core ,...",27,0.879412,0.684502,Architect,0.414214,0.786373,0.708408,0.630444
32654,34402,"[Python, MongoDB]","[Python, Rust, MongoDB]",9,0.0,0.0,Senior,0.5,0.1,0.1,0.1
25078,26298,"[Kotlin, , , , , Homebrew]","[Dart, Kotlin, , , , , Homebrew]",8,0.278468,0.142306,Associate,1.0,0.422775,0.36831,0.313845
11062,11561,"[C#, HTML/CSS, JavaScript, Kotlin, Python, SQL...","[C#, HTML/CSS, JavaScript, Python, SQL, TypeSc...",7,0.571893,1.0,Associate,1.0,0.657515,0.828757,1.0


# Add Column for Ordering the List

In [46]:
# Rank similarities
df_people['rank-model1'] = df_people['score-model1'].rank(ascending=False)
df_people['rank-model2'] = df_people['score-model2'].rank(ascending=False)
df_people['rank-model3'] = df_people['score-model3'].rank(ascending=False)

# Select only required columns
df_people = df_people[['id','skills','skills-want', 'Experience', 'rank-model1', 'rank-model2', 'rank-model3']]

df_people

Unnamed: 0,id,skills,skills-want,Experience,rank-model1,rank-model2,rank-model3
5081,5302,"[Go, Java, Kotlin, MariaDB, Oracle, PostgreSQL...","[Go, Java, Kotlin, Rust, Elasticsearch, MariaD...",Architect,1.0,2.0,2.0
64159,68142,"[Assembly, C#, C++, SQL, Microsoft SQL Server,...","[C#, Microsoft SQL Server, OVH, ASP.NET Core ,...",Architect,2.0,3.0,3.0
32654,34402,"[Python, MongoDB]","[Python, Rust, MongoDB]",Senior,5.0,5.0,5.0
25078,26298,"[Kotlin, , , , , Homebrew]","[Dart, Kotlin, , , , , Homebrew]",Associate,4.0,4.0,4.0
11062,11561,"[C#, HTML/CSS, JavaScript, Kotlin, Python, SQL...","[C#, HTML/CSS, JavaScript, Python, SQL, TypeSc...",Associate,3.0,1.0,1.0


# Export Results

In [31]:
df_people.to_csv(f"../6-results/ranking_role_{role_id}.csv")
df_people

Unnamed: 0,id,skills,skills-want,Experience,rank-model1,rank-model2,rank-model3
128,137,"[C#, C++, DynamoDB, SQLite, AWS, Flask, .NET, ...","[C++, , , , Electron, Torch/PyTorch]",Junior,1.0,3.0,5.0
45201,47746,"[HTML/CSS, Python, MongoDB, Firebase, Django]","[HTML/CSS, Python, MongoDB, AWS, Google Cloud,...",Junior,4.0,4.0,3.0
25914,27196,"[Bash/Shell, C++, HTML/CSS, Java, JavaScript, ...","[Bash/Shell, C++, Go, HTML/CSS, Java, JavaScri...",Architect,3.0,2.0,2.0
24016,25158,"[JavaScript, TypeScript, Elasticsearch, MongoD...","[Go, JavaScript, TypeScript, MongoDB, PostgreS...",Junior,5.0,5.0,4.0
40234,42488,"[C#, HTML/CSS, JavaScript, SQL, TypeScript, Mi...","[Bash/Shell, C#, F#, Go, HTML/CSS, JavaScript,...",Associate,2.0,1.0,1.0
