# Models for Rating

One single word embedding model is used for modelling the similarity score for skills and interests.
A score is calculated from each Word Mover's Distance computed and a final score is calculated as a weighted combination of all scores.

## Import Libraries

In [1437]:
import pandas as pd
from gensim.models import Word2Vec
import numpy as np

# Enter Person's Profile

In [1438]:
# Get the person's profile
person_id = 19

# Manual setting
#person_skills = ['Python', 'Microsoft SQL Server', 'Microsoft Azure']
#person_interests = ['Python']
#person_experience = 5 # years

# Get the profile from the collected data from the survey
employee = pd.read_csv(filepath_or_buffer="../2-data/survey_people.csv", sep=",", encoding="latin1")
employee = employee[employee['id']==person_id]
person_skills = employee['skills'].values[0].split(';')
person_interests = employee['skills-want'].values[0].split(';')

# Define ranges for each level of experience
bin_edges = [-1, 2, 5, 10, 50]

# Define the labels for each level of experience
labels = ['Junior', 'Associate', 'Senior', 'Architect']

# Convert the 'years' column to categorical levels of experience
employee['YearsCodePro'] = pd.cut(pd.to_numeric(employee['YearsCodePro']), bins=bin_edges, labels=labels)

person_experience = employee['YearsCodePro'].values[0]

print(person_id)
print(person_skills)
print(person_interests)
print(person_experience)


19
['MATLAB', 'Python', 'SQL', 'Microsoft SQL Server', 'Google Cloud', 'Microsoft Azure']
['SQL', 'Microsoft Azure']
Junior


# Load Survey Roles Dataset

In [1439]:
# Read the roles collected from the survey
df_roles = pd.read_csv(filepath_or_buffer="../2-data/survey_roles.csv", sep=",", encoding="latin1")

# Tokenize the skills
df_roles['skills'] = df_roles['skills'].apply(lambda x: str(x).split(';'))

df_roles = df_roles[['id', 'skills', 'explevel', 'name']]
df_roles = df_roles.dropna()

df_roles



Unnamed: 0,id,skills,explevel,name
0,11,"[Bash/Shell, PowerShell, Python, R, SQL, Micro...",Senior,Data Engineer
1,12,"[Bash/Shell, PowerShell, SQL, Microsoft SQL Se...",Associate,DevOps Engineer
2,13,"[HTML/CSS, JavaScript, Python, SQL, MySQL, Pos...",Associate,Full Stack Developer
3,14,"[Bash/Shell, PowerShell, SQL, Microsoft SQL Se...",Senior,Database Administrator
4,15,"[Python, R, Scala, SQL, MongoDB, MySQL, AWS, G...",Senior,Data Scientist
5,21,"[C#, Kotlin, Python, SQL, PostgreSQL, .NET, Ap...",Associate,Software Engineer
6,22,"[JavaScript, PostgreSQL, jQuery, React.js, Rea...",Associate,Software Engineer - Frontend
7,23,"[HTML/CSS, JavaScript, PostgreSQL, Angular.js,...",Senior,Senior Software Engineer - Frontend
8,24,"[C#, Kotlin, Python, Rust, Microsoft SQL Serve...",Senior,Senior Software Engineer
9,25,"[C#, Java, JavaScript, Kotlin, Python, Rust, S...",Architect,Software Architect


# Load the Word Embedding Model

In [1440]:
model = Word2Vec.load("../3-word_embedding/model-w2vcombinedfiltered")

# Compute Similarity Scores

In [1441]:
# Normalize scores
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()

## Score from Skills

In [1442]:
# Calculate the distance between the person's skills and each role skills
similarity_scores = []
for skills in df_roles['skills']:
    similarity_scores.append(model.wv.wmdistance(person_skills, skills))

# Add the dissimilarity score to each role
df_roles['similarity_score_skills'] = similarity_scores

# Get the highest distance before inf
df_temp = df_roles.loc[(df_roles['similarity_score_skills'] < np.inf)]
max_score = df_temp['similarity_score_skills'].max()
print(max_score)

# Replace rows where the score is inf - meaning there is no similarity
df_roles['similarity_score_skills'] = df_roles['similarity_score_skills'].replace(np.inf, max_score)

# Normalize scores and convert distance to similarity - 0 = more dissimilar, 1 = more similar
df_roles['similarity_score_skills'] = 1 - scaler.fit_transform(df_roles[['similarity_score_skills']])

# Sort the roles by similarity score
df_roles = df_roles.sort_values(by=['similarity_score_skills'], ascending=False)

df_roles

0.6766020696024676


Unnamed: 0,id,skills,explevel,name,similarity_score_skills
18,44,"[Python, SQL, Microsoft SQL Server, Microsoft ...",Junior,Junior Data engineer,1.0
19,45,"[PowerShell, SQL, Microsoft SQL Server, Micros...",Junior,Junior Power BI Developer,0.724982
17,43,"[SQL, Cassandra, Microsoft SQL Server, Oracle,...",Associate,Power BI Developer,0.699003
0,11,"[Bash/Shell, PowerShell, Python, R, SQL, Micro...",Senior,Data Engineer,0.499424
15,41,"[C#, PowerShell, Python, SQL, Cassandra, Maria...",Associate,Data engineer,0.467757
1,12,"[Bash/Shell, PowerShell, SQL, Microsoft SQL Se...",Associate,DevOps Engineer,0.439763
3,14,"[Bash/Shell, PowerShell, SQL, Microsoft SQL Se...",Senior,Database Administrator,0.41581
8,24,"[C#, Kotlin, Python, Rust, Microsoft SQL Serve...",Senior,Senior Software Engineer,0.346037
16,42,"[Kotlin, Python, R, SQL, Cassandra, Elasticsea...",Associate,Data scientist,0.343501
4,15,"[Python, R, Scala, SQL, MongoDB, MySQL, AWS, G...",Senior,Data Scientist,0.326729


## Score from Interests

In [1443]:
# Calculate the similarity between the person's skills and each role skills
similarity_scores = []
for skills in df_roles['skills']:
    similarity_scores.append(model.wv.wmdistance(person_interests, skills))

# Add the similarity scores to each role
df_roles['similarity_score_interests'] = similarity_scores

# Get the highest distance before inf
df_temp = df_roles.loc[(df_roles['similarity_score_interests'] < np.inf)]
max_score = df_temp['similarity_score_interests'].max()
print(max_score)

# Replace rows where the score is inf - meaning there is no similarity
df_roles['similarity_score_interests'] = df_roles['similarity_score_interests'].replace(np.inf, max_score)

# Normalize scores and convert distance to similarity - 0 = more dissimilar, 1 = more similar
df_roles['similarity_score_interests'] =  1 - scaler.fit_transform(df_roles[['similarity_score_interests']])

# Sort the roles by similarity score
df_roles = df_roles.sort_values(by=['similarity_score_interests'], ascending=False)

df_roles

0.6747590105645208


Unnamed: 0,id,skills,explevel,name,similarity_score_skills,similarity_score_interests
19,45,"[PowerShell, SQL, Microsoft SQL Server, Micros...",Junior,Junior Power BI Developer,0.724982,1.0
18,44,"[Python, SQL, Microsoft SQL Server, Microsoft ...",Junior,Junior Data engineer,1.0,0.9559693
17,43,"[SQL, Cassandra, Microsoft SQL Server, Oracle,...",Associate,Power BI Developer,0.699003,0.5978984
1,12,"[Bash/Shell, PowerShell, SQL, Microsoft SQL Se...",Associate,DevOps Engineer,0.439763,0.4010676
3,14,"[Bash/Shell, PowerShell, SQL, Microsoft SQL Se...",Senior,Database Administrator,0.41581,0.3642856
15,41,"[C#, PowerShell, Python, SQL, Cassandra, Maria...",Associate,Data engineer,0.467757,0.2768181
0,11,"[Bash/Shell, PowerShell, Python, R, SQL, Micro...",Senior,Data Engineer,0.499424,0.2762548
16,42,"[Kotlin, Python, R, SQL, Cassandra, Elasticsea...",Associate,Data scientist,0.343501,0.1980593
2,13,"[HTML/CSS, JavaScript, Python, SQL, MySQL, Pos...",Associate,Full Stack Developer,0.266455,0.182095
5,21,"[C#, Kotlin, Python, SQL, PostgreSQL, .NET, Ap...",Associate,Software Engineer,0.276853,0.1612028


## Score from Experience

In [1444]:
from scipy.spatial.distance import euclidean

# Define a function to compute the similarity score
def similarity(level1, level2):
    # Define the vectors representing the levels of experience
    levels = {'Junior': [0, 0, 0, 0],
              'Associate': [1, 0, 0, 0],
              'Senior': [1, 1, 0, 0],
              'Architect': [1, 1, 1, 0]}
    
    # Compute the Euclidean distance between the two vectors
    distance = euclidean(levels[level1], levels[level2])
    
    # Return the similarity metric
    return 1 / (1 + distance)

# Compute the similarity based on level
df_roles['similarity_score_experience'] = df_roles["explevel"].apply(lambda x: similarity(person_experience, x))
df_roles

Unnamed: 0,id,skills,explevel,name,similarity_score_skills,similarity_score_interests,similarity_score_experience
19,45,"[PowerShell, SQL, Microsoft SQL Server, Micros...",Junior,Junior Power BI Developer,0.724982,1.0,1.0
18,44,"[Python, SQL, Microsoft SQL Server, Microsoft ...",Junior,Junior Data engineer,1.0,0.9559693,1.0
17,43,"[SQL, Cassandra, Microsoft SQL Server, Oracle,...",Associate,Power BI Developer,0.699003,0.5978984,0.5
1,12,"[Bash/Shell, PowerShell, SQL, Microsoft SQL Se...",Associate,DevOps Engineer,0.439763,0.4010676,0.5
3,14,"[Bash/Shell, PowerShell, SQL, Microsoft SQL Se...",Senior,Database Administrator,0.41581,0.3642856,0.414214
15,41,"[C#, PowerShell, Python, SQL, Cassandra, Maria...",Associate,Data engineer,0.467757,0.2768181,0.5
0,11,"[Bash/Shell, PowerShell, Python, R, SQL, Micro...",Senior,Data Engineer,0.499424,0.2762548,0.414214
16,42,"[Kotlin, Python, R, SQL, Cassandra, Elasticsea...",Associate,Data scientist,0.343501,0.1980593,0.5
2,13,"[HTML/CSS, JavaScript, Python, SQL, MySQL, Pos...",Associate,Full Stack Developer,0.266455,0.182095,0.5
5,21,"[C#, Kotlin, Python, SQL, PostgreSQL, .NET, Ap...",Associate,Software Engineer,0.276853,0.1612028,0.5


# Add Results from all 3 Models

## Model 1 - Compute Final Score

In [1445]:
# Set weights
weight_skills = 0.8
weight_interests = 0
weight_experience = 0.2

# Calculate final score
df_roles['score-model1'] = df_roles['similarity_score_skills'] * weight_skills + df_roles['similarity_score_interests'] * weight_interests + df_roles['similarity_score_experience'] * weight_experience

df_roles

Unnamed: 0,id,skills,explevel,name,similarity_score_skills,similarity_score_interests,similarity_score_experience,score-model1
19,45,"[PowerShell, SQL, Microsoft SQL Server, Micros...",Junior,Junior Power BI Developer,0.724982,1.0,1.0,0.779986
18,44,"[Python, SQL, Microsoft SQL Server, Microsoft ...",Junior,Junior Data engineer,1.0,0.9559693,1.0,1.0
17,43,"[SQL, Cassandra, Microsoft SQL Server, Oracle,...",Associate,Power BI Developer,0.699003,0.5978984,0.5,0.659203
1,12,"[Bash/Shell, PowerShell, SQL, Microsoft SQL Se...",Associate,DevOps Engineer,0.439763,0.4010676,0.5,0.45181
3,14,"[Bash/Shell, PowerShell, SQL, Microsoft SQL Se...",Senior,Database Administrator,0.41581,0.3642856,0.414214,0.415491
15,41,"[C#, PowerShell, Python, SQL, Cassandra, Maria...",Associate,Data engineer,0.467757,0.2768181,0.5,0.474206
0,11,"[Bash/Shell, PowerShell, Python, R, SQL, Micro...",Senior,Data Engineer,0.499424,0.2762548,0.414214,0.482382
16,42,"[Kotlin, Python, R, SQL, Cassandra, Elasticsea...",Associate,Data scientist,0.343501,0.1980593,0.5,0.374801
2,13,"[HTML/CSS, JavaScript, Python, SQL, MySQL, Pos...",Associate,Full Stack Developer,0.266455,0.182095,0.5,0.313164
5,21,"[C#, Kotlin, Python, SQL, PostgreSQL, .NET, Ap...",Associate,Software Engineer,0.276853,0.1612028,0.5,0.321483


## Model 2 - Compute final score

In [1446]:
# Set weights
weight_skills = 0.4
weight_interests = 0.4
weight_experience = 0.2

# Calculate final score
df_roles['score-model2'] = df_roles['similarity_score_skills'] * weight_skills + df_roles['similarity_score_interests'] * weight_interests + df_roles['similarity_score_experience'] * weight_experience

df_roles

Unnamed: 0,id,skills,explevel,name,similarity_score_skills,similarity_score_interests,similarity_score_experience,score-model1,score-model2
19,45,"[PowerShell, SQL, Microsoft SQL Server, Micros...",Junior,Junior Power BI Developer,0.724982,1.0,1.0,0.779986,0.889993
18,44,"[Python, SQL, Microsoft SQL Server, Microsoft ...",Junior,Junior Data engineer,1.0,0.9559693,1.0,1.0,0.982388
17,43,"[SQL, Cassandra, Microsoft SQL Server, Oracle,...",Associate,Power BI Developer,0.699003,0.5978984,0.5,0.659203,0.618761
1,12,"[Bash/Shell, PowerShell, SQL, Microsoft SQL Se...",Associate,DevOps Engineer,0.439763,0.4010676,0.5,0.45181,0.436332
3,14,"[Bash/Shell, PowerShell, SQL, Microsoft SQL Se...",Senior,Database Administrator,0.41581,0.3642856,0.414214,0.415491,0.394881
15,41,"[C#, PowerShell, Python, SQL, Cassandra, Maria...",Associate,Data engineer,0.467757,0.2768181,0.5,0.474206,0.39783
0,11,"[Bash/Shell, PowerShell, Python, R, SQL, Micro...",Senior,Data Engineer,0.499424,0.2762548,0.414214,0.482382,0.393114
16,42,"[Kotlin, Python, R, SQL, Cassandra, Elasticsea...",Associate,Data scientist,0.343501,0.1980593,0.5,0.374801,0.316624
2,13,"[HTML/CSS, JavaScript, Python, SQL, MySQL, Pos...",Associate,Full Stack Developer,0.266455,0.182095,0.5,0.313164,0.27942
5,21,"[C#, Kotlin, Python, SQL, PostgreSQL, .NET, Ap...",Associate,Software Engineer,0.276853,0.1612028,0.5,0.321483,0.275223


## Model 3 - Compute Final Score

In [1447]:
# Set weights
weight_skills = 0
weight_interests = 0.8
weight_experience = 0.2

# Calculate final score
df_roles['score-model3'] = df_roles['similarity_score_skills'] * weight_skills + df_roles['similarity_score_interests'] * weight_interests + df_roles['similarity_score_experience'] * weight_experience

df_roles

Unnamed: 0,id,skills,explevel,name,similarity_score_skills,similarity_score_interests,similarity_score_experience,score-model1,score-model2,score-model3
19,45,"[PowerShell, SQL, Microsoft SQL Server, Micros...",Junior,Junior Power BI Developer,0.724982,1.0,1.0,0.779986,0.889993,1.0
18,44,"[Python, SQL, Microsoft SQL Server, Microsoft ...",Junior,Junior Data engineer,1.0,0.9559693,1.0,1.0,0.982388,0.964775
17,43,"[SQL, Cassandra, Microsoft SQL Server, Oracle,...",Associate,Power BI Developer,0.699003,0.5978984,0.5,0.659203,0.618761,0.578319
1,12,"[Bash/Shell, PowerShell, SQL, Microsoft SQL Se...",Associate,DevOps Engineer,0.439763,0.4010676,0.5,0.45181,0.436332,0.420854
3,14,"[Bash/Shell, PowerShell, SQL, Microsoft SQL Se...",Senior,Database Administrator,0.41581,0.3642856,0.414214,0.415491,0.394881,0.374271
15,41,"[C#, PowerShell, Python, SQL, Cassandra, Maria...",Associate,Data engineer,0.467757,0.2768181,0.5,0.474206,0.39783,0.321454
0,11,"[Bash/Shell, PowerShell, Python, R, SQL, Micro...",Senior,Data Engineer,0.499424,0.2762548,0.414214,0.482382,0.393114,0.303847
16,42,"[Kotlin, Python, R, SQL, Cassandra, Elasticsea...",Associate,Data scientist,0.343501,0.1980593,0.5,0.374801,0.316624,0.258447
2,13,"[HTML/CSS, JavaScript, Python, SQL, MySQL, Pos...",Associate,Full Stack Developer,0.266455,0.182095,0.5,0.313164,0.27942,0.245676
5,21,"[C#, Kotlin, Python, SQL, PostgreSQL, .NET, Ap...",Associate,Software Engineer,0.276853,0.1612028,0.5,0.321483,0.275223,0.228962


In [1448]:
df_roles[['id','skills', 'explevel','score-model1', 'score-model2', 'score-model3']]

Unnamed: 0,id,skills,explevel,score-model1,score-model2,score-model3
19,45,"[PowerShell, SQL, Microsoft SQL Server, Micros...",Junior,0.779986,0.889993,1.0
18,44,"[Python, SQL, Microsoft SQL Server, Microsoft ...",Junior,1.0,0.982388,0.964775
17,43,"[SQL, Cassandra, Microsoft SQL Server, Oracle,...",Associate,0.659203,0.618761,0.578319
1,12,"[Bash/Shell, PowerShell, SQL, Microsoft SQL Se...",Associate,0.45181,0.436332,0.420854
3,14,"[Bash/Shell, PowerShell, SQL, Microsoft SQL Se...",Senior,0.415491,0.394881,0.374271
15,41,"[C#, PowerShell, Python, SQL, Cassandra, Maria...",Associate,0.474206,0.39783,0.321454
0,11,"[Bash/Shell, PowerShell, Python, R, SQL, Micro...",Senior,0.482382,0.393114,0.303847
16,42,"[Kotlin, Python, R, SQL, Cassandra, Elasticsea...",Associate,0.374801,0.316624,0.258447
2,13,"[HTML/CSS, JavaScript, Python, SQL, MySQL, Pos...",Associate,0.313164,0.27942,0.245676
5,21,"[C#, Kotlin, Python, SQL, PostgreSQL, .NET, Ap...",Associate,0.321483,0.275223,0.228962


# Add Columns for Ordered List

In [1449]:
# Rank similarities
df_roles['rank-model1'] = df_roles['score-model1'].rank(ascending=False)
df_roles['rank-model2'] = df_roles['score-model2'].rank(ascending=False)
df_roles['rank-model3'] = df_roles['score-model3'].rank(ascending=False)

# Select only required columns
df_roles = df_roles[['id','skills', 'explevel','name','rank-model1', 'rank-model2', 'rank-model3']]
df_roles


Unnamed: 0,id,skills,explevel,name,rank-model1,rank-model2,rank-model3
19,45,"[PowerShell, SQL, Microsoft SQL Server, Micros...",Junior,Junior Power BI Developer,2.0,2.0,1.0
18,44,"[Python, SQL, Microsoft SQL Server, Microsoft ...",Junior,Junior Data engineer,1.0,1.0,2.0
17,43,"[SQL, Cassandra, Microsoft SQL Server, Oracle,...",Associate,Power BI Developer,3.0,3.0,3.0
1,12,"[Bash/Shell, PowerShell, SQL, Microsoft SQL Se...",Associate,DevOps Engineer,6.0,4.0,4.0
3,14,"[Bash/Shell, PowerShell, SQL, Microsoft SQL Se...",Senior,Database Administrator,7.0,6.0,5.0
15,41,"[C#, PowerShell, Python, SQL, Cassandra, Maria...",Associate,Data engineer,5.0,5.0,6.0
0,11,"[Bash/Shell, PowerShell, Python, R, SQL, Micro...",Senior,Data Engineer,4.0,7.0,7.0
16,42,"[Kotlin, Python, R, SQL, Cassandra, Elasticsea...",Associate,Data scientist,8.0,8.0,8.0
2,13,"[HTML/CSS, JavaScript, Python, SQL, MySQL, Pos...",Associate,Full Stack Developer,13.0,10.0,10.0
5,21,"[C#, Kotlin, Python, SQL, PostgreSQL, .NET, Ap...",Associate,Software Engineer,12.0,11.0,11.0


In [1450]:
# Get only top 3 from each model
df_roles = df_roles[(df_roles['rank-model1']<4) | (df_roles['rank-model2']<4) | (df_roles['rank-model3']<4)]
df_roles

Unnamed: 0,id,skills,explevel,name,rank-model1,rank-model2,rank-model3
19,45,"[PowerShell, SQL, Microsoft SQL Server, Micros...",Junior,Junior Power BI Developer,2.0,2.0,1.0
18,44,"[Python, SQL, Microsoft SQL Server, Microsoft ...",Junior,Junior Data engineer,1.0,1.0,2.0
17,43,"[SQL, Cassandra, Microsoft SQL Server, Oracle,...",Associate,Power BI Developer,3.0,3.0,3.0


# Export Results

In [1436]:
# Save results in a CSV file
df_roles.to_csv(f"../6-results/rating_ITProfessional_{person_id}.csv")
df_roles


Unnamed: 0,id,skills,explevel,name,rank-model1,rank-model2,rank-model3
19,45,"[PowerShell, SQL, Microsoft SQL Server, Micros...",Junior,Junior Power BI Developer,2.0,2.0,1.0
18,44,"[Python, SQL, Microsoft SQL Server, Microsoft ...",Junior,Junior Data engineer,1.0,1.0,2.0
17,43,"[SQL, Cassandra, Microsoft SQL Server, Oracle,...",Associate,Power BI Developer,3.0,3.0,3.0
