# Models for rating

One single word embedding model is used for modelling the similarities between skills, which is used also for interests.
A score is calculated from each Word Mover's Distance computed and a final score is calculated as a weighted combination of all scores.

## Import Libraries

In [174]:
import pandas as pd
from gensim.models import Word2Vec
import numpy as np

# Enter Person's Profile

In [175]:
# Get the person's profile
person_id = 8

# Manual setting
#person_skills = ['Python', 'Microsoft SQL Server', 'Microsoft Azure']
#person_interests = ['Python']
#person_experience = 5 # years

# Get the profile from the collected data from the survey
employee = pd.read_csv(filepath_or_buffer="../2-data/survey_people.csv", sep=",", encoding="latin1")
employee = employee[employee['id']==person_id]
person_skills = employee['skills'].values[0].split(';')
person_interests = employee['skills-want'].values[0].split(';')

# Define bin edges for each level of experience
bin_edges = [-1, 4, 8, 12, 100]

# Define the labels for each level of experience
labels = ['Junior', 'Associate', 'Senior', 'Architect']

# Convert the 'years' column to categorical levels of experience
employee['YearsCodePro'] = pd.cut(pd.to_numeric(employee['YearsCodePro']), bins=bin_edges, labels=labels)

person_experience = employee['YearsCodePro'].values[0]

print(person_id)
print(person_skills)
print(person_interests)
print(person_experience)


8
['Bash/Shell', 'C#', 'C++', 'Java', 'PowerShell', 'Python', 'R', 'Scala', 'SQL', 'Swift', 'MariaDB', 'Microsoft SQL Server', 'MongoDB', 'MySQL', 'SQLite', 'Microsoft Azure', 'jQuery', 'Ruby on Rails', 'Apache Kafka', 'Apache Spark', 'Hadoop', 'NumPy', 'Pandas', 'Docker', 'Kubernetes', 'npm']
['C#', 'Python', 'SQL', 'Cassandra', 'MariaDB', 'Microsoft SQL Server', 'MySQL', 'Microsoft Azure', 'Angular', 'Apache Spark', 'Hadoop', 'NumPy', 'Pandas', 'Docker', 'npm', 'Terraform']
Associate


# Load Survey Roles Dataset

In [176]:
# Read the filtered Skill2Vec dataset
df_roles = pd.read_csv(filepath_or_buffer="../2-data/survey_roles.csv", sep=",", encoding="latin1")

# Tokenize the skills
df_roles['skills'] = df_roles['skills'].apply(lambda x: str(x).split(';'))

df_roles = df_roles[['id', 'skills', 'explevel', 'name']]
df_roles = df_roles.dropna()

df_roles



Unnamed: 0,id,skills,explevel,name
0,11,"[Bash/Shell, PowerShell, Python, R, SQL, Micro...",Senior,Data Engineer
1,12,"[Bash/Shell, PowerShell, SQL, Microsoft SQL Se...",Associate,DevOps Engineer
2,13,"[HTML/CSS, JavaScript, Python, SQL, MySQL, Pos...",Associate,Full Stack Developer
3,14,"[Bash/Shell, PowerShell, SQL, Microsoft SQL Se...",Senior,Database Administrator
4,15,"[Python, R, Scala, SQL, MongoDB, MySQL, AWS, G...",Senior,Data Scientist
5,21,"[C#, Kotlin, Python, SQL, PostgreSQL, .NET, Ap...",Associate,Software Engineer
6,22,"[JavaScript, PostgreSQL, jQuery, React.js, Rea...",Associate,Software Engineer - Frontend
7,23,"[HTML/CSS, JavaScript, PostgreSQL, Angular.js,...",Senior,Senior Software Engineer - Frontend
8,24,"[C#, Kotlin, Python, Rust, Microsoft SQL Serve...",Senior,Senior Software Engineer
9,25,"[C#, Java, JavaScript, Kotlin, Python, Rust, S...",Architect,Software Architect


# Load Word Embedding Model

In [177]:
model = Word2Vec.load("../3-word_embedding/model-w2vcombinedfiltered")

# Compute Similarity Scores

In [178]:
# Normalize scores
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()

## Score from Skills

In [179]:
# Calculate the distance between the person's skills and the role skills
similarity_scores = []
for skills in df_roles['skills']:
    similarity_scores.append(model.wv.wmdistance(person_skills, skills))

# Add the dissimilarity score to each role
df_roles['similarity_score_skills'] = similarity_scores

# Get the highest distance before inf
df_temp = df_roles.loc[(df_roles['similarity_score_skills'] < np.inf)]
max_score = df_temp['similarity_score_skills'].max()
print(max_score)

# Replace rows where the score is inf - meaning there is no similarity
df_roles['similarity_score_skills'] = df_roles['similarity_score_skills'].replace(np.inf, max_score)

# Normalize scores and convert distance to similarity - 0 = more dissimilar, 1 = more similar
df_roles['similarity_score_skills'] = 1 - scaler.fit_transform(df_roles[['similarity_score_skills']])

# Sort the roles by similarity score
df_roles = df_roles.sort_values(by=['similarity_score_skills'], ascending=False)

df_roles

0.6519635075952824


Unnamed: 0,id,skills,explevel,name,similarity_score_skills
10,31,"[Clojure, Go, Java, Python, Microsoft SQL Serv...",Senior,Development manager,1.0
9,25,"[C#, Java, JavaScript, Kotlin, Python, Rust, S...",Architect,Software Architect,0.998113
14,35,"[C, C++, Go, Java, JavaScript, Kotlin, SQL, Sw...",Senior,Senior Developer,0.909704
15,41,"[C#, PowerShell, Python, SQL, Cassandra, Maria...",Associate,Data engineer,0.82096
16,42,"[Kotlin, Python, R, SQL, Cassandra, Elasticsea...",Associate,Data scientist,0.797442
0,11,"[Bash/Shell, PowerShell, Python, R, SQL, Micro...",Senior,Data Engineer,0.767842
4,15,"[Python, R, Scala, SQL, MongoDB, MySQL, AWS, G...",Senior,Data Scientist,0.741227
8,24,"[C#, Kotlin, Python, Rust, Microsoft SQL Serve...",Senior,Senior Software Engineer,0.708211
1,12,"[Bash/Shell, PowerShell, SQL, Microsoft SQL Se...",Associate,DevOps Engineer,0.547626
5,21,"[C#, Kotlin, Python, SQL, PostgreSQL, .NET, Ap...",Associate,Software Engineer,0.514202


## Score from Interests

In [180]:
# Calculate the similarity between the person's skills and the role skills
similarity_scores = []
for skills in df_roles['skills']:
    similarity_scores.append(model.wv.wmdistance(person_interests, skills))

# Add the similarity scores to each role
df_roles['similarity_score_interests'] = similarity_scores

# Get the highest distance before inf
df_temp = df_roles.loc[(df_roles['similarity_score_interests'] < np.inf)]
max_score = df_temp['similarity_score_interests'].max()
print(max_score)

# Replace rows where the score is inf - meaning there is no similarity
df_roles['similarity_score_interests'] = df_roles['similarity_score_interests'].replace(np.inf, max_score)

# Normalize scores and convert distance to similarity - 0 = more dissimilar, 1 = more similar
df_roles['similarity_score_interests'] =  1 - scaler.fit_transform(df_roles[['similarity_score_interests']])

# Sort the roles by similarity score
df_roles = df_roles.sort_values(by=['similarity_score_interests'], ascending=False)

df_roles

0.657794439625153


Unnamed: 0,id,skills,explevel,name,similarity_score_skills,similarity_score_interests
15,41,"[C#, PowerShell, Python, SQL, Cassandra, Maria...",Associate,Data engineer,0.82096,1.0
8,24,"[C#, Kotlin, Python, Rust, Microsoft SQL Serve...",Senior,Senior Software Engineer,0.708211,0.798005
16,42,"[Kotlin, Python, R, SQL, Cassandra, Elasticsea...",Associate,Data scientist,0.797442,0.77992
9,25,"[C#, Java, JavaScript, Kotlin, Python, Rust, S...",Architect,Software Architect,0.998113,0.687765
0,11,"[Bash/Shell, PowerShell, Python, R, SQL, Micro...",Senior,Data Engineer,0.767842,0.666778
1,12,"[Bash/Shell, PowerShell, SQL, Microsoft SQL Se...",Associate,DevOps Engineer,0.547626,0.57928
10,31,"[Clojure, Go, Java, Python, Microsoft SQL Serv...",Senior,Development manager,1.0,0.547639
4,15,"[Python, R, Scala, SQL, MongoDB, MySQL, AWS, G...",Senior,Data Scientist,0.741227,0.519765
5,21,"[C#, Kotlin, Python, SQL, PostgreSQL, .NET, Ap...",Associate,Software Engineer,0.514202,0.514165
2,13,"[HTML/CSS, JavaScript, Python, SQL, MySQL, Pos...",Associate,Full Stack Developer,0.261611,0.473818


## Score from Experience

In [181]:
from scipy.spatial.distance import euclidean

# Define a function to compute the similarity metric
def similarity(level1, level2):
    # Define the vectors representing the levels of experience
    levels = {'Junior': [0, 0, 0, 0],
              'Associate': [1, 0, 0, 0],
              'Senior': [1, 1, 0, 0],
              'Architect': [1, 1, 1, 0]}
    
    # Compute the Euclidean distance between the two vectors
    distance = euclidean(levels[level1], levels[level2])
    
    # Return the similarity metric
    return 1 / (1 + distance)

# Compute the similarity based on level
df_roles['similarity_score_experience'] = df_roles["explevel"].apply(lambda x: similarity(person_experience, x))
df_roles

Unnamed: 0,id,skills,explevel,name,similarity_score_skills,similarity_score_interests,similarity_score_experience
15,41,"[C#, PowerShell, Python, SQL, Cassandra, Maria...",Associate,Data engineer,0.82096,1.0,1.0
8,24,"[C#, Kotlin, Python, Rust, Microsoft SQL Serve...",Senior,Senior Software Engineer,0.708211,0.798005,0.5
16,42,"[Kotlin, Python, R, SQL, Cassandra, Elasticsea...",Associate,Data scientist,0.797442,0.77992,1.0
9,25,"[C#, Java, JavaScript, Kotlin, Python, Rust, S...",Architect,Software Architect,0.998113,0.687765,0.414214
0,11,"[Bash/Shell, PowerShell, Python, R, SQL, Micro...",Senior,Data Engineer,0.767842,0.666778,0.5
1,12,"[Bash/Shell, PowerShell, SQL, Microsoft SQL Se...",Associate,DevOps Engineer,0.547626,0.57928,1.0
10,31,"[Clojure, Go, Java, Python, Microsoft SQL Serv...",Senior,Development manager,1.0,0.547639,0.5
4,15,"[Python, R, Scala, SQL, MongoDB, MySQL, AWS, G...",Senior,Data Scientist,0.741227,0.519765,0.5
5,21,"[C#, Kotlin, Python, SQL, PostgreSQL, .NET, Ap...",Associate,Software Engineer,0.514202,0.514165,1.0
2,13,"[HTML/CSS, JavaScript, Python, SQL, MySQL, Pos...",Associate,Full Stack Developer,0.261611,0.473818,1.0


# Add Results from all 3 Models

## Model 1 - Compute Final Score

In [182]:
# Set weights
weight_skills = 0.8
weight_interests = 0
weight_experience = 0.2

# Calculate final score
df_roles['score-model1'] = df_roles['similarity_score_skills'] * weight_skills + df_roles['similarity_score_interests'] * weight_interests + df_roles['similarity_score_experience'] * weight_experience

df_roles

Unnamed: 0,id,skills,explevel,name,similarity_score_skills,similarity_score_interests,similarity_score_experience,score-model1
15,41,"[C#, PowerShell, Python, SQL, Cassandra, Maria...",Associate,Data engineer,0.82096,1.0,1.0,0.856768
8,24,"[C#, Kotlin, Python, Rust, Microsoft SQL Serve...",Senior,Senior Software Engineer,0.708211,0.798005,0.5,0.666569
16,42,"[Kotlin, Python, R, SQL, Cassandra, Elasticsea...",Associate,Data scientist,0.797442,0.77992,1.0,0.837953
9,25,"[C#, Java, JavaScript, Kotlin, Python, Rust, S...",Architect,Software Architect,0.998113,0.687765,0.414214,0.881333
0,11,"[Bash/Shell, PowerShell, Python, R, SQL, Micro...",Senior,Data Engineer,0.767842,0.666778,0.5,0.714274
1,12,"[Bash/Shell, PowerShell, SQL, Microsoft SQL Se...",Associate,DevOps Engineer,0.547626,0.57928,1.0,0.638101
10,31,"[Clojure, Go, Java, Python, Microsoft SQL Serv...",Senior,Development manager,1.0,0.547639,0.5,0.9
4,15,"[Python, R, Scala, SQL, MongoDB, MySQL, AWS, G...",Senior,Data Scientist,0.741227,0.519765,0.5,0.692982
5,21,"[C#, Kotlin, Python, SQL, PostgreSQL, .NET, Ap...",Associate,Software Engineer,0.514202,0.514165,1.0,0.611361
2,13,"[HTML/CSS, JavaScript, Python, SQL, MySQL, Pos...",Associate,Full Stack Developer,0.261611,0.473818,1.0,0.409288


## Model 2 - Compute final score

In [183]:
# Set weights
weight_skills = 0.4
weight_interests = 0.4
weight_experience = 0.2

# Calculate final score
df_roles['score-model2'] = df_roles['similarity_score_skills'] * weight_skills + df_roles['similarity_score_interests'] * weight_interests + df_roles['similarity_score_experience'] * weight_experience

df_roles

Unnamed: 0,id,skills,explevel,name,similarity_score_skills,similarity_score_interests,similarity_score_experience,score-model1,score-model2
15,41,"[C#, PowerShell, Python, SQL, Cassandra, Maria...",Associate,Data engineer,0.82096,1.0,1.0,0.856768,0.928384
8,24,"[C#, Kotlin, Python, Rust, Microsoft SQL Serve...",Senior,Senior Software Engineer,0.708211,0.798005,0.5,0.666569,0.702487
16,42,"[Kotlin, Python, R, SQL, Cassandra, Elasticsea...",Associate,Data scientist,0.797442,0.77992,1.0,0.837953,0.830945
9,25,"[C#, Java, JavaScript, Kotlin, Python, Rust, S...",Architect,Software Architect,0.998113,0.687765,0.414214,0.881333,0.757194
0,11,"[Bash/Shell, PowerShell, Python, R, SQL, Micro...",Senior,Data Engineer,0.767842,0.666778,0.5,0.714274,0.673848
1,12,"[Bash/Shell, PowerShell, SQL, Microsoft SQL Se...",Associate,DevOps Engineer,0.547626,0.57928,1.0,0.638101,0.650762
10,31,"[Clojure, Go, Java, Python, Microsoft SQL Serv...",Senior,Development manager,1.0,0.547639,0.5,0.9,0.719056
4,15,"[Python, R, Scala, SQL, MongoDB, MySQL, AWS, G...",Senior,Data Scientist,0.741227,0.519765,0.5,0.692982,0.604397
5,21,"[C#, Kotlin, Python, SQL, PostgreSQL, .NET, Ap...",Associate,Software Engineer,0.514202,0.514165,1.0,0.611361,0.611347
2,13,"[HTML/CSS, JavaScript, Python, SQL, MySQL, Pos...",Associate,Full Stack Developer,0.261611,0.473818,1.0,0.409288,0.494171


## Model 3 - Compute Final Score

In [184]:
# Set weights
weight_skills = 0
weight_interests = 0.8
weight_experience = 0.2

# Calculate final score
df_roles['score-model3'] = df_roles['similarity_score_skills'] * weight_skills + df_roles['similarity_score_interests'] * weight_interests + df_roles['similarity_score_experience'] * weight_experience

df_roles

Unnamed: 0,id,skills,explevel,name,similarity_score_skills,similarity_score_interests,similarity_score_experience,score-model1,score-model2,score-model3
15,41,"[C#, PowerShell, Python, SQL, Cassandra, Maria...",Associate,Data engineer,0.82096,1.0,1.0,0.856768,0.928384,1.0
8,24,"[C#, Kotlin, Python, Rust, Microsoft SQL Serve...",Senior,Senior Software Engineer,0.708211,0.798005,0.5,0.666569,0.702487,0.738404
16,42,"[Kotlin, Python, R, SQL, Cassandra, Elasticsea...",Associate,Data scientist,0.797442,0.77992,1.0,0.837953,0.830945,0.823936
9,25,"[C#, Java, JavaScript, Kotlin, Python, Rust, S...",Architect,Software Architect,0.998113,0.687765,0.414214,0.881333,0.757194,0.633055
0,11,"[Bash/Shell, PowerShell, Python, R, SQL, Micro...",Senior,Data Engineer,0.767842,0.666778,0.5,0.714274,0.673848,0.633423
1,12,"[Bash/Shell, PowerShell, SQL, Microsoft SQL Se...",Associate,DevOps Engineer,0.547626,0.57928,1.0,0.638101,0.650762,0.663424
10,31,"[Clojure, Go, Java, Python, Microsoft SQL Serv...",Senior,Development manager,1.0,0.547639,0.5,0.9,0.719056,0.538111
4,15,"[Python, R, Scala, SQL, MongoDB, MySQL, AWS, G...",Senior,Data Scientist,0.741227,0.519765,0.5,0.692982,0.604397,0.515812
5,21,"[C#, Kotlin, Python, SQL, PostgreSQL, .NET, Ap...",Associate,Software Engineer,0.514202,0.514165,1.0,0.611361,0.611347,0.611332
2,13,"[HTML/CSS, JavaScript, Python, SQL, MySQL, Pos...",Associate,Full Stack Developer,0.261611,0.473818,1.0,0.409288,0.494171,0.579054


# Add Columns for Ordered List

In [185]:
# Rank similarities
df_roles['rank-model1'] = df_roles['score-model1'].rank(ascending=False)
df_roles['rank-model2'] = df_roles['score-model2'].rank(ascending=False)
df_roles['rank-model3'] = df_roles['score-model3'].rank(ascending=False)

# Select only required columns
df_roles = df_roles[['id','skills', 'explevel','name','rank-model1', 'rank-model2', 'rank-model3']]
df_roles


Unnamed: 0,id,skills,explevel,name,rank-model1,rank-model2,rank-model3
15,41,"[C#, PowerShell, Python, SQL, Cassandra, Maria...",Associate,Data engineer,3.0,1.0,1.0
8,24,"[C#, Kotlin, Python, Rust, Microsoft SQL Serve...",Senior,Senior Software Engineer,8.0,5.0,3.0
16,42,"[Kotlin, Python, R, SQL, Cassandra, Elasticsea...",Associate,Data scientist,4.0,2.0,2.0
9,25,"[C#, Java, JavaScript, Kotlin, Python, Rust, S...",Architect,Software Architect,2.0,3.0,6.0
0,11,"[Bash/Shell, PowerShell, Python, R, SQL, Micro...",Senior,Data Engineer,6.0,6.0,5.0
1,12,"[Bash/Shell, PowerShell, SQL, Microsoft SQL Se...",Associate,DevOps Engineer,9.0,7.0,4.0
10,31,"[Clojure, Go, Java, Python, Microsoft SQL Serv...",Senior,Development manager,1.0,4.0,9.0
4,15,"[Python, R, Scala, SQL, MongoDB, MySQL, AWS, G...",Senior,Data Scientist,7.0,10.0,10.0
5,21,"[C#, Kotlin, Python, SQL, PostgreSQL, .NET, Ap...",Associate,Software Engineer,10.0,9.0,7.0
2,13,"[HTML/CSS, JavaScript, Python, SQL, MySQL, Pos...",Associate,Full Stack Developer,13.0,11.0,8.0


In [186]:

df_roles = df_roles[(df_roles['rank-model1']<4) | (df_roles['rank-model2']<4) | (df_roles['rank-model3']<4)]
df_roles

Unnamed: 0,id,skills,explevel,name,rank-model1,rank-model2,rank-model3
15,41,"[C#, PowerShell, Python, SQL, Cassandra, Maria...",Associate,Data engineer,3.0,1.0,1.0
8,24,"[C#, Kotlin, Python, Rust, Microsoft SQL Serve...",Senior,Senior Software Engineer,8.0,5.0,3.0
16,42,"[Kotlin, Python, R, SQL, Cassandra, Elasticsea...",Associate,Data scientist,4.0,2.0,2.0
9,25,"[C#, Java, JavaScript, Kotlin, Python, Rust, S...",Architect,Software Architect,2.0,3.0,6.0
10,31,"[Clojure, Go, Java, Python, Microsoft SQL Serv...",Senior,Development manager,1.0,4.0,9.0


# Export Results

In [187]:
df_roles.to_csv(f"../6-results/rating_ITProfessional_{person_id}.csv")
df_roles


Unnamed: 0,id,skills,explevel,name,rank-model1,rank-model2,rank-model3
15,41,"[C#, PowerShell, Python, SQL, Cassandra, Maria...",Associate,Data engineer,3.0,1.0,1.0
8,24,"[C#, Kotlin, Python, Rust, Microsoft SQL Serve...",Senior,Senior Software Engineer,8.0,5.0,3.0
16,42,"[Kotlin, Python, R, SQL, Cassandra, Elasticsea...",Associate,Data scientist,4.0,2.0,2.0
9,25,"[C#, Java, JavaScript, Kotlin, Python, Rust, S...",Architect,Software Architect,2.0,3.0,6.0
10,31,"[Clojure, Go, Java, Python, Microsoft SQL Serv...",Senior,Development manager,1.0,4.0,9.0
