# Models for Rating

One single word embedding model is used for modelling the similarity score for skills and interests.
A score is calculated from each Word Mover's Distance computed and a final score is calculated as a weighted combination of all scores.

## Import Libraries

In [1451]:
import pandas as pd
from gensim.models import Word2Vec
import numpy as np

# Enter Person's Profile

In [1452]:
# Get the person's profile
person_id = 20

# Manual setting
#person_skills = ['Python', 'Microsoft SQL Server', 'Microsoft Azure']
#person_interests = ['Python']
#person_experience = 5 # years

# Get the profile from the collected data from the survey
employee = pd.read_csv(filepath_or_buffer="../2-data/survey_people.csv", sep=",", encoding="latin1")
employee = employee[employee['id']==person_id]
person_skills = employee['skills'].values[0].split(';')
person_interests = employee['skills-want'].values[0].split(';')

# Define ranges for each level of experience
bin_edges = [-1, 2, 5, 10, 50]

# Define the labels for each level of experience
labels = ['Junior', 'Associate', 'Senior', 'Architect']

# Convert the 'years' column to categorical levels of experience
employee['YearsCodePro'] = pd.cut(pd.to_numeric(employee['YearsCodePro']), bins=bin_edges, labels=labels)

person_experience = employee['YearsCodePro'].values[0]

print(person_id)
print(person_skills)
print(person_interests)
print(person_experience)


20
['Bash/Shell', 'C', 'HTML/CSS', 'Java', 'JavaScript', 'MATLAB', 'Python', 'R', 'SQL', 'Microsoft SQL Server', 'MySQL', 'Oracle', 'PostgreSQL', 'AWS', 'Google Cloud', 'Microsoft Azure', 'Node.js', 'React.js', 'Apache Kafka', 'Keras', 'NumPy', 'Pandas', 'Scikit-learn', 'Spring', 'TensorFlow', 'Torch/PyTorch', 'Docker', 'Homebrew', 'Kubernetes', 'npm', 'Yarn']
['Bash/Shell', 'Go', 'Java', 'JavaScript', 'Python', 'Rust', 'SQL', 'Cassandra', 'Microsoft SQL Server', 'MySQL', 'Oracle', 'PostgreSQL', 'AWS', 'Google Cloud', 'Microsoft Azure', 'Oracle Cloud Infrastructure', 'Angular.js', 'Django', 'Laravel', 'Node.js', 'React.js', 'Vue.js', 'Keras', 'NumPy', 'Pandas', 'Scikit-learn', 'Spring', 'TensorFlow', 'Torch/PyTorch', 'Docker', 'Kubernetes', 'npm']
Junior


# Load Survey Roles Dataset

In [1453]:
# Read the roles collected from the survey
df_roles = pd.read_csv(filepath_or_buffer="../2-data/survey_roles.csv", sep=",", encoding="latin1")

# Tokenize the skills
df_roles['skills'] = df_roles['skills'].apply(lambda x: str(x).split(';'))

df_roles = df_roles[['id', 'skills', 'explevel', 'name']]
df_roles = df_roles.dropna()

df_roles



Unnamed: 0,id,skills,explevel,name
0,11,"[Bash/Shell, PowerShell, Python, R, SQL, Micro...",Senior,Data Engineer
1,12,"[Bash/Shell, PowerShell, SQL, Microsoft SQL Se...",Associate,DevOps Engineer
2,13,"[HTML/CSS, JavaScript, Python, SQL, MySQL, Pos...",Associate,Full Stack Developer
3,14,"[Bash/Shell, PowerShell, SQL, Microsoft SQL Se...",Senior,Database Administrator
4,15,"[Python, R, Scala, SQL, MongoDB, MySQL, AWS, G...",Senior,Data Scientist
5,21,"[C#, Kotlin, Python, SQL, PostgreSQL, .NET, Ap...",Associate,Software Engineer
6,22,"[JavaScript, PostgreSQL, jQuery, React.js, Rea...",Associate,Software Engineer - Frontend
7,23,"[HTML/CSS, JavaScript, PostgreSQL, Angular.js,...",Senior,Senior Software Engineer - Frontend
8,24,"[C#, Kotlin, Python, Rust, Microsoft SQL Serve...",Senior,Senior Software Engineer
9,25,"[C#, Java, JavaScript, Kotlin, Python, Rust, S...",Architect,Software Architect


# Load the Word Embedding Model

In [1454]:
model = Word2Vec.load("../3-word_embedding/model-w2vcombinedfiltered")

# Compute Similarity Scores

In [1455]:
# Normalize scores
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()

## Score from Skills

In [1456]:
# Calculate the distance between the person's skills and each role skills
similarity_scores = []
for skills in df_roles['skills']:
    similarity_scores.append(model.wv.wmdistance(person_skills, skills))

# Add the dissimilarity score to each role
df_roles['similarity_score_skills'] = similarity_scores

# Get the highest distance before inf
df_temp = df_roles.loc[(df_roles['similarity_score_skills'] < np.inf)]
max_score = df_temp['similarity_score_skills'].max()
print(max_score)

# Replace rows where the score is inf - meaning there is no similarity
df_roles['similarity_score_skills'] = df_roles['similarity_score_skills'].replace(np.inf, max_score)

# Normalize scores and convert distance to similarity - 0 = more dissimilar, 1 = more similar
df_roles['similarity_score_skills'] = 1 - scaler.fit_transform(df_roles[['similarity_score_skills']])

# Sort the roles by similarity score
df_roles = df_roles.sort_values(by=['similarity_score_skills'], ascending=False)

df_roles

0.6340424671558986


Unnamed: 0,id,skills,explevel,name,similarity_score_skills
10,31,"[Clojure, Go, Java, Python, Microsoft SQL Serv...",Senior,Development manager,1.0
14,35,"[C, C++, Go, Java, JavaScript, Kotlin, SQL, Sw...",Senior,Senior Developer,0.976833
16,42,"[Kotlin, Python, R, SQL, Cassandra, Elasticsea...",Associate,Data scientist,0.646705
9,25,"[C#, Java, JavaScript, Kotlin, Python, Rust, S...",Architect,Software Architect,0.620822
4,15,"[Python, R, Scala, SQL, MongoDB, MySQL, AWS, G...",Senior,Data Scientist,0.478085
8,24,"[C#, Kotlin, Python, Rust, Microsoft SQL Serve...",Senior,Senior Software Engineer,0.461471
2,13,"[HTML/CSS, JavaScript, Python, SQL, MySQL, Pos...",Associate,Full Stack Developer,0.453598
0,11,"[Bash/Shell, PowerShell, Python, R, SQL, Micro...",Senior,Data Engineer,0.406263
3,14,"[Bash/Shell, PowerShell, SQL, Microsoft SQL Se...",Senior,Database Administrator,0.305344
5,21,"[C#, Kotlin, Python, SQL, PostgreSQL, .NET, Ap...",Associate,Software Engineer,0.292644


## Score from Interests

In [1457]:
# Calculate the similarity between the person's skills and each role skills
similarity_scores = []
for skills in df_roles['skills']:
    similarity_scores.append(model.wv.wmdistance(person_interests, skills))

# Add the similarity scores to each role
df_roles['similarity_score_interests'] = similarity_scores

# Get the highest distance before inf
df_temp = df_roles.loc[(df_roles['similarity_score_interests'] < np.inf)]
max_score = df_temp['similarity_score_interests'].max()
print(max_score)

# Replace rows where the score is inf - meaning there is no similarity
df_roles['similarity_score_interests'] = df_roles['similarity_score_interests'].replace(np.inf, max_score)

# Normalize scores and convert distance to similarity - 0 = more dissimilar, 1 = more similar
df_roles['similarity_score_interests'] =  1 - scaler.fit_transform(df_roles[['similarity_score_interests']])

# Sort the roles by similarity score
df_roles = df_roles.sort_values(by=['similarity_score_interests'], ascending=False)

df_roles

0.6532152027937397


Unnamed: 0,id,skills,explevel,name,similarity_score_skills,similarity_score_interests
14,35,"[C, C++, Go, Java, JavaScript, Kotlin, SQL, Sw...",Senior,Senior Developer,0.976833,1.0
10,31,"[Clojure, Go, Java, Python, Microsoft SQL Serv...",Senior,Development manager,1.0,0.924276
9,25,"[C#, Java, JavaScript, Kotlin, Python, Rust, S...",Architect,Software Architect,0.620822,0.641969
16,42,"[Kotlin, Python, R, SQL, Cassandra, Elasticsea...",Associate,Data scientist,0.646705,0.596432
8,24,"[C#, Kotlin, Python, Rust, Microsoft SQL Serve...",Senior,Senior Software Engineer,0.461471,0.496594
2,13,"[HTML/CSS, JavaScript, Python, SQL, MySQL, Pos...",Associate,Full Stack Developer,0.453598,0.484687
4,15,"[Python, R, Scala, SQL, MongoDB, MySQL, AWS, G...",Senior,Data Scientist,0.478085,0.396409
0,11,"[Bash/Shell, PowerShell, Python, R, SQL, Micro...",Senior,Data Engineer,0.406263,0.377443
3,14,"[Bash/Shell, PowerShell, SQL, Microsoft SQL Se...",Senior,Database Administrator,0.305344,0.349512
15,41,"[C#, PowerShell, Python, SQL, Cassandra, Maria...",Associate,Data engineer,0.205082,0.297977


## Score from Experience

In [1458]:
from scipy.spatial.distance import euclidean

# Define a function to compute the similarity score
def similarity(level1, level2):
    # Define the vectors representing the levels of experience
    levels = {'Junior': [0, 0, 0, 0],
              'Associate': [1, 0, 0, 0],
              'Senior': [1, 1, 0, 0],
              'Architect': [1, 1, 1, 0]}
    
    # Compute the Euclidean distance between the two vectors
    distance = euclidean(levels[level1], levels[level2])
    
    # Return the similarity metric
    return 1 / (1 + distance)

# Compute the similarity based on level
df_roles['similarity_score_experience'] = df_roles["explevel"].apply(lambda x: similarity(person_experience, x))
df_roles

Flushing oldest 200 entries.
  warn('Output cache limit (currently {sz} entries) hit.\n'


Unnamed: 0,id,skills,explevel,name,similarity_score_skills,similarity_score_interests,similarity_score_experience
14,35,"[C, C++, Go, Java, JavaScript, Kotlin, SQL, Sw...",Senior,Senior Developer,0.976833,1.0,0.414214
10,31,"[Clojure, Go, Java, Python, Microsoft SQL Serv...",Senior,Development manager,1.0,0.924276,0.414214
9,25,"[C#, Java, JavaScript, Kotlin, Python, Rust, S...",Architect,Software Architect,0.620822,0.641969,0.366025
16,42,"[Kotlin, Python, R, SQL, Cassandra, Elasticsea...",Associate,Data scientist,0.646705,0.596432,0.5
8,24,"[C#, Kotlin, Python, Rust, Microsoft SQL Serve...",Senior,Senior Software Engineer,0.461471,0.496594,0.414214
2,13,"[HTML/CSS, JavaScript, Python, SQL, MySQL, Pos...",Associate,Full Stack Developer,0.453598,0.484687,0.5
4,15,"[Python, R, Scala, SQL, MongoDB, MySQL, AWS, G...",Senior,Data Scientist,0.478085,0.396409,0.414214
0,11,"[Bash/Shell, PowerShell, Python, R, SQL, Micro...",Senior,Data Engineer,0.406263,0.377443,0.414214
3,14,"[Bash/Shell, PowerShell, SQL, Microsoft SQL Se...",Senior,Database Administrator,0.305344,0.349512,0.414214
15,41,"[C#, PowerShell, Python, SQL, Cassandra, Maria...",Associate,Data engineer,0.205082,0.297977,0.5


# Add Results from all 3 Models

## Model 1 - Compute Final Score

In [1459]:
# Set weights
weight_skills = 0.8
weight_interests = 0
weight_experience = 0.2

# Calculate final score
df_roles['score-model1'] = df_roles['similarity_score_skills'] * weight_skills + df_roles['similarity_score_interests'] * weight_interests + df_roles['similarity_score_experience'] * weight_experience

df_roles

Unnamed: 0,id,skills,explevel,name,similarity_score_skills,similarity_score_interests,similarity_score_experience,score-model1
14,35,"[C, C++, Go, Java, JavaScript, Kotlin, SQL, Sw...",Senior,Senior Developer,0.976833,1.0,0.414214,0.864309
10,31,"[Clojure, Go, Java, Python, Microsoft SQL Serv...",Senior,Development manager,1.0,0.924276,0.414214,0.882843
9,25,"[C#, Java, JavaScript, Kotlin, Python, Rust, S...",Architect,Software Architect,0.620822,0.641969,0.366025,0.569863
16,42,"[Kotlin, Python, R, SQL, Cassandra, Elasticsea...",Associate,Data scientist,0.646705,0.596432,0.5,0.617364
8,24,"[C#, Kotlin, Python, Rust, Microsoft SQL Serve...",Senior,Senior Software Engineer,0.461471,0.496594,0.414214,0.45202
2,13,"[HTML/CSS, JavaScript, Python, SQL, MySQL, Pos...",Associate,Full Stack Developer,0.453598,0.484687,0.5,0.462878
4,15,"[Python, R, Scala, SQL, MongoDB, MySQL, AWS, G...",Senior,Data Scientist,0.478085,0.396409,0.414214,0.46531
0,11,"[Bash/Shell, PowerShell, Python, R, SQL, Micro...",Senior,Data Engineer,0.406263,0.377443,0.414214,0.407853
3,14,"[Bash/Shell, PowerShell, SQL, Microsoft SQL Se...",Senior,Database Administrator,0.305344,0.349512,0.414214,0.327118
15,41,"[C#, PowerShell, Python, SQL, Cassandra, Maria...",Associate,Data engineer,0.205082,0.297977,0.5,0.264066


## Model 2 - Compute final score

In [1460]:
# Set weights
weight_skills = 0.4
weight_interests = 0.4
weight_experience = 0.2

# Calculate final score
df_roles['score-model2'] = df_roles['similarity_score_skills'] * weight_skills + df_roles['similarity_score_interests'] * weight_interests + df_roles['similarity_score_experience'] * weight_experience

df_roles

Unnamed: 0,id,skills,explevel,name,similarity_score_skills,similarity_score_interests,similarity_score_experience,score-model1,score-model2
14,35,"[C, C++, Go, Java, JavaScript, Kotlin, SQL, Sw...",Senior,Senior Developer,0.976833,1.0,0.414214,0.864309,0.873576
10,31,"[Clojure, Go, Java, Python, Microsoft SQL Serv...",Senior,Development manager,1.0,0.924276,0.414214,0.882843,0.852553
9,25,"[C#, Java, JavaScript, Kotlin, Python, Rust, S...",Architect,Software Architect,0.620822,0.641969,0.366025,0.569863,0.578322
16,42,"[Kotlin, Python, R, SQL, Cassandra, Elasticsea...",Associate,Data scientist,0.646705,0.596432,0.5,0.617364,0.597255
8,24,"[C#, Kotlin, Python, Rust, Microsoft SQL Serve...",Senior,Senior Software Engineer,0.461471,0.496594,0.414214,0.45202,0.466069
2,13,"[HTML/CSS, JavaScript, Python, SQL, MySQL, Pos...",Associate,Full Stack Developer,0.453598,0.484687,0.5,0.462878,0.475314
4,15,"[Python, R, Scala, SQL, MongoDB, MySQL, AWS, G...",Senior,Data Scientist,0.478085,0.396409,0.414214,0.46531,0.43264
0,11,"[Bash/Shell, PowerShell, Python, R, SQL, Micro...",Senior,Data Engineer,0.406263,0.377443,0.414214,0.407853,0.396325
3,14,"[Bash/Shell, PowerShell, SQL, Microsoft SQL Se...",Senior,Database Administrator,0.305344,0.349512,0.414214,0.327118,0.344785
15,41,"[C#, PowerShell, Python, SQL, Cassandra, Maria...",Associate,Data engineer,0.205082,0.297977,0.5,0.264066,0.301224


## Model 3 - Compute Final Score

In [1461]:
# Set weights
weight_skills = 0
weight_interests = 0.8
weight_experience = 0.2

# Calculate final score
df_roles['score-model3'] = df_roles['similarity_score_skills'] * weight_skills + df_roles['similarity_score_interests'] * weight_interests + df_roles['similarity_score_experience'] * weight_experience

df_roles

Unnamed: 0,id,skills,explevel,name,similarity_score_skills,similarity_score_interests,similarity_score_experience,score-model1,score-model2,score-model3
14,35,"[C, C++, Go, Java, JavaScript, Kotlin, SQL, Sw...",Senior,Senior Developer,0.976833,1.0,0.414214,0.864309,0.873576,0.882843
10,31,"[Clojure, Go, Java, Python, Microsoft SQL Serv...",Senior,Development manager,1.0,0.924276,0.414214,0.882843,0.852553,0.822264
9,25,"[C#, Java, JavaScript, Kotlin, Python, Rust, S...",Architect,Software Architect,0.620822,0.641969,0.366025,0.569863,0.578322,0.586781
16,42,"[Kotlin, Python, R, SQL, Cassandra, Elasticsea...",Associate,Data scientist,0.646705,0.596432,0.5,0.617364,0.597255,0.577146
8,24,"[C#, Kotlin, Python, Rust, Microsoft SQL Serve...",Senior,Senior Software Engineer,0.461471,0.496594,0.414214,0.45202,0.466069,0.480118
2,13,"[HTML/CSS, JavaScript, Python, SQL, MySQL, Pos...",Associate,Full Stack Developer,0.453598,0.484687,0.5,0.462878,0.475314,0.48775
4,15,"[Python, R, Scala, SQL, MongoDB, MySQL, AWS, G...",Senior,Data Scientist,0.478085,0.396409,0.414214,0.46531,0.43264,0.39997
0,11,"[Bash/Shell, PowerShell, Python, R, SQL, Micro...",Senior,Data Engineer,0.406263,0.377443,0.414214,0.407853,0.396325,0.384797
3,14,"[Bash/Shell, PowerShell, SQL, Microsoft SQL Se...",Senior,Database Administrator,0.305344,0.349512,0.414214,0.327118,0.344785,0.362453
15,41,"[C#, PowerShell, Python, SQL, Cassandra, Maria...",Associate,Data engineer,0.205082,0.297977,0.5,0.264066,0.301224,0.338381


In [1462]:
df_roles[['id','skills', 'explevel','score-model1', 'score-model2', 'score-model3']]

Unnamed: 0,id,skills,explevel,score-model1,score-model2,score-model3
14,35,"[C, C++, Go, Java, JavaScript, Kotlin, SQL, Sw...",Senior,0.864309,0.873576,0.882843
10,31,"[Clojure, Go, Java, Python, Microsoft SQL Serv...",Senior,0.882843,0.852553,0.822264
9,25,"[C#, Java, JavaScript, Kotlin, Python, Rust, S...",Architect,0.569863,0.578322,0.586781
16,42,"[Kotlin, Python, R, SQL, Cassandra, Elasticsea...",Associate,0.617364,0.597255,0.577146
8,24,"[C#, Kotlin, Python, Rust, Microsoft SQL Serve...",Senior,0.45202,0.466069,0.480118
2,13,"[HTML/CSS, JavaScript, Python, SQL, MySQL, Pos...",Associate,0.462878,0.475314,0.48775
4,15,"[Python, R, Scala, SQL, MongoDB, MySQL, AWS, G...",Senior,0.46531,0.43264,0.39997
0,11,"[Bash/Shell, PowerShell, Python, R, SQL, Micro...",Senior,0.407853,0.396325,0.384797
3,14,"[Bash/Shell, PowerShell, SQL, Microsoft SQL Se...",Senior,0.327118,0.344785,0.362453
15,41,"[C#, PowerShell, Python, SQL, Cassandra, Maria...",Associate,0.264066,0.301224,0.338381


# Add Columns for Ordered List

In [1463]:
# Rank similarities
df_roles['rank-model1'] = df_roles['score-model1'].rank(ascending=False)
df_roles['rank-model2'] = df_roles['score-model2'].rank(ascending=False)
df_roles['rank-model3'] = df_roles['score-model3'].rank(ascending=False)

# Select only required columns
df_roles = df_roles[['id','skills', 'explevel','name','rank-model1', 'rank-model2', 'rank-model3']]
df_roles


Unnamed: 0,id,skills,explevel,name,rank-model1,rank-model2,rank-model3
14,35,"[C, C++, Go, Java, JavaScript, Kotlin, SQL, Sw...",Senior,Senior Developer,2.0,1.0,1.0
10,31,"[Clojure, Go, Java, Python, Microsoft SQL Serv...",Senior,Development manager,1.0,2.0,2.0
9,25,"[C#, Java, JavaScript, Kotlin, Python, Rust, S...",Architect,Software Architect,4.0,4.0,3.0
16,42,"[Kotlin, Python, R, SQL, Cassandra, Elasticsea...",Associate,Data scientist,3.0,3.0,4.0
8,24,"[C#, Kotlin, Python, Rust, Microsoft SQL Serve...",Senior,Senior Software Engineer,7.0,6.0,6.0
2,13,"[HTML/CSS, JavaScript, Python, SQL, MySQL, Pos...",Associate,Full Stack Developer,6.0,5.0,5.0
4,15,"[Python, R, Scala, SQL, MongoDB, MySQL, AWS, G...",Senior,Data Scientist,5.0,7.0,7.0
0,11,"[Bash/Shell, PowerShell, Python, R, SQL, Micro...",Senior,Data Engineer,8.0,8.0,8.0
3,14,"[Bash/Shell, PowerShell, SQL, Microsoft SQL Se...",Senior,Database Administrator,10.0,9.0,9.0
15,41,"[C#, PowerShell, Python, SQL, Cassandra, Maria...",Associate,Data engineer,13.0,13.0,10.0


In [1464]:
# Get only top 3 from each model
df_roles = df_roles[(df_roles['rank-model1']<4) | (df_roles['rank-model2']<4) | (df_roles['rank-model3']<4)]
df_roles

Unnamed: 0,id,skills,explevel,name,rank-model1,rank-model2,rank-model3
14,35,"[C, C++, Go, Java, JavaScript, Kotlin, SQL, Sw...",Senior,Senior Developer,2.0,1.0,1.0
10,31,"[Clojure, Go, Java, Python, Microsoft SQL Serv...",Senior,Development manager,1.0,2.0,2.0
9,25,"[C#, Java, JavaScript, Kotlin, Python, Rust, S...",Architect,Software Architect,4.0,4.0,3.0
16,42,"[Kotlin, Python, R, SQL, Cassandra, Elasticsea...",Associate,Data scientist,3.0,3.0,4.0


# Export Results

In [1465]:
# Save results in a CSV file
df_roles.to_csv(f"../6-results/rating_ITProfessional_{person_id}.csv")
df_roles


Unnamed: 0,id,skills,explevel,name,rank-model1,rank-model2,rank-model3
14,35,"[C, C++, Go, Java, JavaScript, Kotlin, SQL, Sw...",Senior,Senior Developer,2.0,1.0,1.0
10,31,"[Clojure, Go, Java, Python, Microsoft SQL Serv...",Senior,Development manager,1.0,2.0,2.0
9,25,"[C#, Java, JavaScript, Kotlin, Python, Rust, S...",Architect,Software Architect,4.0,4.0,3.0
16,42,"[Kotlin, Python, R, SQL, Cassandra, Elasticsea...",Associate,Data scientist,3.0,3.0,4.0
