# Models for Rating

One single word embedding model is used for modelling the similarities between skills, which is used also for interests.
A score is calculated from each Word Mover's Distance computed and a final score is calculated as a weighted combination of all scores.

## Import Libraries

In [24]:
import pandas as pd
from gensim.models import Word2Vec
import numpy as np

# Load Employees Dataset

In [25]:
# Read the StackOverflow dataset
df_people = pd.read_csv(filepath_or_buffer="../2-data/survey_people.csv", sep=",", encoding="latin1")

# Tokenize the skills
df_people['skills'] = df_people['skills'].apply(lambda x: x.split(';'))
df_people['skills-want'] = df_people['skills-want'].apply(lambda x: x.split(';'))

df_people = df_people[['id', 'skills', 'skills-want', 'YearsCodePro']]
df_people



Unnamed: 0,id,skills,skills-want,YearsCodePro
0,1,"[Java, JavaScript, Python, SQL, MySQL, SQLite,...","[Dart, Java, SQL, MySQL, SQLite]",3
1,2,"[C++, HTML/CSS, Java, JavaScript, PHP, Python,...","[HTML/CSS, JavaScript, TypeScript, AWS, Fireba...",4
2,3,"[C++, HTML/CSS, Java, PHP, Python, SQL, MariaD...","[Python, MySQL, SQLite, Unity 3D, Unreal Engine]",4
3,4,"[HTML/CSS, JavaScript, Kotlin, TypeScript, AWS...","[HTML/CSS, JavaScript, TypeScript, AWS, Fireba...",2


# Load Word Embedding Model

In [26]:
model = Word2Vec.load("../3-word_embedding/model-w2vcombinedfiltered")


# Find People to a Role

## Get Role's Requirements

In [27]:
# Get the role requirements
role_skills = ['Python', 'Microsoft SQL Server', 'Microsoft Azure']
#role_experience = 5 # years
role_experience = 'Associate'

## WMD to get most similar people

In [28]:
# Scaler to normalize scores
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()

### Score from Skills

In [29]:
# Calculate the distance between the person's skills and the role skills
similarity_scores = []
for skills in df_people['skills']:
    similarity_scores.append(model.wv.wmdistance(role_skills, skills))

# Add the dissimilarity score to each role
df_people['similarity_score_skills'] = similarity_scores


# Get the highest distance before inf
df_temp = df_people.loc[(df_people['similarity_score_skills'] < np.inf)]
max_score = df_temp['similarity_score_skills'].max()
print(max_score)

# Remove rows where the score is inf - meaning there is no similarity
df_people['similarity_score_skills'] = df_people['similarity_score_skills'].replace(np.inf, max_score)

# Normalize scores and convert distance to similarity - 0 = more dissimilar, 1 = more similar
df_people['similarity_score_skills'] = 1 - scaler.fit_transform(df_people[['similarity_score_skills']])

df_people

0.9161147416211604


Unnamed: 0,id,skills,skills-want,YearsCodePro,similarity_score_skills
0,1,"[Java, JavaScript, Python, SQL, MySQL, SQLite,...","[Dart, Java, SQL, MySQL, SQLite]",3,0.204728
1,2,"[C++, HTML/CSS, Java, JavaScript, PHP, Python,...","[HTML/CSS, JavaScript, TypeScript, AWS, Fireba...",4,1.0
2,3,"[C++, HTML/CSS, Java, PHP, Python, SQL, MariaD...","[Python, MySQL, SQLite, Unity 3D, Unreal Engine]",4,0.0
3,4,"[HTML/CSS, JavaScript, Kotlin, TypeScript, AWS...","[HTML/CSS, JavaScript, TypeScript, AWS, Fireba...",2,0.632972


### Score from Interests

In [30]:
# Calculate the similarity between the person's skills and the role skills
similarity_scores = []
for skills in df_people['skills-want']:
    similarity_scores.append(model.wv.wmdistance(role_skills, skills))

# Add the similarity scores to each role
df_people['similarity_score_interests'] = similarity_scores

# Get the highest distance before inf
df_temp = df_people.loc[(df_people['similarity_score_interests'] < np.inf)]
max_score = df_temp['similarity_score_interests'].max()
print(max_score)

# Remove rows where the score is inf - meaning there is no similarity
df_people['similarity_score_interests'] = df_people['similarity_score_interests'].replace(np.inf, max_score)

# Normalize scores and convert distance to similarity - 0 = more dissimilar, 1 = more similar
df_people['similarity_score_interests'] =  1 - scaler.fit_transform(df_people[['similarity_score_interests']])

df_people

1.0593094513112602


Unnamed: 0,id,skills,skills-want,YearsCodePro,similarity_score_skills,similarity_score_interests
0,1,"[Java, JavaScript, Python, SQL, MySQL, SQLite,...","[Dart, Java, SQL, MySQL, SQLite]",3,0.204728,0.0
1,2,"[C++, HTML/CSS, Java, JavaScript, PHP, Python,...","[HTML/CSS, JavaScript, TypeScript, AWS, Fireba...",4,1.0,0.375262
2,3,"[C++, HTML/CSS, Java, PHP, Python, SQL, MariaD...","[Python, MySQL, SQLite, Unity 3D, Unreal Engine]",4,0.0,1.0
3,4,"[HTML/CSS, JavaScript, Kotlin, TypeScript, AWS...","[HTML/CSS, JavaScript, TypeScript, AWS, Fireba...",2,0.632972,0.36933


### Score from Experience

In [31]:
# define the bin edges for each level of experience
bin_edges = [-1, 4, 8, 12, 100]

# define the labels for each level of experience
labels = ['Junior', 'Associate', 'Senior', 'Architect']

# convert the 'years' column to categorical levels of experience
df_people['Experience'] = pd.cut(pd.to_numeric(df_people['YearsCodePro']), bins=bin_edges, labels=labels)

# print the resulting DataFrame
df_people

Unnamed: 0,id,skills,skills-want,YearsCodePro,similarity_score_skills,similarity_score_interests,Experience
0,1,"[Java, JavaScript, Python, SQL, MySQL, SQLite,...","[Dart, Java, SQL, MySQL, SQLite]",3,0.204728,0.0,Junior
1,2,"[C++, HTML/CSS, Java, JavaScript, PHP, Python,...","[HTML/CSS, JavaScript, TypeScript, AWS, Fireba...",4,1.0,0.375262,Junior
2,3,"[C++, HTML/CSS, Java, PHP, Python, SQL, MariaD...","[Python, MySQL, SQLite, Unity 3D, Unreal Engine]",4,0.0,1.0,Junior
3,4,"[HTML/CSS, JavaScript, Kotlin, TypeScript, AWS...","[HTML/CSS, JavaScript, TypeScript, AWS, Fireba...",2,0.632972,0.36933,Junior


In [32]:
from scipy.spatial.distance import euclidean

# define a function to compute the similarity metric
def similarity(level1, level2):
    # define the vectors representing the levels of experience
    levels = {'Junior': [0, 0, 0, 0],
              'Associate': [1, 0, 0, 0],
              'Senior': [1, 1, 0, 0],
              'Architect': [1, 1, 1, 0]}
    
    # compute the Euclidean distance between the two vectors
    distance = euclidean(levels[level1], levels[level2])
    
    # return the similarity metric
    return 1 / (1 + distance)


In [33]:

# use function to compute the similarity based on level

df_people['similarity_score_experience'] = df_people["Experience"].apply(lambda x: similarity(role_experience, x))
df_people


Unnamed: 0,id,skills,skills-want,YearsCodePro,similarity_score_skills,similarity_score_interests,Experience,similarity_score_experience
0,1,"[Java, JavaScript, Python, SQL, MySQL, SQLite,...","[Dart, Java, SQL, MySQL, SQLite]",3,0.204728,0.0,Junior,0.5
1,2,"[C++, HTML/CSS, Java, JavaScript, PHP, Python,...","[HTML/CSS, JavaScript, TypeScript, AWS, Fireba...",4,1.0,0.375262,Junior,0.5
2,3,"[C++, HTML/CSS, Java, PHP, Python, SQL, MariaD...","[Python, MySQL, SQLite, Unity 3D, Unreal Engine]",4,0.0,1.0,Junior,0.5
3,4,"[HTML/CSS, JavaScript, Kotlin, TypeScript, AWS...","[HTML/CSS, JavaScript, TypeScript, AWS, Fireba...",2,0.632972,0.36933,Junior,0.5


# Present Results from 3 Models

## Model 1 - Compute Final Score

In [34]:
# Set weights
weight_skills = 0.8
weight_interests = 0
weight_experience = 0.2

# Calculate final score
df_people['score-model1'] = df_people['similarity_score_skills'] * weight_skills + df_people['similarity_score_interests'] * weight_interests + df_people['similarity_score_experience'] * weight_experience

df_people

Unnamed: 0,id,skills,skills-want,YearsCodePro,similarity_score_skills,similarity_score_interests,Experience,similarity_score_experience,score-model1
0,1,"[Java, JavaScript, Python, SQL, MySQL, SQLite,...","[Dart, Java, SQL, MySQL, SQLite]",3,0.204728,0.0,Junior,0.5,0.263783
1,2,"[C++, HTML/CSS, Java, JavaScript, PHP, Python,...","[HTML/CSS, JavaScript, TypeScript, AWS, Fireba...",4,1.0,0.375262,Junior,0.5,0.9
2,3,"[C++, HTML/CSS, Java, PHP, Python, SQL, MariaD...","[Python, MySQL, SQLite, Unity 3D, Unreal Engine]",4,0.0,1.0,Junior,0.5,0.1
3,4,"[HTML/CSS, JavaScript, Kotlin, TypeScript, AWS...","[HTML/CSS, JavaScript, TypeScript, AWS, Fireba...",2,0.632972,0.36933,Junior,0.5,0.606378


## Model 2 - Compute final score

In [35]:
# Set weights
weight_skills = 0.4
weight_interests = 0.4
weight_experience = 0.2

# Calculate final score
df_people['score-model2'] = df_people['similarity_score_skills'] * weight_skills + df_people['similarity_score_interests'] * weight_interests + df_people['similarity_score_experience'] * weight_experience

df_people.head(10)

Unnamed: 0,id,skills,skills-want,YearsCodePro,similarity_score_skills,similarity_score_interests,Experience,similarity_score_experience,score-model1,score-model2
0,1,"[Java, JavaScript, Python, SQL, MySQL, SQLite,...","[Dart, Java, SQL, MySQL, SQLite]",3,0.204728,0.0,Junior,0.5,0.263783,0.181891
1,2,"[C++, HTML/CSS, Java, JavaScript, PHP, Python,...","[HTML/CSS, JavaScript, TypeScript, AWS, Fireba...",4,1.0,0.375262,Junior,0.5,0.9,0.650105
2,3,"[C++, HTML/CSS, Java, PHP, Python, SQL, MariaD...","[Python, MySQL, SQLite, Unity 3D, Unreal Engine]",4,0.0,1.0,Junior,0.5,0.1,0.5
3,4,"[HTML/CSS, JavaScript, Kotlin, TypeScript, AWS...","[HTML/CSS, JavaScript, TypeScript, AWS, Fireba...",2,0.632972,0.36933,Junior,0.5,0.606378,0.500921


## Model 3 - Compute Final Score

In [36]:
# Set weights
weight_skills = 0
weight_interests = 0.8
weight_experience = 0.2

# Calculate final score
df_people['score-model3'] = df_people['similarity_score_skills'] * weight_skills + df_people['similarity_score_interests'] * weight_interests + df_people['similarity_score_experience'] * weight_experience

df_people

Unnamed: 0,id,skills,skills-want,YearsCodePro,similarity_score_skills,similarity_score_interests,Experience,similarity_score_experience,score-model1,score-model2,score-model3
0,1,"[Java, JavaScript, Python, SQL, MySQL, SQLite,...","[Dart, Java, SQL, MySQL, SQLite]",3,0.204728,0.0,Junior,0.5,0.263783,0.181891,0.1
1,2,"[C++, HTML/CSS, Java, JavaScript, PHP, Python,...","[HTML/CSS, JavaScript, TypeScript, AWS, Fireba...",4,1.0,0.375262,Junior,0.5,0.9,0.650105,0.40021
2,3,"[C++, HTML/CSS, Java, PHP, Python, SQL, MariaD...","[Python, MySQL, SQLite, Unity 3D, Unreal Engine]",4,0.0,1.0,Junior,0.5,0.1,0.5,0.9
3,4,"[HTML/CSS, JavaScript, Kotlin, TypeScript, AWS...","[HTML/CSS, JavaScript, TypeScript, AWS, Fireba...",2,0.632972,0.36933,Junior,0.5,0.606378,0.500921,0.395464


In [37]:
df_people = df_people[['id','skills','skills-want', 'YearsCodePro', 'score-model1', 'score-model2', 'score-model3']]
df_people

Unnamed: 0,id,skills,skills-want,YearsCodePro,score-model1,score-model2,score-model3
0,1,"[Java, JavaScript, Python, SQL, MySQL, SQLite,...","[Dart, Java, SQL, MySQL, SQLite]",3,0.263783,0.181891,0.1
1,2,"[C++, HTML/CSS, Java, JavaScript, PHP, Python,...","[HTML/CSS, JavaScript, TypeScript, AWS, Fireba...",4,0.9,0.650105,0.40021
2,3,"[C++, HTML/CSS, Java, PHP, Python, SQL, MariaD...","[Python, MySQL, SQLite, Unity 3D, Unreal Engine]",4,0.1,0.5,0.9
3,4,"[HTML/CSS, JavaScript, Kotlin, TypeScript, AWS...","[HTML/CSS, JavaScript, TypeScript, AWS, Fireba...",2,0.606378,0.500921,0.395464


# See Results

In [61]:
print("Model 1")
df_people = df_people.sort_values(by=['score-model1'], ascending=False)
suggestion = df_people.head(1)

print("The suggestion from Model 1 is employee", suggestion['id'].values, ".")
print("This professional has experience with", suggestion['skills'].values, ".")
print("And is particularly interested in working with", suggestion['skills-want'].values ) 


Model 1
The suggestion from Model 1 is employee [2] .
This professional has experience with [list(['C++', 'HTML/CSS', 'Java', 'JavaScript', 'PHP', 'Python', 'TypeScript', 'AWS', 'Firebase', 'Google Cloud', 'Next.js', 'Node.js', 'React.js', 'Vue.js', 'Homebrew', 'npm', 'Yarn'])] .
And is particularly interested in working with [list(['HTML/CSS', 'JavaScript', 'TypeScript', 'AWS', 'Firebase', 'Google Cloud', 'Next.js', 'Node.js', 'React.js', 'Vue.js', 'Flutter', 'Homebrew', 'npm', 'Yarn'])]


In [63]:
print("Model 2")
df_people = df_people.sort_values(by=['score-model2'], ascending=False)
suggestion = df_people.head(1)

print("The suggestion from Model 2 is employee", suggestion['id'].values, ".")
print("This professional has experience with", suggestion['skills'].values, ".")
print("And is particularly interested in working with", suggestion['skills-want'].values ) 

Model 2
The suggestion from Model 2 is employee [2] .
This professional has experience with [list(['C++', 'HTML/CSS', 'Java', 'JavaScript', 'PHP', 'Python', 'TypeScript', 'AWS', 'Firebase', 'Google Cloud', 'Next.js', 'Node.js', 'React.js', 'Vue.js', 'Homebrew', 'npm', 'Yarn'])] .
And is particularly interested in working with [list(['HTML/CSS', 'JavaScript', 'TypeScript', 'AWS', 'Firebase', 'Google Cloud', 'Next.js', 'Node.js', 'React.js', 'Vue.js', 'Flutter', 'Homebrew', 'npm', 'Yarn'])]


In [64]:
print("Model 3")
df_people = df_people.sort_values(by=['score-model3'], ascending=False)
suggestion = df_people.head(1)

print("The suggestion from Model 3 is employee", suggestion['id'].values, ".")
print("This professional has experience with", suggestion['skills'].values, ".")
print("And is particularly interested in working with", suggestion['skills-want'].values ) 

Model 3
The suggestion from Model 3 is employee [3] .
This professional has experience with [list(['C++', 'HTML/CSS', 'Java', 'PHP', 'Python', 'SQL', 'MariaDB', 'MySQL', 'Oracle', 'SQLite', 'VMware', 'Unity 3D', 'Unreal Engine'])] .
And is particularly interested in working with [list(['Python', 'MySQL', 'SQLite', 'Unity 3D', 'Unreal Engine'])]


# Other Metrics

In [17]:

# For each matched person, show % of skills matched and % of interests matched


# Also show the % of skills matched and the % of skills not matched
# and for the matched skills, show the % that is only for skills and only for interests

# Define the list of skills to compare
skills_to_compare = role_skills

# Define a function to compute the percentage of skills that are mentioned in the list of skills and also in the DataFrame column skills
def compute_matching_percentage(row, column='skills'):
    matched_skills = set(row[column]) & set(skills_to_compare)
    return len(matched_skills) / len(skills_to_compare)

# Get matching skills
def compute_matching_list(row, column='skills'):
    matched_skills = set(row[column]) & set(skills_to_compare)
    return matched_skills

# Get non matching skills
def compute_non_matching_list(row, column='skills'):
    non_matched_skills = set(skills_to_compare) - set(row[column])
    return non_matched_skills

# Define a function to compute the percentage of skills that are not mentioned in the DataFrame column skills
def compute_non_matching_percentage(row, column='skills'):
    non_matched_skills = set(skills_to_compare) - set(row[column])
    return len(non_matched_skills) / len(skills_to_compare)

# Apply the functions to each row of the DataFrame
df_people['matching_percentage_skills'] = df_people.apply(lambda x: compute_matching_percentage(x,'skills'), axis=1)
df_people['non_matching_percentage_skills'] = df_people.apply(lambda x: compute_non_matching_percentage(x,'skills'), axis=1)

df_people['matching_percentage_interests'] = df_people.apply(lambda x: compute_matching_percentage(x,'skills-want'), axis=1)
df_people['non_matching_percentage_interests'] = df_people.apply(lambda x: compute_non_matching_percentage(x,'skills-want'), axis=1)

df_people['matching_list_skills'] = df_people.apply(lambda x: compute_matching_list(x,'skills'), axis=1)
df_people['non_matching_list_skills'] = df_people.apply(lambda x: compute_non_matching_list(x,'skills'), axis=1)

df_people['matching_list_interests'] = df_people.apply(lambda x: compute_matching_list(x,'skills-want'), axis=1)
df_people['non_matching_list_insterests'] = df_people.apply(lambda x: compute_non_matching_list(x,'skills-want'), axis=1)


# Print the resulting DataFrame
df_people



Unnamed: 0,id,skills,skills-want,Experience,score-model1,score-model2,score-model3,matching_percentage_skills,non_matching_percentage_skills,matching_percentage_interests,non_matching_percentage_interests,matching_list_skills,non_matching_list_skills,matching_list_interests,non_matching_list_insterests
46618,49361,"[C#, HTML/CSS, Python, SQL, Microsoft SQL Serv...","[C#, HTML/CSS, Python, SQL, Microsoft SQL Serv...",Architect,0.871817,0.875952,0.87733,0.666667,0.333333,0.666667,0.333333,"{Microsoft SQL Server, Python}",{Microsoft Azure},"{Microsoft SQL Server, Python}",{Microsoft Azure}
43159,45696,"[Bash/Shell, HTML/CSS, JavaScript, PHP, Python...","[Bash/Shell, Go, HTML/CSS, JavaScript, PHP, Py...",Architect,0.882843,0.864678,0.858623,0.333333,0.666667,0.333333,0.666667,{Python},"{Microsoft SQL Server, Microsoft Azure}",{Python},"{Microsoft SQL Server, Microsoft Azure}"
33415,35298,"[Go, HTML/CSS, JavaScript, PHP, TypeScript, Dy...","[Elixir, Go, JavaScript, TypeScript, Elasticse...",Architect,0.729396,0.692102,0.679671,0.0,1.0,0.0,1.0,{},"{Microsoft SQL Server, Microsoft Azure, Python}",{},"{Microsoft SQL Server, Microsoft Azure, Python}"
13469,14095,"[C, Go, Java, Python, SQL, SQLite, , Flask]","[C, Go, Java, Lua, Perl, Python, SQL, MySQL, P...",Junior,0.637075,0.611765,0.603329,0.333333,0.666667,0.333333,0.666667,{Python},"{Microsoft SQL Server, Microsoft Azure}",{Python},"{Microsoft SQL Server, Microsoft Azure}"
63882,68104,"[Bash/Shell, C, C#, F#, Java, JavaScript, OCam...","[C#, Elixir, F#, Haskell, OCaml, PHP, Rust, SQ...",Senior,0.1,0.1,0.1,0.0,1.0,0.0,1.0,{},"{Microsoft SQL Server, Microsoft Azure, Python}",{},"{Microsoft SQL Server, Microsoft Azure, Python}"


In [35]:
list_skills = df_people[df_people['id']==22519]['skills']
print(list_skills.values)

[]
