# Models for ranking

One single word embedding model is used for modelling the similarities between skills, which is used also for interests.
A score is calculated from each Word Mover's Distance computed and a final score is calculated as a weighted combination of all scores.

## Import Libraries

In [1]:
import pandas as pd
from gensim.models import Word2Vec
import numpy as np

# Load Employees Dataset

In [2]:
# Read the StackOverflow dataset
df_people = pd.read_csv(filepath_or_buffer="../2-data/employees.csv", sep=",", encoding="latin1")

# Tokenize the skills
df_people['skills'] = df_people['skills'].apply(lambda x: x.split(';'))

df_people = df_people[['id', 'skills', 'skills-want', 'YearsCodePro']]
df_people



Unnamed: 0,id,skills,skills-want,YearsCodePro
0,2,"[JavaScript, TypeScript]",Rust;TypeScript,0
1,3,"[C#, C++, HTML/CSS, JavaScript, Python, Micros...",C#;C++;HTML/CSS;JavaScript;TypeScript;Microsof...,5
2,4,"[C#, JavaScript, SQL, TypeScript, Microsoft SQ...",C#;SQL;TypeScript;Microsoft SQL Server;;ASP.NE...,17
3,5,"[C#, HTML/CSS, JavaScript, SQL, Swift, TypeScr...",C#;Elixir;F#;Go;JavaScript;Rust;TypeScript;Clo...,3
4,6,"[C++, Lua, , , , , Homebrew]",Lua;;;;;Homebrew,0
...,...,...,...,...
68545,73264,"[Bash/Shell, Dart, JavaScript, PHP, Python, SQ...",Bash/Shell;Go;JavaScript;Python;SQL;TypeScript...,5
68546,73265,"[Bash/Shell, HTML/CSS, JavaScript, Python, SQL...",HTML/CSS;JavaScript;Python;Elasticsearch;Neo4j...,5
68547,73266,"[HTML/CSS, JavaScript, PHP, Python, SQL, Maria...",C#;HTML/CSS;JavaScript;PHP;Python;SQL;MariaDB;...,33
68548,73267,"[C#, Delphi, VBA, Microsoft SQL Server, MongoD...",Delphi,31


# Load Word Embedding Model

In [3]:
model = Word2Vec.load("../3-word_embedding/model-w2vcombined")


# Prepare StackOverflow Dataset as available people

In [4]:

# Get only a sample
df_people = df_people.sample(5)

# Find People to a Role

## Get Role's Requirements

In [5]:
# Get the role requirements
role_skills = ['Python', 'Microsoft SQL Server', 'Microsoft Azure']
#role_experience = 5 # years
role_experience = 'Associate'

## WMD to get most similar people

In [6]:
# Normalize scores
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()

### Score from Skills

In [7]:
# Calculate the distance between the person's skills and the role skills
similarity_scores = []
for skills in df_people['skills']:
    similarity_scores.append(model.wv.wmdistance(role_skills, skills))
    #print('---> ')
    #print(role_skills)
    #print(skills)
    #print(model.wv.wmdistance(role_skills, skills))

# Add the dissimilarity score to each role
df_people['similarity_score_skills'] = similarity_scores


# Get the highest distance before inf
df_temp = df_people.loc[(df_people['similarity_score_skills'] < np.inf)]
max_score = df_temp['similarity_score_skills'].max()
print(max_score)

# Remove rows where the score is inf - meaning there is no similarity
df_people['similarity_score_skills'] = df_people['similarity_score_skills'].replace(np.inf, max_score)

#df = df.loc[(df['similarity_score_skills'] < np.inf)]

# Normalize scores and convert distance to similarity - 0 = more dissimilar, 1 = more similar
df_people['similarity_score_skills'] = 1 - scaler.fit_transform(df_people[['similarity_score_skills']])

# Sort the roles by similarity score
df_people = df_people.sort_values(by=['similarity_score_skills'], ascending=False)

df_people

0.9161159129693738


Unnamed: 0,id,skills,skills-want,YearsCodePro,similarity_score_skills
43159,45696,"[Bash/Shell, HTML/CSS, JavaScript, PHP, Python...",Bash/Shell;Go;HTML/CSS;JavaScript;PHP;Python;S...,15,1.0
46618,49361,"[C#, HTML/CSS, Python, SQL, Microsoft SQL Serv...",C#;HTML/CSS;Python;SQL;Microsoft SQL Server;;;...,29,0.9862177
33415,35298,"[Go, HTML/CSS, JavaScript, PHP, TypeScript, Dy...",Elixir;Go;JavaScript;TypeScript;Elasticsearch;...,15,0.8081919
13469,14095,"[C, Go, Java, Python, SQL, SQLite, , Flask]",C;Go;Java;Lua;Perl;Python;SQL;MySQL;PostgreSQL...,0,0.671344
63882,68104,"[Bash/Shell, C, C#, F#, Java, JavaScript, OCam...",C#;Elixir;F#;Haskell;OCaml;PHP;Rust;SQL;VBA;Ma...,12,-4.440892e-16


### Score from Interests

In [8]:
# Tokenize the skills
df_people['skills-want'] = df_people['skills-want'].apply(lambda x: x.split(';'))

# Calculate the similarity between the person's skills and the role skills
similarity_scores = []
for skills in df_people['skills-want']:
    similarity_scores.append(model.wv.wmdistance(role_skills, skills))

# Add the similarity scores to each role
df_people['similarity_score_interests'] = similarity_scores

# Get the highest distance before inf
df_temp = df_people.loc[(df_people['similarity_score_interests'] < np.inf)]
max_score = df_temp['similarity_score_interests'].max()
print(max_score)

# Remove rows where the score is inf - meaning there is no similarity
df_people['similarity_score_interests'] = df_people['similarity_score_interests'].replace(np.inf, max_score)

#df = df.loc[(df['similarity_score_interests'] < np.inf)]

# Normalize scores and convert distance to similarity - 0 = more dissimilar, 1 = more similar
df_people['similarity_score_interests'] =  1 - scaler.fit_transform(df_people[['similarity_score_interests']])

# Sort the roles by similarity score
df_people = df_people.sort_values(by=['similarity_score_interests'], ascending=False)

df_people

0.860582452221455


Unnamed: 0,id,skills,skills-want,YearsCodePro,similarity_score_skills,similarity_score_interests
46618,49361,"[C#, HTML/CSS, Python, SQL, Microsoft SQL Serv...","[C#, HTML/CSS, Python, SQL, Microsoft SQL Serv...",29,0.9862177,1.0
43159,45696,"[Bash/Shell, HTML/CSS, JavaScript, PHP, Python...","[Bash/Shell, Go, HTML/CSS, JavaScript, PHP, Py...",15,1.0,0.9394514
33415,35298,"[Go, HTML/CSS, JavaScript, PHP, TypeScript, Dy...","[Elixir, Go, JavaScript, TypeScript, Elasticse...",15,0.8081919,0.6838789
13469,14095,"[C, Go, Java, Python, SQL, SQLite, , Flask]","[C, Go, Java, Lua, Perl, Python, SQL, MySQL, P...",0,0.671344,0.5869777
63882,68104,"[Bash/Shell, C, C#, F#, Java, JavaScript, OCam...","[C#, Elixir, F#, Haskell, OCaml, PHP, Rust, SQ...",12,-4.440892e-16,8.881784e-16


### Score from Experience

In [9]:


# define the bin edges for each level of experience
bin_edges = [-1, 4, 8, 12, 100]

# define the labels for each level of experience
labels = ['Junior', 'Associate', 'Senior', 'Architect']

# convert the 'years' column to categorical levels of experience
df_people['Experience'] = pd.cut(pd.to_numeric(df_people['YearsCodePro']), bins=bin_edges, labels=labels)

# print the resulting DataFrame
df_people

Unnamed: 0,id,skills,skills-want,YearsCodePro,similarity_score_skills,similarity_score_interests,Experience
46618,49361,"[C#, HTML/CSS, Python, SQL, Microsoft SQL Serv...","[C#, HTML/CSS, Python, SQL, Microsoft SQL Serv...",29,0.9862177,1.0,Architect
43159,45696,"[Bash/Shell, HTML/CSS, JavaScript, PHP, Python...","[Bash/Shell, Go, HTML/CSS, JavaScript, PHP, Py...",15,1.0,0.9394514,Architect
33415,35298,"[Go, HTML/CSS, JavaScript, PHP, TypeScript, Dy...","[Elixir, Go, JavaScript, TypeScript, Elasticse...",15,0.8081919,0.6838789,Architect
13469,14095,"[C, Go, Java, Python, SQL, SQLite, , Flask]","[C, Go, Java, Lua, Perl, Python, SQL, MySQL, P...",0,0.671344,0.5869777,Junior
63882,68104,"[Bash/Shell, C, C#, F#, Java, JavaScript, OCam...","[C#, Elixir, F#, Haskell, OCaml, PHP, Rust, SQ...",12,-4.440892e-16,8.881784e-16,Senior


In [10]:
from scipy.spatial.distance import euclidean

# define a function to compute the similarity metric
def similarity(level1, level2):
    # define the vectors representing the levels of experience
    levels = {'Junior': [0, 0, 0, 0],
              'Associate': [1, 0, 0, 0],
              'Senior': [1, 1, 0, 0],
              'Architect': [1, 1, 1, 0]}
    
    # compute the Euclidean distance between the two vectors
    distance = euclidean(levels[level1], levels[level2])
    
    # return the similarity metric
    return 1 / (1 + distance)

# example usage of the similarity function
#print(similarity('Junior', 'Architect'))

In [11]:

# use function to compute the similarity based on level

df_people['similarity_score_experience'] = df_people["Experience"].apply(lambda x: similarity(role_experience, x))
df_people


Unnamed: 0,id,skills,skills-want,YearsCodePro,similarity_score_skills,similarity_score_interests,Experience,similarity_score_experience
46618,49361,"[C#, HTML/CSS, Python, SQL, Microsoft SQL Serv...","[C#, HTML/CSS, Python, SQL, Microsoft SQL Serv...",29,0.9862177,1.0,Architect,0.414214
43159,45696,"[Bash/Shell, HTML/CSS, JavaScript, PHP, Python...","[Bash/Shell, Go, HTML/CSS, JavaScript, PHP, Py...",15,1.0,0.9394514,Architect,0.414214
33415,35298,"[Go, HTML/CSS, JavaScript, PHP, TypeScript, Dy...","[Elixir, Go, JavaScript, TypeScript, Elasticse...",15,0.8081919,0.6838789,Architect,0.414214
13469,14095,"[C, Go, Java, Python, SQL, SQLite, , Flask]","[C, Go, Java, Lua, Perl, Python, SQL, MySQL, P...",0,0.671344,0.5869777,Junior,0.5
63882,68104,"[Bash/Shell, C, C#, F#, Java, JavaScript, OCam...","[C#, Elixir, F#, Haskell, OCaml, PHP, Rust, SQ...",12,-4.440892e-16,8.881784e-16,Senior,0.5


In [12]:


# Calculate score based on the absolute difference between values
#df['similarity_score_experience'] = df["YearsCodePro"].apply(lambda x: abs(role_experience - int(x)))

# Normalize scores
#df['similarity_score_experience'] =  1 - scaler.fit_transform(df[['similarity_score_experience']])

#df

# Present Results from 3 Models

## Model 1 - Compute Final Score

In [13]:
# Set weights
weight_skills = 0.8
weight_interests = 0
weight_experience = 0.2

# Calculate final score
df_people['score-model1'] = df_people['similarity_score_skills'] * weight_skills + df_people['similarity_score_interests'] * weight_interests + df_people['similarity_score_experience'] * weight_experience

# Get only final columns
#df = df[['id', 'skills', 'score']]

# Sort roles
df_people = df_people.sort_values(by=['score-model1'], ascending=False)

# See top 10 matches
df_people.head(10)

Unnamed: 0,id,skills,skills-want,YearsCodePro,similarity_score_skills,similarity_score_interests,Experience,similarity_score_experience,score-model1
43159,45696,"[Bash/Shell, HTML/CSS, JavaScript, PHP, Python...","[Bash/Shell, Go, HTML/CSS, JavaScript, PHP, Py...",15,1.0,0.9394514,Architect,0.414214,0.882843
46618,49361,"[C#, HTML/CSS, Python, SQL, Microsoft SQL Serv...","[C#, HTML/CSS, Python, SQL, Microsoft SQL Serv...",29,0.9862177,1.0,Architect,0.414214,0.871817
33415,35298,"[Go, HTML/CSS, JavaScript, PHP, TypeScript, Dy...","[Elixir, Go, JavaScript, TypeScript, Elasticse...",15,0.8081919,0.6838789,Architect,0.414214,0.729396
13469,14095,"[C, Go, Java, Python, SQL, SQLite, , Flask]","[C, Go, Java, Lua, Perl, Python, SQL, MySQL, P...",0,0.671344,0.5869777,Junior,0.5,0.637075
63882,68104,"[Bash/Shell, C, C#, F#, Java, JavaScript, OCam...","[C#, Elixir, F#, Haskell, OCaml, PHP, Rust, SQ...",12,-4.440892e-16,8.881784e-16,Senior,0.5,0.1


## Model 2 - Compute final score

In [14]:
# Set weights
weight_skills = 0.5
weight_interests = 0.3
weight_experience = 0.2

# Calculate final score
df_people['score-model2'] = df_people['similarity_score_skills'] * weight_skills + df_people['similarity_score_interests'] * weight_interests + df_people['similarity_score_experience'] * weight_experience

# Get only final columns
#df = df[['id', 'skills', 'score']]

# Sort roles
df_people = df_people.sort_values(by=['score-model2'], ascending=False)

# See top 10 matches
df_people.head(10)

Unnamed: 0,id,skills,skills-want,YearsCodePro,similarity_score_skills,similarity_score_interests,Experience,similarity_score_experience,score-model1,score-model2
46618,49361,"[C#, HTML/CSS, Python, SQL, Microsoft SQL Serv...","[C#, HTML/CSS, Python, SQL, Microsoft SQL Serv...",29,0.9862177,1.0,Architect,0.414214,0.871817,0.875952
43159,45696,"[Bash/Shell, HTML/CSS, JavaScript, PHP, Python...","[Bash/Shell, Go, HTML/CSS, JavaScript, PHP, Py...",15,1.0,0.9394514,Architect,0.414214,0.882843,0.864678
33415,35298,"[Go, HTML/CSS, JavaScript, PHP, TypeScript, Dy...","[Elixir, Go, JavaScript, TypeScript, Elasticse...",15,0.8081919,0.6838789,Architect,0.414214,0.729396,0.692102
13469,14095,"[C, Go, Java, Python, SQL, SQLite, , Flask]","[C, Go, Java, Lua, Perl, Python, SQL, MySQL, P...",0,0.671344,0.5869777,Junior,0.5,0.637075,0.611765
63882,68104,"[Bash/Shell, C, C#, F#, Java, JavaScript, OCam...","[C#, Elixir, F#, Haskell, OCaml, PHP, Rust, SQ...",12,-4.440892e-16,8.881784e-16,Senior,0.5,0.1,0.1


## Model 3 - Compute Final Score

In [15]:
# Set weights
weight_skills = 0.4
weight_interests = 0.4
weight_experience = 0.2

# Calculate final score
df_people['score-model3'] = df_people['similarity_score_skills'] * weight_skills + df_people['similarity_score_interests'] * weight_interests + df_people['similarity_score_experience'] * weight_experience

# Get only final columns
#df = df[['id', 'skills', 'score']]

# Sort roles
df_people = df_people.sort_values(by=['score-model3'], ascending=False)

# See top 10 matches
df_people.head(10)

Unnamed: 0,id,skills,skills-want,YearsCodePro,similarity_score_skills,similarity_score_interests,Experience,similarity_score_experience,score-model1,score-model2,score-model3
46618,49361,"[C#, HTML/CSS, Python, SQL, Microsoft SQL Serv...","[C#, HTML/CSS, Python, SQL, Microsoft SQL Serv...",29,0.9862177,1.0,Architect,0.414214,0.871817,0.875952,0.87733
43159,45696,"[Bash/Shell, HTML/CSS, JavaScript, PHP, Python...","[Bash/Shell, Go, HTML/CSS, JavaScript, PHP, Py...",15,1.0,0.9394514,Architect,0.414214,0.882843,0.864678,0.858623
33415,35298,"[Go, HTML/CSS, JavaScript, PHP, TypeScript, Dy...","[Elixir, Go, JavaScript, TypeScript, Elasticse...",15,0.8081919,0.6838789,Architect,0.414214,0.729396,0.692102,0.679671
13469,14095,"[C, Go, Java, Python, SQL, SQLite, , Flask]","[C, Go, Java, Lua, Perl, Python, SQL, MySQL, P...",0,0.671344,0.5869777,Junior,0.5,0.637075,0.611765,0.603329
63882,68104,"[Bash/Shell, C, C#, F#, Java, JavaScript, OCam...","[C#, Elixir, F#, Haskell, OCaml, PHP, Rust, SQ...",12,-4.440892e-16,8.881784e-16,Senior,0.5,0.1,0.1,0.1


# See Results

In [16]:
df_people = df_people[['id','skills','skills-want', 'Experience', 'score-model1', 'score-model2', 'score-model3']]
df_people.head(10)

Unnamed: 0,id,skills,skills-want,Experience,score-model1,score-model2,score-model3
46618,49361,"[C#, HTML/CSS, Python, SQL, Microsoft SQL Serv...","[C#, HTML/CSS, Python, SQL, Microsoft SQL Serv...",Architect,0.871817,0.875952,0.87733
43159,45696,"[Bash/Shell, HTML/CSS, JavaScript, PHP, Python...","[Bash/Shell, Go, HTML/CSS, JavaScript, PHP, Py...",Architect,0.882843,0.864678,0.858623
33415,35298,"[Go, HTML/CSS, JavaScript, PHP, TypeScript, Dy...","[Elixir, Go, JavaScript, TypeScript, Elasticse...",Architect,0.729396,0.692102,0.679671
13469,14095,"[C, Go, Java, Python, SQL, SQLite, , Flask]","[C, Go, Java, Lua, Perl, Python, SQL, MySQL, P...",Junior,0.637075,0.611765,0.603329
63882,68104,"[Bash/Shell, C, C#, F#, Java, JavaScript, OCam...","[C#, Elixir, F#, Haskell, OCaml, PHP, Rust, SQ...",Senior,0.1,0.1,0.1


# Other Metrics

In [17]:

# For each matched person, show % of skills matched and % of interests matched


# Also show the % of skills matched and the % of skills not matched
# and for the matched skills, show the % that is only for skills and only for interests

# Define the list of skills to compare
skills_to_compare = role_skills

# Define a function to compute the percentage of skills that are mentioned in the list of skills and also in the DataFrame column skills
def compute_matching_percentage(row, column='skills'):
    matched_skills = set(row[column]) & set(skills_to_compare)
    return len(matched_skills) / len(skills_to_compare)

# Get matching skills
def compute_matching_list(row, column='skills'):
    matched_skills = set(row[column]) & set(skills_to_compare)
    return matched_skills

# Get non matching skills
def compute_non_matching_list(row, column='skills'):
    non_matched_skills = set(skills_to_compare) - set(row[column])
    return non_matched_skills

# Define a function to compute the percentage of skills that are not mentioned in the DataFrame column skills
def compute_non_matching_percentage(row, column='skills'):
    non_matched_skills = set(skills_to_compare) - set(row[column])
    return len(non_matched_skills) / len(skills_to_compare)

# Apply the functions to each row of the DataFrame
df_people['matching_percentage_skills'] = df_people.apply(lambda x: compute_matching_percentage(x,'skills'), axis=1)
df_people['non_matching_percentage_skills'] = df_people.apply(lambda x: compute_non_matching_percentage(x,'skills'), axis=1)

df_people['matching_percentage_interests'] = df_people.apply(lambda x: compute_matching_percentage(x,'skills-want'), axis=1)
df_people['non_matching_percentage_interests'] = df_people.apply(lambda x: compute_non_matching_percentage(x,'skills-want'), axis=1)

df_people['matching_list_skills'] = df_people.apply(lambda x: compute_matching_list(x,'skills'), axis=1)
df_people['non_matching_list_skills'] = df_people.apply(lambda x: compute_non_matching_list(x,'skills'), axis=1)

df_people['matching_list_interests'] = df_people.apply(lambda x: compute_matching_list(x,'skills-want'), axis=1)
df_people['non_matching_list_insterests'] = df_people.apply(lambda x: compute_non_matching_list(x,'skills-want'), axis=1)


# Print the resulting DataFrame
df_people



Unnamed: 0,id,skills,skills-want,Experience,score-model1,score-model2,score-model3,matching_percentage_skills,non_matching_percentage_skills,matching_percentage_interests,non_matching_percentage_interests,matching_list_skills,non_matching_list_skills,matching_list_interests,non_matching_list_insterests
46618,49361,"[C#, HTML/CSS, Python, SQL, Microsoft SQL Serv...","[C#, HTML/CSS, Python, SQL, Microsoft SQL Serv...",Architect,0.871817,0.875952,0.87733,0.666667,0.333333,0.666667,0.333333,"{Microsoft SQL Server, Python}",{Microsoft Azure},"{Microsoft SQL Server, Python}",{Microsoft Azure}
43159,45696,"[Bash/Shell, HTML/CSS, JavaScript, PHP, Python...","[Bash/Shell, Go, HTML/CSS, JavaScript, PHP, Py...",Architect,0.882843,0.864678,0.858623,0.333333,0.666667,0.333333,0.666667,{Python},"{Microsoft SQL Server, Microsoft Azure}",{Python},"{Microsoft SQL Server, Microsoft Azure}"
33415,35298,"[Go, HTML/CSS, JavaScript, PHP, TypeScript, Dy...","[Elixir, Go, JavaScript, TypeScript, Elasticse...",Architect,0.729396,0.692102,0.679671,0.0,1.0,0.0,1.0,{},"{Microsoft SQL Server, Microsoft Azure, Python}",{},"{Microsoft SQL Server, Microsoft Azure, Python}"
13469,14095,"[C, Go, Java, Python, SQL, SQLite, , Flask]","[C, Go, Java, Lua, Perl, Python, SQL, MySQL, P...",Junior,0.637075,0.611765,0.603329,0.333333,0.666667,0.333333,0.666667,{Python},"{Microsoft SQL Server, Microsoft Azure}",{Python},"{Microsoft SQL Server, Microsoft Azure}"
63882,68104,"[Bash/Shell, C, C#, F#, Java, JavaScript, OCam...","[C#, Elixir, F#, Haskell, OCaml, PHP, Rust, SQ...",Senior,0.1,0.1,0.1,0.0,1.0,0.0,1.0,{},"{Microsoft SQL Server, Microsoft Azure, Python}",{},"{Microsoft SQL Server, Microsoft Azure, Python}"


In [35]:
list_skills = df_people[df_people['id']==22519]['skills']
print(list_skills.values)

[]
