# Models for ranking

One single word embedding model is used for modelling the similarities between skills, which is used also for interests.
A score is calculated from each Word Mover's Distance computed and a final score is calculated as a weighted combination of all scores.

## Import Libraries

In [797]:
import pandas as pd
from gensim.models import Word2Vec
import numpy as np

# Enter Person's Profile

In [798]:
# Get the person's profile
person_id = 8

# Manual setting
#person_skills = ['Python', 'Microsoft SQL Server', 'Microsoft Azure']
#person_interests = ['Python']
#person_experience = 5 # years

# Get the profile from the collected data from the survey
employee = pd.read_csv(filepath_or_buffer="../2-data/survey_people.csv", sep=",", encoding="latin1")
employee = employee[employee['id']==person_id]
person_skills = employee['skills'].values[0].split(';')
person_interests = employee['skills-want'].values[0].split(';')
person_experience = employee['YearsCodePro'].values[0]

print(person_id)
print(person_skills)
print(person_interests)
print(person_experience)


8
['Bash/Shell', 'C#', 'C++', 'Java', 'PowerShell', 'Python', 'R', 'Scala', 'SQL', 'Swift', 'MariaDB', 'Microsoft SQL Server', 'MongoDB', 'MySQL', 'SQLite', 'Microsoft Azure', 'jQuery', 'Ruby on Rails', 'Apache Kafka', 'Apache Spark', 'Hadoop', 'NumPy', 'Pandas', 'Docker', 'Kubernetes', 'npm']
['C#', 'Python', 'SQL', 'Cassandra', 'MariaDB', 'Microsoft SQL Server', 'MySQL', 'Microsoft Azure', 'Angular', 'Apache Spark', 'Hadoop', 'NumPy', 'Pandas', 'Docker', 'npm', 'Terraform']
6


# Load IT Roles Dataset

In [799]:
# Read the filtered Skill2Vec dataset
df_roles = pd.read_csv(filepath_or_buffer="../2-data/ITroles.csv", sep=",", encoding="latin1")

# Tokenize the skills
df_roles['skills'] = df_roles['skills'].apply(lambda x: x.split(';'))
df_roles['skills'] = df_roles['skills'].apply(lambda x: [value for value in x if value != ''])

df_roles = df_roles[['id', 'skills']]
df_roles



Unnamed: 0,id,skills
0,19805,"[diploma, machining, cnc m, mould, conventiona..."
1,80208,"[Compensation, Benefits, HR Functions, Alm, Pa..."
2,122729,"[Simulink, stateflow, Matlab developer, target..."
3,4772,"[gis, analysis, geographic_information_system,..."
4,44923,"[Full Stack Developer, AngularJS, SaaS applica..."
...,...,...
10353,91663,"[customer interaction, knowledge, java, androi..."
10354,86050,"[Technical Management, Project Management, MS ..."
10355,54515,"[XCode, IOS, Objective C, Project Management]"
10356,36160,"[Director, NoSQL, Node.js, CTO, SQL, JIRA, Agi..."


# Load Word Embedding Model

In [800]:
model = Word2Vec.load("../3-word_embedding/model-w2vcombinedfiltered")

# Sample the roles dataset as available roles

In [801]:
# Get only a sample
df_roles = df_roles.sample(5)
df_roles

Unnamed: 0,id,skills
10144,128248,"[Integration Server, Trading Networks, MWS, We..."
1230,60682,"[Bca, MCA, Languages, Database Architecture, S..."
5545,64177,"[MVVM, waterfall, C#, SQL, MVC, Agile, .Net, S..."
4165,119028,"[Email, Unix, Perl, Java, PLSQL]"
8808,126516,"[JUnit, Selenium, SQL, Automation Testing, Tes..."


# Compute Similarity Scores

In [802]:
# Normalize scores
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()

## Score from Skills

In [803]:
# Calculate the distance between the person's skills and the role skills
similarity_scores = []
for skills in df_roles['skills']:
    similarity_scores.append(model.wv.wmdistance(person_skills, skills))

# Add the dissimilarity score to each role
df_roles['similarity_score_skills'] = similarity_scores

# Get the highest distance before inf
df_temp = df_roles.loc[(df_roles['similarity_score_skills'] < np.inf)]
max_score = df_temp['similarity_score_skills'].max()
print(max_score)

# Replace rows where the score is inf - meaning there is no similarity
df_roles['similarity_score_skills'] = df_roles['similarity_score_skills'].replace(np.inf, max_score)

# Normalize scores and convert distance to similarity - 0 = more dissimilar, 1 = more similar
df_roles['similarity_score_skills'] = 1 - scaler.fit_transform(df_roles[['similarity_score_skills']])

# Sort the roles by similarity score
df_roles = df_roles.sort_values(by=['similarity_score_skills'], ascending=False)

df_roles

1.127273814027629


Unnamed: 0,id,skills,similarity_score_skills
8808,126516,"[JUnit, Selenium, SQL, Automation Testing, Tes...",1.0
5545,64177,"[MVVM, waterfall, C#, SQL, MVC, Agile, .Net, S...",0.806565
4165,119028,"[Email, Unix, Perl, Java, PLSQL]",0.637596
10144,128248,"[Integration Server, Trading Networks, MWS, We...",0.463874
1230,60682,"[Bca, MCA, Languages, Database Architecture, S...",0.0


## Score from Interests

In [804]:
# Calculate the similarity between the person's skills and the role skills
similarity_scores = []
for skills in df_roles['skills']:
    similarity_scores.append(model.wv.wmdistance(person_interests, skills))

# Add the similarity scores to each role
df_roles['similarity_score_interests'] = similarity_scores

# Get the highest distance before inf
df_temp = df_roles.loc[(df_roles['similarity_score_interests'] < np.inf)]
max_score = df_temp['similarity_score_interests'].max()
print(max_score)

# Replace rows where the score is inf - meaning there is no similarity
df_roles['similarity_score_interests'] = df_roles['similarity_score_interests'].replace(np.inf, max_score)

# Normalize scores and convert distance to similarity - 0 = more dissimilar, 1 = more similar
df_roles['similarity_score_interests'] =  1 - scaler.fit_transform(df_roles[['similarity_score_interests']])

# Sort the roles by similarity score
df_roles = df_roles.sort_values(by=['similarity_score_interests'], ascending=False)

df_roles

1.1291442332934587


Unnamed: 0,id,skills,similarity_score_skills,similarity_score_interests
5545,64177,"[MVVM, waterfall, C#, SQL, MVC, Agile, .Net, S...",0.806565,1.0
8808,126516,"[JUnit, Selenium, SQL, Automation Testing, Tes...",1.0,0.892029
10144,128248,"[Integration Server, Trading Networks, MWS, We...",0.463874,0.494312
4165,119028,"[Email, Unix, Perl, Java, PLSQL]",0.637596,0.442762
1230,60682,"[Bca, MCA, Languages, Database Architecture, S...",0.0,0.0


## Score from Experience - Data not available in the dataset

In [805]:
# Cannot calculate score for Experience - Skill2Vec dataset has no experience column

# Add Results from all 3 Models

## Model 1 - Compute Final Score

In [806]:
# Set weights
weight_skills = 0.8
weight_interests = 0
#weight_experience = 0.2

# Calculate final score
df_roles['score-model1'] = df_roles['similarity_score_skills'] * weight_skills + df_roles['similarity_score_interests'] * weight_interests #+ df_roles['similarity_score_experience'] * weight_experience

df_roles

Unnamed: 0,id,skills,similarity_score_skills,similarity_score_interests,score-model1
5545,64177,"[MVVM, waterfall, C#, SQL, MVC, Agile, .Net, S...",0.806565,1.0,0.645252
8808,126516,"[JUnit, Selenium, SQL, Automation Testing, Tes...",1.0,0.892029,0.8
10144,128248,"[Integration Server, Trading Networks, MWS, We...",0.463874,0.494312,0.371099
4165,119028,"[Email, Unix, Perl, Java, PLSQL]",0.637596,0.442762,0.510077
1230,60682,"[Bca, MCA, Languages, Database Architecture, S...",0.0,0.0,0.0


## Model 2 - Compute final score

In [807]:
# Set weights
weight_skills = 0.4
weight_interests = 0.4
#weight_experience = 0.2

# Calculate final score
df_roles['score-model2'] = df_roles['similarity_score_skills'] * weight_skills + df_roles['similarity_score_interests'] * weight_interests #+ df_roles['similarity_score_experience'] * weight_experience

df_roles

Unnamed: 0,id,skills,similarity_score_skills,similarity_score_interests,score-model1,score-model2
5545,64177,"[MVVM, waterfall, C#, SQL, MVC, Agile, .Net, S...",0.806565,1.0,0.645252,0.722626
8808,126516,"[JUnit, Selenium, SQL, Automation Testing, Tes...",1.0,0.892029,0.8,0.756812
10144,128248,"[Integration Server, Trading Networks, MWS, We...",0.463874,0.494312,0.371099,0.383274
4165,119028,"[Email, Unix, Perl, Java, PLSQL]",0.637596,0.442762,0.510077,0.432143
1230,60682,"[Bca, MCA, Languages, Database Architecture, S...",0.0,0.0,0.0,0.0


## Model 3 - Compute Final Score

In [808]:
# Set weights
weight_skills = 0
weight_interests = 0.8
#weight_experience = 0.2

# Calculate final score
df_roles['score-model3'] = df_roles['similarity_score_skills'] * weight_skills + df_roles['similarity_score_interests'] * weight_interests #+ df_roles['similarity_score_experience'] * weight_experience

df_roles

Unnamed: 0,id,skills,similarity_score_skills,similarity_score_interests,score-model1,score-model2,score-model3
5545,64177,"[MVVM, waterfall, C#, SQL, MVC, Agile, .Net, S...",0.806565,1.0,0.645252,0.722626,0.8
8808,126516,"[JUnit, Selenium, SQL, Automation Testing, Tes...",1.0,0.892029,0.8,0.756812,0.713624
10144,128248,"[Integration Server, Trading Networks, MWS, We...",0.463874,0.494312,0.371099,0.383274,0.395449
4165,119028,"[Email, Unix, Perl, Java, PLSQL]",0.637596,0.442762,0.510077,0.432143,0.354209
1230,60682,"[Bca, MCA, Languages, Database Architecture, S...",0.0,0.0,0.0,0.0,0.0


# Add Columns for Ordered List

In [809]:
# Rank similarities
df_roles['rank-model1'] = df_roles['score-model1'].rank(ascending=False)
df_roles['rank-model2'] = df_roles['score-model2'].rank(ascending=False)
df_roles['rank-model3'] = df_roles['score-model3'].rank(ascending=False)

# Select only required columns
df_roles = df_roles[['id','skills', 'rank-model1', 'rank-model2', 'rank-model3']]
df_roles


Unnamed: 0,id,skills,rank-model1,rank-model2,rank-model3
5545,64177,"[MVVM, waterfall, C#, SQL, MVC, Agile, .Net, S...",2.0,2.0,1.0
8808,126516,"[JUnit, Selenium, SQL, Automation Testing, Tes...",1.0,1.0,2.0
10144,128248,"[Integration Server, Trading Networks, MWS, We...",4.0,4.0,3.0
4165,119028,"[Email, Unix, Perl, Java, PLSQL]",3.0,3.0,4.0
1230,60682,"[Bca, MCA, Languages, Database Architecture, S...",5.0,5.0,5.0


# Export Results

In [810]:
df_roles.to_csv(f"../6-results/ranking_ITProfessional_{person_id}.csv")
df_roles


Unnamed: 0,id,skills,rank-model1,rank-model2,rank-model3
5545,64177,"[MVVM, waterfall, C#, SQL, MVC, Agile, .Net, S...",2.0,2.0,1.0
8808,126516,"[JUnit, Selenium, SQL, Automation Testing, Tes...",1.0,1.0,2.0
10144,128248,"[Integration Server, Trading Networks, MWS, We...",4.0,4.0,3.0
4165,119028,"[Email, Unix, Perl, Java, PLSQL]",3.0,3.0,4.0
1230,60682,"[Bca, MCA, Languages, Database Architecture, S...",5.0,5.0,5.0
