# Models for rating based on very best match

One single word embedding model is used for modelling the similarities between skills, which is used also for interests.
A score is calculated from each Word Mover's Distance computed and a final score is calculated as a weighted combination of all scores.

## Import Libraries

In [30]:
import pandas as pd
from gensim.models import Word2Vec
import numpy as np

# Enter Person's Profile

In [31]:
# Get the person's profile
person_id = 6

# Manual setting
#person_skills = ['Python', 'Microsoft SQL Server', 'Microsoft Azure']
#person_interests = ['Python']
#person_experience = 5 # years

# Get the profile from the collected data from the survey
employee = pd.read_csv(filepath_or_buffer="../2-data/survey_people.csv", sep=",", encoding="latin1")
employee = employee[employee['id']==person_id]
person_skills = employee['skills'].values[0].split(';')
person_interests = employee['skills-want'].values[0].split(';')
person_experience = employee['YearsCodePro'].values[0]

print(person_id)
print(person_skills)
print(person_interests)
print(person_experience)


6
['MATLAB', 'Python', 'R', 'SQL', 'Swift', 'Cassandra', 'CouchDB', 'MariaDB', 'Microsoft SQL Server', 'MySQL', 'Neo4j', 'Firebase Realtime Database', 'Firebase', 'IBM Cloud or Watson', 'Microsoft Azure', 'Angular', 'Angular.js', 'ASP.NET', 'ASP.NET Core', 'Django', 'Flask', 'jQuery', 'React.js', 'Vue.js', '.NET', 'Apache Kafka', 'Apache Spark', 'Keras', 'NumPy', 'Pandas', 'Scikit-learn', 'TensorFlow', 'Torch/PyTorch', 'Docker', 'Homebrew']
['Python', 'Microsoft SQL Server', 'MongoDB', 'Neo4j', 'AWS', 'Google Cloud', 'Microsoft Azure', 'Keras', 'NumPy', 'Pandas', 'Scikit-learn', 'TensorFlow', 'Torch/PyTorch', 'Docker', 'Homebrew', 'Kubernetes']
1


# Load IT Roles Dataset

In [32]:
# Read the filtered Skill2Vec dataset
df_roles = pd.read_csv(filepath_or_buffer="../2-data/ITroles.csv", sep=",", encoding="latin1")

# Tokenize the skills
df_roles['skills'] = df_roles['skills'].apply(lambda x: x.split(';'))
df_roles['skills'] = df_roles['skills'].apply(lambda x: [value for value in x if value != ''])

df_roles = df_roles[['id', 'skills']]
df_roles



Unnamed: 0,id,skills
0,19805,"[diploma, machining, cnc m, mould, conventiona..."
1,80208,"[Compensation, Benefits, HR Functions, Alm, Pa..."
2,122729,"[Simulink, stateflow, Matlab developer, target..."
3,4772,"[gis, analysis, geographic_information_system,..."
4,44923,"[Full Stack Developer, AngularJS, SaaS applica..."
...,...,...
10353,91663,"[customer interaction, knowledge, java, androi..."
10354,86050,"[Technical Management, Project Management, MS ..."
10355,54515,"[XCode, IOS, Objective C, Project Management]"
10356,36160,"[Director, NoSQL, Node.js, CTO, SQL, JIRA, Agi..."


# Load Word Embedding Model

In [33]:
model = Word2Vec.load("../3-word_embedding/model-w2vcombinedfiltered")

# Sample the roles dataset as available roles

In [34]:
# Get only a sample
#df_roles = df_roles.sample(5)
#df_roles

# Compute Similarity Scores

In [35]:
# Normalize scores
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()

## Score from Skills

In [36]:
# Calculate the distance between the person's skills and the role skills
similarity_scores = []
for skills in df_roles['skills']:
    similarity_scores.append(model.wv.wmdistance(person_skills, skills))

# Add the dissimilarity score to each role
df_roles['similarity_score_skills'] = similarity_scores

# Get the highest distance before inf
df_temp = df_roles.loc[(df_roles['similarity_score_skills'] < np.inf)]
max_score = df_temp['similarity_score_skills'].max()
print(max_score)

# Remove rows where the score is inf - meaning there is no similarity
df_roles['similarity_score_skills'] = df_roles['similarity_score_skills'].replace(np.inf, max_score)

# Normalize scores and convert distance to similarity - 0 = more dissimilar, 1 = more similar
df_roles['similarity_score_skills'] = 1 - scaler.fit_transform(df_roles[['similarity_score_skills']])

# Sort the roles by similarity score
df_roles = df_roles.sort_values(by=['similarity_score_skills'], ascending=False)

df_roles

1.3295782230824456


Unnamed: 0,id,skills,similarity_score_skills
6343,102795,"[Puppet, Ansible, Docker, Chef]",1.000000e+00
10284,118970,"[3d Unity, Unity Developer, unity programmer, ...",9.560275e-01
2775,86869,"[Django, Python]",9.376067e-01
7800,73728,"[Django, Python]",9.376067e-01
7516,74299,[Node.js],9.164955e-01
...,...,...,...
1636,14874,"[Appian BPM, java]",6.774183e-03
3418,123093,[azure],1.245899e-03
2649,28317,"[Clojurescript, clojure script, clojure, cloju...",3.363155e-04
2675,111990,"[mongodb, mongo dba, mongo database admin, mon...",1.110223e-16


## Score from Interests

In [37]:
# Calculate the similarity between the person's skills and the role skills
similarity_scores = []
for skills in df_roles['skills']:
    similarity_scores.append(model.wv.wmdistance(person_interests, skills))

# Add the similarity scores to each role
df_roles['similarity_score_interests'] = similarity_scores

# Get the highest distance before inf
df_temp = df_roles.loc[(df_roles['similarity_score_interests'] < np.inf)]
max_score = df_temp['similarity_score_interests'].max()
print(max_score)

# Remove rows where the score is inf - meaning there is no similarity
df_roles['similarity_score_interests'] = df_roles['similarity_score_interests'].replace(np.inf, max_score)

# Normalize scores and convert distance to similarity - 0 = more dissimilar, 1 = more similar
df_roles['similarity_score_interests'] =  1 - scaler.fit_transform(df_roles[['similarity_score_interests']])

# Sort the roles by similarity score
df_roles = df_roles.sort_values(by=['similarity_score_interests'], ascending=False)

df_roles

1.3492502280053338


Unnamed: 0,id,skills,similarity_score_skills,similarity_score_interests
6343,102795,"[Puppet, Ansible, Docker, Chef]",1.000000e+00,1.000000
10284,118970,"[3d Unity, Unity Developer, unity programmer, ...",9.560275e-01,0.943971
1347,35455,"[GIT, AWS, Puppet, Ansible, Chef]",8.816489e-01,0.898031
2775,86869,"[Django, Python]",9.376067e-01,0.890836
7800,73728,"[Django, Python]",9.376067e-01,0.890836
...,...,...,...,...
5952,9006,"[c#, asp, .net]",1.299244e-02,0.012969
9140,76425,"[asp.net, .net]",1.505774e-02,0.009550
3418,123093,[azure],1.245899e-03,0.006306
7229,40220,"[Natural language processing, Clinical NLP eng...",8.639607e-03,0.000000


## Score from Experience - Data not available in the dataset

In [38]:
# Cannot calculate score for Experience - Skill2Vec dataset has no experience column

# Add Results from all 3 Models

## Model 1 - Compute Final Score

In [39]:
# Set weights
weight_skills = 0.8
weight_interests = 0
#weight_experience = 0.2

# Calculate final score
df_roles['score-model1'] = df_roles['similarity_score_skills'] * weight_skills + df_roles['similarity_score_interests'] * weight_interests #+ df_roles['similarity_score_experience'] * weight_experience

df_roles

Unnamed: 0,id,skills,similarity_score_skills,similarity_score_interests,score-model1
6343,102795,"[Puppet, Ansible, Docker, Chef]",1.000000e+00,1.000000,8.000000e-01
10284,118970,"[3d Unity, Unity Developer, unity programmer, ...",9.560275e-01,0.943971,7.648220e-01
1347,35455,"[GIT, AWS, Puppet, Ansible, Chef]",8.816489e-01,0.898031,7.053191e-01
2775,86869,"[Django, Python]",9.376067e-01,0.890836,7.500854e-01
7800,73728,"[Django, Python]",9.376067e-01,0.890836,7.500854e-01
...,...,...,...,...,...
5952,9006,"[c#, asp, .net]",1.299244e-02,0.012969,1.039395e-02
9140,76425,"[asp.net, .net]",1.505774e-02,0.009550,1.204619e-02
3418,123093,[azure],1.245899e-03,0.006306,9.967191e-04
7229,40220,"[Natural language processing, Clinical NLP eng...",8.639607e-03,0.000000,6.911685e-03


## Model 2 - Compute final score

In [40]:
# Set weights
weight_skills = 0.4
weight_interests = 0.4
#weight_experience = 0.2

# Calculate final score
df_roles['score-model2'] = df_roles['similarity_score_skills'] * weight_skills + df_roles['similarity_score_interests'] * weight_interests #+ df_roles['similarity_score_experience'] * weight_experience

df_roles

Unnamed: 0,id,skills,similarity_score_skills,similarity_score_interests,score-model1,score-model2
6343,102795,"[Puppet, Ansible, Docker, Chef]",1.000000e+00,1.000000,8.000000e-01,8.000000e-01
10284,118970,"[3d Unity, Unity Developer, unity programmer, ...",9.560275e-01,0.943971,7.648220e-01,7.599996e-01
1347,35455,"[GIT, AWS, Puppet, Ansible, Chef]",8.816489e-01,0.898031,7.053191e-01,7.118719e-01
2775,86869,"[Django, Python]",9.376067e-01,0.890836,7.500854e-01,7.313772e-01
7800,73728,"[Django, Python]",9.376067e-01,0.890836,7.500854e-01,7.313772e-01
...,...,...,...,...,...,...
5952,9006,"[c#, asp, .net]",1.299244e-02,0.012969,1.039395e-02,1.038438e-02
9140,76425,"[asp.net, .net]",1.505774e-02,0.009550,1.204619e-02,9.843039e-03
3418,123093,[azure],1.245899e-03,0.006306,9.967191e-04,3.020687e-03
7229,40220,"[Natural language processing, Clinical NLP eng...",8.639607e-03,0.000000,6.911685e-03,3.455843e-03


## Model 3 - Compute Final Score

In [41]:
# Set weights
weight_skills = 0
weight_interests = 0.8
#weight_experience = 0.2

# Calculate final score
df_roles['score-model3'] = df_roles['similarity_score_skills'] * weight_skills + df_roles['similarity_score_interests'] * weight_interests #+ df_roles['similarity_score_experience'] * weight_experience

df_roles

Unnamed: 0,id,skills,similarity_score_skills,similarity_score_interests,score-model1,score-model2,score-model3
6343,102795,"[Puppet, Ansible, Docker, Chef]",1.000000e+00,1.000000,8.000000e-01,8.000000e-01,0.800000
10284,118970,"[3d Unity, Unity Developer, unity programmer, ...",9.560275e-01,0.943971,7.648220e-01,7.599996e-01,0.755177
1347,35455,"[GIT, AWS, Puppet, Ansible, Chef]",8.816489e-01,0.898031,7.053191e-01,7.118719e-01,0.718425
2775,86869,"[Django, Python]",9.376067e-01,0.890836,7.500854e-01,7.313772e-01,0.712669
7800,73728,"[Django, Python]",9.376067e-01,0.890836,7.500854e-01,7.313772e-01,0.712669
...,...,...,...,...,...,...,...
5952,9006,"[c#, asp, .net]",1.299244e-02,0.012969,1.039395e-02,1.038438e-02,0.010375
9140,76425,"[asp.net, .net]",1.505774e-02,0.009550,1.204619e-02,9.843039e-03,0.007640
3418,123093,[azure],1.245899e-03,0.006306,9.967191e-04,3.020687e-03,0.005045
7229,40220,"[Natural language processing, Clinical NLP eng...",8.639607e-03,0.000000,6.911685e-03,3.455843e-03,0.000000


# Add Columns for Ordered List

In [42]:
# Rank similarities
df_roles['rank-model1'] = df_roles['score-model1'].rank(ascending=False)
df_roles['rank-model2'] = df_roles['score-model2'].rank(ascending=False)
df_roles['rank-model3'] = df_roles['score-model3'].rank(ascending=False)

# Select only required columns
df_roles = df_roles[['id','skills', 'rank-model1', 'rank-model2', 'rank-model3']]
df_roles


Unnamed: 0,id,skills,rank-model1,rank-model2,rank-model3
6343,102795,"[Puppet, Ansible, Docker, Chef]",1.0,1.0,1.0
10284,118970,"[3d Unity, Unity Developer, unity programmer, ...",2.0,2.0,2.0
1347,35455,"[GIT, AWS, Puppet, Ansible, Chef]",7.0,6.0,3.0
2775,86869,"[Django, Python]",3.5,3.5,4.5
7800,73728,"[Django, Python]",3.5,3.5,4.5
...,...,...,...,...,...
5952,9006,"[c#, asp, .net]",10346.0,10354.0,10354.0
9140,76425,"[asp.net, .net]",10345.0,10355.0,10355.0
3418,123093,[azure],10355.0,10357.0,10356.0
7229,40220,"[Natural language processing, Clinical NLP eng...",10348.0,10356.0,10357.5


In [43]:

df_roles = df_roles[(df_roles['rank-model1']<6) | (df_roles['rank-model2']<6) | (df_roles['rank-model3']<6)]
df_roles

Unnamed: 0,id,skills,rank-model1,rank-model2,rank-model3
6343,102795,"[Puppet, Ansible, Docker, Chef]",1.0,1.0,1.0
10284,118970,"[3d Unity, Unity Developer, unity programmer, ...",2.0,2.0,2.0
1347,35455,"[GIT, AWS, Puppet, Ansible, Chef]",7.0,6.0,3.0
2775,86869,"[Django, Python]",3.5,3.5,4.5
7800,73728,"[Django, Python]",3.5,3.5,4.5
7516,74299,[Node.js],5.0,5.0,6.0


# Export Results

In [29]:
df_roles.to_csv(f"../6-results/best_rating_ITProfessional_{person_id}.csv")
df_roles


Unnamed: 0,id,skills,rank-model1,rank-model2,rank-model3
6343,102795,"[Puppet, Ansible, Docker, Chef]",1.0,1.0,1.0
10284,118970,"[3d Unity, Unity Developer, unity programmer, ...",2.0,2.0,2.0
1347,35455,"[GIT, AWS, Puppet, Ansible, Chef]",7.0,6.0,3.0
2775,86869,"[Django, Python]",3.5,3.5,4.5
7800,73728,"[Django, Python]",3.5,3.5,4.5
7516,74299,[Node.js],5.0,5.0,6.0
