# Models for ranking

One single word embedding model is used for modelling the similarities between skills, which is used also for interests.
A score is calculated from each Word Mover's Distance computed and a final score is calculated as a weighted combination of all scores.

## Import Libraries

In [68]:
import pandas as pd
from gensim.models import Word2Vec
import numpy as np

# Load IT Roles Dataset

In [69]:
# Read the StackOverflow dataset
df_roles = pd.read_csv(filepath_or_buffer="../2-data/ITroles.csv", sep=",", encoding="latin1")

# Tokenize the skills
df_roles['skills'] = df_roles['skills'].apply(lambda x: x.split(';'))

df_roles = df_roles[['id', 'skills']]
df_roles



Unnamed: 0,id,skills
0,19805,"[diploma, machining, cnc m, mould, conventiona..."
1,80208,"[Compensation, Benefits, HR Functions, Alm, Pa..."
2,122729,"[Simulink, stateflow, Matlab developer, target..."
3,4772,"[gis, analysis, geographic_information_system,..."
4,44923,"[Full Stack Developer, AngularJS, SaaS applica..."
...,...,...
10353,91663,"[customer interaction, knowledge, java, androi..."
10354,86050,"[Technical Management, Project Management, MS ..."
10355,54515,"[XCode, IOS, Objective C, Project Management, ..."
10356,36160,"[Director, NoSQL, Node.js, CTO, SQL, JIRA, Agi..."


# Load Word Embedding Model

In [70]:
model = Word2Vec.load("../3-word_embedding/model-w2vcombined")


# Prepare StackOverflow Dataset as available people

In [71]:

# Get only a sample
df_roles = df_roles.sample(5)

# Find People to a Role

## Get Role's Requirements

In [72]:
# Get the role requirements
person_skills = ['Python', 'Microsoft SQL Server', 'Microsoft Azure']
person_interests = ['Python']
person_experience = 5 # years

## WMD to get most similar people

In [73]:
# Normalize scores
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()

### Score from Skills

In [74]:
# Calculate the distance between the person's skills and the role skills
similarity_scores = []
for skills in df_roles['skills']:
    similarity_scores.append(model.wv.wmdistance(person_skills, skills))
    #print('---> ')
    #print(role_skills)
    #print(skills)
    #print(model.wv.wmdistance(role_skills, skills))

# Add the dissimilarity score to each role
df_roles['similarity_score_skills'] = similarity_scores


# Get the highest distance before inf
df_temp = df_roles.loc[(df_roles['similarity_score_skills'] < np.inf)]
max_score = df_temp['similarity_score_skills'].max()
print(max_score)

# Remove rows where the score is inf - meaning there is no similarity
df_roles['similarity_score_skills'] = df_roles['similarity_score_skills'].replace(np.inf, max_score)

#df = df.loc[(df['similarity_score_skills'] < np.inf)]

# Normalize scores and convert distance to similarity - 0 = more dissimilar, 1 = more similar
df_roles['similarity_score_skills'] = 1 - scaler.fit_transform(df_roles[['similarity_score_skills']])

# Sort the roles by similarity score
df_roles = df_roles.sort_values(by=['similarity_score_skills'], ascending=False)

df_roles

1.2798832818087742


Unnamed: 0,id,skills,similarity_score_skills
8759,63559,"[Computer Languages, Solution Design, openstac...",1.0
10226,41886,"[ASP.Net, WCF, C#, MS SQL Server, .NET Framewo...",0.433961
6799,60092,"[Javascript, JQuery, UI Development, Rest, Jas...",0.418377
458,33508,"[tools, proxy, email, server architecture, jav...",0.147947
1258,73451,"[Competitive Analysis, New Product, Product Ma...",0.0


### Score from Interests

In [75]:
# Tokenize the skills
#df_people['skills-want'] = df_people['skills-want'].apply(lambda x: x.split(';'))

# Calculate the similarity between the person's skills and the role skills
similarity_scores = []
for skills in df_roles['skills']:
    similarity_scores.append(model.wv.wmdistance(person_interests, skills))

# Add the similarity scores to each role
df_roles['similarity_score_interests'] = similarity_scores

# Get the highest distance before inf
df_temp = df_roles.loc[(df_roles['similarity_score_interests'] < np.inf)]
max_score = df_temp['similarity_score_interests'].max()
print(max_score)

# Remove rows where the score is inf - meaning there is no similarity
df_roles['similarity_score_interests'] = df_roles['similarity_score_interests'].replace(np.inf, max_score)

#df = df.loc[(df['similarity_score_interests'] < np.inf)]

# Normalize scores and convert distance to similarity - 0 = more dissimilar, 1 = more similar
df_roles['similarity_score_interests'] =  1 - scaler.fit_transform(df_roles[['similarity_score_interests']])

# Sort the roles by similarity score
df_roles = df_roles.sort_values(by=['similarity_score_interests'], ascending=False)

df_roles

1.2283510060778395


Unnamed: 0,id,skills,similarity_score_skills,similarity_score_interests
8759,63559,"[Computer Languages, Solution Design, openstac...",1.0,1.0
6799,60092,"[Javascript, JQuery, UI Development, Rest, Jas...",0.418377,0.53137
10226,41886,"[ASP.Net, WCF, C#, MS SQL Server, .NET Framewo...",0.433961,0.409027
1258,73451,"[Competitive Analysis, New Product, Product Ma...",0.0,0.156281
458,33508,"[tools, proxy, email, server architecture, jav...",0.147947,0.0


### Score from Experience - Data not available in the dataset

In [76]:


# define the bin edges for each level of experience
#bin_edges = [-1, 4, 8, 12, 100]

# define the labels for each level of experience
#labels = ['Junior', 'Associate', 'Senior', 'Architect']

# convert the 'years' column to categorical levels of experience
#df_roles['Experience'] = pd.cut(pd.to_numeric(df_people['YearsCodePro']), bins=bin_edges, labels=labels)

# print the resulting DataFrame
#df_people

In [77]:
#from scipy.spatial.distance import euclidean

# define a function to compute the similarity metric
#def similarity(level1, level2):
    # define the vectors representing the levels of experience
#    levels = {'Junior': [0, 0, 0, 0],
#              'Associate': [1, 0, 0, 0],
#              'Senior': [1, 1, 0, 0],
#              'Architect': [1, 1, 1, 0]}
    
    # compute the Euclidean distance between the two vectors
#    distance = euclidean(levels[level1], levels[level2])
    
    # return the similarity metric
#    return 1 / (1 + distance)

# example usage of the similarity function
#print(similarity('Junior', 'Architect'))

In [78]:

# use function to compute the similarity based on level

#df_people['similarity_score_experience'] = df_people["Experience"].apply(lambda x: similarity(role_experience, x))
#df_people


In [79]:


# Calculate score based on the absolute difference between values
#df['similarity_score_experience'] = df["YearsCodePro"].apply(lambda x: abs(role_experience - int(x)))

# Normalize scores
#df['similarity_score_experience'] =  1 - scaler.fit_transform(df[['similarity_score_experience']])

#df

# Present Results from 3 Models

## Model 1 - Compute Final Score

In [80]:
# Set weights
weight_skills = 0.8
weight_interests = 0
#weight_experience = 0.2

# Calculate final score
df_roles['score-model1'] = df_roles['similarity_score_skills'] * weight_skills + df_roles['similarity_score_interests'] * weight_interests #+ df_roles['similarity_score_experience'] * weight_experience

# Get only final columns
#df = df[['id', 'skills', 'score']]

# Sort roles
df_roles = df_roles.sort_values(by=['score-model1'], ascending=False)

# See top 10 matches
df_roles.head(10)

Unnamed: 0,id,skills,similarity_score_skills,similarity_score_interests,score-model1
8759,63559,"[Computer Languages, Solution Design, openstac...",1.0,1.0,0.8
10226,41886,"[ASP.Net, WCF, C#, MS SQL Server, .NET Framewo...",0.433961,0.409027,0.347169
6799,60092,"[Javascript, JQuery, UI Development, Rest, Jas...",0.418377,0.53137,0.334701
458,33508,"[tools, proxy, email, server architecture, jav...",0.147947,0.0,0.118358
1258,73451,"[Competitive Analysis, New Product, Product Ma...",0.0,0.156281,0.0


## Model 2 - Compute final score

In [81]:
# Set weights
weight_skills = 0.5
weight_interests = 0.3
#weight_experience = 0.2

# Calculate final score
df_roles['score-model2'] = df_roles['similarity_score_skills'] * weight_skills + df_roles['similarity_score_interests'] * weight_interests #+ df_roles['similarity_score_experience'] * weight_experience

# Get only final columns
#df = df[['id', 'skills', 'score']]

# Sort roles
df_roles = df_roles.sort_values(by=['score-model2'], ascending=False)

# See top 10 matches
df_roles.head(10)

Unnamed: 0,id,skills,similarity_score_skills,similarity_score_interests,score-model1,score-model2
8759,63559,"[Computer Languages, Solution Design, openstac...",1.0,1.0,0.8,0.8
6799,60092,"[Javascript, JQuery, UI Development, Rest, Jas...",0.418377,0.53137,0.334701,0.3686
10226,41886,"[ASP.Net, WCF, C#, MS SQL Server, .NET Framewo...",0.433961,0.409027,0.347169,0.339688
458,33508,"[tools, proxy, email, server architecture, jav...",0.147947,0.0,0.118358,0.073974
1258,73451,"[Competitive Analysis, New Product, Product Ma...",0.0,0.156281,0.0,0.046884


## Model 3 - Compute Final Score

In [82]:
# Set weights
weight_skills = 0.4
weight_interests = 0.4
#weight_experience = 0.2

# Calculate final score
df_roles['score-model3'] = df_roles['similarity_score_skills'] * weight_skills + df_roles['similarity_score_interests'] * weight_interests #+ df_roles['similarity_score_experience'] * weight_experience

# Get only final columns
#df = df[['id', 'skills', 'score']]

# Sort roles
df_roles = df_roles.sort_values(by=['score-model3'], ascending=False)

# See top 10 matches
df_roles.head(10)

Unnamed: 0,id,skills,similarity_score_skills,similarity_score_interests,score-model1,score-model2,score-model3
8759,63559,"[Computer Languages, Solution Design, openstac...",1.0,1.0,0.8,0.8,0.8
6799,60092,"[Javascript, JQuery, UI Development, Rest, Jas...",0.418377,0.53137,0.334701,0.3686,0.379899
10226,41886,"[ASP.Net, WCF, C#, MS SQL Server, .NET Framewo...",0.433961,0.409027,0.347169,0.339688,0.337195
1258,73451,"[Competitive Analysis, New Product, Product Ma...",0.0,0.156281,0.0,0.046884,0.062512
458,33508,"[tools, proxy, email, server architecture, jav...",0.147947,0.0,0.118358,0.073974,0.059179


# See Results

In [83]:
df_people = df_roles[['id','skills', 'score-model1', 'score-model2', 'score-model3']]
df_people.head(10)

Unnamed: 0,id,skills,score-model1,score-model2,score-model3
8759,63559,"[Computer Languages, Solution Design, openstac...",0.8,0.8,0.8
6799,60092,"[Javascript, JQuery, UI Development, Rest, Jas...",0.334701,0.3686,0.379899
10226,41886,"[ASP.Net, WCF, C#, MS SQL Server, .NET Framewo...",0.347169,0.339688,0.337195
1258,73451,"[Competitive Analysis, New Product, Product Ma...",0.0,0.046884,0.062512
458,33508,"[tools, proxy, email, server architecture, jav...",0.118358,0.073974,0.059179


# Other Metrics

In [19]:

# For each matched person, show % of skills matched and % of interests matched


# Also show the % of skills matched and the % of skills not matched
# and for the matched skills, show the % that is only for skills and only for interests

# Define the list of skills to compare
#skills_to_compare = person_skills


# Define a function to compute the percentage of skills that are mentioned in the list of skills and also in the DataFrame column skills
#def compute_matching_percentage(row, column='skills'):
#    matched_skills = set(row[column]) & set(skills_to_compare)
#    return len(matched_skills) / len(skills_to_compare)

# Get matching skills
#def compute_matching_list(row, column='skills'):
#    matched_skills = set(row[column]) & set(skills_to_compare)
#    return matched_skills

# Get non matching skills
#def compute_non_matching_list(row, column='skills'):
#    non_matched_skills = set(skills_to_compare) - set(row[column])
#    return non_matched_skills

# Define a function to compute the percentage of skills that are not mentioned in the DataFrame column skills
#def compute_non_matching_percentage(row, column='skills'):
#    non_matched_skills = set(skills_to_compare) - set(row[column])
#    return len(non_matched_skills) / len(skills_to_compare)

# Apply the functions to each row of the DataFrame
#df_people['matching_percentage_skills'] = df_roles.apply(lambda x: compute_matching_percentage(x,'skills'), axis=1)
#df_people['non_matching_percentage_skills'] = df_roles.apply(lambda x: compute_non_matching_percentage(x,'skills'), axis=1)

#df_people['matching_percentage_interests'] = df_roles.apply(lambda x: compute_matching_percentage(x,'skills-want'), axis=1)
#df_people['non_matching_percentage_interests'] = df_roles.apply(lambda x: compute_non_matching_percentage(x,'skills-want'), axis=1)

#df_people['matching_list_skills'] = df_roles.apply(lambda x: compute_matching_list(x,'skills'), axis=1)
#df_people['non_matching_list_skills'] = df_roles.apply(lambda x: compute_non_matching_list(x,'skills'), axis=1)

#df_people['matching_list_interests'] = df_roles.apply(lambda x: compute_matching_list(x,'skills-want'), axis=1)
#df_people['non_matching_list_insterests'] = df_roles.apply(lambda x: compute_non_matching_list(x,'skills-want'), axis=1)


# Print the resulting DataFrame
#df_roles



In [35]:
list_skills = df_people[df_people['id']==22519]['skills']
print(list_skills.values)

[]
