# Model 2 - Word2Vec from Jobs for Skills and Interests, WMD as metric, combined score for ranking

The model is trained using the Word2Vec word embedding technique combined with the Word Mover's Distance metric to identify similarities.


Model using Word2Vec and WMD to find a Position to a Person

The model is trained using the Word2Vec word embedding technique combined with the Word Mover's Distance metric to identify similarities.

One single word embedding model is used for modelling the similarities between skills, which is used also for interests.
The word embedding is built based on data from job postings (Skill2vec dataset).

A score is calculated from each Word Mover's Distance computed and a final score is calculated as a weighted combination of all scores.

## Import Libraries

In [7]:
import pandas as pd
from gensim.models import Word2Vec
import numpy as np

# Prepare Skill2Vec dataset

In [8]:
# Read the Skill2Vec dataset with skills requirements to roles
df_roles = pd.read_csv(filepath_or_buffer="../data/skill2vec_50k.csv", sep=",", encoding="latin1", header = None)
df_roles = df_roles.fillna('')

# Join all skills in a same skill column
df_roles['skills'] = df_roles.drop(columns=[0]).apply(lambda x: ';'.join(x.astype(str)), axis=1)
df_roles["id"] = df_roles[0]
df_roles = df_roles[['id', 'skills']]
df_roles

  has_raised = await self.run_ast_nodes(code_ast.body, cell_name,


Unnamed: 0,id,skills
0,125720,HR Executive;screening;selection;Interview;HR;...
1,112708,Special Teacher;Teaching;Education;;;;;;;;;;;;...
2,115226,consulting;fresher;IT helpdesk;Techincal Troub...
3,19805,diploma;machining;cnc m;mould;conventional mac...
4,80208,Compensation;Benefits;HR Functions;Alm;Payroll...
...,...,...
49995,27374,Chief Engineer;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;...
49996,88457,Receptionist Activities;Front Desk;front offic...
49997,34590,SQL Queries;Log Analysis;Hardware Networking;P...
49998,86171,Quality Analyst;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;...


In [9]:
# Get list of all skills available to be selected - Source: Stack Overflow Survey questions
#df_skills = pd.read_csv(filepath_or_buffer="../data/skill-list.csv", sep=",", encoding="latin1")


#df_roles = df_roles.sample(1000) # get sample


# Get only positions with skills that are mapped
#skills = df_skills['skill']

# Account for synonyms
#synonym_skill = df_skills['synonym']
#def find_index(arr, val):
#    for i in range(len(arr)):
#        if arr[i] == val:
#            return i
#    return -1

#count = 0
#for skill in skills:
#    count = count + df_roles["skills"].apply(lambda x: 1 if skill.lower() in [item.lower() for item in x.split(';')] or synonym_skill[find_index(skills, skill)] in [item.lower() for item in x.split(';')] else 0)
#    df_roles['KeepRow'] = count

# Get only roles that have mapped skills
#df_roles = df_roles.loc[(df_roles['KeepRow'] > 0)]

# Tokenize the skills
df_roles['skills'] = df_roles['skills'].apply(lambda x: x.split(';'))

# Get only required columns
df_roles = df_roles[["id", "skills"]]
df_roles

Unnamed: 0,id,skills
0,125720,"[HR Executive, screening, selection, Interview..."
1,112708,"[Special Teacher, Teaching, Education, , , , ,..."
2,115226,"[consulting, fresher, IT helpdesk, Techincal T..."
3,19805,"[diploma, machining, cnc m, mould, conventiona..."
4,80208,"[Compensation, Benefits, HR Functions, Alm, Pa..."
...,...,...
49995,27374,"[Chief Engineer, , , , , , , , , , , , , , , ,..."
49996,88457,"[Receptionist Activities, Front Desk, front of..."
49997,34590,"[SQL Queries, Log Analysis, Hardware Networkin..."
49998,86171,"[Quality Analyst, , , , , , , , , , , , , , , ..."


# Prepare StackOverflow dataset

In [10]:
# Read the StackOverflow dataset
df = pd.concat([pd.read_csv(filepath_or_buffer="../data/survey_results_public_1.csv", sep=",", encoding="latin1"), pd.read_csv(filepath_or_buffer="../data/survey_results_public_2.csv", sep=",", encoding="latin1")])

# Select the required columns and fill nulls
df = df[["ResponseId","YearsCodePro","LanguageHaveWorkedWith","LanguageWantToWorkWith","DatabaseHaveWorkedWith","DatabaseWantToWorkWith","PlatformHaveWorkedWith","PlatformWantToWorkWith","WebframeHaveWorkedWith","WebframeWantToWorkWith","MiscTechHaveWorkedWith","MiscTechWantToWorkWith","ToolsTechHaveWorkedWith","ToolsTechWantToWorkWith"]]
df = df.fillna('')

# Join all "HaveWorked" and "WantToWorkWith" columns
df["skills"] = df["LanguageHaveWorkedWith"] + ";" + df["DatabaseHaveWorkedWith"]+ ";" + df["PlatformHaveWorkedWith"] + ";" + df["WebframeHaveWorkedWith"] + ";" + df["MiscTechHaveWorkedWith"] + ";" + df["ToolsTechHaveWorkedWith"]
df["skills-want"] = df["LanguageWantToWorkWith"] + ";" + df["DatabaseWantToWorkWith"]+ ";" + df["PlatformWantToWorkWith"] + ";" + df["WebframeWantToWorkWith"] + ";" + df["MiscTechWantToWorkWith"] + ";" + df["ToolsTechWantToWorkWith"]
df['id'] = df['ResponseId']

# Remove original columns
df = df[['id', 'skills', 'skills-want', 'YearsCodePro']]

# Remove extra ;s
df['skills'] = df['skills'].str.strip(';')
df['skills-want'] = df['skills-want'].str.strip(';')

# Remove empty skills
df = df[df['skills']!= '']

# Adjust the Experience column
df["YearsCodePro"] = df["YearsCodePro"].apply(lambda x: 0 if x == '' or x == 'Less than 1 year' else x)
df["YearsCodePro"] = df["YearsCodePro"].apply(lambda x: 50 if x == 'More than 50 years' else x)

# Remove people with identical profiles
df = df.drop_duplicates(subset=['skills', 'skills-want', 'YearsCodePro'])

# Tokenize the skills
df['skills'] = df['skills'].apply(lambda x: x.split(';'))

df_people = df[['id', 'skills']]
df_people



Unnamed: 0,id,skills
1,2,"[JavaScript, TypeScript]"
2,3,"[C#, C++, HTML/CSS, JavaScript, Python, Micros..."
3,4,"[C#, JavaScript, SQL, TypeScript, Microsoft SQ..."
4,5,"[C#, HTML/CSS, JavaScript, SQL, Swift, TypeScr..."
5,6,"[C++, Lua, , , , , Homebrew]"
...,...,...
36629,73264,"[Bash/Shell, Dart, JavaScript, PHP, Python, SQ..."
36630,73265,"[Bash/Shell, HTML/CSS, JavaScript, Python, SQL..."
36631,73266,"[HTML/CSS, JavaScript, PHP, Python, SQL, Maria..."
36632,73267,"[C#, Delphi, VBA, Microsoft SQL Server, MongoD..."


# Combine both Datasets

In [11]:
df_combined = pd.concat([df_people, df_roles])
df_combined

Unnamed: 0,id,skills
1,2,"[JavaScript, TypeScript]"
2,3,"[C#, C++, HTML/CSS, JavaScript, Python, Micros..."
3,4,"[C#, JavaScript, SQL, TypeScript, Microsoft SQ..."
4,5,"[C#, HTML/CSS, JavaScript, SQL, Swift, TypeScr..."
5,6,"[C++, Lua, , , , , Homebrew]"
...,...,...
49995,27374,"[Chief Engineer, , , , , , , , , , , , , , , ,..."
49996,88457,"[Receptionist Activities, Front Desk, front of..."
49997,34590,"[SQL Queries, Log Analysis, Hardware Networkin..."
49998,86171,"[Quality Analyst, , , , , , , , , , , , , , , ..."


# Word Embedding relating skills that are seen together

## Word Embedding for Skills

In [12]:
# Train the Word2Vec model
sentences = df_combined['skills'].tolist()
model = Word2Vec(sentences, min_count=1, vector_size=300, window=300, sg=1)


# Prepare StackOverflow Dataset as available people

In [13]:

# Get only a sample
df = df.sample(5)

# Find People to a Role

## Get Role's Requirements

In [14]:
# Get the role requirements
role_skills = ['Python', 'Microsoft SQL Server', 'Microsoft Azure']
#role_experience = 5 # years
role_experience = 'Associate'

## WMD to get most similar people

In [15]:
# Normalize scores
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()

### Score from Skills

In [17]:
#df = df.sample(10)

# Calculate the distance between the person's skills and the role skills
similarity_scores = []
for skills in df['skills']:
    similarity_scores.append(model.wv.wmdistance(role_skills, skills))
    #print('---> ')
    #print(role_skills)
    #print(skills)
    #print(model.wv.wmdistance(role_skills, skills))

# Add the dissimilarity score to each role
df['similarity_score_skills'] = similarity_scores


# Get the highest distance before inf
df_temp = df.loc[(df['similarity_score_skills'] < np.inf)]
max_score = df_temp['similarity_score_skills'].max()
print(max_score)

# Remove rows where the score is inf - meaning there is no similarity
df['similarity_score_skills'] = df['similarity_score_skills'].replace(np.inf, max_score)

#df = df.loc[(df['similarity_score_skills'] < np.inf)]

# Normalize scores and convert distance to similarity - 0 = more dissimilar, 1 = more similar
df['similarity_score_skills'] = 1 - scaler.fit_transform(df[['similarity_score_skills']])

# Sort the roles by similarity score
df = df.sort_values(by=['similarity_score_skills'], ascending=False)

df

0.7722246250670713


Unnamed: 0,id,skills,skills-want,YearsCodePro,similarity_score_skills
36498,73133,"[Haskell, HTML/CSS, PHP, Python, SQL, TypeScri...",HTML/CSS;Python;SQL;TypeScript;Microsoft SQL S...,0,1.0
16974,53609,"[C, C#, C++, HTML/CSS, JavaScript, SQLite, AWS...",C;C#;C++;HTML/CSS;JavaScript;SQLite;AWS;Micros...,22,0.354059
23427,23428,"[Bash/Shell, Go, HTML/CSS, JavaScript, Lua, PH...",Bash/Shell;Dart;Go;HTML/CSS;JavaScript;Lua;PHP...,0,0.274504
25376,62011,"[HTML/CSS, JavaScript, Ruby, SQL, Elasticsearc...",Bash/Shell;C;HTML/CSS;JavaScript;PHP;Ruby;SQL;...,5,0.116891
23054,23055,"[Bash/Shell, Dart, HTML/CSS, SQL, TypeScript, ...",TypeScript;PostgreSQL;Firebase;Node.js;React.j...,0,0.0


### Score from Interests

In [18]:
# Tokenize the skills
df['skills-want'] = df['skills-want'].apply(lambda x: x.split(';'))

# Calculate the similarity between the person's skills and the role skills
similarity_scores = []
for skills in df['skills-want']:
    similarity_scores.append(model.wv.wmdistance(role_skills, skills))

# Add the similarity scores to each role
df['similarity_score_interests'] = similarity_scores

# Get the highest distance before inf
df_temp = df.loc[(df['similarity_score_interests'] < np.inf)]
max_score = df_temp['similarity_score_interests'].max()
print(max_score)

# Remove rows where the score is inf - meaning there is no similarity
df['similarity_score_interests'] = df['similarity_score_interests'].replace(np.inf, max_score)

#df = df.loc[(df['similarity_score_interests'] < np.inf)]

# Normalize scores and convert distance to similarity - 0 = more dissimilar, 1 = more similar
df['similarity_score_interests'] =  1 - scaler.fit_transform(df[['similarity_score_interests']])

# Sort the roles by similarity score
df = df.sort_values(by=['similarity_score_interests'], ascending=False)

df

0.7932613158339935


Unnamed: 0,id,skills,skills-want,YearsCodePro,similarity_score_skills,similarity_score_interests
36498,73133,"[Haskell, HTML/CSS, PHP, Python, SQL, TypeScri...","[HTML/CSS, Python, SQL, TypeScript, Microsoft ...",0,1.0,1.0
16974,53609,"[C, C#, C++, HTML/CSS, JavaScript, SQLite, AWS...","[C, C#, C++, HTML/CSS, JavaScript, SQLite, AWS...",22,0.354059,0.385484
23427,23428,"[Bash/Shell, Go, HTML/CSS, JavaScript, Lua, PH...","[Bash/Shell, Dart, Go, HTML/CSS, JavaScript, L...",0,0.274504,0.296562
25376,62011,"[HTML/CSS, JavaScript, Ruby, SQL, Elasticsearc...","[Bash/Shell, C, HTML/CSS, JavaScript, PHP, Rub...",5,0.116891,0.159697
23054,23055,"[Bash/Shell, Dart, HTML/CSS, SQL, TypeScript, ...","[TypeScript, PostgreSQL, Firebase, Node.js, Re...",0,0.0,0.0


### Score from Experience

In [19]:


# define the bin edges for each level of experience
bin_edges = [-1, 4, 8, 12, 100]

# define the labels for each level of experience
labels = ['Junior', 'Associate', 'Senior', 'Architect']

# convert the 'years' column to categorical levels of experience
df['Experience'] = pd.cut(pd.to_numeric(df['YearsCodePro']), bins=bin_edges, labels=labels)

# print the resulting DataFrame
df

Unnamed: 0,id,skills,skills-want,YearsCodePro,similarity_score_skills,similarity_score_interests,Experience
36498,73133,"[Haskell, HTML/CSS, PHP, Python, SQL, TypeScri...","[HTML/CSS, Python, SQL, TypeScript, Microsoft ...",0,1.0,1.0,Junior
16974,53609,"[C, C#, C++, HTML/CSS, JavaScript, SQLite, AWS...","[C, C#, C++, HTML/CSS, JavaScript, SQLite, AWS...",22,0.354059,0.385484,Architect
23427,23428,"[Bash/Shell, Go, HTML/CSS, JavaScript, Lua, PH...","[Bash/Shell, Dart, Go, HTML/CSS, JavaScript, L...",0,0.274504,0.296562,Junior
25376,62011,"[HTML/CSS, JavaScript, Ruby, SQL, Elasticsearc...","[Bash/Shell, C, HTML/CSS, JavaScript, PHP, Rub...",5,0.116891,0.159697,Associate
23054,23055,"[Bash/Shell, Dart, HTML/CSS, SQL, TypeScript, ...","[TypeScript, PostgreSQL, Firebase, Node.js, Re...",0,0.0,0.0,Junior


In [20]:
from scipy.spatial.distance import euclidean

# define a function to compute the similarity metric
def similarity(level1, level2):
    # define the vectors representing the levels of experience
    levels = {'Junior': [0, 0, 0, 0],
              'Associate': [1, 0, 0, 0],
              'Senior': [1, 1, 0, 0],
              'Architect': [1, 1, 1, 0]}
    
    # compute the Euclidean distance between the two vectors
    distance = euclidean(levels[level1], levels[level2])
    
    # return the similarity metric
    return 1 / (1 + distance)

# example usage of the similarity function
#print(similarity('Junior', 'Architect'))

In [21]:

# use function to compute the similarity based on level

df['similarity_score_experience'] = df["Experience"].apply(lambda x: similarity(role_experience, x))
df


Unnamed: 0,id,skills,skills-want,YearsCodePro,similarity_score_skills,similarity_score_interests,Experience,similarity_score_experience
36498,73133,"[Haskell, HTML/CSS, PHP, Python, SQL, TypeScri...","[HTML/CSS, Python, SQL, TypeScript, Microsoft ...",0,1.0,1.0,Junior,0.5
16974,53609,"[C, C#, C++, HTML/CSS, JavaScript, SQLite, AWS...","[C, C#, C++, HTML/CSS, JavaScript, SQLite, AWS...",22,0.354059,0.385484,Architect,0.414214
23427,23428,"[Bash/Shell, Go, HTML/CSS, JavaScript, Lua, PH...","[Bash/Shell, Dart, Go, HTML/CSS, JavaScript, L...",0,0.274504,0.296562,Junior,0.5
25376,62011,"[HTML/CSS, JavaScript, Ruby, SQL, Elasticsearc...","[Bash/Shell, C, HTML/CSS, JavaScript, PHP, Rub...",5,0.116891,0.159697,Associate,1.0
23054,23055,"[Bash/Shell, Dart, HTML/CSS, SQL, TypeScript, ...","[TypeScript, PostgreSQL, Firebase, Node.js, Re...",0,0.0,0.0,Junior,0.5


In [63]:


# Calculate score based on the absolute difference between values
#df['similarity_score_experience'] = df["YearsCodePro"].apply(lambda x: abs(role_experience - int(x)))

# Normalize scores
#df['similarity_score_experience'] =  1 - scaler.fit_transform(df[['similarity_score_experience']])

#df

### Compute final score

In [22]:
# Set weights
weight_skills = 0.5
weight_interests = 0.3
weight_experience = 0.2

# Calculate final score
df['score'] = df['similarity_score_skills'] * weight_skills + df['similarity_score_interests'] * weight_interests + df['similarity_score_experience'] * weight_experience

# Get only final columns
#df = df[['id', 'skills', 'score']]

# Sort roles
df = df.sort_values(by=['score'], ascending=False)

# See top 10 matches
df.head(10)

Unnamed: 0,id,skills,skills-want,YearsCodePro,similarity_score_skills,similarity_score_interests,Experience,similarity_score_experience,score
36498,73133,"[Haskell, HTML/CSS, PHP, Python, SQL, TypeScri...","[HTML/CSS, Python, SQL, TypeScript, Microsoft ...",0,1.0,1.0,Junior,0.5,0.9
16974,53609,"[C, C#, C++, HTML/CSS, JavaScript, SQLite, AWS...","[C, C#, C++, HTML/CSS, JavaScript, SQLite, AWS...",22,0.354059,0.385484,Architect,0.414214,0.375518
23427,23428,"[Bash/Shell, Go, HTML/CSS, JavaScript, Lua, PH...","[Bash/Shell, Dart, Go, HTML/CSS, JavaScript, L...",0,0.274504,0.296562,Junior,0.5,0.326221
25376,62011,"[HTML/CSS, JavaScript, Ruby, SQL, Elasticsearc...","[Bash/Shell, C, HTML/CSS, JavaScript, PHP, Rub...",5,0.116891,0.159697,Associate,1.0,0.306354
23054,23055,"[Bash/Shell, Dart, HTML/CSS, SQL, TypeScript, ...","[TypeScript, PostgreSQL, Firebase, Node.js, Re...",0,0.0,0.0,Junior,0.5,0.1


# See Results

In [23]:
#df
df.head(10)

Unnamed: 0,id,skills,skills-want,YearsCodePro,similarity_score_skills,similarity_score_interests,Experience,similarity_score_experience,score
36498,73133,"[Haskell, HTML/CSS, PHP, Python, SQL, TypeScri...","[HTML/CSS, Python, SQL, TypeScript, Microsoft ...",0,1.0,1.0,Junior,0.5,0.9
16974,53609,"[C, C#, C++, HTML/CSS, JavaScript, SQLite, AWS...","[C, C#, C++, HTML/CSS, JavaScript, SQLite, AWS...",22,0.354059,0.385484,Architect,0.414214,0.375518
23427,23428,"[Bash/Shell, Go, HTML/CSS, JavaScript, Lua, PH...","[Bash/Shell, Dart, Go, HTML/CSS, JavaScript, L...",0,0.274504,0.296562,Junior,0.5,0.326221
25376,62011,"[HTML/CSS, JavaScript, Ruby, SQL, Elasticsearc...","[Bash/Shell, C, HTML/CSS, JavaScript, PHP, Rub...",5,0.116891,0.159697,Associate,1.0,0.306354
23054,23055,"[Bash/Shell, Dart, HTML/CSS, SQL, TypeScript, ...","[TypeScript, PostgreSQL, Firebase, Node.js, Re...",0,0.0,0.0,Junior,0.5,0.1


In [24]:

# For each matched person, show % of skills matched and % of interests matched


# Also show the % of skills matched and the % of skills not matched
# and for the matched skills, show the % that is only for skills and only for interests

# Define the list of skills to compare
skills_to_compare = role_skills

# Define a function to compute the percentage of skills that are mentioned in the list of skills and also in the DataFrame column skills
def compute_matching_percentage(row, column='skills'):
    matched_skills = set(row[column]) & set(skills_to_compare)
    return len(matched_skills) / len(skills_to_compare)

# Get matching skills
def compute_matching_list(row, column='skills'):
    matched_skills = set(row[column]) & set(skills_to_compare)
    return matched_skills

# Get non matching skills
def compute_non_matching_list(row, column='skills'):
    non_matched_skills = set(skills_to_compare) - set(row[column])
    return non_matched_skills

# Define a function to compute the percentage of skills that are not mentioned in the DataFrame column skills
def compute_non_matching_percentage(row, column='skills'):
    non_matched_skills = set(skills_to_compare) - set(row[column])
    return len(non_matched_skills) / len(skills_to_compare)

# Apply the functions to each row of the DataFrame
df['matching_percentage_skills'] = df.apply(lambda x: compute_matching_percentage(x,'skills'), axis=1)
df['non_matching_percentage_skills'] = df.apply(lambda x: compute_non_matching_percentage(x,'skills'), axis=1)

df['matching_percentage_interests'] = df.apply(lambda x: compute_matching_percentage(x,'skills-want'), axis=1)
df['non_matching_percentage_interests'] = df.apply(lambda x: compute_non_matching_percentage(x,'skills-want'), axis=1)

df['matching_list_skills'] = df.apply(lambda x: compute_matching_list(x,'skills'), axis=1)
df['non_matching_list_skills'] = df.apply(lambda x: compute_non_matching_list(x,'skills'), axis=1)

df['matching_list_interests'] = df.apply(lambda x: compute_matching_list(x,'skills-want'), axis=1)
df['non_matching_list_insterests'] = df.apply(lambda x: compute_non_matching_list(x,'skills-want'), axis=1)


# Print the resulting DataFrame
df



Unnamed: 0,id,skills,skills-want,YearsCodePro,similarity_score_skills,similarity_score_interests,Experience,similarity_score_experience,score,matching_percentage_skills,non_matching_percentage_skills,matching_percentage_interests,non_matching_percentage_interests,matching_list_skills,non_matching_list_skills,matching_list_interests,non_matching_list_insterests
36498,73133,"[Haskell, HTML/CSS, PHP, Python, SQL, TypeScri...","[HTML/CSS, Python, SQL, TypeScript, Microsoft ...",0,1.0,1.0,Junior,0.5,0.9,1.0,0.0,1.0,0.0,"{Python, Microsoft SQL Server, Microsoft Azure}",{},"{Python, Microsoft SQL Server, Microsoft Azure}",{}
16974,53609,"[C, C#, C++, HTML/CSS, JavaScript, SQLite, AWS...","[C, C#, C++, HTML/CSS, JavaScript, SQLite, AWS...",22,0.354059,0.385484,Architect,0.414214,0.375518,0.333333,0.666667,0.333333,0.666667,{Microsoft Azure},"{Python, Microsoft SQL Server}",{Microsoft Azure},"{Python, Microsoft SQL Server}"
23427,23428,"[Bash/Shell, Go, HTML/CSS, JavaScript, Lua, PH...","[Bash/Shell, Dart, Go, HTML/CSS, JavaScript, L...",0,0.274504,0.296562,Junior,0.5,0.326221,0.333333,0.666667,0.333333,0.666667,{Python},"{Microsoft SQL Server, Microsoft Azure}",{Python},"{Microsoft SQL Server, Microsoft Azure}"
25376,62011,"[HTML/CSS, JavaScript, Ruby, SQL, Elasticsearc...","[Bash/Shell, C, HTML/CSS, JavaScript, PHP, Rub...",5,0.116891,0.159697,Associate,1.0,0.306354,0.0,1.0,0.0,1.0,{},"{Python, Microsoft SQL Server, Microsoft Azure}",{},"{Python, Microsoft SQL Server, Microsoft Azure}"
23054,23055,"[Bash/Shell, Dart, HTML/CSS, SQL, TypeScript, ...","[TypeScript, PostgreSQL, Firebase, Node.js, Re...",0,0.0,0.0,Junior,0.5,0.1,0.0,1.0,0.0,1.0,{},"{Python, Microsoft SQL Server, Microsoft Azure}",{},"{Python, Microsoft SQL Server, Microsoft Azure}"


In [179]:
list_skills = df[df['id']==22519]['skills']
print(list_skills.values)

[list(['Go', 'JavaScript', 'Python', 'SQL', 'PostgreSQL', 'Redis', 'SQLite', 'AWS', 'Google Cloud', 'FastAPI', '', 'Ansible', 'Docker', 'Homebrew', 'Kubernetes', 'npm', 'Pulumi', 'Terraform'])]


# Check Word Embedding

In [162]:
model.wv.wmdistance(['SQL'], ['Python', 'SQL'])
#model.wv.wmdistance(['Bash/Shell', 'Python', 'SQL'], ['Python', 'SQL'])

#model.wv.wmdistance(role_skills, ['C#', 'HTML/CSS', 'JavaScript', 'PowerShell', 'Python', 'SQL', 'TypeScript', 'Microsoft SQL Server'])


0.47909408108215323

In [217]:

# Get list of all skills available to be selected - Source: Stack Overflow Survey questions
df_skills = pd.read_csv(filepath_or_buffer="../data/skill-list.csv", sep=",", encoding="latin1")

skill_check = 'Azure'

df_skills["similarity"] = df_skills["skill"].apply(lambda x: model.wv.wmdistance([skill_check], [x]))
df_skills = df_skills.sort_values(by=['similarity'], ascending=True)

#df_skills
print(df_skills.head(10))


            skill                  type synonym  similarity
72         VMware        Cloud Platform     NaN    0.590997
59            AWS        Cloud Platform     NaN    0.680139
39          Swift  Programming Language     NaN    0.689028
80         Drupal         Web Framework     NaN    0.689575
96        Symfony         Web Framework     NaN    0.715944
79         Django         Web Framework     NaN    0.771417
4              C#  Programming Language     NaN    0.772190
55     PostgreSQL              Database     NaN    0.772320
75        ASP.NET         Web Framework     NaN    0.794956
94  Ruby on Rails         Web Framework     NaN    0.809103


In [107]:

##df.loc[df['skills-want']!='SQL' & df['skills']!='SQL']

filtered_df = df[(df['skills-want'] !='SQL') & (df['skills'] != 'SQL')]
filtered_df


Unnamed: 0,id,skills,skills-want,YearsCodePro,similarity_score_skills,similarity_score_interests,similarity_score_experience,score
16741,16742,Swift,Swift,5,0.453604,0.671319,1.000000,0.568948
29850,29851,Swift,Swift,6,0.453604,0.671319,0.977778,0.564504
29728,29729,Swift,Swift,6,0.453604,0.671319,0.977778,0.564504
1458,1459,Swift,Swift,6,0.453604,0.671319,0.977778,0.564504
28157,28158,Python,Python,5,0.445198,0.681646,1.000000,0.562317
...,...,...,...,...,...,...,...,...
31547,31548,Bash/Shell;C;C#;C++;Java;Kotlin;Perl;PHP;Power...,Bash/Shell;C;C#;C++;Java;Kotlin;Perl;PHP;Power...,50,0.000000,inf,0.000000,0.000000
19912,19913,Java;PowerShell;Python;SQL;TypeScript;IBM DB2;...,HTML/CSS;JavaScript;PowerShell;Python;SQL;Type...,50,0.000000,inf,0.000000,0.000000
3644,40279,C#;HTML/CSS;JavaScript;PowerShell;SAS;SQL;Type...,C#;HTML/CSS;JavaScript;PowerShell;SAS;SQL;Type...,50,0.000000,inf,0.000000,0.000000
22678,59313,C#;PHP;SQL;VBA;Microsoft SQL Server;MySQL;VMware,C#;SQL;Microsoft SQL Server;MySQL;VMware,50,0.000000,inf,0.000000,0.000000


In [246]:

df[(df['id'] ==64129)]

Unnamed: 0,id,skills,skills-want,YearsCodePro,similarity_score_skills,similarity_score_interests,similarity_score_experience,score
27494,64129,"[Bash/Shell, Clojure, Python, SQL, PostgreSQL,...","[Bash/Shell, Python, SQL, PostgreSQL, Redis, A...",16,1.0,1.0,0.592593,0.959259
