# Word2Vec Word Embedding using skills from Developers' Profiles

The model is trained using the Word2Vec word embedding technique.

The word embedding is built based on a data combination from people's skills (Stack Overflow Developer Survey).

The created model is saved to a file for later loading.

## Import Libraries

In [4]:
import pandas as pd
from gensim.models import Word2Vec
import numpy as np

# Load Employees Dataset

In [5]:
df_people = pd.read_csv(filepath_or_buffer="../2-data/employees.csv", sep=",", encoding="latin1")
df_people = df_people[['id', 'skills']]
df_people

Unnamed: 0,id,skills
0,2,JavaScript;TypeScript
1,3,C#;C++;HTML/CSS;JavaScript;Python;Microsoft SQ...
2,4,C#;JavaScript;SQL;TypeScript;Microsoft SQL Ser...
3,5,C#;HTML/CSS;JavaScript;SQL;Swift;TypeScript;Cl...
4,6,C++;Lua;;;;;Homebrew
...,...,...
68821,73264,Bash/Shell;Dart;JavaScript;PHP;Python;SQL;Type...
68822,73265,Bash/Shell;HTML/CSS;JavaScript;Python;SQL;Elas...
68823,73266,HTML/CSS;JavaScript;PHP;Python;SQL;MariaDB;Mic...
68824,73267,C#;Delphi;VBA;Microsoft SQL Server;MongoDB;Oracle


In [12]:
# Make list out of skills
df_combined = df_people
# Make list out of skills
df_combined['skills'] = df_combined['skills'].apply(lambda x: x.split(';'))

# Remove empty skills
df_combined['skills'] = df_combined['skills'].apply(lambda x: [value for value in x if value != ''])
df_combined


Unnamed: 0,id,skills,NumSkills
0,2,"[JavaScript, TypeScript]",21
1,3,"[C#, C++, HTML/CSS, JavaScript, Python, Micros...",73
2,4,"[C#, JavaScript, SQL, TypeScript, Microsoft SQ...",77
3,5,"[C#, HTML/CSS, JavaScript, SQL, Swift, TypeScr...",200
4,6,"[C++, Lua, Homebrew]",20
...,...,...,...
68821,73264,"[Bash/Shell, Dart, JavaScript, PHP, Python, SQ...",182
68822,73265,"[Bash/Shell, HTML/CSS, JavaScript, Python, SQL...",184
68823,73266,"[HTML/CSS, JavaScript, PHP, Python, SQL, Maria...",172
68824,73267,"[C#, Delphi, VBA, Microsoft SQL Server, MongoD...",49


In [13]:
# Check usual number of skills per row
df_combined['NumSkills'] = df_combined['skills'].apply(len)
max_num = df_combined['NumSkills'].max()
mean_num = df_combined['NumSkills'].mean()
median_num = df_combined['NumSkills'].median()
print("Max:", max_num, "- Mean:",mean_num, "- Median:", median_num)

df_combined = df_combined[['id','skills']]
df_combined

Max: 134 - Mean: 14.746403975241915 - Median: 13.0


Unnamed: 0,id,skills
0,2,"[JavaScript, TypeScript]"
1,3,"[C#, C++, HTML/CSS, JavaScript, Python, Micros..."
2,4,"[C#, JavaScript, SQL, TypeScript, Microsoft SQ..."
3,5,"[C#, HTML/CSS, JavaScript, SQL, Swift, TypeScr..."
4,6,"[C++, Lua, Homebrew]"
...,...,...
68821,73264,"[Bash/Shell, Dart, JavaScript, PHP, Python, SQ..."
68822,73265,"[Bash/Shell, HTML/CSS, JavaScript, Python, SQL..."
68823,73266,"[HTML/CSS, JavaScript, PHP, Python, SQL, Maria..."
68824,73267,"[C#, Delphi, VBA, Microsoft SQL Server, MongoD..."


# Word Embedding relating skills that are seen together

## Word Embedding for Skills

In [14]:
# Create the Word2Vec model
sentences = df_combined['skills'].tolist()
model = Word2Vec(sentences, min_count=1, vector_size=300, window=7, sg=1)


In [36]:
model.save("model-w2v-people")

In [32]:
model = Word2Vec.load("model-w2vcombinedfiltered")

# Check Word Embedding

In [15]:
model.wv.wmdistance(['SQL'], ['Python', 'SQL'])


0.6024587555067721

In [18]:
sims = model.wv.most_similar('C#', topn=20)
sims

#model.wv.similarity('SQL', 'Python')

#df_combined['skills'].max()


[('F#', 0.5623462200164795),
 ('COBOL', 0.4988788366317749),
 ('Crystal', 0.4701869487762451),
 ('Delphi', 0.46900737285614014),
 ('VBA', 0.4425012171268463),
 ('PowerShell', 0.43050915002822876),
 ('Fortran', 0.42202165722846985),
 ('Haskell', 0.3764938414096832),
 ('Blazor', 0.3576357662677765),
 ('Assembly', 0.35495758056640625),
 ('IBM DB2', 0.3505973517894745),
 ('Microsoft SQL Server', 0.3432474136352539),
 ('Lua', 0.34258902072906494),
 ('Perl', 0.34067055583000183),
 ('HTML/CSS', 0.33643677830696106),
 ('C++', 0.3307570815086365),
 ('LISP', 0.32829993963241577),
 ('MATLAB', 0.32751238346099854),
 ('Dart', 0.32698988914489746),
 ('ASP.NET', 0.3254520297050476)]