# Prepare Employees Dataset

Pre-processing steps to prepare the dataset that contains all employees' data for training and evaluation.

Source dataset: Stack Overflow Developer Survey

In [8]:
import pandas as pd
from gensim.models import Word2Vec
import numpy as np

In [9]:
# Read the StackOverflow dataset
df = pd.concat([pd.read_csv(filepath_or_buffer="../0-rawdata/survey_results_public_1.csv", sep=",", encoding="latin1"), pd.read_csv(filepath_or_buffer="../0-rawdata/survey_results_public_2.csv", sep=",", encoding="latin1")])

# Select the required columns and fill nulls
df = df[["ResponseId","YearsCodePro","LanguageHaveWorkedWith","LanguageWantToWorkWith","DatabaseHaveWorkedWith","DatabaseWantToWorkWith","PlatformHaveWorkedWith","PlatformWantToWorkWith","WebframeHaveWorkedWith","WebframeWantToWorkWith","MiscTechHaveWorkedWith","MiscTechWantToWorkWith","ToolsTechHaveWorkedWith","ToolsTechWantToWorkWith"]]
df = df.fillna('')

# Join all "HaveWorked" and "WantToWorkWith" columns
df["skills"] = df["LanguageHaveWorkedWith"] + ";" + df["DatabaseHaveWorkedWith"]+ ";" + df["PlatformHaveWorkedWith"] + ";" + df["WebframeHaveWorkedWith"] + ";" + df["MiscTechHaveWorkedWith"] + ";" + df["ToolsTechHaveWorkedWith"]
df["skills-want"] = df["LanguageWantToWorkWith"] + ";" + df["DatabaseWantToWorkWith"]+ ";" + df["PlatformWantToWorkWith"] + ";" + df["WebframeWantToWorkWith"] + ";" + df["MiscTechWantToWorkWith"] + ";" + df["ToolsTechWantToWorkWith"]
df['id'] = df['ResponseId']

# Remove original columns
df = df[['id', 'skills', 'skills-want', 'YearsCodePro']]

# Remove extra ;s
df['skills'] = df['skills'].str.strip(';')
df['skills-want'] = df['skills-want'].str.strip(';')

# Remove empty skills
df = df[df['skills']!= '']
df = df[df['skills-want']!= '']

# Adjust the Experience column
df["YearsCodePro"] = df["YearsCodePro"].apply(lambda x: 0 if x == '' or x == 'Less than 1 year' else x)
df["YearsCodePro"] = df["YearsCodePro"].apply(lambda x: 50 if x == 'More than 50 years' else x)

# Remove people with identical profiles
df = df.drop_duplicates(subset=['skills', 'skills-want', 'YearsCodePro'])

# Tokenize the skills
#df['skills'] = df['skills'].apply(lambda x: x.split(';'))

df



Unnamed: 0,id,skills,skills-want,YearsCodePro
1,2,JavaScript;TypeScript,Rust;TypeScript,0
2,3,C#;C++;HTML/CSS;JavaScript;Python;Microsoft SQ...,C#;C++;HTML/CSS;JavaScript;TypeScript;Microsof...,5
3,4,C#;JavaScript;SQL;TypeScript;Microsoft SQL Ser...,C#;SQL;TypeScript;Microsoft SQL Server;;ASP.NE...,17
4,5,C#;HTML/CSS;JavaScript;SQL;Swift;TypeScript;Cl...,C#;Elixir;F#;Go;JavaScript;Rust;TypeScript;Clo...,3
5,6,C++;Lua;;;;;Homebrew,Lua;;;;;Homebrew,0
...,...,...,...,...
36629,73264,Bash/Shell;Dart;JavaScript;PHP;Python;SQL;Type...,Bash/Shell;Go;JavaScript;Python;SQL;TypeScript...,5
36630,73265,Bash/Shell;HTML/CSS;JavaScript;Python;SQL;Elas...,HTML/CSS;JavaScript;Python;Elasticsearch;Neo4j...,5
36631,73266,HTML/CSS;JavaScript;PHP;Python;SQL;MariaDB;Mic...,C#;HTML/CSS;JavaScript;PHP;Python;SQL;MariaDB;...,33
36632,73267,C#;Delphi;VBA;Microsoft SQL Server;MongoDB;Oracle,Delphi,31


In [10]:
# Save data frame to CSV
df.to_csv("../2-data/employees.csv")