In [8]:
import pandas as pd
from sklearn.model_selection import train_test_split
from collections import Counter

In [2]:
data = pd.read_csv("stackoverflow_full.csv")
missing_data = data.isna().sum()

In [3]:
print(missing_data)

Unnamed: 0         0
Age                0
Accessibility      0
EdLevel            0
Employment         0
Gender             0
MentalHealth       0
MainBranch         0
YearsCode          0
YearsCodePro       0
Country            0
PreviousSalary     0
HaveWorkedWith    63
ComputerSkills     0
Employed           0
dtype: int64


In [20]:
print(data.head())

   Unnamed: 0  Age Accessibility        EdLevel  Employment Gender  \
0           0  <35            No         Master           1    Man   
1           1  <35            No  Undergraduate           1    Man   
2           2  <35            No         Master           1    Man   
3           3  <35            No  Undergraduate           1    Man   
4           4  >35            No            PhD           0    Man   

  MentalHealth MainBranch  YearsCode  YearsCodePro    Country  PreviousSalary  \
0           No        Dev          7             4     Sweden         51552.0   
1           No        Dev         12             5      Spain         46482.0   
2           No        Dev         15             6    Germany         77290.0   
3           No        Dev          9             6     Canada         46135.0   
4           No     NotDev         40            30  Singapore        160932.0   

                                      HaveWorkedWith  ComputerSkills  Employed  
0          

In [1]:
# train_ratio = 0.7
# valid_ratio = 0.15
# test_ratio = 0.15 

# train_data, temp_data = train_test_split(data, test_size = 1 - train_ratio, random_state=42)
# valid_data, test_data = train_test_split(temp_data, test_size=test_ratio / (test_ratio + valid_ratio), random_state=42)

# train_data.to_csv("train_set.csv", index=False)
# valid_data.to_csv("valid_set.csv", index=False)
# test_data.to_csv("test_set.csv", index=False)

# print("Datasets saved successfully.")

Datasets saved successfully.


### Find top ten computer skills

In [16]:
all_skills = data['HaveWorkedWith'].str.split(';')
# print(all_skills)
flattened_skills = [
    skill.strip() 
    for sublist in all_skills if isinstance(sublist, list)  # Check if the sublist is actually a list
    for skill in sublist if isinstance(skill, str)          # Then iterate over each skill in the sublist
]

skills_count = Counter(flattened_skills)
top_skills = [skill for skill, count in skills_count.most_common(10)]
# print(top_skills)

['JavaScript', 'Docker', 'HTML/CSS', 'SQL', 'Git', 'AWS', 'Python', 'PostgreSQL', 'MySQL', 'TypeScript']


Update the dataset with ten columns representing the top ten most popular skills where each column has a binary value indicating the presence (1) or absence (0) of a skill:

In [19]:
def create_binary_columns(skill_list, top_skills):
    # Create a binary list representing the presence or absence of each top skill
    return [1 if skill in skill_list else 0 for skill in top_skills]   

data['HaveWorkedWith'] = data['HaveWorkedWith'].fillna('')  # Replace NaN with empty string
all_skills = data['HaveWorkedWith'].str.split(';')  # Split skills into lists
binary_columns = all_skills.apply(lambda x: create_binary_columns(x, top_skills))
binary_df = pd.DataFrame(binary_columns.tolist(), columns=top_skills)
final_dataset = pd.concat([data, binary_df], axis=1)
print(final_dataset.head())

   Unnamed: 0  Age Accessibility        EdLevel  Employment Gender  \
0           0  <35            No         Master           1    Man   
1           1  <35            No  Undergraduate           1    Man   
2           2  <35            No         Master           1    Man   
3           3  <35            No  Undergraduate           1    Man   
4           4  >35            No            PhD           0    Man   

  MentalHealth MainBranch  YearsCode  YearsCodePro  ... JavaScript  Docker  \
0           No        Dev          7             4  ...          0       0   
1           No        Dev         12             5  ...          1       0   
2           No        Dev         15             6  ...          0       0   
3           No        Dev          9             6  ...          1       0   
4           No     NotDev         40            30  ...          0       0   

  HTML/CSS  SQL  Git  AWS  Python  PostgreSQL  MySQL  TypeScript  
0        0    0    1    0       1          

Split the final dataset to 3 csv files (train/valid/test):

In [21]:
# using fixed rate and random_state to ensure reproducibility

train_ratio = 0.7
valid_ratio = 0.15
test_ratio = 0.15

train_data, temp_data = train_test_split(final_dataset, test_size = 1 - train_ratio, random_state=42)
valid_data, test_data = train_test_split(temp_data, test_size=test_ratio / (test_ratio + valid_ratio), random_state=42)

train_data.to_csv("train_set.csv", index=False)
valid_data.to_csv("valid_set.csv", index=False)
test_data.to_csv("test_set.csv", index=False)

print("Split datasets saved successfully.")

Datasets saved successfully.
