# Predicting Career Satisfaction From Stack Overflow Data



## Read the dataset


In [61]:
import pandas as pd
dataset = pd.read_csv('./survey_results_public.csv')


## Separate career satisfaction data from the rest of the columns


In [62]:
# remove all rows from the dataset where career satisfaction has no data
dataset = dataset[dataset['CareerSatisfaction'].notnull()]
# load the career satisfaction column into y
y = dataset['CareerSatisfaction']
# load the the dataset minus the career satisfaction column into x
print(dataset.shape)
X = dataset.drop('CareerSatisfaction', axis=1)
X = dataset.drop('JobSatisfaction', axis=1)
print(X.shape)


(76504, 129)
(76504, 128)


## Create dummy variables for y


In [63]:
print(y.shape)
y = pd.get_dummies(y)
# notice the shape after creating dummies is (76504, 7), because get_dummies() created a new column for each category of answer in the survey
print(y.shape)


(76504,)
(76504, 7)


## Clean the rest of the dataset


### Get rid of columns which were completed by <60% of respondents


In [64]:
cols_sorted_by_complete_entries = X.count().sort_values()
cols_sorted_by_percent_complete = cols_sorted_by_complete_entries / X.shape[0]
# remove all columns which were completed by less than %60 of respondents
print(cols_sorted_by_percent_complete)
good_cols_mask = cols_sorted_by_percent_complete >= 0.6
good_cols = cols_sorted_by_percent_complete[good_cols_mask].index
print('before % clean: ', X.shape)
print(X.count())
X = X[good_cols]
print('after % clean', X.shape)
print(X.count())


TimeAfterBootcamp              0.083460
MilitaryUS                     0.193872
HackathonReasons               0.319330
ErgonomicDevices               0.409155
AdBlockerReasons               0.440709
StackOverflowJobsRecommend     0.463806
JobEmailPriorities1            0.580166
JobEmailPriorities2            0.580166
JobEmailPriorities3            0.580166
JobEmailPriorities5            0.580166
JobEmailPriorities6            0.580166
JobEmailPriorities7            0.580166
JobEmailPriorities4            0.580166
FrameworkWorkedWith            0.607393
JobContactPriorities4          0.610922
JobContactPriorities3          0.610922
JobContactPriorities2          0.610922
JobContactPriorities1          0.610922
JobContactPriorities5          0.610922
ConvertedSalary                0.613889
AdBlockerDisable               0.636620
FrameworkDesireNextYear        0.646228
Salary                         0.648567
SalaryType                     0.652084
DatabaseDesireNextYear         0.664305


In [70]:
# check how many resp we'd have if we nuked every resp that has nan for the FrameworkWorkedWith question
print('is null before null resp removal: ', X.isna().shape)
print('before: ', X.shape)
# X['FrameworkWorkedWith']
no_na_framework_worked_with = X.dropna(subset=['FrameworkWorkedWith'])
print('is null after null resp removal: ', no_na_framework_worked_with.isna().shape)
print('after', no_na_framework_worked_with.shape)


is null before null resp removal:  (76504, 115)
before:  (76504, 115)
is null after null resp removal:  (46468, 115)
after (46468, 115)


### For the `FrameworkWorkedWith` column, one-hot encode each answer

The responses in the `FrameworkWorkedWith` column correspond to a semicolon separated list (e.g `.NET Core;Spark`).
The goal of this section of the code is twofold:
1. find all the Frameworks which respondents were able to choose from
2. turn each string response into a one-hot encoded vector (e.g. `.NET Core;Spark` becomes `[1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0]`)


In [34]:
# start by getting every possible category of response
set_of_responses = set()
for resp in X['FrameworkWorkedWith']:
    if isinstance(resp, float):
        # handles the nan case
        continue
    resp_set = set(resp.split(';'))
    set_of_responses.update(resp_set)
# put the frameworks in alphabetical order for prettiness
sorted_set_of_responses = sorted(set_of_responses)

# iterate thru responses once more and build up one-hot encoded responses from the string responses
for resp in X['FrameworkWorkedWith']:
    if isinstance(resp, float):
        # we will impute these datapoints later with 
        continue



['.NET Core', 'Angular', 'Cordova', 'Django', 'Hadoop', 'Node.js', 'React', 'Spark', 'Spring', 'TensorFlow', 'Torch/PyTorch', 'Xamarin']
# of resp:  12


In [None]:
# now drop the original FrameworkWorkedWith column and add in the new one-hot encoded columns



### Manually drop all other open-ended columns


In [72]:
#TODO: zach nuke shit here pls
X = dataset.drop('<unwanted column name here>', axis=1)


## Actually fit a model now that the data is clean


In [11]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import SelectFromModel
from sklearn.model_selection import train_test_split

In [12]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

MemoryError: 

In [None]:
# print(X.sum(axis=1))
X_smol = X[:500]
print(X_smol.shape)
print(X_smol.columns[X_smol.sum() == 1])
# print(X.head())

In [None]:
sel = SelectFromModel(RandomForestClassifier(n_estimators=100))
sel.fit(X_train, y_train)