# Predicting Career Satisfaction From Stack Overflow Data



# Cleaning 2018 Stack Overflow Developer Survey Data




## Read the dataset


In [13]:
import pandas as pd
dataset = pd.read_csv('../survey_results_public.csv')


## Clean the dataset


### Choose a subset of variables from the dataset for training data


In [14]:
X_subset = dataset[[
               'CareerSatisfaction',
               'FrameworkWorkedWith',
               'OperatingSystem',
               'NumberMonitors',
               'StackOverflowHasAccount',
               'HoursComputer',
               'HoursOutside'
]]

print('shape before dropna: ', X_subset.shape)
# drop any rows with null data
X_subset = X_subset.dropna()
print('shape after dropna: ', X_subset.shape)


shape before dropna:  (98855, 7)
shape after dropna:  (43962, 7)


### Separate CareerSatisfaction (what we want to predict) from the rest of the columns


In [15]:
# load the career satisfaction column into y
y = X_subset['CareerSatisfaction']
X_subset = X_subset.drop('CareerSatisfaction', axis=1)


### Create dummies / one-hot encode y
Each person could choose 1 of 7 ways to rate their career satisfaction.

To simplify the data for ML algorithms, we will split this single column into 7 different columns, one for each category of response.


In [16]:
print(y.shape)
y = pd.get_dummies(y)
# notice the shape after creating dummies is (43962, 7), because get_dummies() created a new column for each category of answer in the survey
print(y.shape)


(43962,)
(43962, 7)


### For the `FrameworkWorkedWith` column, one-hot encode each answer

The responses in the `FrameworkWorkedWith` column correspond to a semicolon separated list (e.g `.NET Core;Spark`).
The goal of this section of the code is twofold:
1. find all the Frameworks which respondents were able to choose from
2. turn each string response into a one-hot encoded vector (e.g. `.NET Core;Spark` becomes `[1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0]`)



In [17]:
def one_hot_encode(categories, data):
    '''
    takes in a list of categories and a list of data and returns a one-hot encoded vector 
    :param categories: 
    :param data: 
    :return: 
    '''
    one_hot_vector = [0 for _ in range(len(categories))] # initialize a vector with 0s for each category
    for item in data:
        for i, category in enumerate(categories):
            if item == category:
                one_hot_vector[i] = 1
    return one_hot_vector


In [18]:
# start by getting every possible category of response
set_of_responses = set()
for resp in X_subset['FrameworkWorkedWith']:
    # parse the response into a list of values
    resp_list = resp.split(';')
    # add to the set of total responses
    resp_set = set(resp_list)
    set_of_responses.update(resp_set)
# put the frameworks in alphabetical order for prettiness
sorted_set_of_responses = sorted(set_of_responses)


Now that we've parsed all the responses from semi-colon separated strings into arrays and found all the categories of responses for the FrameworkWorkedWith column, we can one-hot encode it easily.


In [19]:
# iterate thru responses once more to map the list of responses into one-hot encoded columns
data = {}
for i, resp in X_subset['FrameworkWorkedWith'].iteritems():
    resp_list = resp.split(';')
    one_hot_vector = one_hot_encode(list(sorted_set_of_responses) , resp_list)
    data[i] = one_hot_vector

# build a new dataframe that will replace the FrameworkWorkedWith column with the dict we just built
framework_col_replacement = pd.DataFrame().from_dict(data, orient='index', columns=list(sorted_set_of_responses))
print(framework_col_replacement.head())


   .NET Core  Angular  Cordova  Django  Hadoop  Node.js  React  Spark  Spring  \
0          0        0        0       1       0        0      1      0       0   
1          0        0        0       1       0        0      0      0       0   
5          0        1        0       0       0        1      0      0       0   
6          0        0        0       0       0        1      1      0       0   
7          0        1        0       0       0        1      0      0       0   

   TensorFlow  Torch/PyTorch  Xamarin  
0           0              0        0  
1           0              0        0  
5           0              0        0  
6           0              0        0  
7           0              0        0  


### Now replace the FrameworkWorkedWith column with the new one hot encoded df we just built


In [25]:
X_subset_no_framework_col = X_subset.drop('FrameworkWorkedWith', axis=1)
X_subset_with_one_hot_df = pd.concat([X_subset_no_framework_col, framework_col_replacement], axis=1)

### Create dummies for the rest of the categorical variables


In [31]:
X_cleaned = pd.get_dummies(X_subset_one_hot)
print('The final, cleaned features we will train on:')
print(X_cleaned.columns.values)

The final, cleaned features we will train on:
['.NET Core' 'Angular' 'Cordova' 'Django' 'Hadoop' 'Node.js' 'React'
 'Spark' 'Spring' 'TensorFlow' 'Torch/PyTorch' 'Xamarin'
 'OperatingSystem_BSD/Unix' 'OperatingSystem_Linux-based'
 'OperatingSystem_MacOS' 'OperatingSystem_Windows' 'NumberMonitors_1'
 'NumberMonitors_2' 'NumberMonitors_3' 'NumberMonitors_4'
 'NumberMonitors_More than 4'
 "StackOverflowHasAccount_I'm not sure / I can't remember"
 'StackOverflowHasAccount_No' 'StackOverflowHasAccount_Yes'
 'HoursComputer_1 - 4 hours' 'HoursComputer_5 - 8 hours'
 'HoursComputer_9 - 12 hours' 'HoursComputer_Less than 1 hour'
 'HoursComputer_Over 12 hours' 'HoursOutside_1 - 2 hours'
 'HoursOutside_3 - 4 hours' 'HoursOutside_30 - 59 minutes'
 'HoursOutside_Less than 30 minutes' 'HoursOutside_Over 4 hours']


### Export new CSV file with cleaned data

In [32]:
clean_dataset = y.join(X_cleaned)
clean_dataset.to_csv('../clean_dataset.csv', index=False)