# Predicting Career Satisfaction From Stack Overflow Data


## Read the dataset


In [17]:
import pandas as pd
dataset = pd.read_csv('./survey_results_public.csv')


  interactivity=interactivity, compiler=compiler, result=result)


## Clean the dataset


### Choose a subset of variables from the dataset for training data


In [18]:
X_subset = dataset[[
               # 'LanguageWorkedWith',
               # 'DatabaseWorkedWith',
               # 'PlatformWorkedWith',
               'CareerSatisfaction',
               'FrameworkWorkedWith',
               'OperatingSystem',
               'NumberMonitors',
               'StackOverflowHasAccount',
               'HoursComputer',
               'HoursOutside'
]]

print('shape before dropna: ', X_subset.shape)
# drop any rows with null data
X_subset = X_subset.dropna()
print('shape after dropna: ', X_subset.shape)


shape before dropna:  (98855, 7)
shape after dropna:  (43962, 7)


### Separate CareerSatisfaction (what we want to predict) from the rest of the columns


In [19]:
# load the career satisfaction column into y
y = X_subset['CareerSatisfaction']
X_subset = X_subset.drop('CareerSatisfaction', axis=1)


Each person could choose 1 of 7 ways to rate their career satisfaction.

To simplify the data for ML algorithms, we will split this single column into 7 different columns, one for each category of response.


In [20]:
print(y.shape)
y = pd.get_dummies(y)
# notice the shape after creating dummies is (76504, 7), because get_dummies() created a new column for each category of answer in the survey
print(y.shape)


(43962,)
(43962, 7)


### For the `FrameworkWorkedWith` column, one-hot encode each answer

The responses in the `FrameworkWorkedWith` column correspond to a semicolon separated list (e.g `.NET Core;Spark`).
The goal of this section of the code is twofold:
1. find all the Frameworks which respondents were able to choose from
2. turn each string response into a one-hot encoded vector (e.g. `.NET Core;Spark` becomes `[1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0]`)



In [21]:
def one_hot_encode(categories, data):
    '''
    takes in a list of categories and a list of data and returns a one-hot encoded vector 
    :param categories: 
    :param data: 
    :return: 
    '''
    one_hot_vector = [0 for _ in range(len(categories))] # initialize a vector with 0s for each category
    for item in data:
        for i, category in enumerate(categories):
            if item == category:
                one_hot_vector[i] = 1
    return one_hot_vector


In [22]:
# start by getting every possible category of response
# TODO: put all this one hot encoding stuff into a function and apply it to the LanguageWorkedWith, DatabaseWorkedWIth, & PlatformWorkedWith cols
set_of_responses = set()
for resp in X_subset['FrameworkWorkedWith']:
    # parse the response into a list of values
    resp_list = resp.split(';')
    # replace the response in the dataframe with the parsed list of values
    
    # add to the set of total responses
    resp_set = set(resp_list)
    set_of_responses.update(resp_set)
# put the frameworks in alphabetical order for prettiness
sorted_set_of_responses = sorted(set_of_responses)


Now that we've parsed all the data and found all the categories of responses for the FrameworkWorkedWith column, we can one-hot encode it easily


In [23]:
# iterate thru responses once more to map the list of responses into one-hot encoded columns
data = {}
for i, resp in X_subset['FrameworkWorkedWith'].iteritems():
    resp_list = resp.split(';')
    one_hot_vector = one_hot_encode(list(sorted_set_of_responses) , resp_list)
    data[i] = one_hot_vector

# build a new dataframe that will replace the FrameworkWorkedWith column with the dict we just built
framework_col_replacement = pd.DataFrame().from_dict(data, orient='index', columns=list(sorted_set_of_responses))
print(framework_col_replacement.head())


   .NET Core  Angular  Cordova  Django  Hadoop  Node.js  React  Spark  Spring  \
0          0        0        0       1       0        0      1      0       0   
1          0        0        0       1       0        0      0      0       0   
5          0        1        0       0       0        1      0      0       0   
6          0        0        0       0       0        1      1      0       0   
7          0        1        0       0       0        1      0      0       0   

   TensorFlow  Torch/PyTorch  Xamarin  
0           0              0        0  
1           0              0        0  
5           0              0        0  
6           0              0        0  
7           0              0        0  


### Now replace the FrameworkWorkedWith column with the new one hot encoded df we just built


In [24]:
X_subset_no_framework_col = X_subset.drop('FrameworkWorkedWith', axis=1)
X_subset_one_hot = pd.concat([X_subset_no_framework_col, framework_col_replacement], axis=1)
print(X_subset_one_hot.shape)


(43962, 17)


### Create dummies for the rest of the categorical variables


In [25]:
X_cleaned = pd.get_dummies(X_subset_one_hot)
print(X_cleaned.shape)
print(y.shape)
# print(X_cleaned.head())


(43962, 34)
(43962, 7)


## Actually fit a model now that the data is clean


In [39]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split


In [36]:
# split into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X_cleaned, y, test_size=0.3)
clf = DecisionTreeClassifier(random_state=69)
clf.fit(X_train, y_train)


DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=69,
            splitter='best')

### We can also visualize the tree


In [47]:
from sklearn.externals.six import StringIO
from IPython.display import Image
from sklearn.tree import export_graphviz
import pydotplus

dot_data = StringIO()

#feature_names=X_train.columns.values, 
export_graphviz(clf,
                out_file=dot_data,
                filled=True,
                rounded=True,
                special_characters=True)

graph = pydotplus.graph_from_dot_data(dot_data.getvalue())
Image(graph.create_png())
