# Predicting Career Satisfaction From Stack Overflow Data


## Read the dataset


In [2]:
import pandas as pd
dataset = pd.read_csv('./survey_results_public.csv')


  interactivity=interactivity, compiler=compiler, result=result)


## Separate career satisfaction data from the rest of the columns


In [3]:
# remove all rows from the dataset where career satisfaction has no data
dataset = dataset[dataset['CareerSatisfaction'].notnull()]
# load the career satisfaction column into y
y = dataset['CareerSatisfaction']
# load the the dataset minus the career satisfaction column into x
print(dataset.shape)
X = dataset.drop('CareerSatisfaction', axis=1)
print(X.shape)


(76504, 129)
(76504, 128)


## Create dummy variables for y


In [28]:
print(y.shape)
y = pd.get_dummies(y)
# notice the shape after creating dummies is (76504, 7), because get_dummies() created a new column for each category of answer in the survey
print(y.shape)


(76504, 7)
(76504, 7)


## Clean the rest of the dataset


### Choose a subset of variables from the dataset for training data


In [29]:
X_subset = X[[
               'LanguageWorkedWith',
               'DatabaseWorkedWith',
               'PlatformWorkedWith',
               'FrameworkWorkedWith',
               'OperatingSystem',
               'NumberMonitors',
               'StackOverflowHasAccount',
               'HoursComputer',
               'HoursOutside'
]]

print('shape before dropna: ', X_subset.shape)
# drop any rows with null data
X_subset = X_subset.dropna()
print('shape after dropna: ', X_subset.shape)


shape before dropna:  (76504, 9)
shape after dropna:  (36581, 9)


### For the `FrameworkWorkedWith` column, one-hot encode each answer

The responses in the `FrameworkWorkedWith` column correspond to a semicolon separated list (e.g `.NET Core;Spark`).
The goal of this section of the code is twofold:
1. find all the Frameworks which respondents were able to choose from
2. turn each string response into a one-hot encoded vector (e.g. `.NET Core;Spark` becomes `[1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0]`)



In [30]:
def one_hot_encode(categories, data):
    '''
    takes in a list of categories and a list of data and returns a one-hot encoded vector 
    :param categories: 
    :param data: 
    :return: 
    '''
    one_hot_vector = [0 for _ in range(len(categories))] # initialize a vector with 0s for each category
    for item in data:
        for i, category in enumerate(categories):
            if item == category:
                one_hot_vector[i] = 1
    return one_hot_vector


In [31]:
# start by getting every possible category of response
set_of_responses = set()
for resp in X_subset['FrameworkWorkedWith']:
    # parse the response into a list of values
    resp_list = resp.split(';')
    # replace the response in the dataframe with the parsed list of values
    
    # add to the set of total responses
    resp_set = set(resp_list)
    set_of_responses.update(resp_set)
# put the frameworks in alphabetical order for prettiness
sorted_set_of_responses = sorted(set_of_responses)


Now that we've parsed all the data and found all the categories of responses for the FrameworkWorkedWith column, we can one-hot encode it easily


In [44]:
# iterate thru responses once more to map the list of responses into one-hot encoded columns
X_subset_subset = X_subset[:100]
data = {}
for i, resp in X_subset_subset['FrameworkWorkedWith'].iteritems():
    resp_list = resp.split(';')
    one_hot_vector = one_hot_encode(list(sorted_set_of_responses) , resp_list)
    data[i] = one_hot_vector

# build a new dataframe that will replace the FrameworkWorkedWith column with the dict we just built
framework_col_replacement = pd.DataFrame().from_dict(data, orient='index', columns=list(sorted_set_of_responses))
print(framework_col_replacement.head())

   .NET Core  Angular  Cordova  Django  Hadoop  Node.js  React  Spark  Spring  \
0          0        0        0       1       0        0      1      0       0   
1          0        0        0       1       0        0      0      0       0   
5          0        1        0       0       0        1      0      0       0   
6          0        0        0       0       0        1      1      0       0   
7          0        1        0       0       0        1      0      0       0   

   TensorFlow  Torch/PyTorch  Xamarin  
0           0              0        0  
1           0              0        0  
5           0              0        0  
6           0              0        0  
7           0              0        0  


### Manually drop all other open-ended columns


In [72]:
# just DON'T nuke these columns: LanguageWorkedWith, FrameworkWorkedWith, 
X = dataset.drop('<unwanted column name here>', axis=1)


## Actually fit a model now that the data is clean


In [11]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import SelectFromModel
from sklearn.model_selection import train_test_split

In [12]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

MemoryError: 

In [None]:
# print(X.sum(axis=1))
X_smol = X[:500]
print(X_smol.shape)
print(X_smol.columns[X_smol.sum() == 1])
# print(X.head())

In [None]:
sel = SelectFromModel(RandomForestClassifier(n_estimators=100))
sel.fit(X_train, y_train)