In [1]:
%matplotlib inline

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

sns.set(rc={'figure.figsize': (12, 8)})
sns.set_palette('Set3')
np.random.seed(86)

In [2]:
df = pd.read_csv('./data/mental-health/survey.csv')

In [3]:
df.drop('Timestamp', axis=1, inplace=True)

df.replace(
    {
        'Gender': {
            'male': 'Male',
            'm': 'Male',
            'M': 'Male',
            'female': 'Female',
            'f': 'Female',
            'F': 'Female'
        }
    },
    inplace=True
)
df.loc[~df['Gender'].isin(['Male', 'Female']), 'Gender'] = 'Other'

df.loc[df['Age'] < 0, 'Age'] = np.nan
df.loc[df['Age'] > 200, 'Age'] = np.nan
df.loc[:, 'Age'] = df['Age'].fillna(0).astype(np.float32)

df.loc[:, 'self_employed'] = df['self_employed'].fillna('No')

df.drop('comments', axis=1, inplace=True)

In [4]:
y  = df['treatment'] == 'Yes'
df.drop('treatment', axis=1, inplace=True)

In [5]:
X = pd.concat(
    [df['Age']] + \
    [pd.get_dummies(df[column], prefix=column) for column in df.columns[1:]],
    axis=1
)

In [6]:
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import GradientBoostingClassifier

clf = GradientBoostingClassifier()
param_grid = {
    'learning_rate': [1e-1, 1e-2, 1e-3, 1e-4],
    'n_estimators': [64, 96, 120, 130, 150, 180, 240],
    'max_depth': [1, 2, 3, 6, 8, 12]
}
gridsearch = GridSearchCV(clf, param_grid, verbose=1, n_jobs=4)

In [7]:
gridsearch.fit(X, y)

Fitting 3 folds for each of 168 candidates, totalling 504 fits


[Parallel(n_jobs=4)]: Done  73 tasks      | elapsed:   11.7s
[Parallel(n_jobs=4)]: Done 223 tasks      | elapsed:  1.5min
[Parallel(n_jobs=4)]: Done 473 tasks      | elapsed:  3.9min
[Parallel(n_jobs=4)]: Done 504 out of 504 | elapsed:  4.6min finished


GridSearchCV(cv=None, error_score='raise',
       estimator=GradientBoostingClassifier(criterion='friedman_mse', init=None,
              learning_rate=0.1, loss='deviance', max_depth=3,
              max_features=None, max_leaf_nodes=None,
              min_impurity_decrease=0.0, min_impurity_split=None,
              min_samples_leaf=1, min_samples_split=2,
              min_weight_fraction_leaf=0.0, n_estimators=100,
              presort='auto', random_state=None, subsample=1.0, verbose=0,
              warm_start=False),
       fit_params=None, iid=True, n_jobs=4,
       param_grid={'learning_rate': [0.1, 0.01, 0.001, 0.0001], 'n_estimators': [64, 96, 120, 130, 150, 180, 240], 'max_depth': [1, 2, 3, 6, 8, 12]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=1)

In [8]:
gridsearch.best_params_

{'learning_rate': 0.1, 'max_depth': 1, 'n_estimators': 64}

In [9]:
gridsearch.best_score_

0.84034948371723595