In [1]:
%matplotlib inline

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

sns.set(rc={'figure.figsize': (12, 8)})
sns.set_palette('Set3')
np.random.seed(86)

In [2]:
df = pd.read_csv('./data/mental-health/survey.csv')

In [3]:
df.drop('Timestamp', axis=1, inplace=True)

df.replace(
    {
        'Gender': {
            'male': 'Male',
            'm': 'Male',
            'M': 'Male',
            'female': 'Female',
            'f': 'Female',
            'F': 'Female'
        }
    },
    inplace=True
)
df.loc[~df['Gender'].isin(['Male', 'Female']), 'Gender'] = 'Other'

df.loc[df['Age'] < 0, 'Age'] = np.nan
df.loc[df['Age'] > 200, 'Age'] = np.nan
df.loc[:, 'Age'] = df['Age'].fillna(0).astype(np.float32)

df.loc[:, 'self_employed'] = df['self_employed'].fillna('No')

df.drop('comments', axis=1, inplace=True)

In [4]:
y  = df['treatment'] == 'Yes'
df.drop('treatment', axis=1, inplace=True)

In [5]:
X = pd.concat(
    [df['Age']] + \
    [pd.get_dummies(df[column], prefix=column) for column in df.columns[1:]],
    axis=1
)

In [6]:
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier

clf = RandomForestClassifier(n_jobs=4)
param_grid = {
    'n_estimators': [32, 64, 96, 110, 130, 156],
    'criterion': ['gini', 'entropy'],
    'max_depth': [3, 5, 8, 12, 15],
    'min_samples_leaf': [1, 2, 3],
}
gridsearch = GridSearchCV(clf, param_grid, verbose=1)

In [7]:
gridsearch.fit(X, y)

Fitting 3 folds for each of 180 candidates, totalling 540 fits


[Parallel(n_jobs=1)]: Done 540 out of 540 | elapsed:  3.7min finished


GridSearchCV(cv=None, error_score='raise',
       estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=4,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False),
       fit_params=None, iid=True, n_jobs=1,
       param_grid={'n_estimators': [32, 64, 96, 110, 130, 156], 'criterion': ['gini', 'entropy'], 'max_depth': [3, 5, 8, 12, 15], 'min_samples_leaf': [1, 2, 3]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=1)

In [8]:
gridsearch.best_params_

{'criterion': 'entropy',
 'max_depth': 15,
 'min_samples_leaf': 3,
 'n_estimators': 64}

In [9]:
gridsearch.best_score_

0.80142970611596509