In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
import numpy as np
from sklearn import tree
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from time import time
pd.options.mode.chained_assignment = None  # default='warn'

In [2]:
def preprocessdataframe (df):
    imputer = SimpleImputer(missing_values = np.nan, strategy = 'mean')
    imputer = imputer.fit(df.loc[:,['Age']])
    df.loc[:,'Age'] = imputer.transform(df.loc[:,['Age']])

    imputer = SimpleImputer(missing_values = np.nan, strategy = 'mean')
    imputer = imputer.fit(df.loc[:,['Fare']])
    df.loc[:,'Fare'] = imputer.transform(df.loc[:,['Fare']])

    df.Embarked = df.Embarked.fillna('S')

    df = pd.get_dummies(data=df, columns=['Embarked', 'Pclass', 'Sex'])

    return df

In [3]:
df = pd.read_csv('~/Documents/GitHub/TiberDataScienceLearning/Data/Titanic/train.csv')
y = df[['Survived']]
x = df[['Pclass','Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked']]

In [4]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.2, random_state = 0)
x_train = preprocessdataframe(x_train)
x_test = preprocessdataframe(x_test)

In [5]:
#basic decision tree with no hyperparameters
clf = tree.DecisionTreeClassifier()
cross_val_roc = cross_val_score(clf, X=x_train, y=y_train, cv=10, scoring='roc_auc')
roc_score = np.mean(cross_val_roc)
print("No hyperparameter decision tree: ", roc_score)

No hyperparameter decision tree:  0.7499196351667281


In [6]:
#testing out max_depth parameters with values from 1 to 12
aucs = dict()
for i in range(1,12):
    clf = tree.DecisionTreeClassifier(max_depth=i)
    cross_val_roc = cross_val_score(clf, x_train, y_train, cv=10, scoring='roc_auc')
    roc_score = np.mean(cross_val_roc)
    aucs[i] = roc_score
print("Using the max_depth hyperparameter: ", aucs)

Using the max_depth hyperparameter:  {1: 0.7652214280121256, 2: 0.8065079085427922, 3: 0.8418283751132588, 4: 0.8427579714643668, 5: 0.8421548681723101, 6: 0.8256215952436883, 7: 0.7989714419946978, 8: 0.7906519514077653, 9: 0.7977096197299686, 10: 0.7740012248733179, 11: 0.7609355249616877}


In [7]:
#test out min_samples_split parameter with values of [.01, .05, .1, .2, .5]
aucs = dict()
params = [.01, .05, .1, .2, .5]
for i in params:
    clf = tree.DecisionTreeClassifier(min_samples_split = i)
    cross_val_roc = cross_val_score(clf, X=x_train, y=y_train, cv=10, scoring='roc_auc')
    roc_score = np.mean(cross_val_roc)
    aucs[i] = roc_score
print("Using the min_samples_split hyperparameter: ", aucs)

Using the min_samples_split hyperparameter:  {0.01: 0.7971155351969306, 0.05: 0.8157350078861707, 0.1: 0.8302917044196114, 0.2: 0.8400290627761556, 0.5: 0.804992337550477}


In [8]:
#test out min_samples_leaf parameter with values of [.01, .05, .1, .2, .5]
aucs = dict()
params = [.01, .05, .1, .2, .5]
for i in params:
    clf = tree.DecisionTreeClassifier(min_samples_leaf = i)
    cross_val_roc = cross_val_score(clf, X=x_train, y=y_train, cv=10, scoring='roc_auc')
    roc_score = np.mean(cross_val_roc)
    aucs[i] = roc_score
print("Using the min_samples_leaf hyperparameter: ", aucs)

Using the min_samples_leaf hyperparameter:  {0.01: 0.8288779615087755, 0.05: 0.8297986090137253, 0.1: 0.8305953709296732, 0.2: 0.7927730726422586, 0.5: 0.5233987158405763}


In [9]:
#test out max_features parameter with values of 1 to 8
aucs = dict()
for i in range(1,8):
    clf = tree.DecisionTreeClassifier(max_features=i)
    cross_val_roc = cross_val_score(clf, x_train, y_train, cv=10, scoring='roc_auc')
    roc_score = np.mean(cross_val_roc)
    aucs[i] = roc_score
print("Using the max_features hyperparameter: ", aucs)

Using the max_features hyperparameter:  {1: 0.739331367383693, 2: 0.7160683873619921, 3: 0.7270564867948589, 4: 0.7330205082385315, 5: 0.751907642314619, 6: 0.7753040859872256, 7: 0.7592604827343199}


In [10]:
#test out min_impurity_decrease parameter with values of [.001, .0001, .01, .05, .1, .2, .5]
aucs = dict()
params = [.001, .0001, .01, .05, .1, .2, .5]
for i in params:
    clf = tree.DecisionTreeClassifier(min_impurity_decrease = i)
    cross_val_roc = cross_val_score(clf, X=x_train, y=y_train, cv=10, scoring='roc_auc')
    roc_score = np.mean(cross_val_roc)
    aucs[i] = roc_score
print("Using the min_impurity_decrease hyperparameter: ", aucs)

Using the min_impurity_decrease hyperparameter:  {0.001: 0.7565446827074734, 0.0001: 0.7625792112710716, 0.01: 0.8196643903039252, 0.05: 0.7652214280121256, 0.1: 0.7652214280121256, 0.2: 0.5, 0.5: 0.5}


In [11]:
#use GridSearchCV to find the best hyperparameters
param_grid = [
  {'max_depth': range(1,12), 'min_samples_split': [.01, .05, .1, .2, .5], 'min_samples_leaf': [.01, .05, .1, .2, .5], 'max_features': range(1,8), 'min_impurity_decrease': [.001, .0001, .01, .05, .1, .2, .5]},
 ]
clf = tree.DecisionTreeClassifier()
gscv = GridSearchCV(clf, param_grid, cv=10, scoring='roc_auc')
start = time()
gscv = gscv.fit(x_train , y_train)
stop = time()
print("Best Score: ", gscv.best_score_)
print("Best Parameters: ", gscv.best_params_)
print("Time: ", stop-start)

Best Score:  0.8519964713172512
Best Parameters:  {'max_depth': 10, 'max_features': 6, 'min_impurity_decrease': 0.0001, 'min_samples_leaf': 0.01, 'min_samples_split': 0.1}
Time:  670.9715230464935




In [12]:
#use RandomizedSearchCV to find the best hyperparameters
param_grid = {'max_depth': range(1,12), 'min_samples_split': [.01, .05, .1, .2, .5], 'min_samples_leaf': [.01, .05, .1, .2, .5], 'max_features': range(1,8), 'min_impurity_decrease': [.001, .0001, .01, .05, .1, .2, .5]}
clf = tree.DecisionTreeClassifier()
gscv = RandomizedSearchCV(clf, param_grid, cv=10, scoring='roc_auc')
start = time()
gscv = gscv.fit(x_train , y_train)
stop = time()
print("Best Score: ", gscv.best_score_)
print("Best Parameters: ", gscv.best_params_)
print("Time: ", stop-start)

Best Score:  0.7727468404728659
Best Parameters:  {'min_samples_split': 0.2, 'min_samples_leaf': 0.2, 'min_impurity_decrease': 0.0001, 'max_features': 7, 'max_depth': 11}
Time:  0.5059289932250977


