In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
import numpy as np
from sklearn import tree
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from time import time
pd.options.mode.chained_assignment = None  # default='warn'

In [None]:
def preprocessdataframe (df):
    imputer = SimpleImputer(missing_values = np.nan, strategy = 'mean')
    imputer = imputer.fit(df.loc[:,['Age']])
    df.loc[:,'Age'] = imputer.transform(df.loc[:,['Age']])

    imputer = SimpleImputer(missing_values = np.nan, strategy = 'mean')
    imputer = imputer.fit(df.loc[:,['Fare']])
    df.loc[:,'Fare'] = imputer.transform(df.loc[:,['Fare']])

    df.Embarked = df.Embarked.fillna('S')

    df = pd.get_dummies(data=df, columns=['Embarked', 'Pclass', 'Sex'])

    return df

In [None]:
def showdecisiontree(model, feature_names, name):
    dot_data = tree.export_graphviz(model, out_file=None,
         feature_names=feature_names,
         class_names=['Did not survive', 'Survived'],
         filled=True, rounded=True,
         special_characters=True)
    graph = graphviz.Source(dot_data)
    graph.render(name)
    return True



In [None]:
def showroccurve(fpr, tpr, roc_auc, label):
    plt.figure()
    lw = 2
    plt.plot(fpr, tpr, color='darkorange',
             lw=lw, label='ROC curve - {0} (area = {1:0.2f})'.format(label, roc_auc))
    plt.plot([0, 1], [0, 1], color='navy', lw=lw, linestyle='--')
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('Receiver operating characteristic example')
    plt.legend(loc="lower right")
    plt.show()



In [None]:
def showmultiroccurve(params): #this should be a list of dictionaries of fpr, tpr, roc_auc, label, and color
    plt.figure()
    lw = 2
    for param in params:
        plt.plot(param["fpr"], param["tpr"], color=param["color"],
             lw=lw, label='ROC curve - {0} (area = {1:0.2f})'.format(param["label"],param["roc_auc"]))
    plt.plot([0, 1], [0, 1], color='navy', lw=lw, linestyle='--')
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('Receiver operating characteristic example')
    plt.legend(loc="lower right")
    plt.show()

In [None]:
# roccurvevalues = []

# clfkfold = tree.DecisionTreeClassifier()
# kf = StratifiedKFold(n_splits=10)

# for train_index, test_index in kf.split(x_train, y_train):
#     roccurvevalue = dict()
#     clfkfold = tree.DecisionTreeClassifier()

#     cvv_x_train, cvv_x_test = x_train.iloc[train_index], x_train.iloc[test_index]
#     cvv_y_train, cvv_y_test = y_train.iloc[train_index], y_train.iloc[test_index]


#     clfkfold = clfkfold.fit(cvv_x_train, cvv_y_train)
#     cvv_y_predictions = clfkfold.predict(cvv_x_test)

#     cvv_cm = pd.DataFrame(
#         confusion_matrix(cvv_y_test, cvv_y_predictions),
#         columns=['Predicted Not Survival', 'Predicted Survival'],
#         index=['True Not Survival', 'True Survival']
#     )
#     cvv_score = clfkfold.score(cvv_x_test, cvv_y_test)

#     cvv_fpr, cvv_tpr, thresholds = metrics.roc_curve(cvv_y_test, cvv_y_predictions)
#     cvv_roc_auc = metrics.roc_auc_score(cvv_y_test, cvv_y_predictions)

#     fprs.append(cvv_fpr[1])
#     tprs.append(cvv_tpr[1])
#     rocs.append(cvv_roc_auc)

# avgfpr = array.array('f', [0.0, np.mean(fprs), 1])
# avgtpr = array.array('f', [0.0, np.mean(tprs), 1])
# avgroc = np.mean(rocs)

In [None]:
df = pd.read_csv('~/Documents/GitHub/TiberDataScienceLearning/Data/Titanic/train.csv')
y = df[['Survived']]
x = df[['Pclass','Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked']]

In [None]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.2, random_state = 0)
x_train = preprocessdataframe(x_train)
x_test = preprocessdataframe(x_test)

In [None]:
#basic decision tree with no hyperparameters
clf = tree.DecisionTreeClassifier()
cross_val_roc = cross_val_score(clf, X=x_train, y=y_train, cv=10, scoring='roc_auc')
roc_score = np.mean(cross_val_roc)
print("No hyperparameter decision tree: ", roc_score)

In [None]:
#testing out max_depth parameters with values from 1 to 12
aucs = dict()
for i in range(1,12):
    clf = tree.DecisionTreeClassifier(max_depth=i)
    cross_val_roc = cross_val_score(clf, x_train, y_train, cv=10, scoring='roc_auc')
    roc_score = np.mean(cross_val_roc)
    aucs[i] = roc_score
print("Using the max_depth hyperparameter: ", aucs)

In [None]:
#test out min_samples_split parameter with values of [.01, .05, .1, .2, .5]
aucs = dict()
params = [.01, .05, .1, .2, .5]
for i in params:
    clf = tree.DecisionTreeClassifier(min_samples_split = i)
    cross_val_roc = cross_val_score(clf, X=x_train, y=y_train, cv=10, scoring='roc_auc')
    roc_score = np.mean(cross_val_roc)
    aucs[i] = roc_score
print("Using the min_samples_split hyperparameter: ", aucs)

In [None]:
#test out min_samples_leaf parameter with values of [.01, .05, .1, .2, .5]
aucs = dict()
params = [.01, .05, .1, .2, .5]
for i in params:
    clf = tree.DecisionTreeClassifier(min_samples_leaf = i)
    cross_val_roc = cross_val_score(clf, X=x_train, y=y_train, cv=10, scoring='roc_auc')
    roc_score = np.mean(cross_val_roc)
    aucs[i] = roc_score
print("Using the min_samples_leaf hyperparameter: ", aucs)

In [None]:
#test out max_features parameter with values of 1 to 8
aucs = dict()
for i in range(1,8):
    clf = tree.DecisionTreeClassifier(max_features=i)
    cross_val_roc = cross_val_score(clf, x_train, y_train, cv=10, scoring='roc_auc')
    roc_score = np.mean(cross_val_roc)
    aucs[i] = roc_score
print("Using the max_features hyperparameter: ", aucs)

In [None]:
#test out min_impurity_decrease parameter with values of [.0001, .001, .01, .05, .1, .2, .5]
aucs = dict()
params = [.0001, .001, .01, .05, .1, .2, .5]
for i in params:
    clf = tree.DecisionTreeClassifier(min_impurity_decrease = i)
    cross_val_roc = cross_val_score(clf, X=x_train, y=y_train, cv=10, scoring='roc_auc')
    roc_score = np.mean(cross_val_roc)
    aucs[i] = roc_score
print("Using the min_impurity_decrease hyperparameter: ", aucs)

In [None]:
#use GridSearchCV to find the best hyperparameters
param_grid = [
  {'max_depth': range(1,12), 'min_samples_split': [.01, .05, .1, .2, .5], 'min_samples_leaf': [.01, .05, .1, .2, .5], 'max_features': range(1,8), 'min_impurity_decrease': [.001, .0001, .01, .05, .1, .2, .5]},
 ]
clf = tree.DecisionTreeClassifier()
gscv = GridSearchCV(clf, param_grid, cv=10, scoring='roc_auc')
start = time()
gscv = gscv.fit(x_train , y_train)
stop = time()
print("Best Score: ", gscv.best_score_)
print("Best Parameters: ", gscv.best_params_)
print("Time: ", stop-start)

In [None]:
#use RandomizedSearchCV to find the best hyperparameters
param_grid = {'max_depth': range(1,12), 'min_samples_split': [.01, .05, .1, .2, .5], 'min_samples_leaf': [.01, .05, .1, .2, .5], 'max_features': range(1,8), 'min_impurity_decrease': [.001, .0001, .01, .05, .1, .2, .5]}
clf = tree.DecisionTreeClassifier()
gscv = RandomizedSearchCV(clf, param_grid, cv=10, scoring='roc_auc')
start = time()
gscv = gscv.fit(x_train , y_train)
stop = time()
print("Best Score: ", gscv.best_score_)
print("Best Parameters: ", gscv.best_params_)
print("Time: ", stop-start)