In [17]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import sklearn.model_selection as ms
from sklearn.model_selection import train_test_split, StratifiedKFold, RandomizedSearchCV, GridSearchCV
from sklearn import tree
from sklearn import datasets
from sklearn import linear_model
from sklearn.linear_model import LogisticRegression
from datetime import datetime
from xgboost import XGBClassifier
import warnings
warnings.filterwarnings("ignore")

**Read and clean data**

In [18]:
df = pd.read_csv("titanic.csv").dropna()

X = df.drop(["Name", "Survived", "Sex"], axis = 1)

# Transform 1st, 2nd, 3rd to 1, 2, 3
X['PClass'] = pd.factorize(X['PClass'])[0] + 1

y = df.Survived

**Split into train/test** 

In [19]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)


**Define model type and fit**

In [20]:
tree_model = tree.DecisionTreeClassifier()

tree_model.fit(X_train, y_train)

DecisionTreeClassifier()

**Scoring**

In [21]:
train_score = tree_model.score(X_train, y_train)
train_error = 1 - train_score

print("Score on data used in training =", train_score, "==>error =", train_error)

Score on data used in training = 0.8758278145695364 ==>error = 0.1241721854304636


In [22]:
test_score = tree_model.score(X_test, y_test)
test_error = 1 - test_score

print("Score on test (unseen) data =", test_score, "==>error =", test_error)

Score on test (unseen) data = 0.7763157894736842 ==>error = 0.22368421052631582


**Implies model may be over-fit - big increase in error on test data**

**Hyperparameter Tuning**

In [23]:
# This can take long 'cause runs down a lot of paths
grid_param_tree = {"criterion" : ["gini", "entropy"],
                   "max_depth" : range(1, 31),
                   "max_features" : range(1, 4)}

grid_search_tree = ms.GridSearchCV(tree_model,
                                   grid_param_tree,
                                   cv=4,
                                   scoring="accuracy").fit(X_train, y_train)

print("Best parameters:", grid_search_tree.best_params_,
      "\nBest Score:", grid_search_tree.best_score_)

Best parameters: {'criterion': 'entropy', 'max_depth': 6, 'max_features': 1} 
Best Score: 0.8079470198675497


In [24]:
grid_train_score = grid_search_tree.best_estimator_.score(X_train, y_train)
grid_train_error = 1 - grid_train_score

print("Best estimator score on data used in training =", grid_train_score, "==>error =", grid_train_error)

grid_test_score = grid_search_tree.best_estimator_.score(X_test, y_test)
grid_test_error = 1 - grid_test_score

print("Best estimator score on test (unseen) data =", grid_test_score, "==>error =", grid_test_error)

Best estimator score on data used in training = 0.8344370860927153 ==>error = 0.16556291390728473
Best estimator score on test (unseen) data = 0.8092105263157895 ==>error = 0.1907894736842105


**Error on test data has decreased by over 3% - though this does fluctuate with each run**