In [56]:
import pandas as pd
import numpy as np
from sklearn.datasets import fetch_openml
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, KFold
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, precision_recall_fscore_support, roc_curve
import matplotlib.pyplot as plt

#load data
data = fetch_openml("titanic", version=1, as_frame=True)
titanic = pd.DataFrame(data.data, columns=data.feature_names)
titanic_target = pd.DataFrame(data.target)
titanic.head()

Unnamed: 0,pclass,name,sex,age,sibsp,parch,ticket,fare,cabin,embarked,boat,body,home.dest
0,1,"Allen, Miss. Elisabeth Walton",female,29.0,0,0,24160,211.3375,B5,S,2.0,,"St Louis, MO"
1,1,"Allison, Master. Hudson Trevor",male,0.9167,1,2,113781,151.55,C22 C26,S,11.0,,"Montreal, PQ / Chesterville, ON"
2,1,"Allison, Miss. Helen Loraine",female,2.0,1,2,113781,151.55,C22 C26,S,,,"Montreal, PQ / Chesterville, ON"
3,1,"Allison, Mr. Hudson Joshua Creighton",male,30.0,1,2,113781,151.55,C22 C26,S,,135.0,"Montreal, PQ / Chesterville, ON"
4,1,"Allison, Mrs. Hudson J C (Bessie Waldo Daniels)",female,25.0,1,2,113781,151.55,C22 C26,S,,,"Montreal, PQ / Chesterville, ON"


In [57]:
titanic_target.head()

Unnamed: 0,survived
0,1
1,1
2,0
3,0
4,0


In [58]:
#drop useless columns
titanic.drop(["name", "ticket", "cabin", "embarked", "boat", "body", "home.dest"], axis=1, inplace=True)
titanic

Unnamed: 0,pclass,sex,age,sibsp,parch,fare
0,1,female,29.0000,0,0,211.3375
1,1,male,0.9167,1,2,151.5500
2,1,female,2.0000,1,2,151.5500
3,1,male,30.0000,1,2,151.5500
4,1,female,25.0000,1,2,151.5500
...,...,...,...,...,...,...
1304,3,female,14.5000,1,0,14.4542
1305,3,female,,1,0,14.4542
1306,3,male,26.5000,0,0,7.2250
1307,3,male,27.0000,0,0,7.2250


In [59]:
#make male a column from sex
titanic['male'] = titanic['sex'] == 'male'
titanic.head()

Unnamed: 0,pclass,sex,age,sibsp,parch,fare,male
0,1,female,29.0,0,0,211.3375,False
1,1,male,0.9167,1,2,151.55,True
2,1,female,2.0,1,2,151.55,False
3,1,male,30.0,1,2,151.55,True
4,1,female,25.0,1,2,151.55,False


In [60]:
#merge and extract important columns
merged_titanic = pd.concat([titanic, titanic_target], axis=1)
merged_titanic = merged_titanic.dropna(subset=["age", "fare"])
merged_titanic.head()

Unnamed: 0,pclass,sex,age,sibsp,parch,fare,male,survived
0,1,female,29.0,0,0,211.3375,False,1
1,1,male,0.9167,1,2,151.55,True,1
2,1,female,2.0,1,2,151.55,False,0
3,1,male,30.0,1,2,151.55,True,0
4,1,female,25.0,1,2,151.55,False,0


In [61]:
#split data
X = merged_titanic[["pclass", "male", "age", "sibsp", "parch", "fare"]].values
y = merged_titanic["survived"].values
y = y.astype(int)

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=22)


In [62]:
#model
model = DecisionTreeClassifier()
model.fit(X_train, y_train)

#test
model.predict([[3, True, 22, 1, 0, 7.25]])

array([0])

In [63]:
#scoring the model of decision tree with logistics regression
kf = KFold(n_splits=5, shuffle=True, random_state=10)
dt_accuracy_scores = []
dt_precision_scores = []
dt_recall_scores = []
lr_accuracy_scores = []
lr_precision_scores = []
lr_recall_scores = []

for train_index, test_index in kf.split(X):
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]

    #dt model
    dt = DecisionTreeClassifier()
    dt.fit(X_train, y_train)
    dt_y_pred = dt.predict(X_test)
    
    #score dt
    dt_accuracy_scores.append(precision_score(y_test, dt_y_pred))
    dt_precision_scores.append(precision_score(y_test, dt_y_pred))
    dt_recall_scores.append(recall_score(y_test, dt_y_pred))

    #lr model
    lr = LogisticRegression()
    lr.fit(X_train, y_train)
    lr_y_pred = lr.predict(X_test)

    #score lr
    lr_accuracy_scores.append(accuracy_score(y_test, lr_y_pred))
    lr_precision_scores.append(precision_score(y_test, lr_y_pred))  
    lr_recall_scores.append(recall_score(y_test, lr_y_pred))

print("Decision Tree")
print("Accuracy: ", np.mean(dt_accuracy_scores))
print("Precision: ", np.mean(dt_precision_scores))
print("Recall: ", np.mean(dt_recall_scores))

print("Logistic Regression")
print("Accuracy: ", np.mean(lr_accuracy_scores))
print("Precision: ", np.mean(lr_precision_scores))
print("Recall: ", np.mean(lr_recall_scores))


Decision Tree
Accuracy:  0.693590592662096
Precision:  0.693590592662096
Recall:  0.6632116258617615
Logistic Regression
Accuracy:  0.7866028708133971
Precision:  0.7531585310642546
Recall:  0.6997278494697599


In [64]:
#gini vs entropy

for criterion in ["gini", "entropy"]:
    print("Decision Tree - {}".format(criterion))
    accuracy = []
    precision = []
    recall = []

    for train_index, test_index in kf.split(X):
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]

        dt = DecisionTreeClassifier(criterion=criterion)
        dt.fit(X_train, y_train)
        y_pred = dt.predict(X_test)

        accuracy.append(accuracy_score(y_test, y_pred))
        precision.append(precision_score(y_test, y_pred))
        recall.append(recall_score(y_test, y_pred))

    print("Accuracy: ", np.mean(accuracy))
    print("Precision: ", np.mean(precision))    
    print("Recall: ", np.mean(recall))

Decision Tree - gini
Accuracy:  0.7464114832535884
Precision:  0.6922917272647525
Recall:  0.6704487480414123
Decision Tree - entropy
Accuracy:  0.7358851674641148
Precision:  0.6835175402591133
Recall:  0.6425132168825745


In [65]:
#visualise
from sklearn.tree import export_graphviz
import graphviz
from IPython.display import Image
feature_names = ["pclass", "male", "age", "sibsp", "parch", "fare"]
X = merged_titanic[feature_names].values
y = merged_titanic["survived"].values
y = y.astype(int)

dt = DecisionTreeClassifier(criterion="entropy")
dt.fit(X, y)

dot_file = export_graphviz(dt, feature_names=feature_names)
graph = graphviz.Source(dot_file)
graph.render(filename="titanic_tree", format="png", cleanup=True)

ExecutableNotFound: failed to execute WindowsPath('dot'), make sure the Graphviz executables are on your systems' PATH