# Titanic (Decision Trees)

## Load Libraries and Data

In [1]:
# Load packages
import sys
import io
import requests
import warnings
import numpy as np
import pandas as pd
from sklearn import tree
from sklearn.metrics import (accuracy_score, classification_report,
                             confusion_matrix)
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
import xgboost as xgb
from matplotlib import pyplot as plt
import seaborn as sns

# If on colab, install dtreeviz
if 'google.colab' in sys.modules:
    !pip install -q dtreeviz
    
import dtreeviz

# Make this notebook's output stable across runs
random_state = 1000
np.random.seed(random_state)

# Options for plots
%matplotlib inline
sns.set()

In [2]:
url = 'https://raw.githubusercontent.com/natecraig/aiml/main/Data/titanic.csv'
download = requests.get(url).content
df = pd.read_csv(io.StringIO(download.decode('utf-8')))
df.head()

Unnamed: 0,Passenger ID,Passenger Class,Survived,Name,Sex,Age,On-Board Siblings or Spouses,On-Board Parents or Children,Ticket Number,Fare,Cabin,Port of Embarkation,Destination
0,1,1,1,"Allen, Miss. Elisabeth Walton",Female,29.0,0,0,24160,211.3375,B5,Southampton,"St Louis, MO"
1,2,1,1,"Allison, Master. Hudson Trevor",Male,0.9167,1,2,113781,151.55,C22 C26,Southampton,"Montreal, PQ / Chesterville, ON"
2,3,1,0,"Allison, Miss. Helen Loraine",Female,2.0,1,2,113781,151.55,C22 C26,Southampton,"Montreal, PQ / Chesterville, ON"
3,4,1,0,"Allison, Mr. Hudson Joshua Creighton",Male,30.0,1,2,113781,151.55,C22 C26,Southampton,"Montreal, PQ / Chesterville, ON"
4,5,1,0,"Allison, Mrs. Hudson J C (Bessie Waldo Daniels)",Female,25.0,1,2,113781,151.55,C22 C26,Southampton,"Montreal, PQ / Chesterville, ON"


In [3]:
df.describe()

Unnamed: 0,Passenger ID,Passenger Class,Survived,Age,On-Board Siblings or Spouses,On-Board Parents or Children,Fare
count,1309.0,1309.0,1309.0,1046.0,1309.0,1309.0,1308.0
mean,655.0,2.294882,0.381971,29.881135,0.498854,0.385027,33.295479
std,378.020061,0.837836,0.486055,14.4135,1.041658,0.86556,51.758668
min,1.0,1.0,0.0,0.1667,0.0,0.0,0.0
25%,328.0,2.0,0.0,21.0,0.0,0.0,7.8958
50%,655.0,3.0,0.0,28.0,0.0,0.0,14.4542
75%,982.0,3.0,1.0,39.0,1.0,0.0,31.275
max,1309.0,3.0,1.0,80.0,8.0,9.0,512.3292


## Decision Tree

In [4]:
# Classify survived based on class, sex, and age
num_feature_names = ['Passenger Class', 'Age', 'Fare',
                     'On-Board Siblings or Spouses',
                     'On-Board Parents or Children']
cat_feature_names = ['Sex']

# Drop observations missing age or fare
df = df.dropna(subset=['Age', 'Fare'])

# Get numerical features
X_num = df[num_feature_names]

# Encode categorical features
X_cat = pd.get_dummies(df[cat_feature_names])

# Combine numerical and categorical features
X = pd.concat([X_num, X_cat], axis=1)

y = df['Survived']
class_names = sorted(y.unique())

# Split data into training and testing sets
(X_train, X_test, 
 y_train, y_test) = train_test_split(X, y, test_size=0.15,
                                     stratify=y, random_state=random_state)

In [5]:
# Fit a decision tree
tree_clf = DecisionTreeClassifier(max_depth=1, random_state=random_state)
tree_clf.fit(X_train, y_train)

In [6]:
# Assess decision tree on training data
y_pred = tree_clf.predict(X_train)
print(classification_report(y_train, y_pred))

              precision    recall  f1-score   support

           0       0.79      0.85      0.82       525
           1       0.76      0.67      0.71       363

    accuracy                           0.78       888
   macro avg       0.77      0.76      0.77       888
weighted avg       0.78      0.78      0.78       888



In [7]:
# Assess decision tree on testing data
y_pred = tree_clf.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.82      0.81      0.81        93
           1       0.72      0.73      0.73        64

    accuracy                           0.78       157
   macro avg       0.77      0.77      0.77       157
weighted avg       0.78      0.78      0.78       157

