In [None]:
import sys
import io
import requests
import warnings
import numpy as np
import pandas as pd
from sklearn import tree
from sklearn.metrics import (accuracy_score, classification_report,
                             confusion_matrix)
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
import xgboost as xgb
from matplotlib import pyplot as plt
import seaborn as sns

In [None]:
!pip install -q dtreeviz

[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/90.5 KB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m90.5/90.5 KB[0m [31m2.6 MB/s[0m eta [36m0:00:00[0m
[?25h

In [None]:
import dtreeviz

In [None]:
# Make this notebook's output stable across runs
random_state = 1000
np.random.seed(random_state)

In [None]:
url = 'https://raw.githubusercontent.com/natecraig/aiml/main/Data/titanic.csv'
download = requests.get(url).content
df = pd.read_csv(io.StringIO(download.decode('utf-8')))
df.head()

Unnamed: 0,Passenger ID,Passenger Class,Survived,Name,Sex,Age,On-Board Siblings or Spouses,On-Board Parents or Children,Ticket Number,Fare,Cabin,Port of Embarkation,Destination
0,1,1,1,"Allen, Miss. Elisabeth Walton",Female,29.0,0,0,24160,211.3375,B5,Southampton,"St Louis, MO"
1,2,1,1,"Allison, Master. Hudson Trevor",Male,0.9167,1,2,113781,151.55,C22 C26,Southampton,"Montreal, PQ / Chesterville, ON"
2,3,1,0,"Allison, Miss. Helen Loraine",Female,2.0,1,2,113781,151.55,C22 C26,Southampton,"Montreal, PQ / Chesterville, ON"
3,4,1,0,"Allison, Mr. Hudson Joshua Creighton",Male,30.0,1,2,113781,151.55,C22 C26,Southampton,"Montreal, PQ / Chesterville, ON"
4,5,1,0,"Allison, Mrs. Hudson J C (Bessie Waldo Daniels)",Female,25.0,1,2,113781,151.55,C22 C26,Southampton,"Montreal, PQ / Chesterville, ON"


In [None]:
df.describe()

Unnamed: 0,Passenger ID,Passenger Class,Survived,Age,On-Board Siblings or Spouses,On-Board Parents or Children,Fare
count,1309.0,1309.0,1309.0,1046.0,1309.0,1309.0,1308.0
mean,655.0,2.294882,0.381971,29.881135,0.498854,0.385027,33.295479
std,378.020061,0.837836,0.486055,14.4135,1.041658,0.86556,51.758668
min,1.0,1.0,0.0,0.1667,0.0,0.0,0.0
25%,328.0,2.0,0.0,21.0,0.0,0.0,7.8958
50%,655.0,3.0,0.0,28.0,0.0,0.0,14.4542
75%,982.0,3.0,1.0,39.0,1.0,0.0,31.275
max,1309.0,3.0,1.0,80.0,8.0,9.0,512.3292


In [None]:
num_feature_names = ['Passenger Class', 'Age', 'Fare',
                     'On-Board Siblings or Spouses',
                     'On-Board Parents or Children']
cat_feature_names = ['Sex']

# Drop observations missing age or fare
df = df.dropna(subset=['Age', 'Fare'])

# Get numerical features
X_num = df[num_feature_names]

X_num

Unnamed: 0,Passenger Class,Age,Fare,On-Board Siblings or Spouses,On-Board Parents or Children
0,1,29.0000,211.3375,0,0
1,1,0.9167,151.5500,1,2
2,1,2.0000,151.5500,1,2
3,1,30.0000,151.5500,1,2
4,1,25.0000,151.5500,1,2
...,...,...,...,...,...
1301,3,45.5000,7.2250,0,0
1304,3,14.5000,14.4542,1,0
1306,3,26.5000,7.2250,0,0
1307,3,27.0000,7.2250,0,0


In [None]:
# Encode categorical features
X_cat = pd.get_dummies(df[cat_feature_names])

X_cat

Unnamed: 0,Sex_Female,Sex_Male
0,1,0
1,0,1
2,1,0
3,0,1
4,1,0
...,...,...
1301,0,1
1304,1,0
1306,0,1
1307,0,1


In [None]:
# Combine numerical and categorical features
X = pd.concat([X_num, X_cat], axis=1)
X

Unnamed: 0,Passenger Class,Age,Fare,On-Board Siblings or Spouses,On-Board Parents or Children,Sex_Female,Sex_Male
0,1,29.0000,211.3375,0,0,1,0
1,1,0.9167,151.5500,1,2,0,1
2,1,2.0000,151.5500,1,2,1,0
3,1,30.0000,151.5500,1,2,0,1
4,1,25.0000,151.5500,1,2,1,0
...,...,...,...,...,...,...,...
1301,3,45.5000,7.2250,0,0,0,1
1304,3,14.5000,14.4542,1,0,1,0
1306,3,26.5000,7.2250,0,0,0,1
1307,3,27.0000,7.2250,0,0,0,1


In [None]:
y = df['Survived']
y

0       1
1       1
2       0
3       0
4       0
       ..
1301    0
1304    0
1306    0
1307    0
1308    0
Name: Survived, Length: 1045, dtype: int64

In [None]:
class_names = sorted(y.unique())

In [None]:
# Split data into training and testing sets
(X_train, X_test, 
 y_train, y_test) = train_test_split(X, y, test_size=0.15,
                                     stratify=y, random_state=random_state)

In [None]:
# Fit a decision tree
tree_clf = DecisionTreeClassifier(max_depth=1, random_state=random_state)
tree_clf.fit(X_train, y_train)

In [None]:
# Assess decision tree on training data
y_pred = tree_clf.predict(X_train)
print(classification_report(y_train, y_pred))

              precision    recall  f1-score   support

           0       0.79      0.85      0.82       525
           1       0.76      0.67      0.71       363

    accuracy                           0.78       888
   macro avg       0.77      0.76      0.77       888
weighted avg       0.78      0.78      0.78       888



In [None]:
# Assess decision tree on testing data
y_pred = tree_clf.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.82      0.81      0.81        93
           1       0.72      0.73      0.73        64

    accuracy                           0.78       157
   macro avg       0.77      0.77      0.77       157
weighted avg       0.78      0.78      0.78       157



1. It can be said that the model performs well on the training data as the precision score for both survived and not survived is 0.76 and 0.79 respectively, indicating that the model was able to correctly predict survived/not survived more than three quarters of the time. Similarly, the high recall score for not survived shows that the model was able to correctly predict 81% of all not survived passengers, while the relatively lower recall score of the survived category is also high, but not ideal.
The model also generalizes reasonably well since the accuracy of the model with the test data is 0.78 or 78%, which is a decent indicator of a well-generalized model. Additionally, the high (and similar to train data) precision, recall and f1-score results for the test data shows that the model, on average, generalizes well to new data. 

2. As seen below, the use of GridSearchCV on 'max_depth' to find the best possible model results in a max_depth of 3, providing much higher precision, recall, and f1-scores for both survived and not survived than the original decision tree model. 

In [None]:
parameters = {
    'max_depth':range(2, 9)
}

# Fit a decision tree
tree = DecisionTreeClassifier(random_state=random_state)

In [None]:
scoring = {'precision', 'recall', 'f1', 'accuracy'}

# create a grid search object
grid = GridSearchCV(tree, param_grid=parameters, cv= 10, scoring=scoring, refit='accuracy')

In [None]:
grid.fit(X_train, y_train)

In [None]:
y_pred = grid.predict(X_train)
print(classification_report(y_train, y_pred))

              precision    recall  f1-score   support

           0       0.82      0.88      0.85       525
           1       0.80      0.73      0.76       363

    accuracy                           0.82       888
   macro avg       0.81      0.80      0.81       888
weighted avg       0.82      0.82      0.81       888



In [None]:
bestParam = grid.best_params_
bestscore = grid.best_score_

print("Best Parameters: ", bestParam)

Best Parameters:  {'max_depth': 3}


3. Using the training data, the best number of trees is 175, and the best number of maximum depth is 6. These hyperparameters provide much higher scores in all of precision, recall and f1-score than the decision tree. The accuracy and weight averages of precision, recall and f1-score all increased by 3%, indicating that this model performed better than the simple decision tree model.

In [None]:
params = {
    'n_estimators':range(25, 151, 25),
    'max_depth':range(7, 13)
}

rfc = RandomForestClassifier(random_state=random_state)

In [None]:
grid2 = GridSearchCV(rfc, param_grid=params, cv= 10, scoring=scoring, refit='accuracy')

In [None]:
grid2.fit(X_train, y_train)

In [None]:
y_pred = grid2.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.81      0.88      0.85        93
           1       0.80      0.70      0.75        64

    accuracy                           0.81       157
   macro avg       0.81      0.79      0.80       157
weighted avg       0.81      0.81      0.81       157



In [None]:
bestParam2 = grid2.best_params_

print("Best Parameters: ", bestParam2)

Best Parameters:  {'max_depth': 7, 'n_estimators': 25}


4. The best hyperparameters for the XGB classifier are n_estimators: 150 and max_depth: 2, as found via GridSearchCV. The performance is similar to that of the Random Forest, however a difference was present in the precision scores for the not survived passengers, which increased for the XGB classifier by 2%. Additionally, while the precision score for survived passengers decreased, the recall and f1-score both increased in the XGB classifier.

In [None]:
params = {
    'n_estimators':range(25, 251, 25),
    'max_depth':range(1, 7)
}

xgb = XGBClassifier(random_state=random_state)

In [None]:
grid3 = GridSearchCV(xgb, param_grid=params, cv= 10, scoring=scoring, refit='accuracy')

In [None]:
grid3.fit(X_train, y_train)

In [None]:
y_pred = grid3.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.83      0.85      0.84        93
           1       0.77      0.75      0.76        64

    accuracy                           0.81       157
   macro avg       0.80      0.80      0.80       157
weighted avg       0.81      0.81      0.81       157



In [None]:
bestParam3 = grid3.best_params_
bestscore3 = grid3.best_score_

print("Best Parameters: ", bestParam3)

Best Parameters:  {'max_depth': 2, 'n_estimators': 150}


5. For the random forest classifier, the age and fare were the most important features for the model. For the XGB classifier, the most important features were passenger class and sex, although sex was not as heavily factored as passenger class. According to the random forest classifier, a passenger's age and the fare the paid factored heavily into the survival of the passenger. According to the XGB classifier, the passeneger class and sex were the most important factors in the survival of passengers.

In [None]:
rfc.fit(X_train, y_train)

In [None]:
feature_vars = ['Passenger Class', 'Age', 'Fare', 'Sex']
# Get feature importances
for name, imp in zip(feature_vars, rfc.feature_importances_):
    print('{:<16} {:.3f}'.format(name, imp))

Passenger Class  0.079
Age              0.284
Fare             0.281
Sex              0.041


In [None]:
xgb.fit(X_train, y_train)

In [None]:
for name, imp in zip(feature_vars, xgb.feature_importances_):
    print('{:<16} {:.3f}'.format(name, imp))

Passenger Class  0.182
Age              0.033
Fare             0.034
Sex              0.084
