In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import KFold, cross_val_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import scale
from sklearn.tree import DecisionTreeClassifier
from sklearn.utils import resample

np.random.seed(42)

df = pd.read_csv('data/titanic.csv', index_col='PassengerId')
df.head()

Unnamed: 0_level_0,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [2]:
df.shape

(891, 11)

In [3]:
df.isna().any()

Survived    False
Pclass      False
Name        False
Sex         False
Age          True
SibSp       False
Parch       False
Ticket      False
Fare        False
Cabin        True
Embarked     True
dtype: bool

In [4]:
df['Age'].isna().sum()

177

In [5]:
df.dropna(subset=['Age'], inplace=True)
df.head()

Unnamed: 0_level_0,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [6]:
df['Cabin'].isna().sum()

529

In [7]:
pclasses = pd.get_dummies(df['Pclass'], prefix='Pclass')
df = pd.concat([df, pclasses], axis=1)
df = df.drop('Pclass', axis=1)

In [8]:
embarked = pd.get_dummies(df['Embarked'], prefix='Embarked')
df = pd.concat([df, embarked], axis=1)
df = df.drop('Embarked', axis=1)
df.head()

Unnamed: 0_level_0,Survived,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Pclass_1,Pclass_2,Pclass_3,Embarked_C,Embarked_Q,Embarked_S
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
1,0,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,False,False,True,False,False,True
2,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,True,False,False,True,False,False
3,1,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,False,False,True,False,False,True
4,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,True,False,False,False,False,True
5,0,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,False,False,True,False,False,True


In [9]:
df = df.replace({False: 0, True: 1, 'male': 0, 'female': 1})
df.head()

Unnamed: 0_level_0,Survived,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Pclass_1,Pclass_2,Pclass_3,Embarked_C,Embarked_Q,Embarked_S
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
1,0,"Braund, Mr. Owen Harris",0,22.0,1,0,A/5 21171,7.25,,0,0,1,0,0,1
2,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",1,38.0,1,0,PC 17599,71.2833,C85,1,0,0,1,0,0
3,1,"Heikkinen, Miss. Laina",1,26.0,0,0,STON/O2. 3101282,7.925,,0,0,1,0,0,1
4,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",1,35.0,1,0,113803,53.1,C123,1,0,0,0,0,1
5,0,"Allen, Mr. William Henry",0,35.0,0,0,373450,8.05,,0,0,1,0,0,1


In [10]:
features = ['Sex', 'Age', 'SibSp', 'Parch', 'Fare'] + df.columns.tolist()[9:]
df[features].isna().any().any()

False

In [11]:
X = df[features].to_numpy()
y = df['Survived'].to_numpy()
X.shape, y.shape

((714, 11), (714,))

In [12]:
X

array([[ 0., 22.,  1., ...,  0.,  0.,  1.],
       [ 1., 38.,  1., ...,  1.,  0.,  0.],
       [ 1., 26.,  0., ...,  0.,  0.,  1.],
       ...,
       [ 1., 19.,  0., ...,  0.,  0.,  1.],
       [ 0., 26.,  0., ...,  1.,  0.,  0.],
       [ 0., 32.,  0., ...,  0.,  1.,  0.]])

In [13]:
X = scale(X)
X

array([[-0.75905134, -0.53037664,  0.52457013, ..., -0.47180795,
        -0.20203051,  0.53740921],
       [ 1.31743394,  0.57183099,  0.52457013, ...,  2.11950647,
        -0.20203051, -1.86077941],
       [ 1.31743394, -0.25482473, -0.55170307, ..., -0.47180795,
        -0.20203051,  0.53740921],
       ...,
       [ 1.31743394, -0.73704057, -0.55170307, ..., -0.47180795,
        -0.20203051,  0.53740921],
       [-0.75905134, -0.25482473, -0.55170307, ...,  2.11950647,
        -0.20203051, -1.86077941],
       [-0.75905134,  0.15850313, -0.55170307, ..., -0.47180795,
         4.94974747, -1.86077941]])

In [14]:
def get_bootstrap_score(model):
    num_bootstrap_samples = 1000
    e_trains = []
    e_tests = []
    
    for _ in range(num_bootstrap_samples):
        X_train, y_train = resample(X, y, replace=True, random_state=42)
        indices_not_selected = np.setdiff1d(np.arange(len(X)), np.unique(X_train, return_index=True)[1])
        X_test, y_test = X[indices_not_selected], y[indices_not_selected]

        model.fit(X_train, y_train)
        e_train = 1 - model.score(X_train, y_train)
        e_test = 1 - model.score(X_test, y_test)

        e_trains.append(e_train)
        e_tests.append(e_test)

    e_trains = np.array(e_trains)
    e_tests = np.array(e_tests)
    # Calculate the 0.632 bootstrap estimate
    #in_sample_error = 1 - model.score(X_train, y_train)
    #out_of_sample_error = np.mean(bootstrap_scores)
    bootstrap_estimate = 0.368 * np.mean(e_trains) + 0.632 * np.mean(e_tests)
    
    print("0.632 Bootstrap Estimate:", bootstrap_estimate)
    return bootstrap_estimate

In [17]:
def get_kfolds_score(model):
    kf = KFold(n_splits=10)
    kf_scores = []
    for train_index, test_index in kf.split(X):
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]

        model.fit(X_train, y_train)
        kf_scores.append(1 - model.score(X_test, y_test))

    kf_scores = np.array(kf_scores)
    print("Average Cross-Validation Score:", np.mean(kf_scores))
    return np.mean(kf_scores)

In [19]:
lr = LogisticRegression(random_state=42)
dt = DecisionTreeClassifier(random_state=42)
knn = KNeighborsClassifier(n_neighbors=3)

models = [lr, dt, knn]
b_scores = []
kf_scores = []

for m in models:
    print(m)
    b_scores.append(get_bootstrap_score(m))
    kf_scores.append(get_kfolds_score(m))

LogisticRegression(random_state=42)
0.632 Bootstrap Estimate: 0.18766425446211543
Average Cross-Validation Score: 0.205868544600939
DecisionTreeClassifier(random_state=42)
0.632 Bootstrap Estimate: 0.05532384193346756
Average Cross-Validation Score: 0.22807120500782477
KNeighborsClassifier(n_neighbors=3)
0.632 Bootstrap Estimate: 0.12452220293075911
Average Cross-Validation Score: 0.20154538341158063


In [94]:
df_scores = pd.DataFrame(
    {'name': ['Logistic Regression', 'Decision Tree', 'K-Nearest Neighbor'], 'Bootstrapping Score': b_scores, 'KFold Score': kf_scores}
)
df_scores

Unnamed: 0,name,Bootstrapping Score,KFold Score
0,Logistic Regression,0.187664,0.205869
1,Decision Tree,0.055324,0.228071
2,K-Nearest Neighbor,0.124522,0.201545


In [95]:
px.bar(df_scores, x='name', y=['Bootstrapping Score', 'KFold Score'], barmode='group')

In [110]:
from sklearn.metrics import confusion_matrix
y_pred = lr.predict(X)
cm1 = confusion_matrix(y, y_pred)

y_pred = dt.predict(X)
cm2 = confusion_matrix(y, y_pred)

y_pred = knn.predict(X)
cm3 = confusion_matrix(y, y_pred)

class_labels = ['Died', 'Survived']

import plotly.graph_objects as go
from plotly.subplots import make_subplots
fig = make_subplots(rows=1, cols=3, subplot_titles=["Confusion Matrix 1", "Confusion Matrix 2", "Confusion Matrix 3"])
fig.add_trace(go.Heatmap(z=cm, x=class_labels, y=class_labels, colorscale="Viridis", coloraxis="coloraxis"), row=1, col=1)
fig.add_trace(go.Heatmap(z=cm2, x=class_labels, y=class_labels, colorscale="Viridis", coloraxis="coloraxis"), row=1, col=2)
fig.add_trace(go.Heatmap(z=cm3, x=class_labels, y=class_labels, colorscale="Viridis", coloraxis="coloraxis"), row=1, col=3)

fig.update_layout(
    title_text="Confusion Matrices",
    xaxis=dict(title="Predicted Class"),
    yaxis=dict(title="True Class"),
    coloraxis1=dict(colorscale="Blues")
)

fig.show()