## Modeling Survival on the Titanic

In [1]:
import pandas as pd
import numpy as np
import re
import pickle

In [2]:
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
init_notebook_mode(connected=True)

import plotly.plotly as py
import plotly.graph_objs as go
from plotly.graph_objs import *

In [3]:
from sklearn import svm
from sklearn import tree
from sklearn.model_selection import cross_val_score
from sklearn.metrics import classification_report
from sklearn.naive_bayes import MultinomialNB, BernoulliNB, GaussianNB
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.metrics import classification_report,confusion_matrix,accuracy_score, roc_curve
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier, BaggingClassifier
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline

In [4]:
# Define the color palette (17 colors).
Viridis= ['#440154', '#48186a', '#472d7b', '#424086', '#3b528b', '#33638d', '#2c728e', '#26828e', '#21918c', '#1fa088',
          '#28ae80', '#3fbc73', '#5ec962', '#84d44b', '#addc30','#d8e219', '#fde725']   

### Read in the titanic dataset

In [5]:
df = pd.read_csv('00_resources/titanic.csv')
# df = pd.read_csv("https://raw.githubusercontent.com/austinlasseter/plotly_dash_tutorial/master/00%20resources/titanic.csv")
print(df.shape)
df.head()

(712, 8)


Unnamed: 0,Survived,Pclass,Sex,Age,Fare,Embarked,Name,SibSp
0,0,3,male,22.0,7.25,Southampton,"Braund, Mr. Owen Harris",1
1,1,1,female,38.0,71.2833,Cherbourg,"Cumings, Mrs. John Bradley (Florence Briggs Th...",1
2,1,3,female,26.0,7.925,Southampton,"Heikkinen, Miss. Laina",0
3,1,1,female,35.0,53.1,Southampton,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",1
4,0,3,male,35.0,8.05,Southampton,"Allen, Mr. William Henry",0


### Features

In [6]:
# Make some dummies for sex
df = pd.get_dummies(df, prefix='', prefix_sep='', columns=['Sex'])

In [7]:
# Make some dummies Passenger's cabin class
df = pd.get_dummies(df, prefix='pclass', prefix_sep='_', columns=['Pclass'])

In [8]:
# Make some dummies Passenger's embarkation
df = pd.get_dummies(df, prefix='', prefix_sep='', columns=['Embarked'])

In [9]:
# Siblings and Spouses
df.SibSp.value_counts()

0    469
1    183
2     25
4     18
3     12
5      5
Name: SibSp, dtype: int64

In [10]:
# Length of name
df['name_length']=df['Name'].apply(lambda x: len(x))

In [11]:
# Check for missing values as they will skew the regression
print(df.shape)
df = df.dropna()
print(df.shape)

(712, 14)
(712, 14)


In [12]:
# What are the possible features?
df.columns

Index(['Survived', 'Age', 'Fare', 'Name', 'SibSp', 'female', 'male',
       'pclass_1', 'pclass_2', 'pclass_3', 'Cherbourg', 'Queenstown',
       'Southampton', 'name_length'],
      dtype='object')

In [13]:
# Select our features
feature_cols = ['Fare', 'Age', 'pclass_2', 'pclass_3', 'female',  'Cherbourg', 'Queenstown','SibSp', 'name_length', 'Name']
X = df[feature_cols]
y = df.Survived

### Modeling

In [14]:
# Train-test split
X_train1, X_test1, y_train, y_test = train_test_split(X, y, test_size = .2, random_state=42)

In [15]:
# Remove the names, we'll save those for later use.
X_train=X_train1.drop('Name', axis=1)
X_test=X_test1.drop('Name', axis=1)

In [16]:
gnb = GaussianNB()
# Fit on the training data
gnb_model = gnb.fit(X_train, y_train)
# Predict on the testing data
predictions=gnb_model.predict(X_test)
probabilities = gnb_model.predict_proba(X_test)[:,1]
# Calculate the roc-auc score
auc_nb=metrics.roc_auc_score(y_test, predictions)
acc_nb = metrics.accuracy_score(y_test, predictions)
f1_nb = metrics.f1_score(y_test, predictions)
# Display
print('F1 Score', "%.4f" % round(f1_nb,4))
print('Accuracy', "%.4f" % round(acc_nb,4))
print('AUC Score', "%.4f" % round(auc_nb,4))

F1 Score 0.6783
Accuracy 0.7413
AUC Score 0.7283


In [17]:
knn = KNeighborsClassifier(n_neighbors=7)
# Fit on the training data
knn_model=knn.fit(X_train, y_train)
# Predict on the testing data
predictions=knn_model.predict(X_test)
probabilities = knn_model.predict_proba(X_test)[:,1]
# Calculate the roc-auc score
auc_knn=metrics.roc_auc_score(y_test, predictions)
acc_knn = metrics.accuracy_score(y_test, predictions)
f1_knn = metrics.f1_score(y_test, predictions)
# Display
print('F1 Score', "%.4f" % round(f1_knn,4))
print('Accuracy', "%.4f" % round(acc_knn,4))
print('AUC Score', "%.4f" % round(auc_knn,4))

F1 Score 0.6218
Accuracy 0.6853
AUC Score 0.6749


In [18]:
rf = RandomForestClassifier()
# Fit on the training data
rf_model=rf.fit(X_train, y_train)
# Predict on the testing data
predictions=rf_model.predict(X_test)
probabilities = rf_model.predict_proba(X_test)[:,1]
# Calculate the roc-auc score
auc_rf=metrics.roc_auc_score(y_test, predictions)
acc_rf = metrics.accuracy_score(y_test, predictions)
f1_rf = metrics.f1_score(y_test, predictions)
# Display
print('F1 Score', "%.4f" % round(f1_rf,4))
print('Accuracy', "%.4f" % round(acc_rf,4))
print('AUC Score', "%.4f" % round(auc_rf,4))

F1 Score 0.7368
Accuracy 0.7902
AUC Score 0.7771


In [19]:
logreg = LogisticRegression()
# Fit on the training data
log_model=logreg.fit(X_train, y_train)
# Predict on the testing data
predictions=log_model.predict(X_test)
probabilities = log_model.predict_proba(X_test)[:,1]
# Calculate the roc-auc score
auc_log=metrics.roc_auc_score(y_test, predictions)
acc_log = metrics.accuracy_score(y_test, predictions)
f1_log = metrics.f1_score(y_test, predictions)
# Display
print('F1 Score', "%.4f" % round(f1_log,4))
print('Accuracy', "%.4f" % round(acc_log,4))
print('AUC Score', "%.4f" % round(auc_log,4))

F1 Score 0.7304
Accuracy 0.7832
AUC Score 0.7708


### Comparison of Four Models

In [20]:
# Comparison
f1=[f1_nb, f1_log, f1_knn, f1_rf]
acc=[acc_nb, acc_log, acc_knn, acc_rf]
auc=[auc_nb, auc_log, auc_knn, auc_rf]
models=['naive bayes', 'logistic regression', 'k-nearest neighbors', 'random forest']
index=['F1 score', 'Accuracy', 'AUC score']
results=pd.DataFrame([f1, acc, auc], index=index, columns=models)

In [21]:
# Let's display that with plotly.
mydata1 = go.Bar(
    x=results.loc['F1 score'].index,
    y=results.loc['F1 score'],
    name=results.index[0],
    marker=dict(color=Viridis[16])
)
mydata2 = go.Bar(
    x=results.loc['Accuracy'].index,
    y=results.loc['Accuracy'],
    name=results.index[1],
    marker=dict(color=Viridis[10])
)
mydata3 = go.Bar(
    x=results.loc['AUC score'].index,
    y=results.loc['AUC score'],
    name=results.index[2],
    marker=dict(color=Viridis[0])
)
mylayout = go.Layout(
    title='Comparison of Possible Models',
    xaxis = dict(title = 'Predictive models'), # x-axis label
    yaxis = dict(title = 'Score'), # y-axis label
    
)
fig = go.Figure(data=[mydata1, mydata2, mydata3], layout=mylayout)
iplot(fig)

### Tuning the Logistic Classifier

In [22]:
# Create regularization penalty space (l1=ridge, l2=lasso)
penalty = ['l1', 'l2'] 

# Create regularization hyperparameter space
C = np.logspace(0, 4, 10)

# Create hyperparameter options
hyperparameters = dict(C=C, penalty=penalty)

# Create grid search using 5-fold cross validation
grid_lr = GridSearchCV(LogisticRegression(), hyperparameters, cv=5,  n_jobs = 1, verbose=0)
grid_lr.fit(X_train, y_train)

GridSearchCV(cv=5, error_score='raise',
       estimator=LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False),
       fit_params=None, iid=True, n_jobs=1,
       param_grid={'C': array([1.00000e+00, 2.78256e+00, 7.74264e+00, 2.15443e+01, 5.99484e+01,
       1.66810e+02, 4.64159e+02, 1.29155e+03, 3.59381e+03, 1.00000e+04]), 'penalty': ['l1', 'l2']},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [23]:
print(grid_lr.best_params_)

log_model = grid_lr

{'C': 2.7825594022071245, 'penalty': 'l2'}


In [24]:
# Predict on the testing data

predictions=log_model.predict(X_test)
probabilities = log_model.predict_proba(X_test)[:,1]

## Final Model Metrics

In [25]:
# Full list of metrics
def model_metrics(y_test, predictions):
    '''
    Calculate 5 standard model metrics
    Return a dictionary with the metrics
    '''
    f1 = metrics.f1_score(y_test, predictions)
    accuracy = metrics.accuracy_score(y_test, predictions)
    error = 1 - accuracy
    precision = metrics.precision_score(y_test, predictions)
    recall = metrics.recall_score(y_test, predictions)
    rocauc =  metrics.roc_auc_score(y_test, predictions)
    return {'f1 score':f1, 'accuracy score': accuracy, 'error rate': error, 'precision score': precision, 'recall score': recall, 'ROC-AUC score': rocauc}

model_metrics(y_test, predictions)

{'ROC-AUC score': 0.7833333333333333,
 'accuracy score': 0.7972027972027972,
 'error rate': 0.2027972027972028,
 'f1 score': 0.743362831858407,
 'precision score': 0.84,
 'recall score': 0.6666666666666666}

In [26]:
FPR, TPR, _ = roc_curve(y_test, probabilities)
FPR

array([0.    , 0.    , 0.0125, 0.0125, 0.025 , 0.025 , 0.0375, 0.0375,
       0.05  , 0.05  , 0.075 , 0.075 , 0.0875, 0.0875, 0.1375, 0.1375,
       0.15  , 0.15  , 0.175 , 0.175 , 0.2   , 0.2   , 0.225 , 0.225 ,
       0.2375, 0.2375, 0.25  , 0.25  , 0.275 , 0.275 , 0.3625, 0.3625,
       0.425 , 0.425 , 0.45  , 0.45  , 0.475 , 0.475 , 0.775 , 0.775 ,
       0.8   , 0.8   , 0.85  , 0.85  , 0.875 , 0.875 , 0.8875, 0.8875,
       1.    ])

In [27]:
roc_score=round(100*roc_auc_score(y_test, predictions),1)
roc_score

78.3

In [28]:
# ROC-AUC figure
FPR, TPR, _ = roc_curve(y_test, probabilities)
roc_score=round(100*roc_auc_score(y_test, predictions),1)
trace0=go.Scatter(
        x=FPR, 
        y=TPR,
        mode='lines',
        name=f'AUC: {roc_score}'
        )
trace1=go.Scatter(
        x=[0,1], 
        y=[0,1],
        mode='lines',
        name='Baseline Area: 50.0'
        )
layout=go.Layout(
    title='Receiver Operating Characteristic (ROC) Curve',
    xaxis={'title': 'False Positive Rate (100-Specificity)','scaleratio': 1,'scaleanchor': 'y'},
    yaxis={'title': 'True Positive Rate (Sensitivity)'}
    )
data=[trace0, trace1]
fig = dict(data=data, layout=layout)
iplot(fig)

In [29]:
len(y_test)

143

In [30]:
# A confusion matrix tells us our false positives and false negatives:
matrix=confusion_matrix(y_test, predictions)
cm=pd.DataFrame(matrix, columns=['predict: +', 'pred: -'])
cm[f'n={len(y_test)}']=['ground: +', 'ground: -']
cm=cm[[f'n={len(y_test)}', 'predict: +', 'pred: -']]
cm

Unnamed: 0,n=143,predict: +,pred: -
0,ground: +,72,8
1,ground: -,21,42


In [31]:
# Display the confusion matrix as a formatted table with Plotly
trace = go.Table(
    header=dict(values=cm.columns,
                line = dict(color='#7D7F80'),
                fill = dict(color=Viridis[11]),
                align = ['left'] * 5),
    cells=dict(values=[cm[f'n={len(y_test)}'], cm['predict: +'], cm['pred: -']],
               line = dict(color='#7D7F80'),
               fill = dict(color='white'),
               align = ['left'] * 5))

layout = go.Layout(
    title = f'Confusion Matrix: Logistic Regression Model', 
    width=500,
    height=400, 
)

data = [trace]
fig = dict(data=data, layout=layout)
iplot(fig)

In [32]:
# Confusion Matrix as a Donut Chart

In [33]:
# Feature importance (Logistic Regression)
results=pd.DataFrame(list(zip(feature_cols, logreg.coef_[0])), columns=['feature', 'coefficient'])
results

Unnamed: 0,feature,coefficient
0,Fare,0.001377
1,Age,-0.036138
2,pclass_2,-0.976368
3,pclass_3,-2.176333
4,female,2.257882
5,Cherbourg,0.339154
6,Queenstown,-0.059023
7,SibSp,-0.414378
8,name_length,0.038485


In [34]:
# Let's display that with Plotly.
mydata = [go.Bar(
    x=results['feature'],
    y=results['coefficient']
)]

mylayout = go.Layout(
    title='Younger passengers who paid higher fares were more likely to survive',
    xaxis = {'title': 'Passenger Features'},
    yaxis = {'title': 'Odds of Survival'}, 

)
fig = go.Figure(data=mydata, layout=mylayout)
iplot(fig)

In [35]:
print(len(probabilities))
print(len(predictions))
print(len(y_test))
print(len(X_test1))

143
143
143
143


In [36]:
X_test1=X_test1.reset_index(drop=True)
y_test=y_test.reset_index(drop=True)
probs=pd.DataFrame(probabilities, columns=['survival_prob'])

In [37]:
# Merge back in the names
final=pd.concat([X_test1, y_test, probs], axis=1)

In [38]:
final.shape

(143, 12)

In [39]:
final.head()

Unnamed: 0,Fare,Age,pclass_2,pclass_3,female,Cherbourg,Queenstown,SibSp,name_length,Name,Survived,survival_prob
0,69.3,24.0,0,0,1,1,0,0,20,"Sagesser, Mlle. Emma",1,0.944555
1,78.2667,54.0,0,0,1,1,0,1,30,"Eustis, Miss. Elizabeth Mussey",1,0.836021
2,79.65,52.0,0,0,0,0,0,1,17,"Taussig, Mr. Emil",0,0.183791
3,262.375,18.0,0,0,1,1,0,2,26,"Ryerson, Miss. Emily Borie",1,0.919267
4,26.0,27.0,1,0,0,0,0,0,27,"Sharp, Mr. Percival James R",0,0.279542


In [40]:
final.to_csv('00_resources/final_probs.csv', index=False)

In [43]:
final.columns

Index(['Fare', 'Age', 'pclass_2', 'pclass_3', 'female', 'Cherbourg',
       'Queenstown', 'SibSp', 'name_length', 'Name', 'Survived',
       'survival_prob'],
      dtype='object')

In [47]:
value='Sharp, Mr. Percival James R'
survival=final.loc[3, 'survival_prob']
survival

0.9192673632810157

In [49]:
names=df['Name'].values
names[:5]

array(['Braund, Mr. Owen Harris',
       'Cumings, Mrs. John Bradley (Florence Briggs Thayer)',
       'Heikkinen, Miss. Laina',
       'Futrelle, Mrs. Jacques Heath (Lily May Peel)',
       'Allen, Mr. William Henry'], dtype=object)

In [50]:
indexs=df['Name'].index.values
indexs[:5]

array([0, 1, 2, 3, 4], dtype=int64)

In [56]:
names=df['Name'].values
index=df['Name'].index.values
nameslist = list(zip(indexs, names))
print(nameslist[5])
print(nameslist[5][0])
print(nameslist[5][1])

(5, 'McCarthy, Mr. Timothy J')
5
McCarthy, Mr. Timothy J


In [60]:
options=[{'label': k, 'value': i} for i,k in nameslist]
options[0]

{'label': 'Braund, Mr. Owen Harris', 'value': 0}

In [65]:
value=nameslist[0][0]
value

0

In [71]:
survival=final.loc[value, 'survival_prob']
round(survival*100)

94.0

In [None]:
round(sur)