# Using `sklearn` to predict if a passanger survived the titanic shipwreck

## Imports

In [50]:
from hashlib import sha1

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from IPython.display import HTML

pd.set_option("display.max_colwidth", 200)

from sklearn.compose import ColumnTransformer
from sklearn.dummy import DummyClassifier
from sklearn.impute import SimpleImputer
from sklearn.model_selection import cross_val_score, cross_validate, train_test_split, RandomizedSearchCV, GridSearchCV
from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import (
    FunctionTransformer,
    Normalizer,
    OneHotEncoder,
    OrdinalEncoder,
    StandardScaler,
    normalize,
    scale
)
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier

## Get the data
### Raw Training data

In [2]:
raw_df = pd.read_csv('data/titanic/train.csv')
raw_df

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Thayer)",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,,S
...,...,...,...,...,...,...,...,...,...,...,...,...
886,887,0,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0000,,S
887,888,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0000,B42,S
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,W./C. 6607,23.4500,,S
889,890,1,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0000,C148,C


### EDA
#### Shape and null values

In [3]:
raw_df.shape

(891, 12)

In [4]:
raw_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
PassengerId    891 non-null int64
Survived       891 non-null int64
Pclass         891 non-null int64
Name           891 non-null object
Sex            891 non-null object
Age            714 non-null float64
SibSp          891 non-null int64
Parch          891 non-null int64
Ticket         891 non-null object
Fare           891 non-null float64
Cabin          204 non-null object
Embarked       889 non-null object
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


#### Dropping features

I don't think I can effectively use `Ticket`. `Cabin` has a lot of nulls. I am dropping these (and `PassengerId`)

In [5]:
relevant_df = raw_df.drop(columns=['PassengerId', 'Cabin', 'Ticket'])

relevant_df.shape

(891, 9)

In [6]:
relevant_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 9 columns):
Survived    891 non-null int64
Pclass      891 non-null int64
Name        891 non-null object
Sex         891 non-null object
Age         714 non-null float64
SibSp       891 non-null int64
Parch       891 non-null int64
Fare        891 non-null float64
Embarked    889 non-null object
dtypes: float64(2), int64(4), object(3)
memory usage: 62.8+ KB


In [21]:
X_train, X_test, y_train, y_test = train_test_split(relevant_df.drop(columns=['Survived']), relevant_df['Survived'], test_size=0.20, random_state=2020)
X_train

Unnamed: 0,Pclass,Name,Sex,Age,SibSp,Parch,Fare,Embarked
131,3,"Coelho, Mr. Domingos Fernandeo",male,20.0,0,0,7.0500,S
70,2,"Jenkin, Mr. Stephen Curnow",male,32.0,0,0,10.5000,S
781,1,"Dick, Mrs. Albert Adrian (Vera Gillespie)",female,17.0,1,0,57.0000,S
508,3,"Olsen, Mr. Henry Margido",male,28.0,0,0,22.5250,S
116,3,"Connors, Mr. Patrick",male,70.5,0,0,7.7500,Q
...,...,...,...,...,...,...,...,...
707,1,"Calderhead, Mr. Edward Pennington",male,42.0,0,0,26.2875,S
630,1,"Barkworth, Mr. Algernon Henry Wilson",male,80.0,0,0,30.0000,S
323,2,"Caldwell, Mrs. Albert Francis (Sylvia Mae Harbaugh)",female,22.0,1,1,29.0000,S
392,3,"Gustafsson, Mr. Johan Birger",male,28.0,2,0,7.9250,S


## Feature Engineering

I'm splitting the data

In [30]:
num_features = ['Pclass', 'Age', 'SibSp', 'Parch', 'Fare']
unordered_features = ['Sex']
ordered_features = ['Embarked']

X_train.Embarked.unique()

array(['S', 'Q', 'C', nan], dtype=object)

In [32]:
ports = ['S', 'C', 'Q']

In [33]:
numeric_pipe = make_pipeline(
    SimpleImputer(strategy='median'),
    StandardScaler()
)

ohe_pipe = make_pipeline(
    SimpleImputer(strategy='constant'),
    OneHotEncoder(handle_unknown='ignore')
)

ord_pipe = make_pipeline(
    SimpleImputer(strategy='constant'),
    OrdinalEncoder(categories=[ports], dtype='int')
)

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_pipe, num_features),
        ('ohe', ohe_pipe, unordered_features),
        ('bow', CountVectorizer(), 'Name'),
        ('ord', ord_pipe, ordered_features)
    ]
)

iceberg  = Pipeline(
    steps=[
        ('pre', preprocessor),
        ('clf', KNeighborsClassifier())
    ]
)

In [45]:
param_grid = {
    "pre__bow__max_features": [10, 100, 1000],
    "clf__n_neighbors": [3, 5, 11, 19],
    "clf__weights": ['uniform', 'distance'],
    "pre__ord__simpleimputer__fill_value": ['S', 'Q', 'C']
}

In [52]:
grid = GridSearchCV(iceberg, param_grid=param_grid, n_jobs=-1)

In [53]:
grid.fit(X_train, y_train)

GridSearchCV(cv=None, error_score=nan,
             estimator=Pipeline(memory=None,
                                steps=[('pre',
                                        ColumnTransformer(n_jobs=None,
                                                          remainder='drop',
                                                          sparse_threshold=0.3,
                                                          transformer_weights=None,
                                                          transformers=[('num',
                                                                         Pipeline(memory=None,
                                                                                  steps=[('simpleimputer',
                                                                                          SimpleImputer(add_indicator=False,
                                                                                                        copy=True,
                                    

In [54]:
grid.best_estimator_

Pipeline(memory=None,
         steps=[('pre',
                 ColumnTransformer(n_jobs=None, remainder='drop',
                                   sparse_threshold=0.3,
                                   transformer_weights=None,
                                   transformers=[('num',
                                                  Pipeline(memory=None,
                                                           steps=[('simpleimputer',
                                                                   SimpleImputer(add_indicator=False,
                                                                                 copy=True,
                                                                                 fill_value=None,
                                                                                 missing_values=nan,
                                                                                 strategy='median',
                                                                

In [55]:
grid.best_score_

0.824386880724909

In [56]:
grid.best_estimator_.score(X_test, y_test)

0.7821229050279329