## Pipeline for Predicting Survival on the Titanic



Variables are as follows:

- PassengerId
- Survived: 0 for no, 1 for yes
- Pclass: Ticket class. 1,2,3 for 1st, 2nd and 3rd class, respectively
- Name
- Sex
- Age
- SibSp: number of siblings/spouses on board
- Parch: number of parents/children on board
- Ticket: ticker number
- Fare: passenger fare
- Cabin: cabin number
- Embarked: the port of embarkation. S=Southampton, C = Cherbourg, Q = Queenstown

In [1]:
import re


import pandas as pd
import numpy as np


import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split

from sklearn.preprocessing import StandardScaler, MinMaxScaler

from sklearn.linear_model import LogisticRegression

from sklearn.metrics import accuracy_score, roc_auc_score

import joblib

from sklearn.pipeline import Pipeline

from sklearn.base import BaseEstimator, TransformerMixin

from feature_engine.imputation import (
    CategoricalImputer,
    AddMissingIndicator,
    MeanMedianImputer)

from feature_engine.encoding import (
    RareLabelEncoder,
    OneHotEncoder
)

  from pandas.core.computation.check import NUMEXPR_INSTALLED
  from pandas.core import (


## Prepare the data set

In [2]:
df = pd.read_csv('data/titanic_train.csv')

df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


#### To use in classification_model/processing/data_manager.py

In [4]:
df = df.replace('?', np.nan)

In [5]:
def get_first_cabin(row):
    try:
        return row.split()[0]
    except:
        return np.nan
    
df['Cabin'] = df['Cabin'].apply(get_first_cabin)

In [6]:
def get_title(passenger):
    line = passenger
    if re.search('Mrs', line):
        return 'Mrs'
    elif re.search('Mr', line):
        return 'Mr'
    elif re.search('Miss', line):
        return 'Miss'
    elif re.search('Master', line):
        return 'Master'
    else:
        return 'Other'
    
df['Title'] = df['Name'].apply(get_title)

In [7]:
df['Fare'] = df['Fare'].astype('float')
df['Age'] = df['Age'].astype('float')
df['Pclass'] = df['Pclass'].astype('float')

In [8]:
df.drop(labels=['PassengerId','Name','Ticket'], axis=1, inplace=True)

df.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Cabin,Embarked,Title
0,0,3.0,male,22.0,1,0,7.25,,S,Mr
1,1,1.0,female,38.0,1,0,71.2833,C85,C,Mrs
2,1,3.0,female,26.0,0,0,7.925,,S,Miss
3,1,1.0,female,35.0,1,0,53.1,C123,S,Mrs
4,0,3.0,male,35.0,0,0,8.05,,S,Mr


In [9]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 10 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Survived  891 non-null    int64  
 1   Pclass    891 non-null    float64
 2   Sex       891 non-null    object 
 3   Age       714 non-null    float64
 4   SibSp     891 non-null    int64  
 5   Parch     891 non-null    int64  
 6   Fare      891 non-null    float64
 7   Cabin     204 non-null    object 
 8   Embarked  889 non-null    object 
 9   Title     891 non-null    object 
dtypes: float64(3), int64(3), object(4)
memory usage: 69.7+ KB


In [10]:
# # save the data set

# data.to_csv('titanic.csv', index=False)

## Configuration

#### Define variables for classification_model/config.yaml

In [11]:
NUMERICAL_VARIABLES = ['Age', 'Fare', 'Parch', 'Pclass', 'SibSp']

CATEGORICAL_VARIABLES = ['Sex', 'Cabin', 'Embarked', 'Title']
# CATEGORICAL_VARIABLES = ['Sex', 'Cabin', 'Embarked', 'Title', 'Pclass']

CABIN = ['Cabin']

## Separate data into train and test

#### For classification_model/train_pipeline.py

In [30]:
X_train, X_test, y_train, y_test = train_test_split(
    df.drop(['Survived'], axis=1),  
    df['Survived'],  
    test_size=0.2,  
    random_state=0) 

X_train.shape, X_test.shape

((712, 9), (179, 9))

In [99]:
type(y_test)

pandas.core.series.Series

In [98]:
X_test

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Cabin,Embarked,Title
495,3.0,male,,0,0,14.4583,,C,Mr
648,3.0,male,,0,0,7.5500,,S,Mr
278,3.0,male,7.0,4,1,29.1250,,Q,Master
31,1.0,female,,1,0,146.5208,B78,C,Mrs
255,3.0,female,29.0,0,2,15.2458,,C,Mrs
...,...,...,...,...,...,...,...,...,...
780,3.0,female,13.0,0,0,7.2292,,C,Miss
837,3.0,male,,0,0,8.0500,,S,Mr
215,1.0,female,31.0,1,0,113.2750,D36,C,Miss
833,3.0,male,23.0,0,0,7.8542,,S,Mr


## Preprocessors

#### Class to extract the letter from the variable Cabin. 
#### Use in classification_model/processing/features.py

In [13]:
class ExtractLetterTransformer(BaseEstimator, TransformerMixin):
    
    def __init__(self, variables):
        
        if not isinstance(variables, list):
            raise ValueError('variables should be a list')
        
        self.variables = variables

    def fit(self, X, y=None):

        return self

    def transform(self, X):

        X = X.copy()
        
        for feature in self.variables:
            X[feature] = X[feature].str[0]

        return X

## Pipeline

- Impute categorical variables with string missing
- Add a binary missing indicator to numerical variables with missing data
- Fill NA in original numerical variable with the median
- Extract first letter from cabin
- Group rare Categories
- Perform One hot encoding
- Scale features with standard scaler
- Fit a Logistic regression

#### Use in classification_model/pipeline.py

In [78]:
titanic_pipe = Pipeline([

    # ===== IMPUTATION =====
    # impute categorical variables with string missing
    ('categorical_imputation', CategoricalImputer(
        imputation_method='missing', variables=CATEGORICAL_VARIABLES)),

    # add missing indicator to numerical variables
    ('missing_indicator', AddMissingIndicator(variables=NUMERICAL_VARIABLES)),

    # impute numerical variables with the median
    ('median_imputation', MeanMedianImputer(
        imputation_method='median', variables=NUMERICAL_VARIABLES)),


    # Extract letter from cabin
    ('extract_letter', ExtractLetterTransformer(variables=CABIN)),


    # == CATEGORICAL ENCODING ======
    # remove categories present in less than 5% of the observations (0.05)
    # group them in one category called 'Rare'
    ('rare_label_encoder', RareLabelEncoder(
        tol=0.05, n_categories=1, variables=CATEGORICAL_VARIABLES)),


    # encode categorical variables using one hot encoding into k-1 variables
    ('categorical_encoder', OneHotEncoder(
        drop_last=True, variables=CATEGORICAL_VARIABLES)),

    ('scaler', StandardScaler()),

    ('Logit', LogisticRegression(C=1.0, random_state=0)),
])

#### For classification_model/train_pipeline.py

In [79]:
titanic_pipe.fit(X_train, y_train)

## Make predictions and evaluate model performance

#### Use to test persisted model's performance in tests/test_prediction.py


In [80]:
class_ = titanic_pipe.predict(X_train)
pred = titanic_pipe.predict_proba(X_train)[:,1]

print('train roc-auc: {}'.format(roc_auc_score(y_train, pred)))
print('train accuracy: {}'.format(accuracy_score(y_train, class_)))
print()

class_ = titanic_pipe.predict(X_test)
pred = titanic_pipe.predict_proba(X_test)[:,1]

print('test roc-auc: {}'.format(roc_auc_score(y_test, pred)))
print('test accuracy: {}'.format(accuracy_score(y_test, class_)))
print()

train roc-auc: 0.8751700084274115
train accuracy: 0.824438202247191

test roc-auc: 0.8542819499341239
test accuracy: 0.8044692737430168



In [101]:
accuracy_score(y_test, class_)

0.8044692737430168

In [96]:
titanic_pipe.predict(X_train)

array([1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0,
       1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 1,
       0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1, 0, 0, 1, 1, 1, 0, 0, 1, 0, 0, 1,
       0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1,
       0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 0, 1, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0,
       1, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0,
       1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1,
       0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 1,
       0, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0,
       0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 1, 0, 1,
       0, 0, 1, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0,
       0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 1, 0, 1, 1, 0, 0, 0, 1, 1,
       1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1,
       1, 1, 0, 0, 1, 1, 1, 0, 1, 0, 0, 1, 0, 0, 0,

In [94]:
titanic_pipe.predict_proba(X_train)[:,1]

array([6.95033763e-01, 1.70219216e-01, 1.35386815e-01, 1.74325863e-01,
       1.20248961e-01, 1.64811778e-01, 1.16650927e-01, 6.82479805e-01,
       1.31737459e-01, 1.01778033e-01, 6.85459091e-01, 5.82423599e-01,
       2.10106204e-01, 8.16939367e-01, 6.82345042e-01, 1.10814579e-01,
       1.31617306e-01, 6.77485009e-02, 6.97628479e-01, 2.98152368e-01,
       4.37276926e-02, 1.33235065e-01, 5.32166947e-01, 4.45776617e-02,
       7.34149311e-01, 1.28910849e-01, 1.66395466e-01, 6.61161156e-01,
       8.29543104e-02, 8.30565385e-01, 4.36172240e-02, 1.08625005e-01,
       8.84039623e-01, 1.94906655e-01, 7.23392210e-01, 1.60808310e-01,
       1.28834883e-01, 1.57138155e-01, 5.17583883e-01, 2.95864776e-01,
       9.32795925e-01, 1.28775398e-01, 8.98205289e-01, 8.47758215e-01,
       9.50881091e-02, 1.60552276e-01, 5.58013268e-02, 1.16289669e-01,
       8.05988058e-01, 1.02130698e-01, 6.80715301e-02, 8.04797775e-02,
       6.70110346e-01, 6.61527186e-01, 6.97065336e-01, 1.55583828e-01,
      

In [81]:
from sklearn.metrics import classification_report

# Train set
class_train = titanic_pipe.predict(X_train)

train_report = classification_report(y_train, class_train)
print('Train Classification Report:')
print(train_report)

# Test set
class_test = titanic_pipe.predict(X_test)

test_report = classification_report(y_test, class_test)
print('Test Classification Report:')
print(test_report)

Train Classification Report:
              precision    recall  f1-score   support

           0       0.85      0.87      0.86       439
           1       0.78      0.76      0.77       273

    accuracy                           0.82       712
   macro avg       0.82      0.81      0.81       712
weighted avg       0.82      0.82      0.82       712

Test Classification Report:
              precision    recall  f1-score   support

           0       0.85      0.83      0.84       110
           1       0.74      0.77      0.75        69

    accuracy                           0.80       179
   macro avg       0.79      0.80      0.80       179
weighted avg       0.81      0.80      0.81       179

