<a href="https://colab.research.google.com/github/zhh25/Titanic/blob/main/titanic_data_pipeline.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import pandas as pd
import numpy as np
import os
import sklearn
path_root = '/content/drive/MyDrive/titanic'
train = pd.read_csv(os.path.join(path_root, 'data/train.csv'))
test =  pd.read_csv(os.path.join(path_root, 'data/test.csv'))
sample_submission =  pd.read_csv(os.path.join(path_root, 'data/sample_submission.csv'))
titanic = train.drop('Survived', axis = 1)
titanic_label = train['Survived'].copy()

# Transformation Pipeline




In [None]:
from sklearn.pipeline import Pipeline, FeatureUnion

from sklearn.preprocessing import StandardScaler, Binarizer, OneHotEncoder, FunctionTransformer

from sklearn.compose import ColumnTransformer

from sklearn.experimental import enable_iterative_imputer

from sklearn.impute import SimpleImputer, IterativeImputer

## Pipeline for individual atrribute

### Pipeline for Name


In [None]:
newtitles={
    "Capt":       "Officer",
    "Col":        "Officer",
    "Major":      "Officer",
    "Jonkheer":   "Royalty",
    "Don":        "Royalty",
    "Sir" :       "Royalty",
    "Dr":         "Officer",
    "Rev":        "Officer",
    "the Countess":"Royalty",
    "Dona":       "Royalty",
    "Mme":        "Mrs",
    "Mlle":       "Miss",
    "Ms":         "Mrs",
    "Mr" :        "Mr",
    "Mrs" :       "Mrs",
    "Miss" :      "Miss",
    "Master" :    "Master",
    "Lady" :      "Royalty"}

In [None]:
def name_title(x):
    x = pd.DataFrame(x,columns=['Name'])
    x['Title'] = x['Name'].str.split(', ', expand = True)[1].str.split('.', expand = True)[0]
    x['Title'] = x['Title'].map(newtitles)
    return x[['Title']]


In [None]:
pipeline_name = Pipeline([
    ('title', FunctionTransformer(name_title)),
    ('encode', OneHotEncoder())
])

In [None]:
pipeline_name.fit_transform(titanic['Name'])


<891x6 sparse matrix of type '<class 'numpy.float64'>'
	with 891 stored elements in Compressed Sparse Row format>

In [None]:
cols_name = list (pipeline_name.named_steps['encode'].categories_[0])
cols_name = ['name_'+col for col in cols_name]
cols_name

['name_Master',
 'name_Miss',
 'name_Mr',
 'name_Mrs',
 'name_Officer',
 'name_Royalty']

### pipeline for Age

The following tranformer creates two atrributes using Age:
    - if Age < 8.3
    - if Age > 64.1

In [None]:
from sklearn.base import BaseEstimator, TransformerMixin

In [None]:
#Age_ix, Name_ix = 0, 1
class AdderAge(BaseEstimator, TransformerMixin):
    def __init__(self, threshold_child = 15, threshold_senior = 48, drop_age = False):
        self.threshold_child = threshold_child
        self.threshold_senior = threshold_senior
        self.drop_age = drop_age
        
    def fit(self, X, y = None):
        #self.binarizer_child.fit(X)
        #self.binarizer_senior.fit(X)
        return self
    
    def transform(self, X, y= None):
        Age_child = (X <= self.threshold_child)
        Age_senior = (X > self.threshold_senior)
        
        if self.drop_age:
            return np.c_[Age_child, Age_senior]
        else:
            return np.c_[X, Age_child, Age_senior]
        

the following pipline:
*   Missing values in Age are filled with median age, but using median age of the whole data set is not a good choice. Median age of Pclass groups is the best choice because of its high correlation with Age (0.408106) and Survived (0.338481). 
*   create tow new attributes



In [None]:


pipeline_age = Pipeline([
    ('title', FunctionTransformer(lambda x: pd.concat([x['Age'], name_title( x['Name'])], axis = 1) )),
    ('imputer_age', FunctionTransformer(lambda x: x.groupby('Title')['Age'].apply(lambda x: x.fillna(x.mean())))),
    ('adder_age', AdderAge(threshold_child = 8.3, threshold_senior = 64.1))
])

In [None]:
x = pipeline_age.fit_transform(titanic[['Age', 'Name']])
x[:30,:]


array([[22.        ,  0.        ,  0.        ],
       [38.        ,  0.        ,  0.        ],
       [26.        ,  0.        ,  0.        ],
       [35.        ,  0.        ,  0.        ],
       [35.        ,  0.        ,  0.        ],
       [32.36809045,  0.        ,  0.        ],
       [54.        ,  0.        ,  0.        ],
       [ 2.        ,  1.        ,  0.        ],
       [27.        ,  0.        ,  0.        ],
       [14.        ,  0.        ,  0.        ],
       [ 4.        ,  1.        ,  0.        ],
       [58.        ,  0.        ,  0.        ],
       [20.        ,  0.        ,  0.        ],
       [39.        ,  0.        ,  0.        ],
       [14.        ,  0.        ,  0.        ],
       [55.        ,  0.        ,  0.        ],
       [ 2.        ,  1.        ,  0.        ],
       [32.36809045,  0.        ,  0.        ],
       [31.        ,  0.        ,  0.        ],
       [35.71818182,  0.        ,  0.        ],
       [35.        ,  0.        ,  0.   

In [None]:
cols_age = ['age', 'child', 'senior']

### Transformer SibSp and Parch

In [None]:
class AdderSibSpParch(BaseEstimator, TransformerMixin):
    #def __init__(self):
        
    def fit(self, X, y=None):
        return self
        
    def transform(self, X, y = None):
        family = X['SibSp'] + X['Parch']
        
        Parch_recode = X['Parch'].replace(
            to_replace = [0,1,2, 3, 4, 5, 6, 7, 8, 9,10,11,12,13],
            value =      [0,1,1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2]
        )
        
        SibSp_recode = X['SibSp'].replace(
            to_replace = [0,1,2, 3, 4, 5, 6, 7, 8, 9,10,11,12,13],
            value =      [0,1,1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1]
        )
        
        family_recode = family.replace(
            to_replace = [0,1,2,3, 4, 5, 6, 7, 8, 9,10,11,12,13],
            value =      [0,1,1,1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1]
        )
        
        return np.c_[family, Parch_recode, SibSp_recode, family_recode]

In [None]:
AdderSibSpParch().fit_transform(titanic[['SibSp', 'Parch']])

array([[1, 0, 1, 1],
       [1, 0, 1, 1],
       [0, 0, 0, 0],
       ...,
       [3, 1, 1, 1],
       [0, 0, 0, 0],
       [0, 0, 0, 0]])

In [None]:
cols_SibSp_Parch = ['family', 'Parch_recode', 'SibSp_recode', 'family_recode']

### Pipeline for Fare

In [None]:
pipeline_fare = Pipeline([
    ('imputer', SimpleImputer(strategy='median')),
    ('log', FunctionTransformer(func = np.log1p)),
    ('standadize', StandardScaler())
])

In [None]:
cols_fare = ['fare']

### Pipeline for Cabin

In [None]:
def cabin_group(x):
    x = pd.DataFrame(x)
    x.replace({'A' : 'ABC', 'B' : 'ABC', 'C' : 'ABC', 
               'D' : 'DE', 'E' : 'DE', 
               'F' : 'FG', 'G' : 'FG', 
               'M' : 'M', 
               'T' : 'ABC'},
              inplace = True)
    return x
pipeline_cabin = Pipeline([
    ('imputer', SimpleImputer(strategy='constant', fill_value='M')),
    ('1st_letter', FunctionTransformer(lambda x: pd.DataFrame([s[0][0] for s in x]))),
    ('group', FunctionTransformer(cabin_group)),
    ('encoder', OneHotEncoder(drop='if_binary'))
])

In [None]:
#temp = train.copy()
#temp['Cabin'] = pipeline_cabin.fit_transform(titanic[['Cabin']])
#temp.groupby('Cabin')['Survived'].agg(['mean', 'count'])
pipeline_cabin.fit_transform(titanic[['Cabin']])

<891x4 sparse matrix of type '<class 'numpy.float64'>'
	with 891 stored elements in Compressed Sparse Row format>

In [None]:
cols_cabin = list(pipeline_cabin.named_steps['encoder'].categories_[0])
cols_cabin = ['cabin_'+col for col in cols_cabin]
cols_cabin

['cabin_ABC', 'cabin_DE', 'cabin_FG', 'cabin_M']


### Pipeline for Embarked

In [None]:
pipeline_embarked = Pipeline([
    ('imputer', SimpleImputer(strategy = 'most_frequent')),
    ('1hot', OneHotEncoder())
])

In [None]:
pipeline_embarked.fit_transform(titanic[['Embarked']])

<891x3 sparse matrix of type '<class 'numpy.float64'>'
	with 891 stored elements in Compressed Sparse Row format>

In [None]:
cols_embarked = list(pipeline_embarked.named_steps['1hot'].categories_[0])
cols_embarked = ['embarled_' + col for col in cols_embarked]
cols_embarked

['embarled_C', 'embarled_Q', 'embarled_S']

### Pipeline for Ticket

How is this feature different than Family_Size? Many passengers travelled along with groups. Those groups consist of friends, nannies, maids and etc. They weren't counted as family, but they used the same ticket.

In [None]:
titanic_all = pd.concat([titanic, test], ignore_index = True)
ticket_counts = dict(titanic_all['Ticket'].value_counts())

pipeline_ticket = Pipeline([
    ('count', FunctionTransformer(lambda x: pd.DataFrame(x).replace(ticket_counts)))
])

In [None]:
pipeline_ticket.fit_transform(titanic['Ticket'])

Unnamed: 0,Ticket
0,1
1,2
2,1
3,2
4,1
...,...
886,1
887,1
888,4
889,1


In [None]:
cols_ticket = ['ticket']

## Full Pipeline for whole dataset

In [None]:
attribs_drop = ['PassengerId']
attribs_pass = ['SibSp', 'Parch', 'Pclass']

In [None]:
full_pipeline = ColumnTransformer([
    ('drop', 'drop', attribs_drop),
    ('pass', 'passthrough', attribs_pass),
    ('sex_1hot', OneHotEncoder(drop = 'if_binary'), ['Sex']),
    ('age', pipeline_age, ['Age', 'Name']),
    ('family', AdderSibSpParch(), ['SibSp', 'Parch']),
    ('fare', pipeline_fare, ['Fare']),
    ('cabin', pipeline_cabin, ['Cabin']),
    ('embarked_1hot', pipeline_embarked, ['Embarked']),
    ('ticket_count', pipeline_ticket, ['Ticket']),
    ('title', pipeline_name, ['Name'])
    
])

In [None]:
titanic_prepared = full_pipeline.fit_transform(titanic)
test_prepared = full_pipeline.transform(test)

In [None]:
titanic_prepared.shape

(891, 26)

In [None]:
def attribs_names():
    cols_full = attribs_pass + ['sex'] + cols_age + cols_SibSp_Parch + cols_fare + cols_cabin + cols_embarked + cols_ticket + cols_name
    return cols_full

In [None]:
attribs_names()

['SibSp',
 'Parch',
 'Pclass',
 'sex',
 'age',
 'child',
 'senior',
 'family',
 'Parch_recode',
 'SibSp_recode',
 'family_recode',
 'fare',
 'cabin_ABC',
 'cabin_DE',
 'cabin_FG',
 'cabin_M',
 'embarled_C',
 'embarled_Q',
 'embarled_S',
 'ticket',
 'name_Master',
 'name_Miss',
 'name_Mr',
 'name_Mrs',
 'name_Officer',
 'name_Royalty']