# Pipelines

In [1]:
import numpy as np
import pandas as pd

from sklearn.impute import SimpleImputer

from sklearn.preprocessing import Binarizer
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import FunctionTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler

from sklearn.model_selection import train_test_split

from sklearn.metrics import classification_report

from sklearn.neighbors import KNeighborsClassifier

In [2]:
from sklearn.pipeline import Pipeline, make_pipeline

In [3]:
pipeline = make_pipeline(
    SimpleImputer(strategy='mean'),
    StandardScaler(),
    KNeighborsClassifier()
)

In [4]:
df = pd.read_csv('../data/titanic_train.csv')

df_X = df.drop(['PassengerId', 'Survived', 'Name', 'Ticket', 'Cabin'], axis=1)
df_Y = df['Survived']

In [5]:
X_train, X_test, y_train, y_test = train_test_split(
    df_X[df_X.dtypes[df_X.dtypes != object].index], 
    df_Y, 
    random_state=42
)

In [6]:
X_train.head()

Unnamed: 0,Pclass,Age,SibSp,Parch,Fare
298,1,,0,0,30.5
884,3,25.0,0,0,7.05
247,2,24.0,0,2,14.5
478,3,22.0,0,0,7.5208
305,1,0.92,1,2,151.55


In [7]:
pipeline.fit(X_train.values, y_train)

Pipeline(memory=None,
         steps=[('simpleimputer',
                 SimpleImputer(add_indicator=False, copy=True, fill_value=None,
                               missing_values=nan, strategy='mean',
                               verbose=0)),
                ('standardscaler',
                 StandardScaler(copy=True, with_mean=True, with_std=True)),
                ('kneighborsclassifier',
                 KNeighborsClassifier(algorithm='auto', leaf_size=30,
                                      metric='minkowski', metric_params=None,
                                      n_jobs=None, n_neighbors=5, p=2,
                                      weights='uniform'))],
         verbose=False)

In [8]:
print(classification_report(y_test, pipeline.predict(X_test)))

              precision    recall  f1-score   support

           0       0.74      0.76      0.75       134
           1       0.63      0.61      0.62        89

    accuracy                           0.70       223
   macro avg       0.69      0.68      0.68       223
weighted avg       0.70      0.70      0.70       223



# ColumnTransformer

In [9]:
from sklearn.compose import ColumnTransformer

In [10]:
X_train, X_test, y_train, y_test = train_test_split(df_X, df_Y, random_state=42)

In [11]:
df_X.columns

Index(['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked'], dtype='object')

In [12]:
df_X.head()

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,3,male,22.0,1,0,7.25,S
1,1,female,38.0,1,0,71.2833,C
2,3,female,26.0,0,0,7.925,S
3,1,female,35.0,1,0,53.1,S
4,3,male,35.0,0,0,8.05,S


In [13]:
column_transformer = ColumnTransformer(
        [
            ('fare', make_pipeline(
                FunctionTransformer(np.log1p, np.expm1, validate=False),
                StandardScaler()
            ), ['Fare']),
            ('numeric_cols', make_pipeline(
                SimpleImputer(strategy='mean'),
                StandardScaler()
            ), ['Pclass', 'Age', 'SibSp', 'Parch']),
            ('sex', make_pipeline(
                SimpleImputer(strategy='most_frequent'),
                OrdinalEncoder()
            ), ['Sex']),
            ('embarked', make_pipeline(
                SimpleImputer(strategy='most_frequent'),
                OneHotEncoder(handle_unknown='ignore')
            ), ['Embarked']),
        ]
)

pipeline_steps = [
    ('transformers', column_transformer),
    ('classifier', KNeighborsClassifier())
]

pipeline = Pipeline(pipeline_steps)

In [14]:
pipeline.fit(X_train, y_train)
print(classification_report(y_test, pipeline.predict(X_test)))

              precision    recall  f1-score   support

           0       0.84      0.87      0.85       134
           1       0.79      0.75      0.77        89

    accuracy                           0.82       223
   macro avg       0.81      0.81      0.81       223
weighted avg       0.82      0.82      0.82       223

