<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"><li><span><a href="#Introduction" data-toc-modified-id="Introduction-1"><span class="toc-item-num">1&nbsp;&nbsp;</span>Introduction</a></span></li><li><span><a href="#Data-loading" data-toc-modified-id="Data-loading-2"><span class="toc-item-num">2&nbsp;&nbsp;</span>Data loading</a></span></li><li><span><a href="#Pipeline-building" data-toc-modified-id="Pipeline-building-3"><span class="toc-item-num">3&nbsp;&nbsp;</span>Pipeline building</a></span></li><li><span><a href="#Model-training" data-toc-modified-id="Model-training-4"><span class="toc-item-num">4&nbsp;&nbsp;</span>Model training</a></span></li><li><span><a href="#Grid-Search" data-toc-modified-id="Grid-Search-5"><span class="toc-item-num">5&nbsp;&nbsp;</span>Grid Search</a></span></li><li><span><a href="#An-even-easier-API-make_pipeline" data-toc-modified-id="An-even-easier-API-make_pipeline-6"><span class="toc-item-num">6&nbsp;&nbsp;</span>An even easier API <code>make_pipeline</code></a></span></li></ul></div>

# Introduction

Using ML pipeline has major benefits. It drastically simplifies ML project by unifying data preprocessing, feature engineering and ML model into a unified framework.


In [1]:
import numpy as np

from sklearn.compose import ColumnTransformer
from sklearn.datasets import fetch_openml
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder, MinMaxScaler
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, GridSearchCV, ParameterGrid

# Data loading

In [2]:
from sklearn.datasets import fetch_openml
import numpy as np

np.random.seed(0)

# Load data from https://www.openml.org/d/40945
X, y = fetch_openml("titanic", version=1, as_frame=True, return_X_y=True)
X[["age", "fare", "sex", "pclass"]]

Unnamed: 0,age,fare,sex,pclass
0,29.0000,211.3375,female,1.0
1,0.9167,151.5500,male,1.0
2,2.0000,151.5500,female,1.0
3,30.0000,151.5500,male,1.0
4,25.0000,151.5500,female,1.0
...,...,...,...,...
1304,14.5000,14.4542,female,3.0
1305,,14.4542,female,3.0
1306,26.5000,7.2250,male,3.0
1307,27.0000,7.2250,male,3.0


In [3]:
X.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1309 entries, 0 to 1308
Data columns (total 13 columns):
 #   Column     Non-Null Count  Dtype   
---  ------     --------------  -----   
 0   pclass     1309 non-null   float64 
 1   name       1309 non-null   object  
 2   sex        1309 non-null   category
 3   age        1046 non-null   float64 
 4   sibsp      1309 non-null   float64 
 5   parch      1309 non-null   float64 
 6   ticket     1309 non-null   object  
 7   fare       1308 non-null   float64 
 8   cabin      295 non-null    object  
 9   embarked   1307 non-null   category
 10  boat       486 non-null    object  
 11  body       121 non-null    float64 
 12  home.dest  745 non-null    object  
dtypes: category(2), float64(6), object(5)
memory usage: 115.4+ KB


# Pipeline building

In [4]:
numeric_features = ["age", "fare"]
numeric_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="mean")),
    ("scaler", StandardScaler())
])

categorical_features = ["sex", "pclass"]
categorical_transformer = OneHotEncoder(handle_unknown="ignore")

preprocessor = ColumnTransformer(transformers=[
    ("num_transform", numeric_transformer, numeric_features),
    ("cat_transform", categorical_transformer, categorical_features)
])

pipeline = Pipeline(steps=[("preprocesser", preprocessor), ("classifier", LogisticRegression())])
pipeline

Pipeline(steps=[('preprocesser',
                 ColumnTransformer(transformers=[('num_transform',
                                                  Pipeline(steps=[('imputer',
                                                                   SimpleImputer()),
                                                                  ('scaler',
                                                                   StandardScaler())]),
                                                  ['age', 'fare']),
                                                 ('cat_transform',
                                                  OneHotEncoder(handle_unknown='ignore'),
                                                  ['sex', 'pclass'])])),
                ('classifier', LogisticRegression())])

# Model training

In [5]:
from sklearn import set_config
set_config(display='diagram')
pipeline

In [6]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)
pipeline.fit(X_train, y_train)
score = pipeline.score(X_test, y_test)
print(score)

0.7977099236641222


In [7]:
# if one want to persist the model, 
import joblib
joblib.dump(pipeline, 'your_pipeline.pkl')

# try loading it to verify
joblib.load('your_pipeline.pkl').score(X_test, y_test)


0.7977099236641222

In [9]:
pipeline.named_steps["preprocesser"]

In [10]:
transformed_data = pipeline.named_steps["preprocesser"].fit_transform(X_train, y_train)

In [11]:
import pandas as pd
pd.DataFrame(transformed_data, columns=["age", "fare","sex1","sex2", "pclass1", "pclass2", "pclass3"])

Unnamed: 0,age,fare,sex1,sex2,pclass1,pclass2,pclass3
0,-0.403163,-0.505177,0.0,1.0,0.0,0.0,1.0
1,0.873724,1.971329,1.0,0.0,1.0,0.0,0.0
2,0.000000,-0.508928,0.0,1.0,0.0,0.0,1.0
3,-0.961801,-0.508112,1.0,0.0,0.0,0.0,1.0
4,-0.083941,-0.249358,0.0,1.0,0.0,1.0,0.0
...,...,...,...,...,...,...,...
1042,-2.384996,-0.257673,1.0,0.0,0.0,0.0,1.0
1043,0.000000,-0.502731,0.0,1.0,0.0,0.0,1.0
1044,0.000000,-0.508928,1.0,0.0,0.0,0.0,1.0
1045,-0.802191,0.058799,1.0,0.0,0.0,1.0,0.0


In [12]:
X_train[["age", "fare","sex", "pclass"]]

Unnamed: 0,age,fare,sex,pclass
1118,25.0000,7.9250,male,3.0
44,41.0000,134.5000,female,1.0
1072,,7.7333,male,3.0
1130,18.0000,7.7750,female,3.0
574,29.0000,21.0000,male,2.0
...,...,...,...,...
763,0.1667,20.5750,female,3.0
835,,8.0500,male,3.0
1216,,7.7333,female,3.0
559,20.0000,36.7500,female,2.0


# Grid Search

In [13]:
param_grid = [
    {
        "preprocesser__num_transform__imputer__strategy": ["mean", "median"],
        "classifier__C": [0.1, 1.0, 10.0, 100.0],
        "classifier": [LogisticRegression()]
    },
    {
        "preprocesser__num_transform__imputer__strategy": ["mean", "median"],
        "classifier__n_estimators": [10, 100, 1000],
        "classifier": [RandomForestClassifier()]
    }
]

In [None]:
grid_search = GridSearchCV(pipeline, param_grid, cv=10, verbose=1,n_jobs=-1)
grid_search.fit(X_train, y_train)

Fitting 10 folds for each of 14 candidates, totalling 140 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  44 tasks      | elapsed:    4.1s


In [None]:
print(f"Best params:")
print(grid_search.best_params_)
print("Best score in grid search:")
print(grid_search.best_score_)
print("best logistic regression from grid search:")
print(grid_search.score(X_test, y_test))

# An even easier API `make_pipeline`

In [None]:
from sklearn.compose import make_column_transformer
from sklearn.pipeline import make_pipeline

numeric_transformer = make_pipeline(
    SimpleImputer(strategy='mean'),
    StandardScaler())

categorical_transformer = make_pipeline(OneHotEncoder(handle_unknown='ignore'))

col_transformer = make_column_transformer(
    (numeric_transformer, numeric_features),
    (categorical_transformer, categorical_features))

pipeline = make_pipeline(col_transformer,LogisticRegression())

pipeline.fit(X_train, y_train)
pipeline.score(X_test, y_test)

In [None]:
pipeline