In [16]:
from ds_utils.ds_preamble import *
from ds_utils.ds_plotting import * 
from ds_utils.ds_helper import *
from time import time
from sklearn.compose import make_column_selector
import xgboost 

sklearn.set_config(transform_output="pandas", display='diagram')
# from sklearn.datasets import fetch_openml

# comprehensive example on pipeline

In [60]:
target = 'survived'
df = sns.load_dataset('titanic').drop(['alive','who', 'embark_town', 'deck', 'pclass', 'adult_male'], axis=1)
# y = df.survived
# df_train, df_test, y_train, y_test = train_test_split(df, y, test_size=0.3, random_state=1234)

# in fact, we don't need y, if y is a column of X
df_train, df_test = train_test_split(df, test_size=0.3, random_state=1234)

In [66]:
# EDA
df[:2]

df.dtypes
df.describe(include=['object','category'])

df.describe(exclude=['number'])

# df.parch.value_counts()
# df.sibsp.value_counts()

# df.sibsp.value_counts().plot.bar()
# plt.figure()
# df.parch.value_counts().plot.bar()

Unnamed: 0,survived,sex,age,sibsp,parch,fare,embarked,class,alone
0,0,male,22.0,1,0,7.25,S,Third,False
1,1,female,38.0,1,0,71.2833,C,First,False


survived       int64
sex           object
age          float64
sibsp          int64
parch          int64
fare         float64
embarked      object
class       category
alone           bool
dtype: object

Unnamed: 0,sex,embarked,class
count,891,889,891
unique,2,3,3
top,male,S,Third
freq,577,644,491


Unnamed: 0,sex,embarked,class,alone
count,891,889,891,891
unique,2,3,3,2
top,male,S,Third,True
freq,577,644,491,537


In [69]:
pipe_numeric = make_pipeline(
    SimpleImputer(strategy='median'),
)
pipe_nominal = make_pipeline(
    SimpleImputer(strategy='most_frequent'),
    OneHotEncoder(drop='if_binary', sparse_output=False),
)
pipe_ordinal = make_pipeline(
    OrdinalEncoder(categories=[['First','Second','Third']]),
)

col_transform = make_column_transformer(
    # (pipe_numeric, make_column_selector(dtype_include='number')),   # this will select `survived` too!
    (pipe_numeric, [col for col in df.select_dtypes('number') if col!='survived']),    
    (pipe_nominal, ['sex','embarked', 'alone']),
    (pipe_ordinal, ['class']),
    ('drop', ['survived']),
    verbose_feature_names_out=False,
    remainder='passthrough'
)

pipe = make_pipeline(
    col_transform,
    RFECV(RandomForestClassifier(max_depth=3, n_estimators=150), cv=2, verbose=10, min_features_to_select=5),
    StandardScaler(),
)

# pipe_xgb = make_pipeline(
#     pipe,
#     xgboost.XGBClassifier()
# )
# pipe

pipe
pipe.steps # show the name of each step

pipe.fit_transform(df_train, df_train[target])[:2] # pipe will be fitted afterwards


[('columntransformer',
  ColumnTransformer(remainder='passthrough',
                    transformers=[('pipeline-1',
                                   Pipeline(steps=[('simpleimputer',
                                                    SimpleImputer(strategy='median'))]),
                                   ['age', 'sibsp', 'parch', 'fare']),
                                  ('pipeline-2',
                                   Pipeline(steps=[('simpleimputer',
                                                    SimpleImputer(strategy='most_frequent')),
                                                   ('onehotencoder',
                                                    OneHotEncoder(drop='if_binary',
                                                                  sparse_output=False))]),
                                   ['sex', 'embarked', 'alone']),
                                  ('pipeline-3',
                                   Pipeline(steps=[('ordinalencoder',
             

Fitting estimator with 10 features.
Fitting estimator with 9 features.
Fitting estimator with 8 features.
Fitting estimator with 7 features.
Fitting estimator with 6 features.
Fitting estimator with 10 features.
Fitting estimator with 9 features.
Fitting estimator with 8 features.
Fitting estimator with 7 features.
Fitting estimator with 6 features.
Fitting estimator with 10 features.


Unnamed: 0,age,sibsp,parch,fare,sex_male,embarked_C,embarked_S,alone_True,class
28,-0.089611,-0.488795,-0.465805,-0.488003,-1.320793,-0.468165,-1.672956,0.802371,0.817109
270,-0.089611,-0.488795,-0.465805,-0.017899,0.757121,-0.468165,0.597744,0.802371,-1.567241


In [70]:
# access steps after fitting

# ------------------- get feature names after preprocessing ------------------- #
feat_b4 = pipe['rfecv'].feature_names_in_
feat_after = pipe['rfecv'].get_feature_names_out()
print(f"before feature selection: \n{feat_b4}")
print(f"after feature selection: \n{feat_after}")
print(f"removed features:\n {[feat for feat in feat_b4 if feat not in feat_after]}")

# ------------------------ access statistics of a step ----------------------- #
pipe['standardscaler'].mean_
pipe['standardscaler'].scale_

# ------------ access statistics of a step of a columntransformer ------------ #
pipe['columntransformer'].named_transformers_['pipeline-2']['onehotencoder'].categories_

before feature selection: 
['age' 'sibsp' 'parch' 'fare' 'sex_male' 'embarked_C' 'embarked_Q'
 'embarked_S' 'alone_True' 'class']
after feature selection: 
['age' 'sibsp' 'parch' 'fare' 'sex_male' 'embarked_C' 'embarked_S'
 'alone_True' 'class']
removed features:
 ['embarked_Q']


array([29.18767255,  0.51203852,  0.39165329, 31.8803366 ,  0.63563403,
        0.17977528,  0.73675762,  0.60834671,  1.31460674])

array([13.25369124,  1.04755292,  0.84080963, 49.18240037,  0.48125192,
        0.38400017,  0.44039281,  0.48811985,  0.8388032 ])

[array(['female', 'male'], dtype=object),
 array(['C', 'Q', 'S'], dtype=object),
 array([False, True], dtype=object)]

In [54]:
# ------------------ add a new step to an existing pipeline ------------------ #
pipe.steps.append(['xgb',xgboost.XGBClassifier()])

In [87]:
# ------------------------------- access a step ------------------------------ #
pipe.steps
pipe.named_steps
type(pipe.steps)
type(pipe.named_steps)


pipe[-1]
pipe.steps[-1]
pipe.named_steps['standardscaler']
pipe.named_steps.standardscaler


[('columntransformer',
  ColumnTransformer(remainder='passthrough',
                    transformers=[('pipeline-1',
                                   Pipeline(steps=[('simpleimputer',
                                                    SimpleImputer(strategy='median'))]),
                                   ['age', 'sibsp', 'parch', 'fare']),
                                  ('pipeline-2',
                                   Pipeline(steps=[('simpleimputer',
                                                    SimpleImputer(strategy='most_frequent')),
                                                   ('onehotencoder',
                                                    OneHotEncoder(drop='if_binary',
                                                                  sparse_output=False))]),
                                   ['sex', 'embarked', 'alone']),
                                  ('pipeline-3',
                                   Pipeline(steps=[('ordinalencoder',
             

{'columntransformer': ColumnTransformer(remainder='passthrough',
                   transformers=[('pipeline-1',
                                  Pipeline(steps=[('simpleimputer',
                                                   SimpleImputer(strategy='median'))]),
                                  ['age', 'sibsp', 'parch', 'fare']),
                                 ('pipeline-2',
                                  Pipeline(steps=[('simpleimputer',
                                                   SimpleImputer(strategy='most_frequent')),
                                                  ('onehotencoder',
                                                   OneHotEncoder(drop='if_binary',
                                                                 sparse_output=False))]),
                                  ['sex', 'embarked', 'alone']),
                                 ('pipeline-3',
                                  Pipeline(steps=[('ordinalencoder',
                             

list

sklearn.utils._bunch.Bunch

('standardscaler', StandardScaler())

In [73]:
# -------------------------- disable StandardScaler -------------------------- #
pipe.set_params(standardscaler='passthrough').transform(df_train)[:2] # for disable step, no need to fit again
pipe.steps[-1] # standardscaler step will be changed to 'passthrough'
pipe['standardscaler'] # standardscaler step will be changed to 'passthrough'

# pipe.set_params(standardscaler=StandardScaler()).transform(df_train) # error, since StandardScaler is not fitted

# -------------------------- re-enable StandardScaler ------------------------- #
pipe.set_params(standardscaler=StandardScaler()).fit_transform(df_train, df_train[target])[:2]
pipe.steps[-1] # standardscaler step will be changed to 'passthrough'

Unnamed: 0,age,sibsp,parch,fare,sex_male,embarked_C,embarked_Q,embarked_S,alone_True,class
28,28.0,0.0,0.0,7.8792,0.0,0.0,1.0,0.0,1.0,2.0
270,28.0,0.0,0.0,31.0,1.0,0.0,0.0,1.0,1.0,0.0


('standardscaler', 'passthrough')

'passthrough'

Fitting estimator with 10 features.
Fitting estimator with 9 features.
Fitting estimator with 8 features.
Fitting estimator with 7 features.
Fitting estimator with 6 features.
Fitting estimator with 10 features.
Fitting estimator with 9 features.
Fitting estimator with 8 features.
Fitting estimator with 7 features.
Fitting estimator with 6 features.


Unnamed: 0,age,sibsp,parch,fare,sex_male,embarked_C,embarked_Q,embarked_S,alone_True,class
28,-0.089611,-0.488795,-0.465805,-0.488003,-1.320793,-0.468165,3.313724,-1.672956,0.802371,0.817109
270,-0.089611,-0.488795,-0.465805,-0.017899,0.757121,-0.468165,-0.301775,0.597744,0.802371,-1.567241


('standardscaler', StandardScaler())