In [123]:
from sklearn.compose import make_column_selector, make_column_transformer, ColumnTransformer
from sklearn.pipeline import make_pipeline, Pipeline, FunctionTransformer, FeatureUnion, make_union
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, MinMaxScaler, OneHotEncoder
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, TfidfTransformer
from sklearn.feature_selection import chi2,SelectKBest
import numpy as  np
import pandas as pd

In [None]:
df = pd.DataFrame(
        {
            'Name':['Trump', 'Swift', 'Wenlei'] ,   
            'Sex': ['M', 'F', 'M'],
            'Age':[80, 20, 50],
            'Motto':["Tariff is beautiful", "Be good to people", "Everyone should have a dream"]            

        }
)
df.head()

Unnamed: 0,Name,Sex,Age,Motto
0,Trump,M,80,Tariff is beautiful
1,Swift,F,20,Be good to people
2,Wenlei,M,50,Everyone should have a dream


In [9]:
#test create post ETL process function,  then num_pipeline(Age), cat_pipeline (Name, Sex), txt_pipeline(Motto)
def update_age(X,y=None):
    X['Age'] = X['Age'] + 1
    return X

#make it transformer

transformer_update_age = FunctionTransformer(update_age)

In [10]:
#test transformer
transformer_update_age.fit_transform(df)

Unnamed: 0,Name,Sex,Age,Motto
0,Trump,M,81,Tariff is beautiful
1,Swift,F,21,Be good to people
2,Wenlei,M,51,Everyone should have a dream


In [11]:
#define 
num_columns = ['Age']
cat_columns =['Name', "Sex"]
Txt_columns =['Motto']

In [None]:
#build each pipeline and test
num_pipeline = make_pipeline(  SimpleImputer(strategy='median'), StandardScaler())     
num_pipeline.fit_transform(df[num_columns])

array([[ 1.22474487],
       [-1.22474487],
       [ 0.        ]])

In [15]:
cat_pipeline = make_pipeline(SimpleImputer(strategy='constant', fill_value='Missing'), OneHotEncoder(handle_unknown='ignore'))
cat_pipeline.fit_transform(df[cat_columns])

<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 6 stored elements and shape (3, 5)>

In [17]:
#first check if it works if I do this
pipe1 = make_pipeline(transformer_update_age, make_column_transformer(
        (num_pipeline, num_columns),
        (cat_pipeline, cat_columns),
))

In [18]:
pipe1.fit_transform(df)

array([[ 1.22474487,  0.        ,  1.        ,  0.        ,  0.        ,
         1.        ],
       [-1.22474487,  1.        ,  0.        ,  0.        ,  1.        ,
         0.        ],
       [ 0.        ,  0.        ,  0.        ,  1.        ,  0.        ,
         1.        ]])

In [19]:
df.head()

Unnamed: 0,Name,Sex,Age,Motto
0,Trump,M,82,Tariff is beautiful
1,Swift,F,22,Be good to people
2,Wenlei,M,52,Everyone should have a dream


In [None]:
#sometime we need dynamic choose column, for example, when tuning the model, we might want to choose which column to use to flow downstream
#use index
pipe2 = make_pipeline(transformer_update_age, make_column_transformer(
        (num_pipeline, [2]),
        (cat_pipeline, [1]),
))
pipe2.fit_transform(df)

#use slice
pipe2 = make_pipeline(transformer_update_age, make_column_transformer(
        (num_pipeline, [2]),
        (cat_pipeline, slice(0,2)),
))
pipe2.fit_transform(df)

#use mask as a boolean list
cat_mask =  [  c for c in df.columns if c =='Name'] 
pipe2 = make_pipeline(transformer_update_age, make_column_transformer(
        (num_pipeline, [2]),
        (cat_pipeline, cat_mask),
))
pipe2.fit_transform(df)
#use name regex pattern
pipe2 = make_pipeline(transformer_update_age, make_column_transformer(
        (num_pipeline, [2]),
        (cat_pipeline, make_column_selector(pattern = 'N.*')),
))
pipe2.fit_transform(df)
#use datatype include, exclude 
pipe2 = make_pipeline(transformer_update_age, make_column_transformer(
        (num_pipeline, make_column_selector(dtype_include='number')),
        (cat_pipeline, cat_columns),
))
pipe2.fit_transform(df)
#exclude
pipe2 = make_pipeline(transformer_update_age, make_column_transformer(
        (num_pipeline, make_column_selector(dtype_exclude='object')),
        (cat_pipeline, cat_columns),
))
pipe2.fit_transform(df)


array([[ 1.22474487,  0.        ,  1.        ,  0.        ,  0.        ,
         1.        ],
       [-1.22474487,  1.        ,  0.        ,  0.        ,  1.        ,
         0.        ],
       [ 0.        ,  0.        ,  0.        ,  1.        ,  0.        ,
         1.        ]])

In [39]:
df['Motto'].to_numpy().reshape(-1, 1).ravel()

array(['Tariff is beautiful', 'Be good to people',
       'Everyone should have a dream'], dtype=object)

In [49]:
#what if we add txt feature
#https://stackoverflow.com/questions/71805720/using-countvectorizer-with-pipeline-and-columntransformer-and-getting-attributee
txt1_pipeline =  make_pipeline ( 
                    SimpleImputer(strategy='constant', fill_value='Missing'),                   #title   
                    #CountVectorizer(),
                    # SelectKBest(chi2,k=5), 
                    # TfidfTransformer(use_idf=True)
                    )

txt1_pipeline.fit_transform(df['Motto'].to_frame())

array([['Tariff is beautiful'],
       ['Be good to people'],
       ['Everyone should have a dream']], dtype=object)

In [52]:
txt1_pipeline =  make_pipeline ( 
                    SimpleImputer(strategy='constant', fill_value='Missing'),                   #title   
                    CountVectorizer(),
                    # SelectKBest(chi2,k=5), 
                    # TfidfTransformer(use_idf=True)
                    )

txt1_pipeline.fit_transform(df['Motto'].to_frame())

AttributeError: 'numpy.ndarray' object has no attribute 'lower'

In [56]:
impute_result = SimpleImputer(strategy='constant', fill_value='Missing').fit_transform(df['Motto'].to_frame())
impute_result

array([['Tariff is beautiful'],
       ['Be good to people'],
       ['Everyone should have a dream']], dtype=object)

In [57]:
#https://stackoverflow.com/questions/26367075/countvectorizer-attributeerror-numpy-ndarray-object-has-no-attribute-lower
CountVectorizer().fit_transform(impute_result.ravel())

<Compressed Sparse Row sparse matrix of dtype 'int64'
	with 11 stored elements and shape (3, 11)>

In [64]:
#create 1DWrapper
class oneDwrapper ( BaseEstimator, TransformerMixin):
     #convert to 1d only one bracket#     
     def __init__(self, transformer):
        self.transformer = transformer

     def fit (self, X, y=None ):
         self.transformer.fit(X, y)
         return self
     
     def transform(self, X):
         return np.array(self.transformer.transform(X)).ravel()       
    
    


In [None]:
txt1_pipeline =  make_pipeline ( 
                    oneDwrapper(SimpleImputer(strategy='constant', fill_value='Missing')),                   
                    CountVectorizer(),
                    # SelectKBest(chi2,k=5), 
                    # TfidfTransformer(use_idf=True)
                    )

txt1_pipeline.fit_transform(df['Motto'].to_frame())

<Compressed Sparse Row sparse matrix of dtype 'int64'
	with 11 stored elements and shape (3, 11)>

In [70]:
print(txt1_pipeline.fit_transform(df['Motto'].to_frame()))

<Compressed Sparse Row sparse matrix of dtype 'int64'
	with 11 stored elements and shape (3, 11)>
  Coords	Values
  (0, 9)	1
  (0, 6)	1
  (0, 1)	1
  (1, 0)	1
  (1, 4)	1
  (1, 10)	1
  (1, 7)	1
  (2, 3)	1
  (2, 8)	1
  (2, 5)	1
  (2, 2)	1


In [75]:
#https://github.com/scikit-learn/scikit-learn/issues/28791 
txt1_pipeline =  make_pipeline ( 
                    oneDwrapper(SimpleImputer(strategy='constant', fill_value='Missing')),                   #title   
                    CountVectorizer(),
                    SelectKBest(chi2,k=5), 
                    
                    )

txt1_pipeline.fit_transform(df['Motto'].to_frame(), y=[1, 2, 3])  #chi2 need y, I put abitary y value here

<Compressed Sparse Row sparse matrix of dtype 'int64'
	with 5 stored elements and shape (3, 5)>

In [None]:
txt1_pipeline =  make_pipeline ( 
                    oneDwrapper(SimpleImputer(strategy='constant', fill_value='Missing')),                   #title   
                    CountVectorizer(),
                    SelectKBest(chi2,k=5), 
                    TfidfTransformer(use_idf=True)
                    )

txt1_pipeline.fit_transform(df['Motto'].to_frame(), y=[1, 2, 3])    #single pipeline work, it also take df, then convert to single array in oneDwrapper

<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 5 stored elements and shape (3, 5)>

In [79]:
#put together with other component
#make pipeline with txt_pipe
pipe3 = make_pipeline(transformer_update_age, make_column_transformer(
        (num_pipeline, num_columns),
        (cat_pipeline, cat_columns),
        (txt1_pipeline, Txt_columns)
))
pipe3.fit_transform(df, [1, 2, 3])  #[1, 2, 3] simulated y, which required when training

array([[ 1.22474487,  0.        ,  1.        ,  0.        ,  0.        ,
         1.        ,  0.70710678,  0.        ,  0.        ,  0.70710678,
         0.        ],
       [-1.22474487,  1.        ,  0.        ,  0.        ,  1.        ,
         0.        ,  0.        ,  0.70710678,  0.        ,  0.        ,
         0.70710678],
       [ 0.        ,  0.        ,  0.        ,  1.        ,  0.        ,
         1.        ,  0.        ,  0.        ,  1.        ,  0.        ,
         0.        ]])

In [81]:
#now by default, transform output is array, what impact would be if I turn the deault to pandas
#change global 
from sklearn import set_config
set_config(transform_output="pandas")
pipe3.fit_transform(df, [1, 2, 3]) 


ValueError: Pandas output does not support sparse data. Set sparse_output=False to output pandas dataframes or disable Pandas output via` ohe.set_output(transform="default").

In [82]:
#what if I change onehot encode to dense
cat_pipeline1 = make_pipeline(SimpleImputer(strategy='constant', fill_value='Missing'), 
                             OneHotEncoder(handle_unknown='ignore', sparse_output=False))
pipe4 = make_pipeline(transformer_update_age, make_column_transformer(
        (num_pipeline, num_columns),
        (cat_pipeline1, cat_columns),
        (txt1_pipeline, Txt_columns)
))
pipe4.fit_transform(df, [1, 2, 3])  #[1, 2, 3] simulated y, which required when training


ValueError: The transformer outputs a scipy sparse matrix. Try to set the transformer output to a dense array or disable Pandas output with set_output(transform='default').

In [None]:
#looks it is also from txt_pipeline, since it is not easily changable, I will keep use default format
#the good about use pandas format is quick way to retrieve the input feature name
#assume pipeline end is clf, use clf[-1].feature_names_in_

In [83]:
#convert back to default
set_config(transform_output="default")
pipe3.fit_transform(df, [1, 2, 3]) 


array([[ 1.22474487,  0.        ,  1.        ,  0.        ,  0.        ,
         1.        ,  0.70710678,  0.        ,  0.        ,  0.70710678,
         0.        ],
       [-1.22474487,  1.        ,  0.        ,  0.        ,  1.        ,
         0.        ,  0.        ,  0.70710678,  0.        ,  0.        ,
         0.70710678],
       [ 0.        ,  0.        ,  0.        ,  1.        ,  0.        ,
         1.        ,  0.        ,  0.        ,  1.        ,  0.        ,
         0.        ]])

In [85]:
#pipe3 structure
pipe3 

In [90]:
pipe3[-1].feature_names_in_

array(['Name', 'Sex', 'Age', 'Motto'], dtype=object)

In [86]:
#how to retrieve featur name
pipe3.get_feature_names_out()

AttributeError: Estimator functiontransformer does not provide get_feature_names_out. Did you mean to call pipeline[:-1].get_feature_names_out()?

In [95]:
print({k:v for k, v in pipe3.named_steps.items()})

{'functiontransformer': FunctionTransformer(func=<function update_age at 0x000002248F29FCE0>), 'columntransformer': ColumnTransformer(transformers=[('pipeline-1',
                                 Pipeline(steps=[('simpleimputer',
                                                  SimpleImputer(strategy='median')),
                                                 ('standardscaler',
                                                  StandardScaler())]),
                                 ['Age']),
                                ('pipeline-2',
                                 Pipeline(steps=[('simpleimputer',
                                                  SimpleImputer(fill_value='Missing',
                                                                strategy='constant')),
                                                 ('onehotencoder',
                                                  OneHotEncoder(handle_unknown='ignore'))]),
                                 ['Name', 'Sex']),
     

In [101]:
#https://stackoverflow.com/questions/54646709/sklearn-pipeline-get-feature-names-after-onehotencode-in-columntransformer
pipe3.named_steps['columntransformer']

In [108]:
pipe3.named_steps['columntransformer'].get_params()

{'force_int_remainder_cols': True,
 'n_jobs': None,
 'remainder': 'drop',
 'sparse_threshold': 0.3,
 'transformer_weights': None,
 'transformers': [('pipeline-1',
   Pipeline(steps=[('simpleimputer', SimpleImputer(strategy='median')),
                   ('standardscaler', StandardScaler())]),
   ['Age']),
  ('pipeline-2',
   Pipeline(steps=[('simpleimputer',
                    SimpleImputer(fill_value='Missing', strategy='constant')),
                   ('onehotencoder', OneHotEncoder(handle_unknown='ignore'))]),
   ['Name', 'Sex']),
  ('pipeline-3',
   Pipeline(steps=[('onedwrapper',
                    oneDwrapper(transformer=SimpleImputer(fill_value='Missing',
                                                          strategy='constant'))),
                   ('countvectorizer', CountVectorizer()),
                   ('selectkbest',
                    SelectKBest(k=5,
                                score_func=<function chi2 at 0x00000224907996C0>)),
                   ('tfidftran

In [109]:
pipe3.named_steps['columntransformer'].named_transformers_['pipeline-1']

In [110]:
pipe3.named_steps['columntransformer'].named_transformers_['pipeline-1'].named_steps['simpleimputer'].get_feature_names_out()

array(['Age'], dtype=object)

In [112]:
pipe3.named_steps['columntransformer'].named_transformers_['pipeline-3']

In [115]:
pipe3.named_steps['columntransformer'].named_transformers_['pipeline-3'].get_params()

{'memory': None,
 'steps': [('onedwrapper',
   oneDwrapper(transformer=SimpleImputer(fill_value='Missing',
                                         strategy='constant'))),
  ('countvectorizer', CountVectorizer()),
  ('selectkbest',
   SelectKBest(k=5, score_func=<function chi2 at 0x00000224907996C0>)),
  ('tfidftransformer', TfidfTransformer())],
 'verbose': False,
 'onedwrapper': oneDwrapper(transformer=SimpleImputer(fill_value='Missing',
                                       strategy='constant')),
 'countvectorizer': CountVectorizer(),
 'selectkbest': SelectKBest(k=5, score_func=<function chi2 at 0x00000224907996C0>),
 'tfidftransformer': TfidfTransformer(),
 'onedwrapper__transformer__add_indicator': False,
 'onedwrapper__transformer__copy': True,
 'onedwrapper__transformer__fill_value': 'Missing',
 'onedwrapper__transformer__keep_empty_features': False,
 'onedwrapper__transformer__missing_values': nan,
 'onedwrapper__transformer__strategy': 'constant',
 'onedwrapper__transformer':

In [None]:
pipe3.named_steps['columntransformer'].named_transformers_['pipeline-3'].named_steps['selectkbest']

In [117]:
pipe3.named_steps['columntransformer'].named_transformers_['pipeline-3'].named_steps['selectkbest'].get_feature_names_out()

array(['x6', 'x7', 'x8', 'x9', 'x10'], dtype=object)

In [118]:
pipe3.named_steps['columntransformer'].named_transformers_['pipeline-3'].named_steps['tfidftransformer'].get_feature_names_out()

array(['x0', 'x1', 'x2', 'x3', 'x4'], dtype=object)

In [119]:
class ExcludeColumnSelector(BaseEstimator, TransformerMixin):
    '''select specific columns of a given dataset
    
    num_pipeline = make_pipeline (
    ExcludeColumnSelector(oh_columns),
    SimpleImputer(strategy="median"),    
    Pipeline_winsorize(), #powertransform()
    StandardScaler())

    oh_pipeline = make_pipeline(
        IncludeColumnSelector(oh_columns),
        SimpleImputer(strategy="median"),
    )

    preprocessor = make_union (num_pipeline, oh_pipeline)
        
    '''
    def __init__(self, subset):
        self.subset = subset
        
    def fit(self, X, y=None):
        return self
    
    def transform(self, X, y=None):
        return X.loc[:, ~X.columns.isin(self.subset)]
    

class IncludeColumnSelector(BaseEstimator, TransformerMixin):
    '''select specific columns of a given dataset'''
    def __init__(self, subset):
        self.subset = subset
        
    def fit(self, X, y=None):
        return self
    
    def transform(self, X, y=None):
        #check if X contain the column in subset 
        column_list = [ i for i in X.columns if i in self.subset]
        return X.loc[:, column_list]

In [120]:
#more complicated senarior, first select text column use make uniono

txt_pipeline2 = make_pipeline(IncludeColumnSelector(['Motto']), txt1_pipeline)
txt_pipeline2

In [122]:
non_txt_pipeline2 = make_pipeline(ExcludeColumnSelector(['Motto']), make_column_transformer((num_pipeline, num_columns),(cat_pipeline, cat_columns)))
non_txt_pipeline2

In [124]:
#combine_all
pipe_all = make_pipeline(transformer_update_age, make_union(txt_pipeline2, non_txt_pipeline2))
pipe_all 

In [125]:
pipe_all.fit_transform(df, [1, 2, 3]) 

<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 13 stored elements and shape (3, 11)>