In [2]:
import numpy as np
import pandas as pd
from sklearn.pipeline import make_pipeline
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.linear_model import LogisticRegression

In [3]:
X_train = pd.DataFrame({'col1':['a', np.nan, 'b'], 'col2':[1, 2, np.nan]})
X_test = pd.DataFrame({'col1':[np.nan], 'col2':[2]})
y_train = pd.Series([10, 20, 30])

X_train

Unnamed: 0,col1,col2
0,a,1.0
1,,2.0
2,b,


In [4]:
numeric_preprocessor = Pipeline(
    steps=[
        ("imputation_mean", SimpleImputer(missing_values=np.nan, strategy="mean")),
        #("scaler", StandardScaler()),
    ]
)

categorical_preprocessor = Pipeline(
    steps=[
        ("imputation_constant", SimpleImputer(fill_value="missing", strategy="constant")),
        ("onehot", OneHotEncoder(handle_unknown="ignore")),
    ]
)

preprocessor = ColumnTransformer(
    [
        ("categorical", categorical_preprocessor, ["col1"]),
        ("numerical", numeric_preprocessor, ["col2"]),
    ]
)

## to see the output with the proper column names (i.e. after Onehot encoding), we need to fit the preprocessor first 
preprocessor.fit(X_train)
cat_col_names_final = preprocessor.named_transformers_['categorical']['onehot'].get_feature_names_out(['col1']) 
column_names = np.concatenate([cat_col_names_final, ['col2']])
pd.DataFrame(preprocessor.fit_transform(X_train), columns=column_names)

Unnamed: 0,col1_a,col1_b,col1_missing,col2
0,1.0,0.0,0.0,1.0
1,0.0,0.0,1.0,2.0
2,0.0,1.0,0.0,1.5


In [5]:
pipe = make_pipeline(preprocessor, LogisticRegression(max_iter=500))
pipe

In [6]:
pipe.fit(X_train, y_train)

pipe.predict(X_test)

array([20], dtype=int64)

### Custom Transformer

In [21]:
X_train = pd.DataFrame({'col1':['a', 'b', 'c', np.nan, 'e', 'f', 'g', np.nan, 'i', 'j'], 'col2':[1, np.nan, np.nan, np.nan, np.nan, np.nan, np.nan, np.nan, np.nan, np.nan]})
X_test = pd.DataFrame({'col1':'j', 'col2':[2]})
y_train = pd.Series(range(1, 100, 10))

X_train

Unnamed: 0,col1,col2
0,a,1.0
1,b,
2,c,
3,,
4,e,
5,f,
6,g,
7,,
8,i,
9,j,


In [4]:
from sklearn.base import BaseEstimator, TransformerMixin # the former provides get_params() and set_params() methods and the latter provides the fit_transform() method

class handle_high_count_zero_columns(BaseEstimator, TransformerMixin):
  
  def __init__(self, zero_pct_thresh=90):  # no *args or **kargs
    self.zero_pct_thresh = zero_pct_thresh
  
  def fit(self, X, y=None):
    return self # nothing else to do

  def transform(self, X, y=None):
    zero_pct_X = dict(X[X == 0].notna().sum() * 100/len(X))  # all the columns of X wih the percentage of their zero cells
    high_zero_count_columns = [ i for i in zero_pct_X if zero_pct_X[i] >= self.zero_pct_thresh ]
    return X[ [i for i in X.columns if i not in high_zero_count_columns] ]

In [16]:
numeric_preprocessor = Pipeline(
    steps=[
        ("imputation_mean", SimpleImputer(missing_values=np.nan, strategy="mean")),
        #("scaler", StandardScaler()),
    ]
)

categorical_preprocessor = Pipeline(
    steps=[
        ("imputation_constant", SimpleImputer(fill_value="missing", strategy="constant")),
        ("onehot", OneHotEncoder(handle_unknown="ignore")),
    ]
)

ColumnTransformers = ColumnTransformer(
    [
        ("categorical", categorical_preprocessor, ["col1"]),
        ("numerical", numeric_preprocessor, ["col2"]),
    ]
)

preprocessors = Pipeline(
    steps=[
        ('exclude high zero-count cols', handle_high_count_zero_columns()),
        ('column transformers', ColumnTransformers),
    ]
)

pipe = make_pipeline(preprocessors, LogisticRegression(max_iter=500))
pipe

In [22]:
pipe.fit(X_train, y_train)
pipe.predict(X_test)

array([91], dtype=int64)