In [1]:
from dataclasses import dataclass
import numpy as np
import pandas as pd
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer

In [2]:
@dataclass
class PreprocessConfig:
    drop_cols: list | None = None
    corr_threshold: float | None = None # None if no correlation dropping
    expect_numeric: bool = True # False if categorical exist

def build_preprocessor(df:pd.DataFrame, cfg:PreprocessConfig) -> tuple[Pipeline, list[str]]:
    X = df.copy()
    if cfg.drop_cols:
        X = X.drop(columns=[c for c in cfg.drop_cols if c in X.columns], errors='ignore')
    num_cols = X.select_dtypes(include=[np.number]).columns.tolist()
    cat_cols = [] if cfg.expect_numeric else X.select_dtypes(include=['object','category','bool'])

    # Pipelines
    num_pipe = Pipeline([
        ('imputer', SimpleImputer(strategy='median')), 
        ('scaler', StandardScaler())
    ])
    cat_pipe = Pipeline([
        ('imputer', SimpleImputer(strategy='most_frequent')), 
        ('ohe', OneHotEncoder(handle_unknown='ignore', drop='if_binary', sparse_output=False))
    ])
    ct = ColumnTransformer(
        transformer=[
            ('num', num_pipe, num_cols), 
            ('cat', cat_pipe, cat_cols)], 
        remainder='drop'
    )

    # Post transform - drop highly correlated cols
    def get_feature_names() -> list[str]:
        names = []
        if num_cols:
            name += num_cols
        if cat_cols:
            ohe = ct.named_transformers_['cat'].named_steps['ohe']
            ohe_names = ohe.get_feature_names_out(cat_cols).tolist()
            names += ohe_names
        return names

    return ct, get_feature_names