feat: add categorical feature encoding support to preprocessing pipeline

artemisTurintech · artemisTurintech · commit 0b6b5664af82 · 2025-10-23T06:36:49.000Z
diff --git a/preprocessing.py b/preprocessing.py
@@ -0,0 +1,240 @@
+"""Preprocessing module for feature transformation and normalization.
+
+This module provides a scikit-learn Pipeline for preprocessing both numeric and 
+categorical features, including handling missing values, feature scaling, and 
+categorical encoding. The pipeline is designed to be reusable for both training 
+and prediction phases.
+"""
+
+import pandas as pd
+import numpy as np
+from sklearn.pipeline import Pipeline
+from sklearn.impute import SimpleImputer
+from sklearn.preprocessing import StandardScaler
+from typing import Tuple
+
+
+def create_preprocessing_pipeline(numeric_features: list = None, categorical_features: list = None) -> Pipeline:
+    """
+    Create an unfitted preprocessing pipeline for both numeric and categorical features.
+    
+    The pipeline uses ColumnTransformer to apply different preprocessing steps:
+    1. For numeric features:
+       - SimpleImputer: Handles missing values by replacing them with the mean
+       - StandardScaler: Normalizes features to have zero mean and unit variance
+    2. For categorical features:
+       - SimpleImputer: Handles missing values by replacing them with the most frequent value
+       - OneHotEncoder: Encodes categorical variables as binary vectors
+    
+    Parameters
+    ----------
+    numeric_features : list, optional
+        List of numeric feature column names. If None, must be provided during fitting.
+    categorical_features : list, optional
+        List of categorical feature column names. If None, no categorical encoding is applied.
+    
+    Returns
+    -------
+    Pipeline
+        An unfitted scikit-learn Pipeline object ready to be fitted on training data.
+        The pipeline can be fitted using fit() or fit_transform() methods.
+    
+    Examples
+    --------
+    >>> pipeline = create_preprocessing_pipeline(['age', 'income'], ['gender', 'city'])
+    >>> # Fit on training data
+    >>> X_train_transformed = pipeline.fit_transform(X_train)
+    >>> # Apply to test data
+    >>> X_test_transformed = pipeline.transform(X_test)
+    
+    Notes
+    -----
+    - The pipeline must be fitted on training data before it can transform new data
+    - Statistics are learned from training data only to avoid data leakage
+    - The pipeline is serializable and can be saved for later use
+    - OneHotEncoder handles unknown categories gracefully during transform
+    """
+    from sklearn.compose import ColumnTransformer
+    from sklearn.preprocessing import OneHotEncoder
+    
+    transformers = []
+    
+    # Add numeric transformer if numeric features are provided
+    if numeric_features is not None and len(numeric_features) > 0:
+        numeric_transformer = Pipeline([
+            ('imputer', SimpleImputer(strategy='mean')),
+            ('scaler', StandardScaler())
+        ])
+        transformers.append(('numeric', numeric_transformer, numeric_features))
+    
+    # Add categorical transformer if categorical features are provided
+    if categorical_features is not None and len(categorical_features) > 0:
+        categorical_transformer = Pipeline([
+            ('imputer', SimpleImputer(strategy='most_frequent')),
+            ('encoder', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
+        ])
+        transformers.append(('categorical', categorical_transformer, categorical_features))
+    
+    # Create the column transformer
+    preprocessor = ColumnTransformer(
+        transformers=transformers,
+        remainder='drop'  # Drop any columns not specified
+    )
+    
+    # Wrap in a pipeline for consistency
+    pipeline = Pipeline([
+        ('preprocessor', preprocessor)
+    ])
+    
+    return pipeline
+
+
+def fit_and_transform(pipeline: Pipeline, X: pd.DataFrame) -> Tuple[Pipeline, np.ndarray]:
+    """
+    Fit the preprocessing pipeline on training data and return transformed features.
+    
+    This function fits the pipeline on the provided training data, learning the
+    necessary statistics (column means for imputation, mean and standard deviation
+    for scaling, category encodings for categorical features), and then applies the 
+    transformations to return the preprocessed features. The fitted pipeline is 
+    returned for later use on prediction data.
+    
+    Parameters
+    ----------
+    pipeline : Pipeline
+        An unfitted scikit-learn Pipeline object created by create_preprocessing_pipeline().
+    X : pd.DataFrame
+        Training features DataFrame containing numeric and/or categorical columns. 
+        May contain missing values (NaN) which will be imputed.
+    
+    Returns
+    -------
+    Tuple[Pipeline, np.ndarray]
+        A tuple containing:
+        - pipeline (Pipeline): The fitted pipeline with learned parameters
+        - X_transformed (np.ndarray): Transformed features as a 2D numpy array
+    
+    Raises
+    ------
+    ValueError
+        If the input DataFrame is empty or contains no columns.
+    TypeError
+        If X is not a pandas DataFrame.
+    
+    Examples
+    --------
+    >>> num_features = ['age', 'income']
+    >>> cat_features = ['gender', 'city']
+    >>> pipeline = create_preprocessing_pipeline(num_features, cat_features)
+    >>> fitted_pipeline, X_train_transformed = fit_and_transform(pipeline, X_train)
+    
+    Notes
+    -----
+    - This function should only be called on training data, not test/prediction data
+    - The fitted pipeline remembers the training statistics for consistent preprocessing
+    - After fitting, use transform_only() to apply the pipeline to new data
+    - The output is a numpy array, losing DataFrame structure but maintaining column order
+    """
+    # Validate input
+    _validate_dataframe(X)
+    
+    # Fit the pipeline on training data and transform
+    X_transformed = pipeline.fit_transform(X)
+    
+    return pipeline, X_transformed
+
+
+def transform_only(pipeline: Pipeline, X: pd.DataFrame) -> np.ndarray:
+    """
+    Apply a fitted preprocessing pipeline to new data without refitting.
+    
+    This function applies a previously fitted pipeline to new data (e.g., test set
+    or prediction data) using the statistics learned from the training data. This
+    ensures consistent preprocessing across training and prediction phases and
+    prevents data leakage.
+    
+    Parameters
+    ----------
+    pipeline : Pipeline
+        A fitted scikit-learn Pipeline object (previously fitted using fit() or
+        fit_and_transform()). Must have been fitted on training data first.
+    X : pd.DataFrame
+        Features DataFrame containing numeric and/or categorical columns to be 
+        transformed. Must have the same columns (in the same order) as the training 
+        data used to fit the pipeline. May contain missing values which will be 
+        imputed using training statistics.
+    
+    Returns
+    -------
+    np.ndarray
+        Transformed features as a 2D numpy array. Missing values are imputed and 
+        features are scaled/encoded using training statistics.
+    
+    Raises
+    ------
+    ValueError
+        If the input DataFrame is empty or has different columns than the training data.
+    TypeError
+        If X is not a pandas DataFrame.
+    sklearn.exceptions.NotFittedError
+        If the pipeline has not been fitted yet (raised by scikit-learn).
+    
+    Examples
+    --------
+    >>> # First fit on training data
+    >>> num_features = ['age', 'income']
+    >>> cat_features = ['gender', 'city']
+    >>> pipeline = create_preprocessing_pipeline(num_features, cat_features)
+    >>> fitted_pipeline, X_train_transformed = fit_and_transform(pipeline, X_train)
+    >>> 
+    >>> # Later, transform test data using the same fitted pipeline
+    >>> X_test_transformed = transform_only(fitted_pipeline, X_test)
+    >>> 
+    >>> # Or transform prediction data
+    >>> X_pred_transformed = transform_only(fitted_pipeline, X_pred)
+    
+    Notes
+    -----
+    - The pipeline must be fitted before calling this function
+    - Uses training statistics (means, std, categories) for consistent preprocessing
+    - Prevents data leakage by not learning from test/prediction data
+    - Input must have the same columns as training data (same names and order)
+    - Missing values in new data are imputed using training statistics
+    """
+    # Validate input
+    _validate_dataframe(X)
+    
+    # Transform using the fitted pipeline (no refitting)
+    X_transformed = pipeline.transform(X)
+    
+    return X_transformed
+
+
+def _validate_dataframe(X: pd.DataFrame) -> None:
+    """
+    Validate that input is a non-empty DataFrame.
+    
+    Parameters
+    ----------
+    X : pd.DataFrame
+        DataFrame to validate.
+    
+    Raises
+    ------
+    TypeError
+        If X is not a pandas DataFrame.
+    ValueError
+        If DataFrame is empty or has no columns.
+    """
+    # Check if input is a DataFrame
+    if not isinstance(X, pd.DataFrame):
+        raise TypeError(
+            f"Input must be a pandas DataFrame, got {type(X).__name__} instead."
+        )
+    
+    # Check if DataFrame is empty
+    if X.empty:
+        raise ValueError("Input DataFrame is empty (no rows).")
+    
+    if X.shape[1] == 0:
+        raise ValueError("Input DataFrame has no columns.")