Skip to content

Commit 0b6b566

Browse files
feat: add categorical feature encoding support to preprocessing pipeline
1 parent cf4287e commit 0b6b566

File tree

1 file changed

+240
-0
lines changed

1 file changed

+240
-0
lines changed

preprocessing.py

Lines changed: 240 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,240 @@
1+
"""Preprocessing module for feature transformation and normalization.
2+
3+
This module provides a scikit-learn Pipeline for preprocessing both numeric and
4+
categorical features, including handling missing values, feature scaling, and
5+
categorical encoding. The pipeline is designed to be reusable for both training
6+
and prediction phases.
7+
"""
8+
9+
import pandas as pd
10+
import numpy as np
11+
from sklearn.pipeline import Pipeline
12+
from sklearn.impute import SimpleImputer
13+
from sklearn.preprocessing import StandardScaler
14+
from typing import Tuple
15+
16+
17+
def create_preprocessing_pipeline(numeric_features: list = None, categorical_features: list = None) -> Pipeline:
18+
"""
19+
Create an unfitted preprocessing pipeline for both numeric and categorical features.
20+
21+
The pipeline uses ColumnTransformer to apply different preprocessing steps:
22+
1. For numeric features:
23+
- SimpleImputer: Handles missing values by replacing them with the mean
24+
- StandardScaler: Normalizes features to have zero mean and unit variance
25+
2. For categorical features:
26+
- SimpleImputer: Handles missing values by replacing them with the most frequent value
27+
- OneHotEncoder: Encodes categorical variables as binary vectors
28+
29+
Parameters
30+
----------
31+
numeric_features : list, optional
32+
List of numeric feature column names. If None, must be provided during fitting.
33+
categorical_features : list, optional
34+
List of categorical feature column names. If None, no categorical encoding is applied.
35+
36+
Returns
37+
-------
38+
Pipeline
39+
An unfitted scikit-learn Pipeline object ready to be fitted on training data.
40+
The pipeline can be fitted using fit() or fit_transform() methods.
41+
42+
Examples
43+
--------
44+
>>> pipeline = create_preprocessing_pipeline(['age', 'income'], ['gender', 'city'])
45+
>>> # Fit on training data
46+
>>> X_train_transformed = pipeline.fit_transform(X_train)
47+
>>> # Apply to test data
48+
>>> X_test_transformed = pipeline.transform(X_test)
49+
50+
Notes
51+
-----
52+
- The pipeline must be fitted on training data before it can transform new data
53+
- Statistics are learned from training data only to avoid data leakage
54+
- The pipeline is serializable and can be saved for later use
55+
- OneHotEncoder handles unknown categories gracefully during transform
56+
"""
57+
from sklearn.compose import ColumnTransformer
58+
from sklearn.preprocessing import OneHotEncoder
59+
60+
transformers = []
61+
62+
# Add numeric transformer if numeric features are provided
63+
if numeric_features is not None and len(numeric_features) > 0:
64+
numeric_transformer = Pipeline([
65+
('imputer', SimpleImputer(strategy='mean')),
66+
('scaler', StandardScaler())
67+
])
68+
transformers.append(('numeric', numeric_transformer, numeric_features))
69+
70+
# Add categorical transformer if categorical features are provided
71+
if categorical_features is not None and len(categorical_features) > 0:
72+
categorical_transformer = Pipeline([
73+
('imputer', SimpleImputer(strategy='most_frequent')),
74+
('encoder', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
75+
])
76+
transformers.append(('categorical', categorical_transformer, categorical_features))
77+
78+
# Create the column transformer
79+
preprocessor = ColumnTransformer(
80+
transformers=transformers,
81+
remainder='drop' # Drop any columns not specified
82+
)
83+
84+
# Wrap in a pipeline for consistency
85+
pipeline = Pipeline([
86+
('preprocessor', preprocessor)
87+
])
88+
89+
return pipeline
90+
91+
92+
def fit_and_transform(pipeline: Pipeline, X: pd.DataFrame) -> Tuple[Pipeline, np.ndarray]:
93+
"""
94+
Fit the preprocessing pipeline on training data and return transformed features.
95+
96+
This function fits the pipeline on the provided training data, learning the
97+
necessary statistics (column means for imputation, mean and standard deviation
98+
for scaling, category encodings for categorical features), and then applies the
99+
transformations to return the preprocessed features. The fitted pipeline is
100+
returned for later use on prediction data.
101+
102+
Parameters
103+
----------
104+
pipeline : Pipeline
105+
An unfitted scikit-learn Pipeline object created by create_preprocessing_pipeline().
106+
X : pd.DataFrame
107+
Training features DataFrame containing numeric and/or categorical columns.
108+
May contain missing values (NaN) which will be imputed.
109+
110+
Returns
111+
-------
112+
Tuple[Pipeline, np.ndarray]
113+
A tuple containing:
114+
- pipeline (Pipeline): The fitted pipeline with learned parameters
115+
- X_transformed (np.ndarray): Transformed features as a 2D numpy array
116+
117+
Raises
118+
------
119+
ValueError
120+
If the input DataFrame is empty or contains no columns.
121+
TypeError
122+
If X is not a pandas DataFrame.
123+
124+
Examples
125+
--------
126+
>>> num_features = ['age', 'income']
127+
>>> cat_features = ['gender', 'city']
128+
>>> pipeline = create_preprocessing_pipeline(num_features, cat_features)
129+
>>> fitted_pipeline, X_train_transformed = fit_and_transform(pipeline, X_train)
130+
131+
Notes
132+
-----
133+
- This function should only be called on training data, not test/prediction data
134+
- The fitted pipeline remembers the training statistics for consistent preprocessing
135+
- After fitting, use transform_only() to apply the pipeline to new data
136+
- The output is a numpy array, losing DataFrame structure but maintaining column order
137+
"""
138+
# Validate input
139+
_validate_dataframe(X)
140+
141+
# Fit the pipeline on training data and transform
142+
X_transformed = pipeline.fit_transform(X)
143+
144+
return pipeline, X_transformed
145+
146+
147+
def transform_only(pipeline: Pipeline, X: pd.DataFrame) -> np.ndarray:
148+
"""
149+
Apply a fitted preprocessing pipeline to new data without refitting.
150+
151+
This function applies a previously fitted pipeline to new data (e.g., test set
152+
or prediction data) using the statistics learned from the training data. This
153+
ensures consistent preprocessing across training and prediction phases and
154+
prevents data leakage.
155+
156+
Parameters
157+
----------
158+
pipeline : Pipeline
159+
A fitted scikit-learn Pipeline object (previously fitted using fit() or
160+
fit_and_transform()). Must have been fitted on training data first.
161+
X : pd.DataFrame
162+
Features DataFrame containing numeric and/or categorical columns to be
163+
transformed. Must have the same columns (in the same order) as the training
164+
data used to fit the pipeline. May contain missing values which will be
165+
imputed using training statistics.
166+
167+
Returns
168+
-------
169+
np.ndarray
170+
Transformed features as a 2D numpy array. Missing values are imputed and
171+
features are scaled/encoded using training statistics.
172+
173+
Raises
174+
------
175+
ValueError
176+
If the input DataFrame is empty or has different columns than the training data.
177+
TypeError
178+
If X is not a pandas DataFrame.
179+
sklearn.exceptions.NotFittedError
180+
If the pipeline has not been fitted yet (raised by scikit-learn).
181+
182+
Examples
183+
--------
184+
>>> # First fit on training data
185+
>>> num_features = ['age', 'income']
186+
>>> cat_features = ['gender', 'city']
187+
>>> pipeline = create_preprocessing_pipeline(num_features, cat_features)
188+
>>> fitted_pipeline, X_train_transformed = fit_and_transform(pipeline, X_train)
189+
>>>
190+
>>> # Later, transform test data using the same fitted pipeline
191+
>>> X_test_transformed = transform_only(fitted_pipeline, X_test)
192+
>>>
193+
>>> # Or transform prediction data
194+
>>> X_pred_transformed = transform_only(fitted_pipeline, X_pred)
195+
196+
Notes
197+
-----
198+
- The pipeline must be fitted before calling this function
199+
- Uses training statistics (means, std, categories) for consistent preprocessing
200+
- Prevents data leakage by not learning from test/prediction data
201+
- Input must have the same columns as training data (same names and order)
202+
- Missing values in new data are imputed using training statistics
203+
"""
204+
# Validate input
205+
_validate_dataframe(X)
206+
207+
# Transform using the fitted pipeline (no refitting)
208+
X_transformed = pipeline.transform(X)
209+
210+
return X_transformed
211+
212+
213+
def _validate_dataframe(X: pd.DataFrame) -> None:
214+
"""
215+
Validate that input is a non-empty DataFrame.
216+
217+
Parameters
218+
----------
219+
X : pd.DataFrame
220+
DataFrame to validate.
221+
222+
Raises
223+
------
224+
TypeError
225+
If X is not a pandas DataFrame.
226+
ValueError
227+
If DataFrame is empty or has no columns.
228+
"""
229+
# Check if input is a DataFrame
230+
if not isinstance(X, pd.DataFrame):
231+
raise TypeError(
232+
f"Input must be a pandas DataFrame, got {type(X).__name__} instead."
233+
)
234+
235+
# Check if DataFrame is empty
236+
if X.empty:
237+
raise ValueError("Input DataFrame is empty (no rows).")
238+
239+
if X.shape[1] == 0:
240+
raise ValueError("Input DataFrame has no columns.")

0 commit comments

Comments
 (0)