From 8e5d32316b7cb90b694bc874ad04ccd8b893de43 Mon Sep 17 00:00:00 2001
From: Li Jiang <lijiang1@microsoft.com>
Date: Tue, 17 Mar 2026 21:48:14 +0800
Subject: [PATCH 01/22] fix: harden source imports and fit_transform kwargs

---
 featcopilot/__init__.py                    |  9 ++++++--
 featcopilot/transformers/sklearn_compat.py |  3 ++-
 tests/test_sklearn_compat.py               | 24 ++++++++++++++++++++++
 3 files changed, 33 insertions(+), 3 deletions(-)

diff --git a/featcopilot/__init__.py b/featcopilot/__init__.py
index 377aefb..7178867 100644
--- a/featcopilot/__init__.py
+++ b/featcopilot/__init__.py
@@ -5,9 +5,14 @@
 with novel LLM-powered capabilities via GitHub Copilot SDK.
 """
 
-from importlib.metadata import version
+from importlib.metadata import PackageNotFoundError, version
+
+try:
+    __version__ = version("featcopilot")
+except PackageNotFoundError:
+    # Allow importing directly from a source checkout before editable/install step.
+    __version__ = "0+unknown"
 
-__version__ = version("featcopilot")
 __author__ = "FeatCopilot Contributors"
 
 from featcopilot.core.base import BaseEngine, BaseSelector
diff --git a/featcopilot/transformers/sklearn_compat.py b/featcopilot/transformers/sklearn_compat.py
index c93f8fa..bb662af 100644
--- a/featcopilot/transformers/sklearn_compat.py
+++ b/featcopilot/transformers/sklearn_compat.py
@@ -298,7 +298,8 @@ def fit_transform(
             Transformed data with generated features
         """
         self.fit(X, y, column_descriptions, task_description, **fit_params)
-        result = self.transform(X)
+        # Reuse transform-relevant kwargs (e.g. text_columns, related_tables) during fit_transform.
+        result = self.transform(X, **fit_params)
 
         # Track original features (input columns) vs derived features
         if isinstance(X, np.ndarray):
diff --git a/tests/test_sklearn_compat.py b/tests/test_sklearn_compat.py
index ad1ccd1..a07118d 100644
--- a/tests/test_sklearn_compat.py
+++ b/tests/test_sklearn_compat.py
@@ -1,5 +1,6 @@
 """Tests for scikit-learn compatible feature engineering transformers."""
 
+import importlib
 from unittest.mock import MagicMock, patch
 
 import numpy as np
@@ -262,6 +263,29 @@ def test_set_params(self):
         assert afe.max_features == 20
         assert afe.verbose is True
 
+
+class TestPackageImport:
+    """Tests for top-level package import behavior."""
+
+    def test_import_without_installed_metadata_falls_back(self):
+        """Test source import works even when distribution metadata is unavailable."""
+        import importlib.metadata as importlib_metadata
+
+        import featcopilot
+
+        original_version = importlib_metadata.version
+
+        def fake_version(name):
+            if name == "featcopilot":
+                raise importlib_metadata.PackageNotFoundError
+            return original_version(name)
+
+        with patch("importlib.metadata.version", side_effect=fake_version):
+            reloaded = importlib.reload(featcopilot)
+            assert reloaded.__version__ == "0+unknown"
+
+        importlib.reload(featcopilot)
+
     def test_verbose_logging(self, sample_df, sample_target):
         """Test that verbose=True does not error."""
         afe = AutoFeatureEngineer(engines=["tabular"], verbose=True)

From 50b399cb6440e176f8b9d75e21c7b6a93981f362 Mon Sep 17 00:00:00 2001
From: Li Jiang <lijiang1@microsoft.com>
Date: Tue, 17 Mar 2026 21:58:16 +0800
Subject: [PATCH 02/22] feat: add leakage guard and time-aware prototype

---
 docs/examples/time-aware-tabular.md        |  66 ++++++++++++
 examples/time_aware_tabular_prototype.py   | 114 +++++++++++++++++++++
 featcopilot/transformers/sklearn_compat.py |  48 ++++++++-
 featcopilot/utils/__init__.py              |   2 +
 featcopilot/utils/validation.py            |  66 ++++++++++++
 mkdocs.yml                                 |   1 +
 tests/test_autofeat.py                     |  28 +++++
 7 files changed, 324 insertions(+), 1 deletion(-)
 create mode 100644 docs/examples/time-aware-tabular.md
 create mode 100644 examples/time_aware_tabular_prototype.py
 create mode 100644 featcopilot/utils/validation.py

diff --git a/docs/examples/time-aware-tabular.md b/docs/examples/time-aware-tabular.md
new file mode 100644
index 0000000..7b938e1
--- /dev/null
+++ b/docs/examples/time-aware-tabular.md
@@ -0,0 +1,66 @@
+# Time-Aware Tabular Prototype
+
+A practical prototype for **leakage-safe auto feature engineering** on time-aware tabular data.
+
+## Why this example matters
+
+Most real feature engineering failures are not caused by weak transformations. They come from:
+
+- random train/test splits on temporal data
+- future information leaking into features
+- offline features that cannot be reproduced later
+
+This example shows a safer baseline:
+
+1. sort by time
+2. split by time
+3. fit features on the training slice only
+4. transform the holdout slice separately
+5. compare against a plain model baseline
+
+## Script
+
+See:
+
+```text
+examples/time_aware_tabular_prototype.py
+```
+
+## Core pattern
+
+```python
+engineer = AutoFeatureEngineer(
+    engines=["tabular"],
+    max_features=30,
+    selection_methods=["mutual_info", "importance"],
+    correlation_threshold=0.9,
+    leakage_guard="warn",
+)
+
+X_train_fe = engineer.fit_transform(
+    X_train,
+    y_train,
+    target_name="churned",
+    apply_selection=True,
+)
+X_test_fe = engineer.transform(X_test)
+```
+
+## Leakage guard
+
+`AutoFeatureEngineer` now supports a lightweight `leakage_guard` option:
+
+- `"warn"` — default, warns if suspicious columns are present
+- `"raise"` — fail fast when likely leakage columns are detected
+- `"off"` — disable the check
+
+This is intentionally conservative. It does **not** prove your pipeline is safe. It just catches obvious foot-guns such as columns named like:
+
+- `target`
+- `label`
+- `outcome`
+- `future_*`
+
+## Recommendation
+
+For a real project, start with this workflow before trying more advanced LLM or agent-based feature generation. If the time-aware baseline is not trustworthy, more automation only makes the mistake faster.
diff --git a/examples/time_aware_tabular_prototype.py b/examples/time_aware_tabular_prototype.py
new file mode 100644
index 0000000..585d0ff
--- /dev/null
+++ b/examples/time_aware_tabular_prototype.py
@@ -0,0 +1,114 @@
+"""Time-aware tabular prototype with leakage-safe evaluation.
+
+This example shows a practical starting point for auto feature engineering on
+behavioral / event / tabular data where time-based splitting matters.
+"""
+
+from __future__ import annotations
+
+import numpy as np
+import pandas as pd
+from sklearn.ensemble import HistGradientBoostingClassifier
+from sklearn.metrics import roc_auc_score
+
+from featcopilot import AutoFeatureEngineer
+
+
+def create_time_aware_dataset(n_samples: int = 2000) -> pd.DataFrame:
+    """Create a synthetic time-aware churn-like dataset."""
+    rng = np.random.default_rng(42)
+    timestamps = pd.date_range("2024-01-01", periods=n_samples, freq="h")
+
+    df = pd.DataFrame(
+        {
+            "event_time": timestamps,
+            "account_age_days": rng.integers(10, 1200, n_samples),
+            "sessions_7d": rng.poisson(8, n_samples),
+            "tickets_30d": rng.poisson(2, n_samples),
+            "spend_30d": rng.gamma(2.5, 40, n_samples),
+            "plan_tier": rng.choice(["free", "pro", "team"], n_samples, p=[0.45, 0.4, 0.15]),
+        }
+    )
+
+    spend_ratio = df["spend_30d"] / (df["account_age_days"] + 10)
+    support_pressure = df["tickets_30d"] / (df["sessions_7d"] + 1)
+    pro_flag = (df["plan_tier"] == "pro").astype(int)
+    team_flag = (df["plan_tier"] == "team").astype(int)
+
+    churn_logit = (
+        -1.2
+        - 0.015 * df["sessions_7d"]
+        + 0.25 * df["tickets_30d"]
+        + 3.2 * support_pressure
+        + 1.7 * spend_ratio
+        - 0.35 * pro_flag
+        - 0.55 * team_flag
+    )
+    churn_prob = 1 / (1 + np.exp(-churn_logit))
+    df["churned"] = (rng.random(n_samples) < churn_prob).astype(int)
+    return df
+
+
+def temporal_split(df: pd.DataFrame, time_col: str, valid_fraction: float = 0.2) -> tuple[pd.DataFrame, pd.DataFrame]:
+    """Split a dataset by time instead of random shuffling."""
+    df = df.sort_values(time_col).reset_index(drop=True)
+    split_idx = int(len(df) * (1 - valid_fraction))
+    return df.iloc[:split_idx].copy(), df.iloc[split_idx:].copy()
+
+
+def main() -> None:
+    """Run a leakage-safe auto feature engineering prototype."""
+    data = create_time_aware_dataset()
+    train_df, test_df = temporal_split(data, time_col="event_time", valid_fraction=0.2)
+
+    feature_cols = [
+        "account_age_days",
+        "sessions_7d",
+        "tickets_30d",
+        "spend_30d",
+        "plan_tier",
+    ]
+    target_col = "churned"
+
+    X_train = train_df[feature_cols]
+    y_train = train_df[target_col]
+    X_test = test_df[feature_cols]
+    y_test = test_df[target_col]
+
+    X_train_baseline = pd.get_dummies(X_train, drop_first=False)
+    X_test_baseline = pd.get_dummies(X_test, drop_first=False)
+    X_train_baseline, X_test_baseline = X_train_baseline.align(
+        X_test_baseline, join="left", axis=1, fill_value=0
+    )
+
+    baseline = HistGradientBoostingClassifier(max_depth=4, learning_rate=0.05, random_state=42)
+    baseline.fit(X_train_baseline, y_train)
+    baseline_auc = roc_auc_score(y_test, baseline.predict_proba(X_test_baseline)[:, 1])
+
+    engineer = AutoFeatureEngineer(
+        engines=["tabular"],
+        max_features=30,
+        selection_methods=["mutual_info", "importance"],
+        correlation_threshold=0.9,
+        leakage_guard="warn",
+        verbose=True,
+    )
+    X_train_fe = engineer.fit_transform(X_train, y_train, target_name=target_col, apply_selection=True).fillna(0)
+    X_test_fe = engineer.transform(X_test).fillna(0)
+
+    common_cols = [col for col in X_train_fe.columns if col in X_test_fe.columns]
+    X_train_fe = X_train_fe[common_cols]
+    X_test_fe = X_test_fe[common_cols]
+
+    model = HistGradientBoostingClassifier(max_depth=4, learning_rate=0.05, random_state=42)
+    model.fit(X_train_fe, y_train)
+    engineered_auc = roc_auc_score(y_test, model.predict_proba(X_test_fe)[:, 1])
+
+    print(f"Temporal baseline ROC-AUC: {baseline_auc:.4f}")
+    print(f"Engineered ROC-AUC:       {engineered_auc:.4f}")
+    print(f"Delta:                    {engineered_auc - baseline_auc:+.4f}")
+    print(f"Selected features:        {len(common_cols)}")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/featcopilot/transformers/sklearn_compat.py b/featcopilot/transformers/sklearn_compat.py
index bb662af..63f3447 100644
--- a/featcopilot/transformers/sklearn_compat.py
+++ b/featcopilot/transformers/sklearn_compat.py
@@ -3,6 +3,7 @@
 Provides drop-in sklearn transformers for feature engineering pipelines.
 """
 
+import warnings
 from typing import Any, Optional, Union
 
 import numpy as np
@@ -16,6 +17,7 @@
 from featcopilot.engines.timeseries import TimeSeriesEngine
 from featcopilot.selection.unified import FeatureSelector
 from featcopilot.utils.logger import get_logger
+from featcopilot.utils.validation import find_potential_leakage_columns
 
 logger = get_logger(__name__)
 
@@ -110,6 +112,10 @@ class AutoFeatureEngineer(BaseEstimator, TransformerMixin):
     >>> X_transformed = engineer.fit_transform(X, y)
     """
 
+    SUPPORTED_ENGINES = {"tabular", "timeseries", "relational", "text", "llm"}
+    SUPPORTED_SELECTION_METHODS = {"mutual_info", "importance", "f_test", "chi2", "correlation", "xgboost"}
+    SUPPORTED_LEAKAGE_GUARDS = {"off", "warn", "raise"}
+
     def __init__(
         self,
         engines: Optional[list[str]] = None,
@@ -118,6 +124,7 @@ def __init__(
         correlation_threshold: float = 0.85,
         llm_config: Optional[dict[str, Any]] = None,
         verbose: bool = False,
+        leakage_guard: str = "warn",
     ):
         self.engines = engines or ["tabular"]
         self.max_features = max_features
@@ -125,6 +132,9 @@ def __init__(
         self.correlation_threshold = correlation_threshold
         self.llm_config = llm_config or {}
         self.verbose = verbose
+        self.leakage_guard = leakage_guard
+
+        self._validate_configuration()
 
         self._engine_instances: dict[str, Any] = {}
         self._selector: Optional[FeatureSelector] = None
@@ -133,12 +143,34 @@ def __init__(
         self._column_descriptions: dict[str, str] = {}
         self._task_description: str = ""
 
+    def _validate_configuration(self) -> None:
+        """Validate user-facing configuration early."""
+        unknown_engines = sorted(set(self.engines) - self.SUPPORTED_ENGINES)
+        if unknown_engines:
+            raise ValueError(f"Unknown engines: {unknown_engines}. Supported engines: {sorted(self.SUPPORTED_ENGINES)}")
+
+        unknown_methods = sorted(set(self.selection_methods) - self.SUPPORTED_SELECTION_METHODS)
+        if unknown_methods:
+            raise ValueError(
+                "Unknown selection methods: "
+                f"{unknown_methods}. Supported methods: {sorted(self.SUPPORTED_SELECTION_METHODS)}"
+            )
+
+        if self.leakage_guard not in self.SUPPORTED_LEAKAGE_GUARDS:
+            raise ValueError(
+                f"leakage_guard must be one of {sorted(self.SUPPORTED_LEAKAGE_GUARDS)}, got {self.leakage_guard!r}"
+            )
+
+        if self.max_features is not None and self.max_features <= 0:
+            raise ValueError("max_features must be positive when provided")
+
     def fit(
         self,
         X: Union[pd.DataFrame, np.ndarray],
         y: Optional[Union[pd.Series, np.ndarray]] = None,
         column_descriptions: Optional[dict[str, str]] = None,
         task_description: str = "prediction task",
+        target_name: Optional[str] = None,
         **fit_params,
     ) -> "AutoFeatureEngineer":
         """
@@ -168,6 +200,17 @@ def fit(
         self._column_descriptions = column_descriptions or {}
         self._task_description = task_description
 
+        suspicious_columns = find_potential_leakage_columns(X.columns.tolist(), target_name=target_name)
+        if suspicious_columns and self.leakage_guard != "off":
+            message = (
+                "Potential leakage-prone columns detected: "
+                f"{suspicious_columns}. Review time/label leakage before fitting, "
+                "or set leakage_guard='off' to disable this check."
+            )
+            if self.leakage_guard == "raise":
+                raise ValueError(message)
+            warnings.warn(message, UserWarning, stacklevel=2)
+
         # Fit each engine
         for engine_name in self.engines:
             engine = self._create_engine(engine_name)
@@ -271,6 +314,7 @@ def fit_transform(
         y: Optional[Union[pd.Series, np.ndarray]] = None,
         column_descriptions: Optional[dict[str, str]] = None,
         task_description: str = "prediction task",
+        target_name: Optional[str] = None,
         apply_selection: bool = True,
         **fit_params,
     ) -> pd.DataFrame:
@@ -297,7 +341,7 @@ def fit_transform(
         X_transformed : DataFrame
             Transformed data with generated features
         """
-        self.fit(X, y, column_descriptions, task_description, **fit_params)
+        self.fit(X, y, column_descriptions, task_description, target_name=target_name, **fit_params)
         # Reuse transform-relevant kwargs (e.g. text_columns, related_tables) during fit_transform.
         result = self.transform(X, **fit_params)
 
@@ -409,10 +453,12 @@ def get_params(self, deep=True):
             "correlation_threshold": self.correlation_threshold,
             "llm_config": self.llm_config,
             "verbose": self.verbose,
+            "leakage_guard": self.leakage_guard,
         }
 
     def set_params(self, **params):
         """Set parameters for sklearn compatibility."""
         for key, value in params.items():
             setattr(self, key, value)
+        self._validate_configuration()
         return self
diff --git a/featcopilot/utils/__init__.py b/featcopilot/utils/__init__.py
index ff6a969..8920ef4 100644
--- a/featcopilot/utils/__init__.py
+++ b/featcopilot/utils/__init__.py
@@ -10,6 +10,7 @@
     list_models,
 )
 from featcopilot.utils.parallel import parallel_apply
+from featcopilot.utils.validation import find_potential_leakage_columns
 
 __all__ = [
     "parallel_apply",
@@ -20,4 +21,5 @@
     "get_default_model",
     "get_model_names",
     "is_valid_model",
+    "find_potential_leakage_columns",
 ]
diff --git a/featcopilot/utils/validation.py b/featcopilot/utils/validation.py
new file mode 100644
index 0000000..9df3dae
--- /dev/null
+++ b/featcopilot/utils/validation.py
@@ -0,0 +1,66 @@
+"""Validation helpers for safer feature engineering workflows."""
+
+import re
+from typing import Optional
+
+DEFAULT_LEAKAGE_KEYWORDS = [
+    "target",
+    "label",
+    "outcome",
+    "ground_truth",
+    "y_true",
+    "future",
+    "leak",
+]
+
+
+def _normalize_column_name(name: str) -> str:
+    """Normalize a column name for fuzzy matching."""
+    return re.sub(r"[^a-z0-9]+", "", name.lower())
+
+
+def find_potential_leakage_columns(
+    columns: list[str],
+    target_name: Optional[str] = None,
+    keywords: Optional[list[str]] = None,
+) -> list[str]:
+    """
+    Find suspicious columns that may leak label or future information.
+
+    Parameters
+    ----------
+    columns : list[str]
+        Input column names to inspect.
+    target_name : str, optional
+        Expected target/label column name. Related variants will be flagged.
+    keywords : list[str], optional
+        Additional suspicious keywords to match against normalized column names.
+
+    Returns
+    -------
+    list[str]
+        Column names that deserve manual review for leakage.
+    """
+    keywords = keywords or DEFAULT_LEAKAGE_KEYWORDS
+    normalized_keywords = [_normalize_column_name(keyword) for keyword in keywords]
+    normalized_target = _normalize_column_name(target_name) if target_name else None
+
+    suspicious: list[str] = []
+    for column in columns:
+        normalized_column = _normalize_column_name(column)
+
+        keyword_hit = any(keyword and keyword in normalized_column for keyword in normalized_keywords)
+        target_hit = bool(
+            normalized_target
+            and normalized_target
+            and (
+                normalized_column == normalized_target
+                or normalized_target in normalized_column
+                or normalized_column in normalized_target
+            )
+        )
+
+        if keyword_hit or target_hit:
+            suspicious.append(column)
+
+    return suspicious
diff --git a/mkdocs.yml b/mkdocs.yml
index a091b3f..9b98a7c 100644
--- a/mkdocs.yml
+++ b/mkdocs.yml
@@ -85,6 +85,7 @@ nav:
   - Examples:
     - End-to-End Demo: examples/e2e-demo.md
     - Basic Usage: examples/basic.md
+    - Time-Aware Tabular: examples/time-aware-tabular.md
     - LLM-Powered: examples/llm-powered.md
     - Sklearn Pipeline: examples/sklearn-pipeline.md
     - Domain-Specific: examples/domain-specific.md
diff --git a/tests/test_autofeat.py b/tests/test_autofeat.py
index afe6eb5..560791c 100644
--- a/tests/test_autofeat.py
+++ b/tests/test_autofeat.py
@@ -75,6 +75,34 @@ def test_multiple_engines(self, sample_data):
         assert result is not None
         assert len(result.columns) > 0
 
+    def test_leakage_guard_warns_on_suspicious_columns(self, sample_data):
+        """Test leakage guard warns when suspicious columns are present."""
+        X, y = sample_data
+        X = X.rename(columns={"balance": "future_target_signal"})
+        engineer = AutoFeatureEngineer(engines=["tabular"], leakage_guard="warn")
+
+        with pytest.warns(UserWarning, match="Potential leakage-prone columns detected"):
+            engineer.fit(X, y, target_name="target")
+
+    def test_leakage_guard_raises_on_suspicious_columns(self, sample_data):
+        """Test leakage guard can hard-fail when suspicious columns are present."""
+        X, y = sample_data
+        X = X.rename(columns={"balance": "churn_label_proxy"})
+        engineer = AutoFeatureEngineer(engines=["tabular"], leakage_guard="raise")
+
+        with pytest.raises(ValueError, match="Potential leakage-prone columns detected"):
+            engineer.fit(X, y, target_name="churn")
+
+    def test_invalid_engine_configuration_raises(self):
+        """Test invalid engine names fail early."""
+        with pytest.raises(ValueError, match="Unknown engines"):
+            AutoFeatureEngineer(engines=["tabular", "spaceship"])
+
+    def test_invalid_selection_method_raises(self):
+        """Test invalid selection methods fail early."""
+        with pytest.raises(ValueError, match="Unknown selection methods"):
+            AutoFeatureEngineer(selection_methods=["mutual_info", "magic"])
+
 
 class TestAutoFeatureEngineerParams:
     """Test parameter handling."""

From 50957fa66715ce77e490230c83c6e5f0d57f822d Mon Sep 17 00:00:00 2001
From: Li Jiang <lijiang1@microsoft.com>
Date: Tue, 17 Mar 2026 22:13:20 +0800
Subject: [PATCH 03/22] feat: improve engine APIs and benchmark realism

---
 .../compare_tools/run_fe_tools_comparison.py  |  38 ++++
 benchmarks/use_cases/README.md                |  20 ++
 .../run_auto_feature_engineering_benchmark.py | 209 ++++++++++++++++++
 featcopilot/engines/relational.py             |  36 ++-
 featcopilot/engines/timeseries.py             |  39 ++--
 tests/test_autofeat.py                        |   5 +
 tests/test_engines.py                         |  35 +++
 7 files changed, 357 insertions(+), 25 deletions(-)
 create mode 100644 benchmarks/use_cases/README.md
 create mode 100644 benchmarks/use_cases/run_auto_feature_engineering_benchmark.py

diff --git a/benchmarks/compare_tools/run_fe_tools_comparison.py b/benchmarks/compare_tools/run_fe_tools_comparison.py
index f678fe4..03b4d92 100644
--- a/benchmarks/compare_tools/run_fe_tools_comparison.py
+++ b/benchmarks/compare_tools/run_fe_tools_comparison.py
@@ -67,6 +67,44 @@
 
 warnings.filterwarnings("ignore")
 
+
+def split_benchmark_data(
+    X: pd.DataFrame,
+    y: pd.Series,
+    task: str,
+    random_state: int,
+    test_size: float = 0.2,
+) -> tuple[np.ndarray, np.ndarray, pd.Series, pd.Series]:
+    """Split benchmark data with slightly more realistic defaults by task type."""
+    indices = np.arange(len(X))
+
+    if "forecast" in task or "timeseries" in task:
+        split_idx = int(len(indices) * (1 - test_size))
+        train_idx = indices[:split_idx]
+        test_idx = indices[split_idx:]
+        y_train = y.iloc[train_idx]
+        y_test = y.iloc[test_idx]
+        return train_idx, test_idx, y_train, y_test
+
+    stratify = None
+    if "classification" in task:
+        try:
+            class_counts = pd.Series(y).value_counts(dropna=False)
+            if len(class_counts) > 1 and class_counts.min() >= 2:
+                stratify = y
+        except Exception:
+            stratify = None
+
+    train_idx, test_idx, y_train, y_test = train_test_split(
+        indices,
+        y,
+        test_size=test_size,
+        random_state=random_state,
+        stratify=stratify,
+    )
+    return train_idx, test_idx, y_train, y_test
+
+
 # Default configuration
 QUICK_DATASETS = [
     # Interaction-heavy synthetic (FeatCopilot creates valuable polynomial features)
diff --git a/benchmarks/use_cases/README.md b/benchmarks/use_cases/README.md
new file mode 100644
index 0000000..b08d50f
--- /dev/null
+++ b/benchmarks/use_cases/README.md
@@ -0,0 +1,20 @@
+# Use-Case Benchmarks
+
+Targeted benchmarks for realistic feature-engineering scenarios.
+
+## Auto Feature Engineering
+
+This benchmark compares:
+- plain baseline
+- FeatCopilot
+- Featuretools (if installed)
+- autofeat (if installed)
+
+on an interaction-heavy tabular classification task where automatic feature engineering should matter.
+
+```bash
+python -m benchmarks.use_cases.run_auto_feature_engineering_benchmark
+```
+
+Outputs:
+- `AUTO_FEATURE_ENGINEERING_USE_CASE.md`
diff --git a/benchmarks/use_cases/run_auto_feature_engineering_benchmark.py b/benchmarks/use_cases/run_auto_feature_engineering_benchmark.py
new file mode 100644
index 0000000..8c9c2a2
--- /dev/null
+++ b/benchmarks/use_cases/run_auto_feature_engineering_benchmark.py
@@ -0,0 +1,209 @@
+"""Benchmark FeatCopilot against common baselines for auto feature engineering.
+
+Focused on the practical use case of structured/tabular data with interaction
+and ratio effects where feature engineering should matter.
+"""
+
+from __future__ import annotations
+
+import argparse
+import json
+from pathlib import Path
+from typing import Any
+
+import numpy as np
+import pandas as pd
+from sklearn.ensemble import HistGradientBoostingClassifier
+from sklearn.metrics import roc_auc_score
+from sklearn.model_selection import train_test_split
+
+from featcopilot import AutoFeatureEngineer
+
+REPORT_DIR = Path(__file__).resolve().parent
+
+
+def create_dataset(n_samples: int = 5000, random_state: int = 42) -> pd.DataFrame:
+    """Create a synthetic classification dataset with interaction-heavy signal."""
+    rng = np.random.default_rng(random_state)
+    df = pd.DataFrame(
+        {
+            "age": rng.integers(18, 80, n_samples),
+            "income": rng.gamma(3.5, 18000, n_samples),
+            "tenure_months": rng.integers(1, 120, n_samples),
+            "monthly_charges": rng.uniform(20, 180, n_samples),
+            "num_products": rng.integers(1, 6, n_samples),
+            "support_tickets": rng.poisson(2.2, n_samples),
+            "plan_tier": rng.choice(["free", "pro", "team"], n_samples, p=[0.45, 0.4, 0.15]),
+        }
+    )
+
+    charge_ratio = df["monthly_charges"] / (df["income"] / 12 + 1)
+    complaint_rate = df["support_tickets"] / (df["tenure_months"] + 1)
+    product_density = df["monthly_charges"] / (df["num_products"] + 0.5)
+    loyalty = ((df["age"] - 18) / 62) * (df["tenure_months"] / 120)
+    team_flag = (df["plan_tier"] == "team").astype(int)
+
+    logit = (
+        -1.4
+        + 1.5 * charge_ratio
+        + 1.8 * complaint_rate
+        + 0.004 * product_density
+        - 0.8 * loyalty
+        - 0.4 * team_flag
+    )
+    prob = 1 / (1 + np.exp(-logit))
+    df["target"] = (rng.random(n_samples) < prob).astype(int)
+    return df
+
+
+def evaluate_auc(X_train: pd.DataFrame, X_test: pd.DataFrame, y_train: pd.Series, y_test: pd.Series) -> float:
+    """Train a simple classifier and return ROC-AUC."""
+    model = HistGradientBoostingClassifier(max_depth=4, learning_rate=0.05, random_state=42)
+    model.fit(X_train, y_train)
+    return roc_auc_score(y_test, model.predict_proba(X_test)[:, 1])
+
+
+def align_and_fill(X_train: pd.DataFrame, X_test: pd.DataFrame) -> tuple[pd.DataFrame, pd.DataFrame]:
+    """Align train/test columns and fill missing values safely."""
+    X_train_aligned, X_test_aligned = X_train.align(X_test, join="left", axis=1, fill_value=0)
+    return X_train_aligned.fillna(0), X_test_aligned.fillna(0)
+
+
+def run_featcopilot(X_train: pd.DataFrame, X_test: pd.DataFrame, y_train: pd.Series) -> tuple[float, int]:
+    """Run FeatCopilot and return transformed feature count."""
+    engineer = AutoFeatureEngineer(
+        engines=["tabular"],
+        max_features=40,
+        selection_methods=["mutual_info", "importance"],
+        correlation_threshold=0.9,
+        leakage_guard="warn",
+        verbose=False,
+    )
+    X_train_fe = engineer.fit_transform(X_train, y_train, target_name="target", apply_selection=True)
+    X_test_fe = engineer.transform(X_test)
+    X_train_fe, X_test_fe = align_and_fill(X_train_fe, X_test_fe)
+    return evaluate_auc(X_train_fe, X_test_fe, y_train, y_test=None), X_train_fe.shape[1]
+
+
+def run_baseline(X_train: pd.DataFrame, X_test: pd.DataFrame, y_train: pd.Series, y_test: pd.Series) -> dict[str, Any]:
+    """Run one-hot baseline."""
+    X_train_base = pd.get_dummies(X_train, drop_first=False)
+    X_test_base = pd.get_dummies(X_test, drop_first=False)
+    X_train_base, X_test_base = align_and_fill(X_train_base, X_test_base)
+    auc = evaluate_auc(X_train_base, X_test_base, y_train, y_test)
+    return {"tool": "baseline", "auc": auc, "n_features": X_train_base.shape[1]}
+
+
+def run_featcopilot_case(X_train: pd.DataFrame, X_test: pd.DataFrame, y_train: pd.Series, y_test: pd.Series) -> dict[str, Any]:
+    """Run FeatCopilot benchmark case."""
+    engineer = AutoFeatureEngineer(
+        engines=["tabular"],
+        max_features=40,
+        selection_methods=["mutual_info", "importance"],
+        correlation_threshold=0.9,
+        leakage_guard="warn",
+        verbose=False,
+    )
+    X_train_fe = engineer.fit_transform(X_train, y_train, target_name="target", apply_selection=True)
+    X_test_fe = engineer.transform(X_test)
+    X_train_fe, X_test_fe = align_and_fill(X_train_fe, X_test_fe)
+    auc = evaluate_auc(X_train_fe, X_test_fe, y_train, y_test)
+    return {"tool": "featcopilot", "auc": auc, "n_features": X_train_fe.shape[1]}
+
+
+def run_featuretools_case(X_train: pd.DataFrame, X_test: pd.DataFrame, y_train: pd.Series, y_test: pd.Series) -> dict[str, Any]:
+    """Run Featuretools if available."""
+    try:
+        import featuretools as ft
+    except Exception as exc:
+        return {"tool": "featuretools", "status": f"unavailable: {exc}"}
+
+    train_copy = pd.get_dummies(X_train, drop_first=False).reset_index(drop=True)
+    test_copy = pd.get_dummies(X_test, drop_first=False).reset_index(drop=True)
+    train_copy, test_copy = align_and_fill(train_copy, test_copy)
+    train_copy["row_id"] = np.arange(len(train_copy))
+    test_copy["row_id"] = np.arange(len(test_copy))
+
+    es_train = ft.EntitySet(id="afe_train").add_dataframe(dataframe_name="data", dataframe=train_copy, index="row_id")
+    train_fm, feature_defs = ft.dfs(
+        entityset=es_train,
+        target_dataframe_name="data",
+        trans_primitives=["add_numeric", "multiply_numeric", "divide_numeric"],
+        agg_primitives=[],
+        max_depth=2,
+        max_features=40,
+    )
+
+    es_test = ft.EntitySet(id="afe_test").add_dataframe(dataframe_name="data", dataframe=test_copy, index="row_id")
+    test_fm = ft.calculate_feature_matrix(entityset=es_test, features=feature_defs)
+    train_fm, test_fm = align_and_fill(train_fm, test_fm)
+    auc = evaluate_auc(train_fm, test_fm, y_train, y_test)
+    return {"tool": "featuretools", "auc": auc, "n_features": train_fm.shape[1]}
+
+
+def run_autofeat_case(X_train: pd.DataFrame, X_test: pd.DataFrame, y_train: pd.Series, y_test: pd.Series) -> dict[str, Any]:
+    """Run autofeat if available."""
+    try:
+        from autofeat import AutoFeatClassifier
+    except Exception as exc:
+        return {"tool": "autofeat", "status": f"unavailable: {exc}"}
+
+    X_train_num = pd.get_dummies(X_train, drop_first=False)
+    X_test_num = pd.get_dummies(X_test, drop_first=False)
+    X_train_num, X_test_num = align_and_fill(X_train_num, X_test_num)
+
+    model = AutoFeatClassifier(verbose=0, feateng_steps=1, featsel_runs=1)
+    X_train_fe = model.fit_transform(X_train_num, y_train)
+    X_test_fe = model.transform(X_test_num)
+    X_train_fe, X_test_fe = align_and_fill(X_train_fe, X_test_fe)
+    auc = evaluate_auc(X_train_fe, X_test_fe, y_train, y_test)
+    return {"tool": "autofeat", "auc": auc, "n_features": X_train_fe.shape[1]}
+
+
+def write_report(results: list[dict[str, Any]], output_path: Path) -> None:
+    """Write a markdown report."""
+    lines = [
+        "# Auto Feature Engineering Use-Case Benchmark",
+        "",
+        "Compares a plain baseline with FeatCopilot and common automatic feature engineering tools on an interaction-heavy tabular classification task.",
+        "",
+        "| Tool | Status | ROC-AUC | Feature Count |",
+        "|------|--------|---------|---------------|",
+    ]
+    for row in results:
+        status = row.get("status", "ok")
+        auc = f"{row['auc']:.4f}" if "auc" in row else "-"
+        n_features = str(row.get("n_features", "-"))
+        lines.append(f"| {row['tool']} | {status} | {auc} | {n_features} |")
+    output_path.write_text("\n".join(lines) + "\n", encoding="utf-8")
+
+
+def main() -> None:
+    """Run the use-case benchmark."""
+    parser = argparse.ArgumentParser(description="Run an auto feature engineering use-case benchmark")
+    parser.add_argument("--samples", type=int, default=5000, help="Number of synthetic samples")
+    parser.add_argument("--seed", type=int, default=42, help="Random seed")
+    args = parser.parse_args()
+
+    data = create_dataset(n_samples=args.samples, random_state=args.seed)
+    X = data.drop(columns=["target"])
+    y = data["target"]
+    X_train, X_test, y_train, y_test = train_test_split(
+        X, y, test_size=0.2, random_state=args.seed, stratify=y
+    )
+
+    results = [
+        run_baseline(X_train, X_test, y_train, y_test),
+        run_featcopilot_case(X_train, X_test, y_train, y_test),
+        run_featuretools_case(X_train, X_test, y_train, y_test),
+        run_autofeat_case(X_train, X_test, y_train, y_test),
+    ]
+
+    output_path = REPORT_DIR / "AUTO_FEATURE_ENGINEERING_USE_CASE.md"
+    write_report(results, output_path)
+    print(json.dumps(results, indent=2))
+    print(f"\nWrote report to {output_path}")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/featcopilot/engines/relational.py b/featcopilot/engines/relational.py
index 19ad435..fdc9963 100644
--- a/featcopilot/engines/relational.py
+++ b/featcopilot/engines/relational.py
@@ -106,14 +106,14 @@ def add_relationship(
         -------
         self : RelationalEngine
         """
-        self._relationships.append(
-            {
-                "child": child_table,
-                "parent": parent_table,
-                "child_key": key_column,
-                "parent_key": parent_key or key_column,
-            }
-        )
+        relationship = {
+            "child": child_table,
+            "parent": parent_table,
+            "child_key": key_column,
+            "parent_key": parent_key or key_column,
+        }
+        if relationship not in self._relationships:
+            self._relationships.append(relationship)
         return self
 
     def fit(
@@ -143,6 +143,8 @@ def fit(
         self._related_tables = related_tables or {}
         self._primary_columns = X.columns.tolist()
 
+        self._validate_relationships(X, self._related_tables)
+
         if self.config.verbose:
             logger.info(f"RelationalEngine: {len(self._relationships)} relationships defined")
 
@@ -175,6 +177,7 @@ def transform(
 
         X = self._validate_input(X)
         related_tables = related_tables or self._related_tables
+        self._validate_relationships(X, related_tables)
         result = X.copy()
 
         # Generate features from relationships
@@ -198,6 +201,23 @@ def transform(
 
         return result
 
+    def _validate_relationships(self, primary_df: pd.DataFrame, related_tables: dict[str, pd.DataFrame]) -> None:
+        """Validate configured relationships against available tables and keys."""
+        for relationship in self._relationships:
+            child_key = relationship["child_key"]
+            parent_table = relationship["parent"]
+            parent_key = relationship["parent_key"]
+
+            if child_key not in primary_df.columns:
+                raise ValueError(f"child_key '{child_key}' not found in primary table")
+
+            if parent_table not in related_tables:
+                continue
+
+            parent_df = related_tables[parent_table]
+            if parent_key not in parent_df.columns:
+                raise ValueError(f"parent_key '{parent_key}' not found in related table '{parent_table}'")
+
     def _aggregate_from_relationship(
         self,
         child_df: pd.DataFrame,
diff --git a/featcopilot/engines/timeseries.py b/featcopilot/engines/timeseries.py
index 8702b75..d361f20 100644
--- a/featcopilot/engines/timeseries.py
+++ b/featcopilot/engines/timeseries.py
@@ -21,6 +21,7 @@ class TimeSeriesEngineConfig(EngineConfig):
     """Configuration for time series feature engine."""
 
     name: str = "TimeSeriesEngine"
+    series_in_rows: bool = Field(default=False, description="Treat each row as an independent time series")
     features: list[str] = Field(
         default_factory=lambda: [
             "basic_stats",
@@ -92,6 +93,7 @@ def __init__(
         window_sizes: Optional[list[int]] = None,
         max_features: Optional[int] = None,
         verbose: bool = False,
+        series_in_rows: bool = False,
         **kwargs,
     ):
         config = TimeSeriesEngineConfig(
@@ -99,6 +101,7 @@ def __init__(
             window_sizes=window_sizes or [5, 10, 20],
             max_features=max_features,
             verbose=verbose,
+            series_in_rows=series_in_rows,
             **kwargs,
         )
         super().__init__(config=config)
@@ -131,11 +134,24 @@ def fit(
         """
         X = self._validate_input(X)
 
+        if time_column is not None and time_column not in X.columns:
+            raise ValueError(f"time_column '{time_column}' not found in input data")
+
+        self._time_index_column = time_column
+
         # Identify numeric columns for time series analysis
-        self._time_columns = X.select_dtypes(include=[np.number]).columns.tolist()
+        if self.config.series_in_rows:
+            self._time_columns = [col for col in X.columns if col != time_column]
+        else:
+            self._time_columns = X.select_dtypes(include=[np.number]).columns.tolist()
+            if time_column is not None:
+                self._time_columns = [col for col in self._time_columns if col != time_column]
 
         if self.config.verbose:
-            logger.info(f"TimeSeriesEngine: Found {len(self._time_columns)} numeric columns")
+            logger.info(
+                f"TimeSeriesEngine: Found {len(self._time_columns)} columns "
+                f"(series_in_rows={self.config.series_in_rows})"
+            )
 
         self._is_fitted = True
         return self
@@ -158,21 +174,10 @@ def transform(self, X: Union[pd.DataFrame, np.ndarray], **kwargs) -> pd.DataFram
             raise RuntimeError("Engine must be fitted before transform")
 
         X = self._validate_input(X)
-        features_dict = {}
 
-        for col in self._time_columns:
-            series = X[col].values
-
-            for feature_group in self.config.features:
-                if feature_group in self.FEATURE_EXTRACTORS:
-                    method_name = self.FEATURE_EXTRACTORS[feature_group]
-                    method = getattr(self, method_name)
-                    extracted = method(series, col)
-                    features_dict.update(extracted)
-
-        # For DataFrames with multiple rows, extract features across the entire column
-        if len(X) > 1:
-            # Each column is treated as a single time series
+        if self.config.series_in_rows:
+            result = self._extract_per_row(X)
+        else:
             features_dict = {}
             for col in self._time_columns:
                 series = X[col].values
@@ -184,7 +189,7 @@ def transform(self, X: Union[pd.DataFrame, np.ndarray], **kwargs) -> pd.DataFram
                         extracted = method(series, col)
                         features_dict.update(extracted)
 
-        result = pd.DataFrame([features_dict])
+            result = pd.DataFrame([features_dict])
 
         self._feature_names = list(result.columns)
 
diff --git a/tests/test_autofeat.py b/tests/test_autofeat.py
index 560791c..600b11a 100644
--- a/tests/test_autofeat.py
+++ b/tests/test_autofeat.py
@@ -103,6 +103,11 @@ def test_invalid_selection_method_raises(self):
         with pytest.raises(ValueError, match="Unknown selection methods"):
             AutoFeatureEngineer(selection_methods=["mutual_info", "magic"])
 
+    def test_invalid_leakage_guard_raises(self):
+        """Test invalid leakage guard setting fails early."""
+        with pytest.raises(ValueError, match="leakage_guard must be one of"):
+            AutoFeatureEngineer(leakage_guard="maybe")
+
 
 class TestAutoFeatureEngineerParams:
     """Test parameter handling."""
diff --git a/tests/test_engines.py b/tests/test_engines.py
index ee49cfa..3005ae9 100644
--- a/tests/test_engines.py
+++ b/tests/test_engines.py
@@ -785,6 +785,27 @@ def test_ndarray_input(self):
         assert "feature_0_mean" in result.columns
         assert "feature_1_mean" in result.columns
 
+    def test_series_in_rows_mode(self):
+        """Test per-row mode for sequence-like cells."""
+        df = pd.DataFrame(
+            {
+                "series": [np.array([1.0, 2.0, 3.0]), np.array([3.0, 2.0, 1.0])],
+                "event_time": [pd.Timestamp("2024-01-01"), pd.Timestamp("2024-01-02")],
+            }
+        )
+        engine = TimeSeriesEngine(features=["basic_stats"], series_in_rows=True)
+        result = engine.fit_transform(df, time_column="event_time")
+
+        assert isinstance(result, pd.DataFrame)
+        assert len(result) == 2
+        assert "series_mean" in result.columns
+
+    def test_time_column_missing_raises(self, ts_data):
+        """Test invalid time column raises a helpful error."""
+        engine = TimeSeriesEngine(features=["basic_stats"])
+        with pytest.raises(ValueError, match="time_column 'missing_time'"):
+            engine.fit(ts_data, time_column="missing_time")
+
 
 # ---------------------------------------------------------------------------
 # RelationalEngine tests
@@ -950,6 +971,20 @@ def test_transform_with_different_related_tables(self, orders_data, customers_da
         assert isinstance(result, pd.DataFrame)
         assert "customers_age_mean" in result.columns
 
+    def test_fit_missing_child_key_raises(self, orders_data, customers_data):
+        """Test fit raises when child key is missing from primary table."""
+        engine = RelationalEngine()
+        engine.add_relationship("orders", "customers", "missing_customer_id")
+        with pytest.raises(ValueError, match="child_key 'missing_customer_id'"):
+            engine.fit(orders_data, related_tables={"customers": customers_data})
+
+    def test_fit_missing_parent_key_raises(self, orders_data, customers_data):
+        """Test fit raises when parent key is missing from related table."""
+        engine = RelationalEngine()
+        engine.add_relationship("orders", "customers", "customer_id", parent_key="missing_parent_key")
+        with pytest.raises(ValueError, match="parent_key 'missing_parent_key'"):
+            engine.fit(orders_data, related_tables={"customers": customers_data})
+
     def test_no_relationships(self, orders_data):
         """Test engine with no relationships defined."""
         engine = RelationalEngine()

From db24392ce95138a4d21138c65d0a8cd73177c390 Mon Sep 17 00:00:00 2001
From: Li Jiang <lijiang1@microsoft.com>
Date: Tue, 17 Mar 2026 23:17:21 +0800
Subject: [PATCH 04/22] docs: add relational example and branch summary

---
 PR_SUMMARY_feat_strengthen_core_api.md        | 76 +++++++++++++++++++
 .../run_auto_feature_engineering_benchmark.py | 29 +++----
 .../relational-feature-engineering.md         | 76 +++++++++++++++++++
 examples/time_aware_tabular_prototype.py      |  4 +-
 mkdocs.yml                                    |  1 +
 5 files changed, 169 insertions(+), 17 deletions(-)
 create mode 100644 PR_SUMMARY_feat_strengthen_core_api.md
 create mode 100644 docs/examples/relational-feature-engineering.md

diff --git a/PR_SUMMARY_feat_strengthen_core_api.md b/PR_SUMMARY_feat_strengthen_core_api.md
new file mode 100644
index 0000000..f76a829
--- /dev/null
+++ b/PR_SUMMARY_feat_strengthen_core_api.md
@@ -0,0 +1,76 @@
+# PR Summary — feat/strengthen-core-api
+
+## What this branch changes
+
+This branch improves FeatCopilot in three layers:
+
+1. **Core API reliability**
+   - safer source imports when package metadata is unavailable
+   - `AutoFeatureEngineer.fit_transform()` now forwards transform-time kwargs
+   - earlier config validation for engines / selection methods / leakage guard
+
+2. **Safety and practical workflow support**
+   - lightweight leakage guard in `AutoFeatureEngineer`
+   - time-aware prototype example for leakage-safe evaluation
+   - relational engine key validation and duplicate relationship deduping
+   - optional per-row mode for `TimeSeriesEngine`
+
+3. **Benchmark realism and use-case coverage**
+   - more realistic split logic in tool-comparison benchmarks
+   - targeted benchmark for interaction-heavy auto feature engineering use cases
+
+## Commits
+
+- `8e5d323` — `fix: harden source imports and fit_transform kwargs`
+- `50b399c` — `feat: add leakage guard and time-aware prototype`
+- `50957fa` — `feat: improve engine APIs and benchmark realism`
+
+## Highlights
+
+### AutoFeatureEngineer
+- new `leakage_guard` modes: `warn`, `raise`, `off`
+- early validation of:
+  - engine names
+  - selection methods
+  - positive `max_features`
+- transform-time kwargs are preserved in one-shot workflows
+
+### TimeSeriesEngine
+- new `series_in_rows` mode for row-wise sequence cells
+- clearer `time_column` validation
+- default behavior preserved for current aggregate-style usage
+
+### RelationalEngine
+- duplicate relationships no longer accumulate
+- missing child/parent keys fail fast with useful errors
+
+### Benchmarks
+- classification splits use stratification when possible
+- forecast/time-series splits avoid random shuffling
+- added focused use-case benchmark:
+  - `benchmarks/use_cases/run_auto_feature_engineering_benchmark.py`
+
+### Docs / examples
+- time-aware tabular example
+- relational feature engineering example
+
+## Validation
+
+- targeted tests passed
+- full test suite passed:
+  - `635 passed, 2 skipped`
+- pre-commit passed
+
+## Suggested PR title
+
+**Improve core API reliability, add leakage guard, and strengthen benchmark realism**
+
+## Suggested reviewer notes
+
+The main intended behavior changes are:
+- earlier user-facing validation
+- new optional leakage warnings/errors
+- new optional row-wise time-series mode
+- slightly more realistic benchmark split policy
+
+The branch is intentionally incremental rather than a redesign.
diff --git a/benchmarks/use_cases/run_auto_feature_engineering_benchmark.py b/benchmarks/use_cases/run_auto_feature_engineering_benchmark.py
index 8c9c2a2..a439df3 100644
--- a/benchmarks/use_cases/run_auto_feature_engineering_benchmark.py
+++ b/benchmarks/use_cases/run_auto_feature_engineering_benchmark.py
@@ -43,14 +43,7 @@ def create_dataset(n_samples: int = 5000, random_state: int = 42) -> pd.DataFram
     loyalty = ((df["age"] - 18) / 62) * (df["tenure_months"] / 120)
     team_flag = (df["plan_tier"] == "team").astype(int)
 
-    logit = (
-        -1.4
-        + 1.5 * charge_ratio
-        + 1.8 * complaint_rate
-        + 0.004 * product_density
-        - 0.8 * loyalty
-        - 0.4 * team_flag
-    )
+    logit = -1.4 + 1.5 * charge_ratio + 1.8 * complaint_rate + 0.004 * product_density - 0.8 * loyalty - 0.4 * team_flag
     prob = 1 / (1 + np.exp(-logit))
     df["target"] = (rng.random(n_samples) < prob).astype(int)
     return df
@@ -58,6 +51,10 @@ def create_dataset(n_samples: int = 5000, random_state: int = 42) -> pd.DataFram
 
 def evaluate_auc(X_train: pd.DataFrame, X_test: pd.DataFrame, y_train: pd.Series, y_test: pd.Series) -> float:
     """Train a simple classifier and return ROC-AUC."""
+    X_train = pd.get_dummies(X_train, drop_first=False)
+    X_test = pd.get_dummies(X_test, drop_first=False)
+    X_train, X_test = align_and_fill(X_train, X_test)
+
     model = HistGradientBoostingClassifier(max_depth=4, learning_rate=0.05, random_state=42)
     model.fit(X_train, y_train)
     return roc_auc_score(y_test, model.predict_proba(X_test)[:, 1])
@@ -94,7 +91,9 @@ def run_baseline(X_train: pd.DataFrame, X_test: pd.DataFrame, y_train: pd.Series
     return {"tool": "baseline", "auc": auc, "n_features": X_train_base.shape[1]}
 
 
-def run_featcopilot_case(X_train: pd.DataFrame, X_test: pd.DataFrame, y_train: pd.Series, y_test: pd.Series) -> dict[str, Any]:
+def run_featcopilot_case(
+    X_train: pd.DataFrame, X_test: pd.DataFrame, y_train: pd.Series, y_test: pd.Series
+) -> dict[str, Any]:
     """Run FeatCopilot benchmark case."""
     engineer = AutoFeatureEngineer(
         engines=["tabular"],
@@ -111,7 +110,9 @@ def run_featcopilot_case(X_train: pd.DataFrame, X_test: pd.DataFrame, y_train: p
     return {"tool": "featcopilot", "auc": auc, "n_features": X_train_fe.shape[1]}
 
 
-def run_featuretools_case(X_train: pd.DataFrame, X_test: pd.DataFrame, y_train: pd.Series, y_test: pd.Series) -> dict[str, Any]:
+def run_featuretools_case(
+    X_train: pd.DataFrame, X_test: pd.DataFrame, y_train: pd.Series, y_test: pd.Series
+) -> dict[str, Any]:
     """Run Featuretools if available."""
     try:
         import featuretools as ft
@@ -141,7 +142,9 @@ def run_featuretools_case(X_train: pd.DataFrame, X_test: pd.DataFrame, y_train:
     return {"tool": "featuretools", "auc": auc, "n_features": train_fm.shape[1]}
 
 
-def run_autofeat_case(X_train: pd.DataFrame, X_test: pd.DataFrame, y_train: pd.Series, y_test: pd.Series) -> dict[str, Any]:
+def run_autofeat_case(
+    X_train: pd.DataFrame, X_test: pd.DataFrame, y_train: pd.Series, y_test: pd.Series
+) -> dict[str, Any]:
     """Run autofeat if available."""
     try:
         from autofeat import AutoFeatClassifier
@@ -188,9 +191,7 @@ def main() -> None:
     data = create_dataset(n_samples=args.samples, random_state=args.seed)
     X = data.drop(columns=["target"])
     y = data["target"]
-    X_train, X_test, y_train, y_test = train_test_split(
-        X, y, test_size=0.2, random_state=args.seed, stratify=y
-    )
+    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=args.seed, stratify=y)
 
     results = [
         run_baseline(X_train, X_test, y_train, y_test),
diff --git a/docs/examples/relational-feature-engineering.md b/docs/examples/relational-feature-engineering.md
new file mode 100644
index 0000000..f3e0fac
--- /dev/null
+++ b/docs/examples/relational-feature-engineering.md
@@ -0,0 +1,76 @@
+# Relational Feature Engineering
+
+A practical example for **customer / orders** style data where aggregation features matter.
+
+## When to use this
+
+Use the relational engine when:
+- you have a primary table and one or more related tables
+- useful signal lives in counts / sums / means / maxima over related entities
+- you want a lighter-weight path than a full Featuretools setup
+
+## Example
+
+```python
+import pandas as pd
+
+from featcopilot.engines.relational import RelationalEngine
+
+orders = pd.DataFrame(
+    {
+        "order_id": [1, 2, 3, 4, 5],
+        "customer_id": [1, 1, 2, 2, 3],
+        "amount": [100, 200, 150, 300, 50],
+        "category": ["A", "B", "A", "A", "B"],
+    }
+)
+
+customers = pd.DataFrame(
+    {
+        "customer_id": [1, 2, 3],
+        "age": [25, 35, 45],
+        "income": [50000, 70000, 60000],
+    }
+)
+
+engine = RelationalEngine(
+    aggregation_functions=["mean", "sum", "count", "max", "min"],
+    verbose=True,
+)
+engine.add_relationship("orders", "customers", "customer_id")
+
+features = engine.fit_transform(
+    orders,
+    related_tables={"customers": customers},
+)
+
+print(features.columns.tolist())
+```
+
+## Typical generated features
+
+- `customers_age_mean`
+- `customers_income_mean`
+- `amount_by_category_mean`
+- `amount_by_category_count`
+
+## Guardrails
+
+`RelationalEngine` now validates configured relationship keys:
+- missing child key in the primary table -> raises early
+- missing parent key in a related table -> raises early
+- duplicate relationship definitions are ignored
+
+That is boring, but it is the kind of boring that saves time.
+
+## When Featuretools is still the better choice
+
+Use Featuretools when you need:
+- deeper DFS-style synthesis
+- richer primitive libraries
+- more complex multi-table workflows
+
+Use FeatCopilot relational mode when you want:
+- a smaller API surface
+- straightforward aggregation features
+- sklearn-friendly workflows
diff --git a/examples/time_aware_tabular_prototype.py b/examples/time_aware_tabular_prototype.py
index 585d0ff..f8d8781 100644
--- a/examples/time_aware_tabular_prototype.py
+++ b/examples/time_aware_tabular_prototype.py
@@ -77,9 +77,7 @@ def main() -> None:
 
     X_train_baseline = pd.get_dummies(X_train, drop_first=False)
     X_test_baseline = pd.get_dummies(X_test, drop_first=False)
-    X_train_baseline, X_test_baseline = X_train_baseline.align(
-        X_test_baseline, join="left", axis=1, fill_value=0
-    )
+    X_train_baseline, X_test_baseline = X_train_baseline.align(X_test_baseline, join="left", axis=1, fill_value=0)
 
     baseline = HistGradientBoostingClassifier(max_depth=4, learning_rate=0.05, random_state=42)
     baseline.fit(X_train_baseline, y_train)
diff --git a/mkdocs.yml b/mkdocs.yml
index 9b98a7c..8136adf 100644
--- a/mkdocs.yml
+++ b/mkdocs.yml
@@ -86,6 +86,7 @@ nav:
     - End-to-End Demo: examples/e2e-demo.md
     - Basic Usage: examples/basic.md
     - Time-Aware Tabular: examples/time-aware-tabular.md
+    - Relational Feature Engineering: examples/relational-feature-engineering.md
     - LLM-Powered: examples/llm-powered.md
     - Sklearn Pipeline: examples/sklearn-pipeline.md
     - Domain-Specific: examples/domain-specific.md

From 9315422ec9d0ad77e60cd57cf8a335e0ad4ab936 Mon Sep 17 00:00:00 2001
From: Li Jiang <lijiang1@microsoft.com>
Date: Wed, 18 Mar 2026 14:22:57 +0800
Subject: [PATCH 05/22] bench: add auto feature engineering use-case report

---
 .../AUTO_FEATURE_ENGINEERING_USE_CASE.md      |  10 ++
 .../run_auto_feature_engineering_benchmark.py | 127 +++++++++++-------
 2 files changed, 88 insertions(+), 49 deletions(-)
 create mode 100644 benchmarks/use_cases/AUTO_FEATURE_ENGINEERING_USE_CASE.md

diff --git a/benchmarks/use_cases/AUTO_FEATURE_ENGINEERING_USE_CASE.md b/benchmarks/use_cases/AUTO_FEATURE_ENGINEERING_USE_CASE.md
new file mode 100644
index 0000000..1413c37
--- /dev/null
+++ b/benchmarks/use_cases/AUTO_FEATURE_ENGINEERING_USE_CASE.md
@@ -0,0 +1,10 @@
+# Auto Feature Engineering Use-Case Benchmark
+
+Compares a plain baseline with FeatCopilot and common automatic feature engineering tools on an interaction-heavy tabular classification task.
+
+| Tool | Status | ROC-AUC | Feature Count |
+|------|--------|---------|---------------|
+| baseline | ok | 0.7440 | 9 |
+| featcopilot | ok | 0.7563 | 15 |
+| featuretools | failed: 'NoneType' object has no attribute 'columns' | - | - |
+| autofeat | failed: 'Series' object has no attribute 'ravel' | - | - |
diff --git a/benchmarks/use_cases/run_auto_feature_engineering_benchmark.py b/benchmarks/use_cases/run_auto_feature_engineering_benchmark.py
index a439df3..f909ef0 100644
--- a/benchmarks/use_cases/run_auto_feature_engineering_benchmark.py
+++ b/benchmarks/use_cases/run_auto_feature_engineering_benchmark.py
@@ -13,9 +13,13 @@
 
 import numpy as np
 import pandas as pd
-from sklearn.ensemble import HistGradientBoostingClassifier
+from sklearn.compose import ColumnTransformer
+from sklearn.impute import SimpleImputer
+from sklearn.linear_model import LogisticRegression
 from sklearn.metrics import roc_auc_score
 from sklearn.model_selection import train_test_split
+from sklearn.pipeline import Pipeline
+from sklearn.preprocessing import StandardScaler
 
 from featcopilot import AutoFeatureEngineer
 
@@ -23,7 +27,7 @@
 
 
 def create_dataset(n_samples: int = 5000, random_state: int = 42) -> pd.DataFrame:
-    """Create a synthetic classification dataset with interaction-heavy signal."""
+    """Create a synthetic classification dataset with explicit ratio/interaction signal."""
     rng = np.random.default_rng(random_state)
     df = pd.DataFrame(
         {
@@ -41,47 +45,63 @@ def create_dataset(n_samples: int = 5000, random_state: int = 42) -> pd.DataFram
     complaint_rate = df["support_tickets"] / (df["tenure_months"] + 1)
     product_density = df["monthly_charges"] / (df["num_products"] + 0.5)
     loyalty = ((df["age"] - 18) / 62) * (df["tenure_months"] / 120)
+    free_flag = (df["plan_tier"] == "free").astype(int)
     team_flag = (df["plan_tier"] == "team").astype(int)
 
-    logit = -1.4 + 1.5 * charge_ratio + 1.8 * complaint_rate + 0.004 * product_density - 0.8 * loyalty - 0.4 * team_flag
+    interaction_signal = charge_ratio * complaint_rate * 8.0
+    threshold_bonus = ((charge_ratio > 0.020) & (complaint_rate > 0.045)).astype(float) * 1.2
+    plan_interaction = free_flag * charge_ratio * 6.0 - team_flag * loyalty * 1.5
+
+    logit = (
+        -2.2
+        + 0.0035 * product_density
+        + 1.8 * interaction_signal
+        + threshold_bonus
+        + plan_interaction
+        - 1.1 * loyalty
+    )
     prob = 1 / (1 + np.exp(-logit))
     df["target"] = (rng.random(n_samples) < prob).astype(int)
     return df
 
 
+def align_and_fill(X_train: pd.DataFrame, X_test: pd.DataFrame) -> tuple[pd.DataFrame, pd.DataFrame]:
+    """Align train/test columns and fill missing values safely."""
+    X_train_aligned, X_test_aligned = X_train.align(X_test, join="left", axis=1, fill_value=0)
+    return X_train_aligned.fillna(0), X_test_aligned.fillna(0)
+
+
 def evaluate_auc(X_train: pd.DataFrame, X_test: pd.DataFrame, y_train: pd.Series, y_test: pd.Series) -> float:
     """Train a simple classifier and return ROC-AUC."""
     X_train = pd.get_dummies(X_train, drop_first=False)
     X_test = pd.get_dummies(X_test, drop_first=False)
     X_train, X_test = align_and_fill(X_train, X_test)
 
-    model = HistGradientBoostingClassifier(max_depth=4, learning_rate=0.05, random_state=42)
+    numeric_features = X_train.columns.tolist()
+    preprocessor = ColumnTransformer(
+        transformers=[
+            (
+                "num",
+                Pipeline(
+                    [
+                        ("imputer", SimpleImputer(strategy="constant", fill_value=0.0)),
+                        ("scaler", StandardScaler()),
+                    ]
+                ),
+                numeric_features,
+            )
+        ]
+    )
+    model = Pipeline(
+        steps=[
+            ("preprocessor", preprocessor),
+            ("classifier", LogisticRegression(max_iter=2000, C=1.0)),
+        ]
+    )
     model.fit(X_train, y_train)
     return roc_auc_score(y_test, model.predict_proba(X_test)[:, 1])
 
 
-def align_and_fill(X_train: pd.DataFrame, X_test: pd.DataFrame) -> tuple[pd.DataFrame, pd.DataFrame]:
-    """Align train/test columns and fill missing values safely."""
-    X_train_aligned, X_test_aligned = X_train.align(X_test, join="left", axis=1, fill_value=0)
-    return X_train_aligned.fillna(0), X_test_aligned.fillna(0)
-
-
-def run_featcopilot(X_train: pd.DataFrame, X_test: pd.DataFrame, y_train: pd.Series) -> tuple[float, int]:
-    """Run FeatCopilot and return transformed feature count."""
-    engineer = AutoFeatureEngineer(
-        engines=["tabular"],
-        max_features=40,
-        selection_methods=["mutual_info", "importance"],
-        correlation_threshold=0.9,
-        leakage_guard="warn",
-        verbose=False,
-    )
-    X_train_fe = engineer.fit_transform(X_train, y_train, target_name="target", apply_selection=True)
-    X_test_fe = engineer.transform(X_test)
-    X_train_fe, X_test_fe = align_and_fill(X_train_fe, X_test_fe)
-    return evaluate_auc(X_train_fe, X_test_fe, y_train, y_test=None), X_train_fe.shape[1]
-
-
 def run_baseline(X_train: pd.DataFrame, X_test: pd.DataFrame, y_train: pd.Series, y_test: pd.Series) -> dict[str, Any]:
     """Run one-hot baseline."""
     X_train_base = pd.get_dummies(X_train, drop_first=False)
@@ -97,9 +117,9 @@ def run_featcopilot_case(
     """Run FeatCopilot benchmark case."""
     engineer = AutoFeatureEngineer(
         engines=["tabular"],
-        max_features=40,
+        max_features=60,
         selection_methods=["mutual_info", "importance"],
-        correlation_threshold=0.9,
+        correlation_threshold=0.95,
         leakage_guard="warn",
         verbose=False,
     )
@@ -116,6 +136,7 @@ def run_featuretools_case(
     """Run Featuretools if available."""
     try:
         import featuretools as ft
+        import woodwork  # noqa: F401
     except Exception as exc:
         return {"tool": "featuretools", "status": f"unavailable: {exc}"}
 
@@ -125,21 +146,26 @@ def run_featuretools_case(
     train_copy["row_id"] = np.arange(len(train_copy))
     test_copy["row_id"] = np.arange(len(test_copy))
 
-    es_train = ft.EntitySet(id="afe_train").add_dataframe(dataframe_name="data", dataframe=train_copy, index="row_id")
-    train_fm, feature_defs = ft.dfs(
-        entityset=es_train,
-        target_dataframe_name="data",
-        trans_primitives=["add_numeric", "multiply_numeric", "divide_numeric"],
-        agg_primitives=[],
-        max_depth=2,
-        max_features=40,
-    )
-
-    es_test = ft.EntitySet(id="afe_test").add_dataframe(dataframe_name="data", dataframe=test_copy, index="row_id")
-    test_fm = ft.calculate_feature_matrix(entityset=es_test, features=feature_defs)
-    train_fm, test_fm = align_and_fill(train_fm, test_fm)
-    auc = evaluate_auc(train_fm, test_fm, y_train, y_test)
-    return {"tool": "featuretools", "auc": auc, "n_features": train_fm.shape[1]}
+    try:
+        train_copy = train_copy.ww.init(name="data", index="row_id")
+        es_train = ft.EntitySet(id="afe_train").add_dataframe(dataframe_name="data", dataframe=train_copy, index="row_id")
+        train_fm, feature_defs = ft.dfs(
+            entityset=es_train,
+            target_dataframe_name="data",
+            trans_primitives=["add_numeric", "multiply_numeric", "divide_numeric"],
+            agg_primitives=[],
+            max_depth=2,
+            max_features=60,
+        )
+
+        test_copy.ww.init(name="data", index="row_id")
+        es_test = ft.EntitySet(id="afe_test").add_dataframe(dataframe_name="data", dataframe=test_copy, index="row_id")
+        test_fm = ft.calculate_feature_matrix(entityset=es_test, features=feature_defs)
+        train_fm, test_fm = align_and_fill(train_fm, test_fm)
+        auc = evaluate_auc(train_fm, test_fm, y_train, y_test)
+        return {"tool": "featuretools", "auc": auc, "n_features": train_fm.shape[1]}
+    except Exception as exc:
+        return {"tool": "featuretools", "status": f"failed: {exc}"}
 
 
 def run_autofeat_case(
@@ -155,12 +181,15 @@ def run_autofeat_case(
     X_test_num = pd.get_dummies(X_test, drop_first=False)
     X_train_num, X_test_num = align_and_fill(X_train_num, X_test_num)
 
-    model = AutoFeatClassifier(verbose=0, feateng_steps=1, featsel_runs=1)
-    X_train_fe = model.fit_transform(X_train_num, y_train)
-    X_test_fe = model.transform(X_test_num)
-    X_train_fe, X_test_fe = align_and_fill(X_train_fe, X_test_fe)
-    auc = evaluate_auc(X_train_fe, X_test_fe, y_train, y_test)
-    return {"tool": "autofeat", "auc": auc, "n_features": X_train_fe.shape[1]}
+    try:
+        model = AutoFeatClassifier(verbose=0, feateng_steps=2, featsel_runs=2)
+        X_train_fe = model.fit_transform(X_train_num, y_train)
+        X_test_fe = model.transform(X_test_num)
+        X_train_fe, X_test_fe = align_and_fill(X_train_fe, X_test_fe)
+        auc = evaluate_auc(X_train_fe, X_test_fe, y_train, y_test)
+        return {"tool": "autofeat", "auc": auc, "n_features": X_train_fe.shape[1]}
+    except Exception as exc:
+        return {"tool": "autofeat", "status": f"failed: {exc}"}
 
 
 def write_report(results: list[dict[str, Any]], output_path: Path) -> None:

From c4a8a932ec22b049cd040df7dbd55f87ec9f3500 Mon Sep 17 00:00:00 2001
From: Li Jiang <lijiang1@microsoft.com>
Date: Wed, 18 Mar 2026 15:04:25 +0800
Subject: [PATCH 06/22] style: run pre-commit on benchmark script

---
 .../run_auto_feature_engineering_benchmark.py         | 11 ++++-------
 1 file changed, 4 insertions(+), 7 deletions(-)

diff --git a/benchmarks/use_cases/run_auto_feature_engineering_benchmark.py b/benchmarks/use_cases/run_auto_feature_engineering_benchmark.py
index f909ef0..31aad8e 100644
--- a/benchmarks/use_cases/run_auto_feature_engineering_benchmark.py
+++ b/benchmarks/use_cases/run_auto_feature_engineering_benchmark.py
@@ -53,12 +53,7 @@ def create_dataset(n_samples: int = 5000, random_state: int = 42) -> pd.DataFram
     plan_interaction = free_flag * charge_ratio * 6.0 - team_flag * loyalty * 1.5
 
     logit = (
-        -2.2
-        + 0.0035 * product_density
-        + 1.8 * interaction_signal
-        + threshold_bonus
-        + plan_interaction
-        - 1.1 * loyalty
+        -2.2 + 0.0035 * product_density + 1.8 * interaction_signal + threshold_bonus + plan_interaction - 1.1 * loyalty
     )
     prob = 1 / (1 + np.exp(-logit))
     df["target"] = (rng.random(n_samples) < prob).astype(int)
@@ -148,7 +143,9 @@ def run_featuretools_case(
 
     try:
         train_copy = train_copy.ww.init(name="data", index="row_id")
-        es_train = ft.EntitySet(id="afe_train").add_dataframe(dataframe_name="data", dataframe=train_copy, index="row_id")
+        es_train = ft.EntitySet(id="afe_train").add_dataframe(
+            dataframe_name="data", dataframe=train_copy, index="row_id"
+        )
         train_fm, feature_defs = ft.dfs(
             entityset=es_train,
             target_dataframe_name="data",

From a09dc9836207df18097fe4546c9b166a16d6256b Mon Sep 17 00:00:00 2001
From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com>
Date: Tue, 28 Apr 2026 05:26:31 +0000
Subject: [PATCH 07/22] fix: address benchmark and leakage review feedback

Agent-Logs-Url: https://github.com/thinkall/featcopilot/sessions/81d365b5-07ae-4e09-aa5a-8508c9323f42

Co-authored-by: thinkall <3197038+thinkall@users.noreply.github.com>
---
 benchmarks/compare_tools/run_fe_tools_comparison.py |  3 +--
 featcopilot/transformers/sklearn_compat.py          |  8 ++++++++
 featcopilot/utils/validation.py                     |  7 +++----
 tests/test_utils.py                                 | 11 +++++++++++
 4 files changed, 23 insertions(+), 6 deletions(-)

diff --git a/benchmarks/compare_tools/run_fe_tools_comparison.py b/benchmarks/compare_tools/run_fe_tools_comparison.py
index 03b4d92..412b51c 100644
--- a/benchmarks/compare_tools/run_fe_tools_comparison.py
+++ b/benchmarks/compare_tools/run_fe_tools_comparison.py
@@ -881,8 +881,7 @@ def run_single_benchmark(
         X_encoded[col] = le.fit_transform(X_encoded[col].astype(str))
 
     # Split data (keep raw and encoded in sync)
-    indices = np.arange(len(X_encoded))
-    train_idx, test_idx, y_train, y_test = train_test_split(indices, y, test_size=0.2, random_state=random_state)
+    train_idx, test_idx, y_train, y_test = split_benchmark_data(X_encoded, y, task, random_state)
     X_train_encoded = X_encoded.iloc[train_idx]
     X_test_encoded = X_encoded.iloc[test_idx]
     X_train_raw = X.iloc[train_idx]
diff --git a/featcopilot/transformers/sklearn_compat.py b/featcopilot/transformers/sklearn_compat.py
index 63f3447..4aef2ff 100644
--- a/featcopilot/transformers/sklearn_compat.py
+++ b/featcopilot/transformers/sklearn_compat.py
@@ -97,10 +97,14 @@ class AutoFeatureEngineer(BaseEstimator, TransformerMixin):
         Maximum features to generate/select
     selection_methods : list, default=['mutual_info', 'importance']
         Feature selection methods
+    correlation_threshold : float, default=0.85
+        Maximum pairwise correlation allowed during correlation-based selection
     llm_config : dict, optional
         Configuration for LLM engine
     verbose : bool, default=False
         Verbose output
+    leakage_guard : {'off', 'warn', 'raise'}, default='warn'
+        How to handle columns whose names suggest target, label, or future-information leakage
 
     Examples
     --------
@@ -186,6 +190,8 @@ def fit(
             Human-readable descriptions of columns (for LLM)
         task_description : str
             Description of the ML task (for LLM)
+        target_name : str, optional
+            Target column name used by leakage checks to identify related feature columns
         **fit_params : dict
             Additional parameters
 
@@ -331,6 +337,8 @@ def fit_transform(
             Human-readable column descriptions
         task_description : str
             ML task description
+        target_name : str, optional
+            Target column name used by leakage checks to identify related feature columns
         apply_selection : bool, default=True
             Whether to apply feature selection
         **fit_params : dict
diff --git a/featcopilot/utils/validation.py b/featcopilot/utils/validation.py
index 9df3dae..d733921 100644
--- a/featcopilot/utils/validation.py
+++ b/featcopilot/utils/validation.py
@@ -1,7 +1,7 @@
 """Validation helpers for safer feature engineering workflows."""
 
 import re
-from typing import Optional
+from typing import Any, Optional
 
 DEFAULT_LEAKAGE_KEYWORDS = [
     "target",
@@ -14,9 +14,9 @@
 ]
 
 
-def _normalize_column_name(name: str) -> str:
+def _normalize_column_name(name: Any) -> str:
     """Normalize a column name for fuzzy matching."""
-    return re.sub(r"[^a-z0-9]+", "", name.lower())
+    return re.sub(r"[^a-z0-9]+", "", str(name).lower())
 
 
 def find_potential_leakage_columns(
@@ -52,7 +52,6 @@ def find_potential_leakage_columns(
         keyword_hit = any(keyword and keyword in normalized_column for keyword in normalized_keywords)
         target_hit = bool(
             normalized_target
-            and normalized_target
             and (
                 normalized_column == normalized_target
                 or normalized_target in normalized_column
diff --git a/tests/test_utils.py b/tests/test_utils.py
index 9d2c5e8..e8c07b5 100644
--- a/tests/test_utils.py
+++ b/tests/test_utils.py
@@ -21,6 +21,17 @@
     list_models,
 )
 from featcopilot.utils.parallel import parallel_apply, parallel_transform
+from featcopilot.utils.validation import find_potential_leakage_columns
+
+# ---------------------------------------------------------------------------
+# Validation tests
+# ---------------------------------------------------------------------------
+
+
+def test_find_potential_leakage_columns_handles_non_string_columns():
+    """Test leakage detection accepts non-string column labels."""
+    assert find_potential_leakage_columns([123, "future_label"], target_name="label") == ["future_label"]
+
 
 # ---------------------------------------------------------------------------
 # FeatureCache tests

From 4ddcce55030e47bc97e77297b4e67384d6f5b1e3 Mon Sep 17 00:00:00 2001
From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com>
Date: Tue, 28 Apr 2026 05:27:37 +0000
Subject: [PATCH 08/22] test: cover leakage column edge cases

Agent-Logs-Url: https://github.com/thinkall/featcopilot/sessions/81d365b5-07ae-4e09-aa5a-8508c9323f42

Co-authored-by: thinkall <3197038+thinkall@users.noreply.github.com>
---
 tests/test_utils.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/tests/test_utils.py b/tests/test_utils.py
index e8c07b5..724c801 100644
--- a/tests/test_utils.py
+++ b/tests/test_utils.py
@@ -30,7 +30,11 @@
 
 def test_find_potential_leakage_columns_handles_non_string_columns():
     """Test leakage detection accepts non-string column labels."""
+    assert find_potential_leakage_columns([], target_name="label") == []
     assert find_potential_leakage_columns([123, "future_label"], target_name="label") == ["future_label"]
+    assert find_potential_leakage_columns([123, 456], target_name="label") == []
+    assert find_potential_leakage_columns(["Churn Label"], target_name="churn_label") == ["Churn Label"]
+    assert find_potential_leakage_columns(["future-target!"], target_name="target") == ["future-target!"]
 
 
 # ---------------------------------------------------------------------------

From e280cfc8e28e7be47d132b458636fc8dc9a7002d Mon Sep 17 00:00:00 2001
From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com>
Date: Tue, 28 Apr 2026 05:28:40 +0000
Subject: [PATCH 09/22] refactor: simplify leakage target matching

Agent-Logs-Url: https://github.com/thinkall/featcopilot/sessions/81d365b5-07ae-4e09-aa5a-8508c9323f42

Co-authored-by: thinkall <3197038+thinkall@users.noreply.github.com>
---
 featcopilot/utils/validation.py | 11 ++++-------
 1 file changed, 4 insertions(+), 7 deletions(-)

diff --git a/featcopilot/utils/validation.py b/featcopilot/utils/validation.py
index d733921..cc7c7b2 100644
--- a/featcopilot/utils/validation.py
+++ b/featcopilot/utils/validation.py
@@ -50,13 +50,10 @@ def find_potential_leakage_columns(
         normalized_column = _normalize_column_name(column)
 
         keyword_hit = any(keyword and keyword in normalized_column for keyword in normalized_keywords)
-        target_hit = bool(
-            normalized_target
-            and (
-                normalized_column == normalized_target
-                or normalized_target in normalized_column
-                or normalized_column in normalized_target
-            )
+        target_hit = normalized_target is not None and (
+            normalized_column == normalized_target
+            or normalized_target in normalized_column
+            or normalized_column in normalized_target
         )
 
         if keyword_hit or target_hit:

From 4dcdfe9e4f30916c5ed5f4b571cc92045f8f9b8a Mon Sep 17 00:00:00 2001
From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com>
Date: Tue, 28 Apr 2026 05:29:45 +0000
Subject: [PATCH 10/22] docs: clarify leakage target matching

Agent-Logs-Url: https://github.com/thinkall/featcopilot/sessions/81d365b5-07ae-4e09-aa5a-8508c9323f42

Co-authored-by: thinkall <3197038+thinkall@users.noreply.github.com>
---
 featcopilot/utils/validation.py | 5 +++++
 tests/test_utils.py             | 2 +-
 2 files changed, 6 insertions(+), 1 deletion(-)

diff --git a/featcopilot/utils/validation.py b/featcopilot/utils/validation.py
index cc7c7b2..8e2cb21 100644
--- a/featcopilot/utils/validation.py
+++ b/featcopilot/utils/validation.py
@@ -40,6 +40,11 @@ def find_potential_leakage_columns(
     -------
     list[str]
         Column names that deserve manual review for leakage.
+
+    Notes
+    -----
+    Target-name matching is intentionally fuzzy: labels are normalized and substring
+    variants are flagged so derived names such as ``target_encoded`` are reviewed.
     """
     keywords = keywords or DEFAULT_LEAKAGE_KEYWORDS
     normalized_keywords = [_normalize_column_name(keyword) for keyword in keywords]
diff --git a/tests/test_utils.py b/tests/test_utils.py
index 724c801..7dfd375 100644
--- a/tests/test_utils.py
+++ b/tests/test_utils.py
@@ -28,7 +28,7 @@
 # ---------------------------------------------------------------------------
 
 
-def test_find_potential_leakage_columns_handles_non_string_columns():
+def test_leakage_detection_non_string_columns():
     """Test leakage detection accepts non-string column labels."""
     assert find_potential_leakage_columns([], target_name="label") == []
     assert find_potential_leakage_columns([123, "future_label"], target_name="label") == ["future_label"]

From f5e78a8c81f77c10da6da58ca9b5aa4c397948b0 Mon Sep 17 00:00:00 2001
From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com>
Date: Tue, 28 Apr 2026 05:30:44 +0000
Subject: [PATCH 11/22] test: cover mixed leakage keyword columns

Agent-Logs-Url: https://github.com/thinkall/featcopilot/sessions/81d365b5-07ae-4e09-aa5a-8508c9323f42

Co-authored-by: thinkall <3197038+thinkall@users.noreply.github.com>
---
 tests/test_utils.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/tests/test_utils.py b/tests/test_utils.py
index 7dfd375..6099d85 100644
--- a/tests/test_utils.py
+++ b/tests/test_utils.py
@@ -32,6 +32,7 @@ def test_leakage_detection_non_string_columns():
     """Test leakage detection accepts non-string column labels."""
     assert find_potential_leakage_columns([], target_name="label") == []
     assert find_potential_leakage_columns([123, "future_label"], target_name="label") == ["future_label"]
+    assert find_potential_leakage_columns([123, "target"], target_name="other") == ["target"]
     assert find_potential_leakage_columns([123, 456], target_name="label") == []
     assert find_potential_leakage_columns(["Churn Label"], target_name="churn_label") == ["Churn Label"]
     assert find_potential_leakage_columns(["future-target!"], target_name="target") == ["future-target!"]

From b1a32752a2c3f3b8e9ebd7fbf67ea370e80677bd Mon Sep 17 00:00:00 2001
From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com>
Date: Tue, 28 Apr 2026 05:31:45 +0000
Subject: [PATCH 12/22] test: cover non-string leakage targets

Agent-Logs-Url: https://github.com/thinkall/featcopilot/sessions/81d365b5-07ae-4e09-aa5a-8508c9323f42

Co-authored-by: thinkall <3197038+thinkall@users.noreply.github.com>
---
 featcopilot/utils/validation.py | 20 ++++++++++----------
 tests/test_utils.py             |  1 +
 2 files changed, 11 insertions(+), 10 deletions(-)

diff --git a/featcopilot/utils/validation.py b/featcopilot/utils/validation.py
index 8e2cb21..3c37a67 100644
--- a/featcopilot/utils/validation.py
+++ b/featcopilot/utils/validation.py
@@ -20,26 +20,26 @@ def _normalize_column_name(name: Any) -> str:
 
 
 def find_potential_leakage_columns(
-    columns: list[str],
-    target_name: Optional[str] = None,
+    columns: list[Any],
+    target_name: Optional[Any] = None,
     keywords: Optional[list[str]] = None,
-) -> list[str]:
+) -> list[Any]:
     """
     Find suspicious columns that may leak label or future information.
 
     Parameters
     ----------
-    columns : list[str]
-        Input column names to inspect.
-    target_name : str, optional
-        Expected target/label column name. Related variants will be flagged.
+    columns : list
+        Input column names or labels to inspect.
+    target_name : optional
+        Expected target/label column name or label. Related variants will be flagged.
     keywords : list[str], optional
         Additional suspicious keywords to match against normalized column names.
 
     Returns
     -------
-    list[str]
-        Column names that deserve manual review for leakage.
+    list
+        Column names or labels that deserve manual review for leakage.
 
     Notes
     -----
@@ -50,7 +50,7 @@ def find_potential_leakage_columns(
     normalized_keywords = [_normalize_column_name(keyword) for keyword in keywords]
     normalized_target = _normalize_column_name(target_name) if target_name else None
 
-    suspicious: list[str] = []
+    suspicious: list[Any] = []
     for column in columns:
         normalized_column = _normalize_column_name(column)
 
diff --git a/tests/test_utils.py b/tests/test_utils.py
index 6099d85..576cad6 100644
--- a/tests/test_utils.py
+++ b/tests/test_utils.py
@@ -34,6 +34,7 @@ def test_leakage_detection_non_string_columns():
     assert find_potential_leakage_columns([123, "future_label"], target_name="label") == ["future_label"]
     assert find_potential_leakage_columns([123, "target"], target_name="other") == ["target"]
     assert find_potential_leakage_columns([123, 456], target_name="label") == []
+    assert find_potential_leakage_columns([123, "feature"], target_name=123) == [123]
     assert find_potential_leakage_columns(["Churn Label"], target_name="churn_label") == ["Churn Label"]
     assert find_potential_leakage_columns(["future-target!"], target_name="target") == ["future-target!"]
 

From d1e89248ae2526c38ca8e6d2ea10f1141eb05ed1 Mon Sep 17 00:00:00 2001
From: Li Jiang <lijiang1@microsoft.com>
Date: Wed, 29 Apr 2026 16:51:36 +0800
Subject: [PATCH 13/22] refactor(bench): share split helper and document
 target_name

Address remaining PR #2 review comments:

- Move split_benchmark_data from compare_tools into a shared
  benchmarks/splits.py module so both the existing comparison
  benchmark and the new use-case benchmark route their splits
  through the same task-aware policy.
- Wire benchmarks/use_cases/run_auto_feature_engineering_benchmark.py
  to call split_benchmark_data instead of train_test_split directly,
  matching the PR summary's intent.
- Add target_name to the AutoFeatureEngineer class docstring under
  an "Other Parameters" section so it surfaces in help() and generated
  docs alongside the constructor params.
- Add tests/test_benchmark_splits.py covering both the helper's
  behavior (chronological vs stratified vs random) and a wiring
  regression check that benchmark scripts use the helper instead of
  calling train_test_split directly.

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
---
 .../compare_tools/run_fe_tools_comparison.py  |  39 +----
 benchmarks/splits.py                          |  82 ++++++++++
 .../run_auto_feature_engineering_benchmark.py |   6 +-
 featcopilot/transformers/sklearn_compat.py    |   9 +-
 tests/test_benchmark_splits.py                | 142 ++++++++++++++++++
 5 files changed, 237 insertions(+), 41 deletions(-)
 create mode 100644 benchmarks/splits.py
 create mode 100644 tests/test_benchmark_splits.py

diff --git a/benchmarks/compare_tools/run_fe_tools_comparison.py b/benchmarks/compare_tools/run_fe_tools_comparison.py
index 412b51c..dd7b10f 100644
--- a/benchmarks/compare_tools/run_fe_tools_comparison.py
+++ b/benchmarks/compare_tools/run_fe_tools_comparison.py
@@ -43,7 +43,6 @@
 import pandas as pd
 from packaging.version import Version
 from sklearn.metrics import accuracy_score, f1_score, mean_squared_error, r2_score, roc_auc_score
-from sklearn.model_selection import train_test_split
 from sklearn.preprocessing import LabelEncoder
 
 from benchmarks.datasets import (
@@ -61,6 +60,7 @@
     sanitize_feature_frames,
     save_feature_cache,
 )
+from benchmarks.splits import split_benchmark_data
 from featcopilot.utils.logger import get_logger  # noqa: E402
 
 logger = get_logger(__name__)
@@ -68,43 +68,6 @@
 warnings.filterwarnings("ignore")
 
 
-def split_benchmark_data(
-    X: pd.DataFrame,
-    y: pd.Series,
-    task: str,
-    random_state: int,
-    test_size: float = 0.2,
-) -> tuple[np.ndarray, np.ndarray, pd.Series, pd.Series]:
-    """Split benchmark data with slightly more realistic defaults by task type."""
-    indices = np.arange(len(X))
-
-    if "forecast" in task or "timeseries" in task:
-        split_idx = int(len(indices) * (1 - test_size))
-        train_idx = indices[:split_idx]
-        test_idx = indices[split_idx:]
-        y_train = y.iloc[train_idx]
-        y_test = y.iloc[test_idx]
-        return train_idx, test_idx, y_train, y_test
-
-    stratify = None
-    if "classification" in task:
-        try:
-            class_counts = pd.Series(y).value_counts(dropna=False)
-            if len(class_counts) > 1 and class_counts.min() >= 2:
-                stratify = y
-        except Exception:
-            stratify = None
-
-    train_idx, test_idx, y_train, y_test = train_test_split(
-        indices,
-        y,
-        test_size=test_size,
-        random_state=random_state,
-        stratify=stratify,
-    )
-    return train_idx, test_idx, y_train, y_test
-
-
 # Default configuration
 QUICK_DATASETS = [
     # Interaction-heavy synthetic (FeatCopilot creates valuable polynomial features)
diff --git a/benchmarks/splits.py b/benchmarks/splits.py
new file mode 100644
index 0000000..b585887
--- /dev/null
+++ b/benchmarks/splits.py
@@ -0,0 +1,82 @@
+"""Shared split utilities for FeatCopilot benchmarks.
+
+Centralizes the split policy so individual benchmark scripts share the same
+realistic defaults: chronological splits for forecasting/timeseries tasks and
+stratified random splits for classification tasks (when class counts allow).
+"""
+
+from __future__ import annotations
+
+import numpy as np
+import pandas as pd
+from sklearn.model_selection import train_test_split
+
+
+def split_benchmark_data(
+    X: pd.DataFrame,
+    y: pd.Series,
+    task: str,
+    random_state: int,
+    test_size: float = 0.2,
+) -> tuple[np.ndarray, np.ndarray, pd.Series, pd.Series]:
+    """
+    Split benchmark data with task-aware defaults.
+
+    Parameters
+    ----------
+    X : pandas.DataFrame
+        Feature matrix. Only its length is used; the function returns positional
+        indices (use ``X.iloc[train_idx]`` / ``X.iloc[test_idx]`` to materialize
+        the splits).
+    y : pandas.Series
+        Target values aligned with ``X``.
+    task : str
+        Task identifier. Substrings ``"forecast"`` / ``"timeseries"`` trigger a
+        chronological split; ``"classification"`` triggers a stratified split
+        when class counts allow it. Anything else falls back to a random split.
+    random_state : int
+        Random state for reproducible random splits.
+    test_size : float, default=0.2
+        Fraction of rows held out for the test split.
+
+    Returns
+    -------
+    train_idx : numpy.ndarray
+        Positional indices for the training rows.
+    test_idx : numpy.ndarray
+        Positional indices for the test rows.
+    y_train : pandas.Series
+        Target values for the training rows.
+    y_test : pandas.Series
+        Target values for the test rows.
+    """
+    indices = np.arange(len(X))
+
+    if "forecast" in task or "timeseries" in task:
+        split_idx = int(len(indices) * (1 - test_size))
+        train_idx = indices[:split_idx]
+        test_idx = indices[split_idx:]
+        y_train = y.iloc[train_idx]
+        y_test = y.iloc[test_idx]
+        return train_idx, test_idx, y_train, y_test
+
+    stratify = None
+    if "classification" in task:
+        try:
+            class_counts = pd.Series(y).value_counts(dropna=False)
+            if len(class_counts) > 1 and class_counts.min() >= 2:
+                stratify = y
+        except Exception:
+            stratify = None
+
+    train_idx, test_idx, y_train, y_test = train_test_split(
+        indices,
+        y,
+        test_size=test_size,
+        random_state=random_state,
+        stratify=stratify,
+    )
+    return train_idx, test_idx, y_train, y_test
+
+
+__all__ = ["split_benchmark_data"]
diff --git a/benchmarks/use_cases/run_auto_feature_engineering_benchmark.py b/benchmarks/use_cases/run_auto_feature_engineering_benchmark.py
index 31aad8e..91578f5 100644
--- a/benchmarks/use_cases/run_auto_feature_engineering_benchmark.py
+++ b/benchmarks/use_cases/run_auto_feature_engineering_benchmark.py
@@ -17,10 +17,10 @@
 from sklearn.impute import SimpleImputer
 from sklearn.linear_model import LogisticRegression
 from sklearn.metrics import roc_auc_score
-from sklearn.model_selection import train_test_split
 from sklearn.pipeline import Pipeline
 from sklearn.preprocessing import StandardScaler
 
+from benchmarks.splits import split_benchmark_data
 from featcopilot import AutoFeatureEngineer
 
 REPORT_DIR = Path(__file__).resolve().parent
@@ -217,7 +217,9 @@ def main() -> None:
     data = create_dataset(n_samples=args.samples, random_state=args.seed)
     X = data.drop(columns=["target"])
     y = data["target"]
-    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=args.seed, stratify=y)
+    train_idx, test_idx, y_train, y_test = split_benchmark_data(X, y, "classification", random_state=args.seed)
+    X_train = X.iloc[train_idx]
+    X_test = X.iloc[test_idx]
 
     results = [
         run_baseline(X_train, X_test, y_train, y_test),
diff --git a/featcopilot/transformers/sklearn_compat.py b/featcopilot/transformers/sklearn_compat.py
index 4aef2ff..529f811 100644
--- a/featcopilot/transformers/sklearn_compat.py
+++ b/featcopilot/transformers/sklearn_compat.py
@@ -106,6 +106,13 @@ class AutoFeatureEngineer(BaseEstimator, TransformerMixin):
     leakage_guard : {'off', 'warn', 'raise'}, default='warn'
         How to handle columns whose names suggest target, label, or future-information leakage
 
+    Other Parameters
+    ----------------
+    target_name : str, optional
+        Fit-time parameter accepted by :meth:`fit` and :meth:`fit_transform`.
+        When provided, the leakage guard cross-references column names against
+        the target so derived variants (e.g. ``target_encoded``) are flagged.
+
     Examples
     --------
     >>> engineer = AutoFeatureEngineer(
@@ -113,7 +120,7 @@ class AutoFeatureEngineer(BaseEstimator, TransformerMixin):
     ...     max_features=100,
     ...     llm_config={'model': 'gpt-5.2', 'enable_semantic': True}
     ... )
-    >>> X_transformed = engineer.fit_transform(X, y)
+    >>> X_transformed = engineer.fit_transform(X, y, target_name='label')
     """
 
     SUPPORTED_ENGINES = {"tabular", "timeseries", "relational", "text", "llm"}
diff --git a/tests/test_benchmark_splits.py b/tests/test_benchmark_splits.py
new file mode 100644
index 0000000..f4fb26c
--- /dev/null
+++ b/tests/test_benchmark_splits.py
@@ -0,0 +1,142 @@
+"""Tests for the shared benchmarks split helper and its wiring."""
+
+from __future__ import annotations
+
+import importlib
+import inspect
+from pathlib import Path
+
+import numpy as np
+import pandas as pd
+import pytest
+
+from benchmarks.splits import split_benchmark_data
+
+# ---------------------------------------------------------------------------
+# Behavioral tests for split_benchmark_data
+# ---------------------------------------------------------------------------
+
+
+def test_classification_uses_stratified_split():
+    """Classification tasks should produce a stratified split when class counts allow."""
+    rng = np.random.default_rng(0)
+    X = pd.DataFrame({"f": rng.normal(size=200)})
+    y = pd.Series(([0] * 160) + ([1] * 40))
+
+    train_idx, test_idx, y_train, y_test = split_benchmark_data(X, y, "classification", random_state=42)
+
+    assert len(train_idx) + len(test_idx) == len(X)
+    assert set(train_idx).isdisjoint(set(test_idx))
+    train_pos_ratio = (y_train == 1).mean()
+    test_pos_ratio = (y_test == 1).mean()
+    expected_ratio = (y == 1).mean()
+    assert abs(train_pos_ratio - expected_ratio) < 0.02
+    assert abs(test_pos_ratio - expected_ratio) < 0.02
+
+
+def test_classification_falls_back_when_class_too_small():
+    """Singleton classes should not raise; the split falls back to non-stratified."""
+    X = pd.DataFrame({"f": np.arange(10.0)})
+    y = pd.Series([0] * 9 + [1])
+
+    train_idx, test_idx, _, _ = split_benchmark_data(X, y, "classification", random_state=0)
+
+    assert len(train_idx) + len(test_idx) == len(X)
+
+
+def test_forecasting_uses_chronological_split():
+    """Forecasting/timeseries tasks should preserve temporal order."""
+    X = pd.DataFrame({"t": np.arange(100)})
+    y = pd.Series(np.arange(100, dtype=float))
+
+    train_idx, test_idx, y_train, y_test = split_benchmark_data(X, y, "forecasting", random_state=123)
+
+    assert list(train_idx) == list(range(80))
+    assert list(test_idx) == list(range(80, 100))
+    assert y_train.iloc[-1] < y_test.iloc[0]
+
+
+def test_timeseries_keyword_also_chronological():
+    """The 'timeseries' substring should also trigger chronological split."""
+    X = pd.DataFrame({"t": np.arange(50)})
+    y = pd.Series(np.arange(50, dtype=float))
+
+    train_idx, test_idx, _, _ = split_benchmark_data(X, y, "timeseries_regression", random_state=0)
+
+    assert list(train_idx) == list(range(40))
+    assert list(test_idx) == list(range(40, 50))
+
+
+def test_regression_uses_random_split_no_stratify():
+    """Non-classification, non-forecasting tasks should use a plain random split."""
+    rng = np.random.default_rng(1)
+    X = pd.DataFrame({"f": rng.normal(size=100)})
+    y = pd.Series(rng.normal(size=100))
+
+    train_idx, test_idx, _, _ = split_benchmark_data(X, y, "regression", random_state=42)
+
+    assert len(train_idx) == 80
+    assert len(test_idx) == 20
+    assert set(train_idx).isdisjoint(set(test_idx))
+
+
+def test_custom_test_size_respected():
+    """``test_size`` should override the default 0.2."""
+    X = pd.DataFrame({"f": np.arange(100.0)})
+    y = pd.Series(np.arange(100, dtype=float))
+
+    train_idx, test_idx, _, _ = split_benchmark_data(X, y, "regression", random_state=0, test_size=0.4)
+
+    assert len(test_idx) == 40
+    assert len(train_idx) == 60
+
+
+# ---------------------------------------------------------------------------
+# Wiring tests: ensure benchmark scripts actually use the shared helper.
+# These guard against the regression flagged on PR #2 where the helper was
+# introduced but never wired into the call sites.
+# ---------------------------------------------------------------------------
+
+
+_REPO_ROOT = Path(__file__).resolve().parents[1]
+
+
+@pytest.mark.parametrize(
+    "module_path",
+    [
+        "benchmarks.compare_tools.run_fe_tools_comparison",
+        "benchmarks.use_cases.run_auto_feature_engineering_benchmark",
+    ],
+)
+def test_benchmark_scripts_import_split_helper(module_path):
+    """Both in-scope benchmark scripts must import ``split_benchmark_data``."""
+    module = importlib.import_module(module_path)
+    assert (
+        getattr(module, "split_benchmark_data", None) is split_benchmark_data
+    ), f"{module_path} should import split_benchmark_data from benchmarks.splits"
+
+
+@pytest.mark.parametrize(
+    "relative_path",
+    [
+        "benchmarks/compare_tools/run_fe_tools_comparison.py",
+        "benchmarks/use_cases/run_auto_feature_engineering_benchmark.py",
+    ],
+)
+def test_benchmark_scripts_do_not_call_train_test_split_directly(relative_path):
+    """Benchmark scripts should route through ``split_benchmark_data`` instead of ``train_test_split``."""
+    source = (_REPO_ROOT / relative_path).read_text(encoding="utf-8")
+    assert (
+        "train_test_split(" not in source
+    ), f"{relative_path} should not call train_test_split directly; use split_benchmark_data."
+    assert (
+        "split_benchmark_data(" in source
+    ), f"{relative_path} should call split_benchmark_data from benchmarks.splits."
+
+
+def test_split_benchmark_data_signature_stable():
+    """Pin the public signature so downstream benchmark scripts keep working."""
+    sig = inspect.signature(split_benchmark_data)
+    params = list(sig.parameters)
+    assert params == ["X", "y", "task", "random_state", "test_size"]
+    assert sig.parameters["test_size"].default == 0.2

From dd982a47b5f815d2de4e6a6bfe6a2b171b893563 Mon Sep 17 00:00:00 2001
From: Li Jiang <lijiang1@microsoft.com>
Date: Thu, 30 Apr 2026 15:02:42 +0800
Subject: [PATCH 14/22] fix(benchmarks): add future annotations import to
 datasets module

The Python 3.9 CI job failed when collecting tests/test_benchmark_splits.py
because importing benchmarks.splits triggers benchmarks/__init__.py, which
eagerly imports benchmarks.datasets. That module uses PEP 604 union syntax
(`str | None`) at function-definition time, which is only supported at
runtime on Python 3.10+.

Add `from __future__ import annotations` to benchmarks/datasets.py so the
annotations become lazy strings and the module imports cleanly under Python
3.9. Verified locally with a fresh Python 3.9 environment: the full test
suite (647 passed, 2 skipped) and the new benchmark split tests both pass.

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
---
 benchmarks/datasets.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/benchmarks/datasets.py b/benchmarks/datasets.py
index c611bf6..db5e238 100644
--- a/benchmarks/datasets.py
+++ b/benchmarks/datasets.py
@@ -5,6 +5,8 @@
 time series datasets, and text/semantic datasets for comprehensive benchmarking.
 """
 
+from __future__ import annotations
+
 import numpy as np
 import pandas as pd
 

From e76d312e586be033d0c755353841aefe43cc5cc5 Mon Sep 17 00:00:00 2001
From: Li Jiang <lijiang1@microsoft.com>
Date: Thu, 30 Apr 2026 15:49:59 +0800
Subject: [PATCH 15/22] chore: remove PR_SUMMARY_feat_strengthen_core_api.md

The PR summary belongs in the GitHub PR description, not committed to
the repository. Its content has been moved to PR #2's body so the file
no longer needs to live in source control.

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
---
 PR_SUMMARY_feat_strengthen_core_api.md | 76 --------------------------
 1 file changed, 76 deletions(-)
 delete mode 100644 PR_SUMMARY_feat_strengthen_core_api.md

diff --git a/PR_SUMMARY_feat_strengthen_core_api.md b/PR_SUMMARY_feat_strengthen_core_api.md
deleted file mode 100644
index f76a829..0000000
--- a/PR_SUMMARY_feat_strengthen_core_api.md
+++ /dev/null
@@ -1,76 +0,0 @@
-# PR Summary — feat/strengthen-core-api
-
-## What this branch changes
-
-This branch improves FeatCopilot in three layers:
-
-1. **Core API reliability**
-   - safer source imports when package metadata is unavailable
-   - `AutoFeatureEngineer.fit_transform()` now forwards transform-time kwargs
-   - earlier config validation for engines / selection methods / leakage guard
-
-2. **Safety and practical workflow support**
-   - lightweight leakage guard in `AutoFeatureEngineer`
-   - time-aware prototype example for leakage-safe evaluation
-   - relational engine key validation and duplicate relationship deduping
-   - optional per-row mode for `TimeSeriesEngine`
-
-3. **Benchmark realism and use-case coverage**
-   - more realistic split logic in tool-comparison benchmarks
-   - targeted benchmark for interaction-heavy auto feature engineering use cases
-
-## Commits
-
-- `8e5d323` — `fix: harden source imports and fit_transform kwargs`
-- `50b399c` — `feat: add leakage guard and time-aware prototype`
-- `50957fa` — `feat: improve engine APIs and benchmark realism`
-
-## Highlights
-
-### AutoFeatureEngineer
-- new `leakage_guard` modes: `warn`, `raise`, `off`
-- early validation of:
-  - engine names
-  - selection methods
-  - positive `max_features`
-- transform-time kwargs are preserved in one-shot workflows
-
-### TimeSeriesEngine
-- new `series_in_rows` mode for row-wise sequence cells
-- clearer `time_column` validation
-- default behavior preserved for current aggregate-style usage
-
-### RelationalEngine
-- duplicate relationships no longer accumulate
-- missing child/parent keys fail fast with useful errors
-
-### Benchmarks
-- classification splits use stratification when possible
-- forecast/time-series splits avoid random shuffling
-- added focused use-case benchmark:
-  - `benchmarks/use_cases/run_auto_feature_engineering_benchmark.py`
-
-### Docs / examples
-- time-aware tabular example
-- relational feature engineering example
-
-## Validation
-
-- targeted tests passed
-- full test suite passed:
-  - `635 passed, 2 skipped`
-- pre-commit passed
-
-## Suggested PR title
-
-**Improve core API reliability, add leakage guard, and strengthen benchmark realism**
-
-## Suggested reviewer notes
-
-The main intended behavior changes are:
-- earlier user-facing validation
-- new optional leakage warnings/errors
-- new optional row-wise time-series mode
-- slightly more realistic benchmark split policy
-
-The branch is intentionally incremental rather than a redesign.

From cbf821d7a1e50478599682a6417c14737ab90398 Mon Sep 17 00:00:00 2001
From: Li Jiang <lijiang1@microsoft.com>
Date: Thu, 30 Apr 2026 16:57:13 +0800
Subject: [PATCH 16/22] fix(api,bench): harden sklearn cloning and benchmark
 tool calls

Address the latest PR review feedback on the strengthen-core-api branch:

* AutoFeatureEngineer (sklearn_compat):
  - Default collection-valued constructor args (`engines`,
    `selection_methods`, `llm_config`) using `is not None` instead of
    truthiness so explicit empty containers and identity-bearing
    arguments are preserved. This is also what sklearn's `clone()`
    round-trip identity check requires; previously cloning the
    estimator raised `RuntimeError: Cannot clone object ... constructor
    either does not set or modifies parameter llm_config`.
  - Make `set_params` tolerate `None` for the same parameters (common in
    `GridSearchCV` parameter grids) by normalizing to defaults before
    `_validate_configuration` is called, so callers no longer hit
    `TypeError: 'NoneType' object is not iterable` from `set(None)`.
  - Add tests covering the None-normalization in `set_params`,
    validation of non-None values, and the sklearn `clone` round trip.

* Auto feature engineering use-case benchmark:
  - In the Featuretools case, drop the assignment from
    `train_copy.ww.init(...)`. `Accessor.init` mutates in place and
    returns `None`, so the previous code clobbered the DataFrame and
    later raised `'NoneType' object has no attribute 'columns'` when
    building the EntitySet.
  - In the autofeat case, convert `y_train` to a numpy array via
    `np.asarray(y_train).ravel()` before calling `fit_transform`, so
    autofeat receives the 1-D ndarray it expects regardless of whether
    the caller passes a Series or array.

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
---
 .../run_auto_feature_engineering_benchmark.py |  5 +--
 featcopilot/transformers/sklearn_compat.py    | 25 ++++++++++++---
 tests/test_sklearn_compat.py                  | 32 +++++++++++++++++++
 3 files changed, 56 insertions(+), 6 deletions(-)

diff --git a/benchmarks/use_cases/run_auto_feature_engineering_benchmark.py b/benchmarks/use_cases/run_auto_feature_engineering_benchmark.py
index 91578f5..cfe5050 100644
--- a/benchmarks/use_cases/run_auto_feature_engineering_benchmark.py
+++ b/benchmarks/use_cases/run_auto_feature_engineering_benchmark.py
@@ -142,7 +142,7 @@ def run_featuretools_case(
     test_copy["row_id"] = np.arange(len(test_copy))
 
     try:
-        train_copy = train_copy.ww.init(name="data", index="row_id")
+        train_copy.ww.init(name="data", index="row_id")
         es_train = ft.EntitySet(id="afe_train").add_dataframe(
             dataframe_name="data", dataframe=train_copy, index="row_id"
         )
@@ -180,7 +180,8 @@ def run_autofeat_case(
 
     try:
         model = AutoFeatClassifier(verbose=0, feateng_steps=2, featsel_runs=2)
-        X_train_fe = model.fit_transform(X_train_num, y_train)
+        y_train_arr = np.asarray(y_train).ravel()
+        X_train_fe = model.fit_transform(X_train_num, y_train_arr)
         X_test_fe = model.transform(X_test_num)
         X_train_fe, X_test_fe = align_and_fill(X_train_fe, X_test_fe)
         auc = evaluate_auc(X_train_fe, X_test_fe, y_train, y_test)
diff --git a/featcopilot/transformers/sklearn_compat.py b/featcopilot/transformers/sklearn_compat.py
index 529f811..8c9b8ee 100644
--- a/featcopilot/transformers/sklearn_compat.py
+++ b/featcopilot/transformers/sklearn_compat.py
@@ -137,11 +137,15 @@ def __init__(
         verbose: bool = False,
         leakage_guard: str = "warn",
     ):
-        self.engines = engines or ["tabular"]
+        # Use ``is not None`` defaulting (rather than ``or``) so that explicit
+        # empty containers and identity-bearing arguments are preserved. This
+        # also keeps ``self.<param> is param`` for any non-None argument, which
+        # is required for sklearn's ``clone`` round-trip identity check.
+        self.engines = engines if engines is not None else ["tabular"]
         self.max_features = max_features
-        self.selection_methods = selection_methods or ["mutual_info", "importance"]
+        self.selection_methods = selection_methods if selection_methods is not None else ["mutual_info", "importance"]
         self.correlation_threshold = correlation_threshold
-        self.llm_config = llm_config or {}
+        self.llm_config = llm_config if llm_config is not None else {}
         self.verbose = verbose
         self.leakage_guard = leakage_guard
 
@@ -472,8 +476,21 @@ def get_params(self, deep=True):
         }
 
     def set_params(self, **params):
-        """Set parameters for sklearn compatibility."""
+        """
+        Set parameters for sklearn compatibility.
+
+        Mirrors the defaulting performed in ``__init__`` so callers (e.g. sklearn
+        cloning, ``GridSearchCV`` parameter grids) can pass ``None`` for
+        collection-valued parameters and have it normalized back to the default
+        rather than raising during validation.
+        """
         for key, value in params.items():
             setattr(self, key, value)
+        if self.engines is None:
+            self.engines = ["tabular"]
+        if self.selection_methods is None:
+            self.selection_methods = ["mutual_info", "importance"]
+        if self.llm_config is None:
+            self.llm_config = {}
         self._validate_configuration()
         return self
diff --git a/tests/test_sklearn_compat.py b/tests/test_sklearn_compat.py
index a07118d..c7af4b8 100644
--- a/tests/test_sklearn_compat.py
+++ b/tests/test_sklearn_compat.py
@@ -263,6 +263,38 @@ def test_set_params(self):
         assert afe.max_features == 20
         assert afe.verbose is True
 
+    def test_set_params_none_normalizes_to_defaults(self):
+        """set_params should accept None for collection-valued params (sklearn compat).
+
+        Sklearn's ``clone`` and ``GridSearchCV`` may pass ``None`` for parameters
+        whose default in ``__init__`` is also ``None``. Validation should not raise
+        in that case; ``None`` should be normalized to the same defaults
+        ``__init__`` applies.
+        """
+        afe = AutoFeatureEngineer(engines=["tabular", "timeseries"])
+        afe.set_params(engines=None, selection_methods=None, llm_config=None)
+
+        assert afe.engines == ["tabular"]
+        assert afe.selection_methods == ["mutual_info", "importance"]
+        assert afe.llm_config == {}
+
+    def test_set_params_invalid_engine_still_raises(self):
+        """set_params should still validate non-None values."""
+        afe = AutoFeatureEngineer()
+        with pytest.raises(ValueError, match="Unknown engines"):
+            afe.set_params(engines=["not_a_real_engine"])
+
+    def test_sklearn_clone_round_trip(self):
+        """A cloned estimator must be configurable identically to the original."""
+        from sklearn.base import clone
+
+        afe = AutoFeatureEngineer(engines=["tabular"], max_features=7)
+        cloned = clone(afe)
+
+        assert cloned.engines == ["tabular"]
+        assert cloned.max_features == 7
+        assert cloned.selection_methods == ["mutual_info", "importance"]
+
 
 class TestPackageImport:
     """Tests for top-level package import behavior."""

From 939a6b5d1d74ee1465a3cff12618e7ed34b7ba58 Mon Sep 17 00:00:00 2001
From: Li Jiang <lijiang1@microsoft.com>
Date: Thu, 30 Apr 2026 21:28:35 +0800
Subject: [PATCH 17/22] fix(api,bench): validate set_params keys, regenerate
 use-case report
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Address the latest PR review comments on the strengthen-core-api branch:

* AutoFeatureEngineer.set_params:
  - Validate parameter keys against ``self.get_params(deep=True)`` and
    raise ``ValueError`` on unknown keys, matching scikit-learn's
    ``BaseEstimator.set_params`` convention. Previously typos like
    ``afe.set_params(typo_param=42)`` were silently accepted via
    ``setattr``, which masked bugs and broke tooling that expects
    sklearn-style validation. Documented the contract in the docstring
    and added regression tests covering the unknown-key error and that
    a failing call leaves the estimator unmutated.

* Auto feature engineering use-case benchmark:
  - Drop the redundant one-hot encoding/alignment in ``run_baseline``;
    ``evaluate_auc`` already owns that step (and applies it
    idempotently for the FE-tool runners that already produce numeric
    matrices). The baseline now passes raw frames straight through and
    reports ``n_features`` from the post-encoding column count so the
    metric still matches what the model actually trains on.
  - Sanitize ``±inf`` in ``align_and_fill`` (Featuretools'
    ``divide_numeric`` primitive can emit infinities on
    zero-denominator rows, which then crashes ``StandardScaler``). NaN
    handling is preserved.
  - Regenerate the committed report so its status text reflects what
    the current script actually does. With the upstream-bug fixes in
    the previous commit, the Featuretools case now succeeds end to
    end; the autofeat case still fails, but with the genuine current
    error (sklearn ≥ 1.6 dropped the ``force_all_finite`` kwarg that
    autofeat 2.1.x still passes) instead of the stale
    ``'Series' object has no attribute 'ravel'`` symptom.

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
---
 .../AUTO_FEATURE_ENGINEERING_USE_CASE.md      |  8 ++---
 .../run_auto_feature_engineering_benchmark.py | 36 ++++++++++++++-----
 featcopilot/transformers/sklearn_compat.py    | 32 +++++++++++++++--
 tests/test_sklearn_compat.py                  | 16 +++++++++
 4 files changed, 77 insertions(+), 15 deletions(-)

diff --git a/benchmarks/use_cases/AUTO_FEATURE_ENGINEERING_USE_CASE.md b/benchmarks/use_cases/AUTO_FEATURE_ENGINEERING_USE_CASE.md
index 1413c37..927e33f 100644
--- a/benchmarks/use_cases/AUTO_FEATURE_ENGINEERING_USE_CASE.md
+++ b/benchmarks/use_cases/AUTO_FEATURE_ENGINEERING_USE_CASE.md
@@ -4,7 +4,7 @@ Compares a plain baseline with FeatCopilot and common automatic feature engineer
 
 | Tool | Status | ROC-AUC | Feature Count |
 |------|--------|---------|---------------|
-| baseline | ok | 0.7440 | 9 |
-| featcopilot | ok | 0.7563 | 15 |
-| featuretools | failed: 'NoneType' object has no attribute 'columns' | - | - |
-| autofeat | failed: 'Series' object has no attribute 'ravel' | - | - |
+| baseline | ok | 0.6330 | 9 |
+| featcopilot | ok | 0.6328 | 11 |
+| featuretools | ok | 0.6362 | 60 |
+| autofeat | failed: check_array() got an unexpected keyword argument 'force_all_finite' | - | - |
diff --git a/benchmarks/use_cases/run_auto_feature_engineering_benchmark.py b/benchmarks/use_cases/run_auto_feature_engineering_benchmark.py
index cfe5050..b4b426d 100644
--- a/benchmarks/use_cases/run_auto_feature_engineering_benchmark.py
+++ b/benchmarks/use_cases/run_auto_feature_engineering_benchmark.py
@@ -61,13 +61,28 @@ def create_dataset(n_samples: int = 5000, random_state: int = 42) -> pd.DataFram
 
 
 def align_and_fill(X_train: pd.DataFrame, X_test: pd.DataFrame) -> tuple[pd.DataFrame, pd.DataFrame]:
-    """Align train/test columns and fill missing values safely."""
+    """Align train/test columns and replace NaN/±inf with 0 for downstream models.
+
+    Several feature-engineering tools can emit ``±inf`` (e.g. Featuretools'
+    ``divide_numeric`` primitive on rows where the denominator is 0). Scaling
+    or fitting any sklearn estimator on those values raises, so we sanitize
+    them here as part of column alignment.
+    """
     X_train_aligned, X_test_aligned = X_train.align(X_test, join="left", axis=1, fill_value=0)
-    return X_train_aligned.fillna(0), X_test_aligned.fillna(0)
+    X_train_clean = X_train_aligned.replace([np.inf, -np.inf], 0).fillna(0)
+    X_test_clean = X_test_aligned.replace([np.inf, -np.inf], 0).fillna(0)
+    return X_train_clean, X_test_clean
 
 
 def evaluate_auc(X_train: pd.DataFrame, X_test: pd.DataFrame, y_train: pd.Series, y_test: pd.Series) -> float:
-    """Train a simple classifier and return ROC-AUC."""
+    """Train a simple classifier and return ROC-AUC.
+
+    Owns the categorical encoding + column alignment so individual benchmark
+    cases don't have to repeat that step. The encoding is idempotent for
+    already-numeric inputs (``get_dummies`` is a no-op when there are no
+    object/categorical columns), so per-tool runners that produce numeric
+    matrices can pass them in directly without first one-hot encoding again.
+    """
     X_train = pd.get_dummies(X_train, drop_first=False)
     X_test = pd.get_dummies(X_test, drop_first=False)
     X_train, X_test = align_and_fill(X_train, X_test)
@@ -98,12 +113,15 @@ def evaluate_auc(X_train: pd.DataFrame, X_test: pd.DataFrame, y_train: pd.Series
 
 
 def run_baseline(X_train: pd.DataFrame, X_test: pd.DataFrame, y_train: pd.Series, y_test: pd.Series) -> dict[str, Any]:
-    """Run one-hot baseline."""
-    X_train_base = pd.get_dummies(X_train, drop_first=False)
-    X_test_base = pd.get_dummies(X_test, drop_first=False)
-    X_train_base, X_test_base = align_and_fill(X_train_base, X_test_base)
-    auc = evaluate_auc(X_train_base, X_test_base, y_train, y_test)
-    return {"tool": "baseline", "auc": auc, "n_features": X_train_base.shape[1]}
+    """Run one-hot baseline.
+
+    Encoding and alignment are owned by :func:`evaluate_auc`, so we pass the
+    raw frames straight through. ``n_features`` is reported as the post-encoding
+    column count to match what the model actually trains on.
+    """
+    auc = evaluate_auc(X_train, X_test, y_train, y_test)
+    n_features = pd.get_dummies(X_train, drop_first=False).shape[1]
+    return {"tool": "baseline", "auc": auc, "n_features": n_features}
 
 
 def run_featcopilot_case(
diff --git a/featcopilot/transformers/sklearn_compat.py b/featcopilot/transformers/sklearn_compat.py
index 8c9b8ee..3b931cf 100644
--- a/featcopilot/transformers/sklearn_compat.py
+++ b/featcopilot/transformers/sklearn_compat.py
@@ -479,11 +479,39 @@ def set_params(self, **params):
         """
         Set parameters for sklearn compatibility.
 
-        Mirrors the defaulting performed in ``__init__`` so callers (e.g. sklearn
-        cloning, ``GridSearchCV`` parameter grids) can pass ``None`` for
+        Validates parameter keys against the estimator's known parameters
+        (raising :class:`ValueError` on unknown keys, matching scikit-learn
+        ``BaseEstimator.set_params`` behavior) and then mirrors the defaulting
+        performed in ``__init__`` so callers (e.g. sklearn cloning,
+        ``GridSearchCV`` parameter grids) can pass ``None`` for
         collection-valued parameters and have it normalized back to the default
         rather than raising during validation.
+
+        Parameters
+        ----------
+        **params
+            Estimator parameters to update. Each key must already be a
+            top-level parameter accepted by ``__init__``.
+
+        Returns
+        -------
+        AutoFeatureEngineer
+            ``self``, to support fluent chaining.
+
+        Raises
+        ------
+        ValueError
+            If ``params`` contains a key that is not a known estimator
+            parameter, or if any provided value fails configuration
+            validation (see :meth:`_validate_configuration`).
         """
+        valid_params = self.get_params(deep=True)
+        invalid_keys = sorted(set(params) - set(valid_params))
+        if invalid_keys:
+            raise ValueError(
+                f"Invalid parameter(s) {invalid_keys} for estimator {type(self).__name__}. "
+                f"Valid parameters are: {sorted(valid_params)}."
+            )
         for key, value in params.items():
             setattr(self, key, value)
         if self.engines is None:
diff --git a/tests/test_sklearn_compat.py b/tests/test_sklearn_compat.py
index c7af4b8..9e0a3a0 100644
--- a/tests/test_sklearn_compat.py
+++ b/tests/test_sklearn_compat.py
@@ -284,6 +284,22 @@ def test_set_params_invalid_engine_still_raises(self):
         with pytest.raises(ValueError, match="Unknown engines"):
             afe.set_params(engines=["not_a_real_engine"])
 
+    def test_set_params_unknown_key_raises(self):
+        """set_params should reject unknown parameter names (sklearn convention)."""
+        afe = AutoFeatureEngineer()
+        with pytest.raises(ValueError, match="Invalid parameter"):
+            afe.set_params(not_a_real_param=42)
+
+    def test_set_params_unknown_key_does_not_mutate_state(self):
+        """A failing set_params call must leave the estimator unchanged."""
+        afe = AutoFeatureEngineer(engines=["tabular"], max_features=5)
+        with pytest.raises(ValueError):
+            afe.set_params(typo_param=99)
+
+        assert afe.engines == ["tabular"]
+        assert afe.max_features == 5
+        assert not hasattr(afe, "typo_param")
+
     def test_sklearn_clone_round_trip(self):
         """A cloned estimator must be configurable identically to the original."""
         from sklearn.base import clone

From 47ff1844911a546f4607a9d76980b2b498205ae9 Mon Sep 17 00:00:00 2001
From: Li Jiang <lijiang1@microsoft.com>
Date: Thu, 30 Apr 2026 22:22:43 +0800
Subject: [PATCH 18/22] fix(sklearn_compat): reset fit state and make
 set_params atomic

Address two PR #2 review findings on AutoFeatureEngineer:

1. ``fit`` previously appended into ``self._engine_instances`` and never
   cleared the prior dict or reset ``self._selector``. After changing
   ``engines`` via ``set_params`` (or after a previous ``fit_transform``
   that built a selector), ``transform`` could run stale engines or apply
   a stale selection. ``fit`` now calls a new ``_reset_fit_state`` helper
   that mirrors the fit-derived attribute initialization in ``__init__``
   (``_engine_instances``, ``_selector``, ``_feature_set``,
   ``_is_fitted``, ``_column_descriptions``, ``_task_description``)
   *before* doing any work, so a refit -- and even a fit that raises
   midway -- leaves the estimator in a clean, unfitted state instead of
   a partially-fitted one.

2. ``set_params`` mutated attributes via ``setattr`` and only then
   called ``_validate_configuration``. A failing call (e.g. an invalid
   ``engines``/``leakage_guard``/``max_features`` combination) left the
   estimator in a partially mutated, invalid state. The implementation
   now snapshots the pre-call values for every parameter in the request
   *before* applying any mutation (including the eventual ``None`` ->
   default normalization). On any exception, the snapshot is restored
   and the original error re-raised, so a failing ``set_params`` is a
   no-op from the caller's perspective.

Tests:
- test_set_params_invalid_value_rolls_back_state: combined invalid
  payload, every parameter restored.
- test_set_params_invalid_value_after_none_normalization_rolls_back:
  rollback restores the pre-call value rather than the
  None-normalized default.
- test_fit_resets_engine_instances_when_engines_change: removing an
  engine via set_params and refitting drops the stale fitted engine.
- test_fit_resets_selector_after_prior_fit_transform: a plain fit()
  following fit_transform() clears the selector so transform() does
  not re-apply the prior selection.
- test_fit_resets_state_when_called_after_failed_fit: a fit that
  raises mid-flight leaves _is_fitted=False, _engine_instances={}
  and transform() correctly errors out.

Full suite: 657 passed, 2 skipped (was 652, +5 new tests).
Pre-commit clean (black, ruff, trailing-whitespace, EOF, etc.).

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
---
 featcopilot/transformers/sklearn_compat.py | 63 ++++++++++++---
 tests/test_sklearn_compat.py               | 94 ++++++++++++++++++++++
 2 files changed, 147 insertions(+), 10 deletions(-)

diff --git a/featcopilot/transformers/sklearn_compat.py b/featcopilot/transformers/sklearn_compat.py
index 3b931cf..59c86df 100644
--- a/featcopilot/transformers/sklearn_compat.py
+++ b/featcopilot/transformers/sklearn_compat.py
@@ -179,6 +179,21 @@ def _validate_configuration(self) -> None:
         if self.max_features is not None and self.max_features <= 0:
             raise ValueError("max_features must be positive when provided")
 
+    def _reset_fit_state(self) -> None:
+        """Reset all attributes populated during ``fit``/``fit_transform``.
+
+        Called at the start of ``fit`` so that re-fitting (e.g. after changing
+        ``engines`` via ``set_params``) cannot leave stale fitted engines, a
+        stale selector, or fit-time metadata in place. Mirrors the fit-derived
+        attribute initialization in ``__init__``.
+        """
+        self._engine_instances = {}
+        self._selector = None
+        self._feature_set = FeatureSet()
+        self._is_fitted = False
+        self._column_descriptions = {}
+        self._task_description = ""
+
     def fit(
         self,
         X: Union[pd.DataFrame, np.ndarray],
@@ -210,6 +225,14 @@ def fit(
         -------
         self : AutoFeatureEngineer
         """
+        # Reset all fit-derived state so a refit (e.g. after changing ``engines``
+        # or after a previous ``fit_transform`` that built a selector) cannot leak
+        # stale engines, a stale selector, or the previous ``_is_fitted`` flag
+        # into a subsequent ``transform`` call. Any early exit below (validation
+        # error, leakage_guard='raise', engine fit failure) leaves the estimator
+        # in a clean, unfitted state rather than a partially-fitted one.
+        self._reset_fit_state()
+
         # Convert to DataFrame if needed
         if isinstance(X, np.ndarray):
             X = pd.DataFrame(X, columns=[f"feature_{i}" for i in range(X.shape[1])])
@@ -487,6 +510,11 @@ def set_params(self, **params):
         collection-valued parameters and have it normalized back to the default
         rather than raising during validation.
 
+        The update is atomic: if any provided value fails configuration
+        validation, all in-flight mutations are rolled back so the estimator
+        is left in its pre-call state rather than a partially-mutated invalid
+        one.
+
         Parameters
         ----------
         **params
@@ -503,7 +531,9 @@ def set_params(self, **params):
         ValueError
             If ``params`` contains a key that is not a known estimator
             parameter, or if any provided value fails configuration
-            validation (see :meth:`_validate_configuration`).
+            validation (see :meth:`_validate_configuration`). On validation
+            failure the estimator's parameters are restored to the values
+            they held before the call.
         """
         valid_params = self.get_params(deep=True)
         invalid_keys = sorted(set(params) - set(valid_params))
@@ -512,13 +542,26 @@ def set_params(self, **params):
                 f"Invalid parameter(s) {invalid_keys} for estimator {type(self).__name__}. "
                 f"Valid parameters are: {sorted(valid_params)}."
             )
-        for key, value in params.items():
-            setattr(self, key, value)
-        if self.engines is None:
-            self.engines = ["tabular"]
-        if self.selection_methods is None:
-            self.selection_methods = ["mutual_info", "importance"]
-        if self.llm_config is None:
-            self.llm_config = {}
-        self._validate_configuration()
+
+        # Snapshot the current values for every parameter we are about to
+        # change (including any whose final value will come from None
+        # normalization below) so that a validation failure can roll back to
+        # a fully consistent pre-call state.
+        snapshot = {key: getattr(self, key) for key in params}
+
+        try:
+            for key, value in params.items():
+                setattr(self, key, value)
+            if self.engines is None:
+                self.engines = ["tabular"]
+            if self.selection_methods is None:
+                self.selection_methods = ["mutual_info", "importance"]
+            if self.llm_config is None:
+                self.llm_config = {}
+            self._validate_configuration()
+        except Exception:
+            for key, value in snapshot.items():
+                setattr(self, key, value)
+            raise
+
         return self
diff --git a/tests/test_sklearn_compat.py b/tests/test_sklearn_compat.py
index 9e0a3a0..c7b9236 100644
--- a/tests/test_sklearn_compat.py
+++ b/tests/test_sklearn_compat.py
@@ -311,6 +311,100 @@ def test_sklearn_clone_round_trip(self):
         assert cloned.max_features == 7
         assert cloned.selection_methods == ["mutual_info", "importance"]
 
+    def test_set_params_invalid_value_rolls_back_state(self, sample_df):
+        """A failing set_params call must leave every parameter at its pre-call value."""
+        afe = AutoFeatureEngineer(
+            engines=["tabular"],
+            max_features=5,
+            selection_methods=["mutual_info"],
+            correlation_threshold=0.9,
+            llm_config={"model": "gpt-5.2"},
+            verbose=False,
+            leakage_guard="warn",
+        )
+
+        with pytest.raises(ValueError):
+            afe.set_params(
+                max_features=10,
+                engines=["bogus_engine"],
+                leakage_guard="not_a_mode",
+            )
+
+        # Every parameter that was part of the failing call must be restored.
+        assert afe.engines == ["tabular"]
+        assert afe.max_features == 5
+        assert afe.leakage_guard == "warn"
+        # Untouched parameters are obviously unchanged but assert anyway to
+        # guard against unrelated mutations.
+        assert afe.selection_methods == ["mutual_info"]
+        assert afe.correlation_threshold == 0.9
+        assert afe.llm_config == {"model": "gpt-5.2"}
+        assert afe.verbose is False
+
+    def test_set_params_invalid_value_after_none_normalization_rolls_back(self):
+        """Rollback must capture the pre-call value, not the None-normalized one."""
+        afe = AutoFeatureEngineer(engines=["tabular", "timeseries"])
+
+        with pytest.raises(ValueError):
+            afe.set_params(engines=None, max_features=-1)
+
+        # engines was None-normalized to ["tabular"] mid-call; rollback must
+        # restore the original ["tabular", "timeseries"], not ["tabular"].
+        assert afe.engines == ["tabular", "timeseries"]
+        assert afe.max_features is None
+
+    def test_fit_resets_engine_instances_when_engines_change(self, sample_df, sample_target):
+        """Refitting after removing an engine must drop the previously fitted engine."""
+        afe = AutoFeatureEngineer(engines=["tabular", "timeseries"], verbose=False)
+        afe.fit(sample_df, sample_target)
+        assert set(afe._engine_instances) == {"tabular", "timeseries"}
+
+        afe.set_params(engines=["tabular"])
+        afe.fit(sample_df, sample_target)
+
+        # The previously fitted "timeseries" engine must not survive into the
+        # new fit, otherwise transform() would invoke a stale engine.
+        assert set(afe._engine_instances) == {"tabular"}
+
+    def test_fit_resets_selector_after_prior_fit_transform(self, sample_df, sample_target):
+        """A plain fit() following fit_transform() must clear the selector."""
+        afe = AutoFeatureEngineer(engines=["tabular"], max_features=3, verbose=False)
+        afe.fit_transform(sample_df, sample_target)
+        assert afe._selector is not None
+
+        afe.fit(sample_df, sample_target)
+
+        # Without a selector reset, transform() would still apply the stale
+        # selection from the previous fit_transform call.
+        assert afe._selector is None
+        result = afe.transform(sample_df)
+        # Every input column must survive transform when no selector is active.
+        for col in sample_df.columns:
+            assert col in result.columns
+
+    def test_fit_resets_state_when_called_after_failed_fit(self, sample_df, sample_target, monkeypatch):
+        """If fit raises mid-flight, _is_fitted must be False so transform errors out."""
+        afe = AutoFeatureEngineer(engines=["tabular"], verbose=False)
+        afe.fit(sample_df, sample_target)
+        assert afe._is_fitted is True
+
+        # Force the next fit to fail partway through engine fitting.
+        from featcopilot.engines.tabular import TabularEngine
+
+        def _boom(self, X, y=None, **kwargs):
+            raise RuntimeError("simulated engine failure")
+
+        monkeypatch.setattr(TabularEngine, "fit", _boom)
+        with pytest.raises(RuntimeError, match="simulated engine failure"):
+            afe.fit(sample_df, sample_target)
+
+        # The failed fit must not leave the estimator in a "fitted" state
+        # that points at stale engines from the previous successful fit.
+        assert afe._is_fitted is False
+        assert afe._engine_instances == {}
+        with pytest.raises(RuntimeError, match="Must call fit"):
+            afe.transform(sample_df)
+
 
 class TestPackageImport:
     """Tests for top-level package import behavior."""

From 681841ff6a8add20b54dadeaf166dcfa6fda33a3 Mon Sep 17 00:00:00 2001
From: Li Jiang <lijiang1@microsoft.com>
Date: Thu, 30 Apr 2026 22:33:58 +0800
Subject: [PATCH 19/22] fix: clearer ValueErrors for non-string engine names
 and out-of-range test_size

Address two PR #2 review findings:

1. ``AutoFeatureEngineer._validate_configuration`` previously used
   ``sorted(set(self.engines) - SUPPORTED_ENGINES)`` (and the analogous
   call for ``selection_methods``) to build its error messages. If the
   caller passed a mix of non-string values (e.g.
   ``engines=[None, "spaceship"]``), ``sorted`` raised a confusing
   ``TypeError`` from the comparison rather than the intended
   ``ValueError``. The validator now rejects non-string entries up front
   with a clear ``ValueError`` that names the offending values and
   lists the supported names. ``set_params`` inherits the new check
   (and rolls back on it) because validation runs through the same code
   path.

2. ``benchmarks.splits.split_benchmark_data`` did not validate
   ``test_size`` for the chronological branch. Values like
   ``test_size <= 0`` or ``>= 1`` silently produced empty / overlapping
   splits, whereas the random branch (via ``train_test_split``) would
   have raised. The function now asserts ``0 < test_size < 1`` up front
   and additionally raises when the chronological ``split_idx`` would
   leave either side of the split empty (e.g. tiny datasets combined
   with an extreme ``test_size``), matching the fail-fast behavior of
   ``sklearn.model_selection.train_test_split``.

Tests added in ``tests/test_sklearn_compat.py``:
- ``test_validate_engines_rejects_non_string_entries``
- ``test_validate_selection_methods_rejects_non_string_entries``
- ``test_set_params_rejects_non_string_engine_entries_and_rolls_back``

Tests added in ``tests/test_benchmark_splits.py``:
- ``test_split_benchmark_data_rejects_out_of_range_test_size``
  (parametrized over both random and chronological branches with 0.0,
  1.0, -0.1, 1.5, 2)
- ``test_split_benchmark_data_chronological_rejects_empty_train_split``
- ``test_split_benchmark_data_chronological_single_row_dataset_raises``

Full suite: 667 passed, 2 skipped (was 657, +10 new).
Pre-commit clean (black, ruff, trailing-whitespace, EOF, etc.).

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
---
 benchmarks/splits.py                       | 20 +++++++++++++
 featcopilot/transformers/sklearn_compat.py | 18 ++++++++++++
 tests/test_benchmark_splits.py             | 34 ++++++++++++++++++++++
 tests/test_sklearn_compat.py               | 24 +++++++++++++++
 4 files changed, 96 insertions(+)

diff --git a/benchmarks/splits.py b/benchmarks/splits.py
index b585887..4b97c34 100644
--- a/benchmarks/splits.py
+++ b/benchmarks/splits.py
@@ -49,11 +49,31 @@ def split_benchmark_data(
         Target values for the training rows.
     y_test : pandas.Series
         Target values for the test rows.
+
+    Raises
+    ------
+    ValueError
+        If ``test_size`` is not strictly between 0 and 1, or if the resulting
+        chronological split would leave either side empty (for example, a
+        very small dataset combined with an extreme ``test_size``).
     """
+    # Validate ``test_size`` up front so the chronological branch matches the
+    # behavior of ``sklearn.model_selection.train_test_split`` (which rejects
+    # ``test_size <= 0`` / ``>= 1``) instead of silently producing an empty
+    # or overlapping split.
+    if not (0 < test_size < 1):
+        raise ValueError(f"test_size must be a float strictly between 0 and 1; got {test_size!r}")
+
     indices = np.arange(len(X))
 
     if "forecast" in task or "timeseries" in task:
         split_idx = int(len(indices) * (1 - test_size))
+        if split_idx <= 0 or split_idx >= len(indices):
+            raise ValueError(
+                "Chronological split would leave one side empty: "
+                f"len(X)={len(indices)}, test_size={test_size} -> split_idx={split_idx}. "
+                "Provide more rows or pick a different ``test_size``."
+            )
         train_idx = indices[:split_idx]
         test_idx = indices[split_idx:]
         y_train = y.iloc[train_idx]
diff --git a/featcopilot/transformers/sklearn_compat.py b/featcopilot/transformers/sklearn_compat.py
index 59c86df..ebcc728 100644
--- a/featcopilot/transformers/sklearn_compat.py
+++ b/featcopilot/transformers/sklearn_compat.py
@@ -160,6 +160,24 @@ def __init__(
 
     def _validate_configuration(self) -> None:
         """Validate user-facing configuration early."""
+        # Reject non-string entries up front so that the diff against the
+        # supported-name sets (and the ``sorted(...)`` used to build the error
+        # message) cannot raise an unrelated ``TypeError`` for mixed-type
+        # inputs (e.g. ``engines=[None, "spaceship"]``).
+        non_string_engines = [e for e in self.engines if not isinstance(e, str)]
+        if non_string_engines:
+            raise ValueError(
+                "engines must contain only strings; got non-string entries: "
+                f"{non_string_engines!r}. Supported engines: {sorted(self.SUPPORTED_ENGINES)}"
+            )
+
+        non_string_methods = [m for m in self.selection_methods if not isinstance(m, str)]
+        if non_string_methods:
+            raise ValueError(
+                "selection_methods must contain only strings; got non-string entries: "
+                f"{non_string_methods!r}. Supported methods: {sorted(self.SUPPORTED_SELECTION_METHODS)}"
+            )
+
         unknown_engines = sorted(set(self.engines) - self.SUPPORTED_ENGINES)
         if unknown_engines:
             raise ValueError(f"Unknown engines: {unknown_engines}. Supported engines: {sorted(self.SUPPORTED_ENGINES)}")
diff --git a/tests/test_benchmark_splits.py b/tests/test_benchmark_splits.py
index f4fb26c..570ea39 100644
--- a/tests/test_benchmark_splits.py
+++ b/tests/test_benchmark_splits.py
@@ -91,6 +91,40 @@ def test_custom_test_size_respected():
     assert len(train_idx) == 60
 
 
+@pytest.mark.parametrize("bad_test_size", [0.0, 1.0, -0.1, 1.5, 2])
+def test_split_benchmark_data_rejects_out_of_range_test_size(bad_test_size):
+    """``test_size`` must be strictly between 0 and 1 for both branches."""
+    X = pd.DataFrame({"f": np.arange(100.0)})
+    y = pd.Series(np.arange(100, dtype=float))
+
+    # Random branch.
+    with pytest.raises(ValueError, match="test_size must be a float strictly between 0 and 1"):
+        split_benchmark_data(X, y, "regression", random_state=0, test_size=bad_test_size)
+
+    # Chronological branch -- previously silently produced empty/overlapping splits.
+    with pytest.raises(ValueError, match="test_size must be a float strictly between 0 and 1"):
+        split_benchmark_data(X, y, "forecasting", random_state=0, test_size=bad_test_size)
+
+
+def test_split_benchmark_data_chronological_rejects_empty_train_split():
+    """Tiny datasets with extreme ``test_size`` must raise instead of producing an empty train set."""
+    X = pd.DataFrame({"t": np.arange(2)})
+    y = pd.Series(np.arange(2, dtype=float))
+
+    # len=2, test_size=0.9 -> split_idx = int(2 * 0.1) = 0 -> empty train.
+    with pytest.raises(ValueError, match="Chronological split would leave one side empty"):
+        split_benchmark_data(X, y, "forecasting", random_state=0, test_size=0.9)
+
+
+def test_split_benchmark_data_chronological_single_row_dataset_raises():
+    """A single-row dataset cannot be chronologically split for any valid ``test_size``."""
+    X = pd.DataFrame({"t": [0]})
+    y = pd.Series([0.0])
+
+    with pytest.raises(ValueError, match="Chronological split would leave one side empty"):
+        split_benchmark_data(X, y, "forecasting", random_state=0, test_size=0.5)
+
+
 # ---------------------------------------------------------------------------
 # Wiring tests: ensure benchmark scripts actually use the shared helper.
 # These guard against the regression flagged on PR #2 where the helper was
diff --git a/tests/test_sklearn_compat.py b/tests/test_sklearn_compat.py
index c7b9236..bdf91a5 100644
--- a/tests/test_sklearn_compat.py
+++ b/tests/test_sklearn_compat.py
@@ -353,6 +353,30 @@ def test_set_params_invalid_value_after_none_normalization_rolls_back(self):
         assert afe.engines == ["tabular", "timeseries"]
         assert afe.max_features is None
 
+    def test_validate_engines_rejects_non_string_entries(self):
+        """Mixed-type engine lists must raise ValueError, not TypeError from sorted()."""
+        # A bare ``sorted(set(...))`` over a mix of None/str would raise
+        # ``TypeError: '<' not supported between instances of 'str' and 'NoneType'``.
+        # The validator must surface a clear ValueError instead.
+        with pytest.raises(ValueError, match="engines must contain only strings"):
+            AutoFeatureEngineer(engines=[None, "tabular"])
+        with pytest.raises(ValueError, match="engines must contain only strings"):
+            AutoFeatureEngineer(engines=["tabular", 42])
+
+    def test_validate_selection_methods_rejects_non_string_entries(self):
+        """Mixed-type selection_methods lists must raise ValueError, not TypeError from sorted()."""
+        with pytest.raises(ValueError, match="selection_methods must contain only strings"):
+            AutoFeatureEngineer(selection_methods=[None, "mutual_info"])
+        with pytest.raises(ValueError, match="selection_methods must contain only strings"):
+            AutoFeatureEngineer(selection_methods=["mutual_info", 0])
+
+    def test_set_params_rejects_non_string_engine_entries_and_rolls_back(self):
+        """set_params must surface the same ValueError and roll back state."""
+        afe = AutoFeatureEngineer(engines=["tabular"])
+        with pytest.raises(ValueError, match="engines must contain only strings"):
+            afe.set_params(engines=[None, "spaceship"])
+        assert afe.engines == ["tabular"]
+
     def test_fit_resets_engine_instances_when_engines_change(self, sample_df, sample_target):
         """Refitting after removing an engine must drop the previously fitted engine."""
         afe = AutoFeatureEngineer(engines=["tabular", "timeseries"], verbose=False)

From bf6b8f98e0c079d54514b5815351883e222f0456 Mon Sep 17 00:00:00 2001
From: Li Jiang <lijiang1@microsoft.com>
Date: Thu, 30 Apr 2026 22:50:05 +0800
Subject: [PATCH 20/22] fix: stricter empty/falsy handling for engines and
 target_name

Address two PR #2 review findings:

1. ``find_potential_leakage_columns`` (``featcopilot/utils/validation.py``)
   - ``keywords = keywords or DEFAULT_LEAKAGE_KEYWORDS`` prevented callers
     from intentionally passing ``keywords=[]`` to disable keyword
     matching. Switched to an explicit ``is None`` check so an empty
     list is honored.
   - ``normalized_target = ... if target_name else None`` had two bugs:
     valid falsy targets (e.g. ``0``) were treated as "no target", and
     ``target_name=""`` would normalize to the empty string, which
     matches every column via the ``normalized_target in normalized_column``
     substring check. Now: explicit ``is None`` check, and an empty
     normalized result is treated as absent so that ``target_name=""``
     (or values like ``"---"`` that strip to nothing) no longer matches
     every column, while ``target_name=0`` correctly drives matching.

2. ``AutoFeatureEngineer`` (``featcopilot/transformers/sklearn_compat.py``)
   - With the recent ``is not None`` defaulting in ``__init__``, an
     explicit ``engines=[]`` was preserved. ``fit()`` would then run
     zero engines and still mark the estimator fitted, making
     ``transform()`` a silent no-op. ``_validate_configuration`` now
     rejects empty ``engines`` and empty ``selection_methods`` with a
     clear ``ValueError`` that points callers at ``None`` (for the
     default) or the supported names. ``set_params`` inherits this
     check (and the atomic rollback) automatically.

Tests added in ``tests/test_utils.py``:
- ``test_leakage_detection_empty_keywords_disables_keyword_matching``
- ``test_leakage_detection_falsy_target_name_zero_still_matches``
- ``test_leakage_detection_empty_string_target_does_not_match_everything``

Tests added in ``tests/test_sklearn_compat.py``:
- ``test_init_rejects_empty_engines_list``
- ``test_init_rejects_empty_selection_methods_list``
- ``test_set_params_rejects_empty_engines_and_rolls_back``
- ``test_init_engines_none_still_defaults_to_tabular``

Full suite: 674 passed, 2 skipped (was 667, +7).
Pre-commit clean (black, ruff, trailing-whitespace, EOF, etc.).

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
---
 featcopilot/transformers/sklearn_compat.py | 18 +++++++++++++
 featcopilot/utils/validation.py            | 24 +++++++++++++++--
 tests/test_sklearn_compat.py               | 26 ++++++++++++++++++
 tests/test_utils.py                        | 31 ++++++++++++++++++++++
 4 files changed, 97 insertions(+), 2 deletions(-)

diff --git a/featcopilot/transformers/sklearn_compat.py b/featcopilot/transformers/sklearn_compat.py
index ebcc728..6107690 100644
--- a/featcopilot/transformers/sklearn_compat.py
+++ b/featcopilot/transformers/sklearn_compat.py
@@ -171,6 +171,17 @@ def _validate_configuration(self) -> None:
                 f"{non_string_engines!r}. Supported engines: {sorted(self.SUPPORTED_ENGINES)}"
             )
 
+        # Reject empty collections explicitly. ``engines=None`` is normalized to
+        # the default in ``__init__`` / ``set_params``; an explicit empty list
+        # would otherwise leave ``fit()`` running zero engines and ``transform()``
+        # silently returning the input (modulo NaN/inf cleanup), which is a
+        # surprising silent no-op rather than a misconfiguration.
+        if not self.engines:
+            raise ValueError(
+                "engines must contain at least one engine; got an empty sequence. "
+                f"Pass ``engines=None`` for the default ['tabular'] or pick from {sorted(self.SUPPORTED_ENGINES)}."
+            )
+
         non_string_methods = [m for m in self.selection_methods if not isinstance(m, str)]
         if non_string_methods:
             raise ValueError(
@@ -178,6 +189,13 @@ def _validate_configuration(self) -> None:
                 f"{non_string_methods!r}. Supported methods: {sorted(self.SUPPORTED_SELECTION_METHODS)}"
             )
 
+        if not self.selection_methods:
+            raise ValueError(
+                "selection_methods must contain at least one method; got an empty sequence. "
+                "Pass ``selection_methods=None`` for the default "
+                f"['mutual_info', 'importance'] or pick from {sorted(self.SUPPORTED_SELECTION_METHODS)}."
+            )
+
         unknown_engines = sorted(set(self.engines) - self.SUPPORTED_ENGINES)
         if unknown_engines:
             raise ValueError(f"Unknown engines: {unknown_engines}. Supported engines: {sorted(self.SUPPORTED_ENGINES)}")
diff --git a/featcopilot/utils/validation.py b/featcopilot/utils/validation.py
index 3c37a67..300cd79 100644
--- a/featcopilot/utils/validation.py
+++ b/featcopilot/utils/validation.py
@@ -45,10 +45,30 @@ def find_potential_leakage_columns(
     -----
     Target-name matching is intentionally fuzzy: labels are normalized and substring
     variants are flagged so derived names such as ``target_encoded`` are reviewed.
+
+    Pass ``keywords=[]`` to opt out of keyword-based matching entirely (only
+    the explicit ``target_name`` will be used). ``target_name`` is treated as
+    absent only when it is ``None`` *or* normalizes to an empty string after
+    stripping non-alphanumerics; this lets falsy-but-meaningful values such
+    as ``0`` participate in matching while preventing ``target_name=""``
+    from matching every column via the empty-substring trap.
     """
-    keywords = keywords or DEFAULT_LEAKAGE_KEYWORDS
+    # Use ``is None`` defaulting (rather than ``or``) so callers can pass an
+    # empty list to explicitly disable keyword matching.
+    if keywords is None:
+        keywords = DEFAULT_LEAKAGE_KEYWORDS
     normalized_keywords = [_normalize_column_name(keyword) for keyword in keywords]
-    normalized_target = _normalize_column_name(target_name) if target_name else None
+
+    # Use explicit ``is None`` so falsy but meaningful target labels (e.g. ``0``)
+    # still participate in matching. After normalization, an empty string is
+    # treated as "no target" so that ``target_name=""`` (or values like ``"---"``
+    # that strip to nothing) cannot match every column via the
+    # ``normalized_target in normalized_column`` substring check.
+    if target_name is None:
+        normalized_target: Optional[str] = None
+    else:
+        normalized = _normalize_column_name(target_name)
+        normalized_target = normalized if normalized else None
 
     suspicious: list[Any] = []
     for column in columns:
diff --git a/tests/test_sklearn_compat.py b/tests/test_sklearn_compat.py
index bdf91a5..ef25090 100644
--- a/tests/test_sklearn_compat.py
+++ b/tests/test_sklearn_compat.py
@@ -377,6 +377,32 @@ def test_set_params_rejects_non_string_engine_entries_and_rolls_back(self):
             afe.set_params(engines=[None, "spaceship"])
         assert afe.engines == ["tabular"]
 
+    def test_init_rejects_empty_engines_list(self):
+        """An explicitly empty ``engines=[]`` must raise rather than silently no-op."""
+        # ``engines=None`` defaults to ['tabular']; an explicit empty list is a
+        # different intent and would otherwise let ``fit()`` mark the estimator
+        # fitted with zero engines so that ``transform()`` becomes a silent no-op.
+        with pytest.raises(ValueError, match="engines must contain at least one engine"):
+            AutoFeatureEngineer(engines=[])
+
+    def test_init_rejects_empty_selection_methods_list(self):
+        """An explicitly empty ``selection_methods=[]`` must raise."""
+        with pytest.raises(ValueError, match="selection_methods must contain at least one method"):
+            AutoFeatureEngineer(selection_methods=[])
+
+    def test_set_params_rejects_empty_engines_and_rolls_back(self):
+        """set_params must reject empty ``engines=[]`` and leave state untouched."""
+        afe = AutoFeatureEngineer(engines=["tabular"], max_features=5)
+        with pytest.raises(ValueError, match="engines must contain at least one engine"):
+            afe.set_params(engines=[])
+        assert afe.engines == ["tabular"]
+        assert afe.max_features == 5
+
+    def test_init_engines_none_still_defaults_to_tabular(self):
+        """``engines=None`` continues to normalize to the default ['tabular']."""
+        afe = AutoFeatureEngineer(engines=None)
+        assert afe.engines == ["tabular"]
+
     def test_fit_resets_engine_instances_when_engines_change(self, sample_df, sample_target):
         """Refitting after removing an engine must drop the previously fitted engine."""
         afe = AutoFeatureEngineer(engines=["tabular", "timeseries"], verbose=False)
diff --git a/tests/test_utils.py b/tests/test_utils.py
index 576cad6..a26e655 100644
--- a/tests/test_utils.py
+++ b/tests/test_utils.py
@@ -39,6 +39,37 @@ def test_leakage_detection_non_string_columns():
     assert find_potential_leakage_columns(["future-target!"], target_name="target") == ["future-target!"]
 
 
+def test_leakage_detection_empty_keywords_disables_keyword_matching():
+    """Passing ``keywords=[]`` must opt out of keyword-based matching entirely."""
+    # Without an explicit empty keywords list, "target" / "future_score" would
+    # be flagged via DEFAULT_LEAKAGE_KEYWORDS. With ``keywords=[]`` and no
+    # ``target_name``, no column should be flagged.
+    assert find_potential_leakage_columns(["target", "future_score", "x"], keywords=[]) == []
+
+    # Explicit ``target_name`` still works alongside ``keywords=[]``.
+    assert find_potential_leakage_columns(
+        ["target", "future_score", "label_x"],
+        target_name="label",
+        keywords=[],
+    ) == ["label_x"]
+
+
+def test_leakage_detection_falsy_target_name_zero_still_matches():
+    """Falsy but meaningful targets (e.g. ``0``) must still drive matching."""
+    # Previously ``if target_name`` would treat ``0`` as absent and skip target
+    # matching entirely. The integer column ``0`` should now be flagged.
+    assert find_potential_leakage_columns([0, "feature"], target_name=0) == [0]
+
+
+def test_leakage_detection_empty_string_target_does_not_match_everything():
+    """``target_name=''`` must not flag every column via the empty-substring trap."""
+    # An empty (or whitespace-only) target normalizes to "" which would otherwise
+    # be treated as a substring of every column. Treating empty normalized
+    # results as absent prevents that.
+    assert find_potential_leakage_columns(["a", "b", "c"], target_name="", keywords=[]) == []
+    assert find_potential_leakage_columns(["a", "b", "c"], target_name="---", keywords=[]) == []
+
+
 # ---------------------------------------------------------------------------
 # FeatureCache tests
 # ---------------------------------------------------------------------------

From 28345c031b9c7afa66c70c77eeb1c7c6b28dbee5 Mon Sep 17 00:00:00 2001
From: Li Jiang <lijiang1@microsoft.com>
Date: Thu, 30 Apr 2026 23:03:43 +0800
Subject: [PATCH 21/22] fix(sklearn_compat): reject non-sequence
 engines/methods and widen target_name typing

Address two PR #2 review findings:

1. ``_validate_configuration`` previously assumed ``self.engines`` and
   ``self.selection_methods`` were iterable sequences. A bare ``str``
   (e.g. ``engines="tabular"``) would silently iterate
   character-by-character through the downstream non-string-entry /
   set-diff checks and surface as a confusing
   ``Unknown engines: ['a', 'b', 'l', 'r', 't', 'u']`` error, while a
   non-iterable value (e.g. ``engines=5``) raised an unrelated
   ``TypeError`` from ``set(self.engines)``. The validator now rejects
   anything that is not a ``list`` or ``tuple`` up front with a clear
   ``ValueError`` that names the offending type and value, and runs
   *before* every other engines/methods check so the rest of the
   validator can rely on a well-formed container. ``set_params``
   inherits this guard (and its atomic rollback) automatically.

2. ``fit`` and ``fit_transform`` annotated ``target_name`` as
   ``Optional[str]``, but ``find_potential_leakage_columns`` is
   explicitly designed to accept any column-label type DataFrames
   support (the helper normalizes labels via ``str(...)`` and is
   covered by tests with integer labels). The annotation has been
   widened to ``Optional[Any]`` on both methods, the matching
   docstring entries now read ``hashable, optional`` and call out
   that non-string labels are accepted, and the class-level
   ``Other Parameters`` block was updated for consistency.

Tests added in ``tests/test_sklearn_compat.py``:
- ``test_init_rejects_string_engines_argument``
- ``test_init_rejects_non_sequence_engines_argument`` (covers ``int``
  and ``dict``)
- ``test_init_rejects_string_selection_methods_argument``
- ``test_init_rejects_non_sequence_selection_methods_argument``
- ``test_init_accepts_tuple_engines`` (pins tuple support so future
  tightenings don't accidentally break it)
- ``test_set_params_rejects_string_engines_and_rolls_back`` (also
  verifies prior ``max_features`` is restored)
- ``test_fit_accepts_non_string_target_name`` (integer column label
  ``0`` honored as a target by the leakage guard)

Full suite: 681 passed, 2 skipped (was 674, +7).
Pre-commit clean (black, ruff, trailing-whitespace, EOF, etc.).

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
---
 featcopilot/transformers/sklearn_compat.py | 48 ++++++++++++++++----
 tests/test_sklearn_compat.py               | 52 ++++++++++++++++++++++
 2 files changed, 91 insertions(+), 9 deletions(-)

diff --git a/featcopilot/transformers/sklearn_compat.py b/featcopilot/transformers/sklearn_compat.py
index 6107690..07b5566 100644
--- a/featcopilot/transformers/sklearn_compat.py
+++ b/featcopilot/transformers/sklearn_compat.py
@@ -108,10 +108,12 @@ class AutoFeatureEngineer(BaseEstimator, TransformerMixin):
 
     Other Parameters
     ----------------
-    target_name : str, optional
+    target_name : hashable, optional
         Fit-time parameter accepted by :meth:`fit` and :meth:`fit_transform`.
-        When provided, the leakage guard cross-references column names against
-        the target so derived variants (e.g. ``target_encoded``) are flagged.
+        When provided, the leakage guard cross-references column labels
+        against the target so derived variants (e.g. ``target_encoded``) are
+        flagged. Accepts any column-label type DataFrames support
+        (typically ``str``, but also ``int`` or other hashables).
 
     Examples
     --------
@@ -160,6 +162,28 @@ def __init__(
 
     def _validate_configuration(self) -> None:
         """Validate user-facing configuration early."""
+        # Reject non-sequence containers (and ``str``/``bytes``, which are
+        # technically iterable but would be iterated character-by-character)
+        # before any iteration so that the downstream non-string-entry,
+        # empty, and set-diff checks all run on a real list/tuple. Without
+        # this guard, ``engines="tabular"`` would silently expand into
+        # individual characters and produce a confusing "Unknown engines"
+        # error, and ``engines=5`` would raise an unrelated ``TypeError``
+        # from ``set(self.engines)``.
+        if not isinstance(self.engines, (list, tuple)):
+            raise ValueError(
+                "engines must be a list or tuple of strings; got "
+                f"{type(self.engines).__name__}={self.engines!r}. "
+                f"Supported engines: {sorted(self.SUPPORTED_ENGINES)}"
+            )
+
+        if not isinstance(self.selection_methods, (list, tuple)):
+            raise ValueError(
+                "selection_methods must be a list or tuple of strings; got "
+                f"{type(self.selection_methods).__name__}={self.selection_methods!r}. "
+                f"Supported methods: {sorted(self.SUPPORTED_SELECTION_METHODS)}"
+            )
+
         # Reject non-string entries up front so that the diff against the
         # supported-name sets (and the ``sorted(...)`` used to build the error
         # message) cannot raise an unrelated ``TypeError`` for mixed-type
@@ -236,7 +260,7 @@ def fit(
         y: Optional[Union[pd.Series, np.ndarray]] = None,
         column_descriptions: Optional[dict[str, str]] = None,
         task_description: str = "prediction task",
-        target_name: Optional[str] = None,
+        target_name: Optional[Any] = None,
         **fit_params,
     ) -> "AutoFeatureEngineer":
         """
@@ -252,8 +276,11 @@ def fit(
             Human-readable descriptions of columns (for LLM)
         task_description : str
             Description of the ML task (for LLM)
-        target_name : str, optional
-            Target column name used by leakage checks to identify related feature columns
+        target_name : hashable, optional
+            Target column label used by leakage checks to identify related
+            feature columns. Accepts any column-label type DataFrames support
+            (typically ``str``, but also ``int`` or other hashables); the
+            leakage helper normalizes labels via ``str(...)`` before matching.
         **fit_params : dict
             Additional parameters
 
@@ -390,7 +417,7 @@ def fit_transform(
         y: Optional[Union[pd.Series, np.ndarray]] = None,
         column_descriptions: Optional[dict[str, str]] = None,
         task_description: str = "prediction task",
-        target_name: Optional[str] = None,
+        target_name: Optional[Any] = None,
         apply_selection: bool = True,
         **fit_params,
     ) -> pd.DataFrame:
@@ -407,8 +434,11 @@ def fit_transform(
             Human-readable column descriptions
         task_description : str
             ML task description
-        target_name : str, optional
-            Target column name used by leakage checks to identify related feature columns
+        target_name : hashable, optional
+            Target column label used by leakage checks to identify related
+            feature columns. Accepts any column-label type DataFrames support
+            (typically ``str``, but also ``int`` or other hashables); the
+            leakage helper normalizes labels via ``str(...)`` before matching.
         apply_selection : bool, default=True
             Whether to apply feature selection
         **fit_params : dict
diff --git a/tests/test_sklearn_compat.py b/tests/test_sklearn_compat.py
index ef25090..6920b9a 100644
--- a/tests/test_sklearn_compat.py
+++ b/tests/test_sklearn_compat.py
@@ -403,6 +403,58 @@ def test_init_engines_none_still_defaults_to_tabular(self):
         afe = AutoFeatureEngineer(engines=None)
         assert afe.engines == ["tabular"]
 
+    def test_init_rejects_string_engines_argument(self):
+        """A bare ``str`` for ``engines`` must raise instead of iterating char-by-char."""
+        # Without the container-type guard, ``engines="tabular"`` would expand
+        # into individual characters and produce a confusing "Unknown engines"
+        # error such as ``Unknown engines: ['a', 'b', 'l', 'r', 't', 'u']``.
+        with pytest.raises(ValueError, match="engines must be a list or tuple of strings"):
+            AutoFeatureEngineer(engines="tabular")
+
+    def test_init_rejects_non_sequence_engines_argument(self):
+        """Non-sequence ``engines`` (e.g. ``int``) must raise a clear ValueError."""
+        # Without the guard, ``set(self.engines)`` would raise a bare
+        # ``TypeError: 'int' object is not iterable``.
+        with pytest.raises(ValueError, match="engines must be a list or tuple of strings"):
+            AutoFeatureEngineer(engines=5)
+        with pytest.raises(ValueError, match="engines must be a list or tuple of strings"):
+            AutoFeatureEngineer(engines={"tabular": True})
+
+    def test_init_rejects_string_selection_methods_argument(self):
+        """A bare ``str`` for ``selection_methods`` must raise."""
+        with pytest.raises(ValueError, match="selection_methods must be a list or tuple of strings"):
+            AutoFeatureEngineer(selection_methods="mutual_info")
+
+    def test_init_rejects_non_sequence_selection_methods_argument(self):
+        """Non-sequence ``selection_methods`` must raise a clear ValueError."""
+        with pytest.raises(ValueError, match="selection_methods must be a list or tuple of strings"):
+            AutoFeatureEngineer(selection_methods=42)
+
+    def test_init_accepts_tuple_engines(self):
+        """Tuples of strings are an acceptable container for ``engines``."""
+        afe = AutoFeatureEngineer(engines=("tabular",))
+        assert afe.engines == ("tabular",)
+
+    def test_set_params_rejects_string_engines_and_rolls_back(self):
+        """``set_params`` inherits the container-type check and rolls back on failure."""
+        afe = AutoFeatureEngineer(engines=["tabular"], max_features=5)
+        with pytest.raises(ValueError, match="engines must be a list or tuple of strings"):
+            afe.set_params(engines="tabular")
+        assert afe.engines == ["tabular"]
+        assert afe.max_features == 5
+
+    def test_fit_accepts_non_string_target_name(self, sample_df, sample_target):
+        """``target_name`` is typed Optional[Any]; integer column labels must work."""
+        # Build a DataFrame with an integer column name that overlaps the target.
+        df = sample_df.copy()
+        df.columns = [0, 1, 2, 3]
+        afe = AutoFeatureEngineer(engines=["tabular"], leakage_guard="raise")
+        # Integer target_name=0 must be honored (and would raise here because
+        # leakage_guard="raise" + a column named 0). This pins the type-hint
+        # contract: non-string target labels are accepted at runtime.
+        with pytest.raises(ValueError, match="leakage-prone"):
+            afe.fit(df, sample_target, target_name=0)
+
     def test_fit_resets_engine_instances_when_engines_change(self, sample_df, sample_target):
         """Refitting after removing an engine must drop the previously fitted engine."""
         afe = AutoFeatureEngineer(engines=["tabular", "timeseries"], verbose=False)

From 0e329f16886bb52f5c18b7242309e7316bd3fb4d Mon Sep 17 00:00:00 2001
From: Li Jiang <lijiang1@microsoft.com>
Date: Thu, 30 Apr 2026 23:12:56 +0800
Subject: [PATCH 22/22] fix(validation): skip leakage check for columns whose
 label normalizes to empty string

Address PR #2 review finding: ``find_potential_leakage_columns`` already
guarded the *target* side of the empty-substring trap (treating
``target_name=""`` / ``"---"`` as absent), but the *column* side could
still trigger it. If a column label normalized to an empty string, the
``normalized_column in normalized_target`` substring check evaluated
``True`` (because ``"" in "label"`` is ``True``), so a column literally
named e.g. ``"---"`` or ``"!!!"`` would be flagged as leakage-prone
whenever any ``target_name`` was provided.

Fixed by skipping any column whose label normalizes to an empty string
entirely (``if not normalized_column: continue`` before both the
keyword and target match blocks). An empty normalized column has no
meaningful content to compare against, so neither side of the match
should run for it.

Tests added in ``tests/test_utils.py``:
- ``test_leakage_detection_columns_normalizing_to_empty_string_are_skipped``
  covers:
  - ``["---", "!!!"]`` with ``target_name="label"`` returns ``[]``.
  - ``["---", "label_x"]`` returns only ``["label_x"]``.
  - Mixing meaningful and empty-normalizing labels still reports only
    the meaningful ones.

Also expanded the docstring ``Notes`` block to document the symmetric
column-side guard alongside the existing target-side description.

Full suite: 682 passed, 2 skipped (was 681, +1).
Pre-commit clean (black, ruff, trailing-whitespace, EOF, etc.).

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
---
 featcopilot/utils/validation.py | 15 ++++++++++++++-
 tests/test_utils.py             | 16 ++++++++++++++++
 2 files changed, 30 insertions(+), 1 deletion(-)

diff --git a/featcopilot/utils/validation.py b/featcopilot/utils/validation.py
index 300cd79..bce5ec2 100644
--- a/featcopilot/utils/validation.py
+++ b/featcopilot/utils/validation.py
@@ -51,7 +51,10 @@ def find_potential_leakage_columns(
     absent only when it is ``None`` *or* normalizes to an empty string after
     stripping non-alphanumerics; this lets falsy-but-meaningful values such
     as ``0`` participate in matching while preventing ``target_name=""``
-    from matching every column via the empty-substring trap.
+    from matching every column via the empty-substring trap. Symmetrically,
+    columns whose labels normalize to an empty string (e.g. ``"---"``,
+    ``"!!!"``) are skipped entirely so the empty ``normalized_column`` cannot
+    be reported as a substring of every ``normalized_target``.
     """
     # Use ``is None`` defaulting (rather than ``or``) so callers can pass an
     # empty list to explicitly disable keyword matching.
@@ -74,6 +77,16 @@ def find_potential_leakage_columns(
     for column in columns:
         normalized_column = _normalize_column_name(column)
 
+        # A column whose label normalizes to an empty string (e.g. ``"---"`` or
+        # ``"!!!"``) has no meaningful content to compare against. Skipping it
+        # closes the column-side counterpart of the ``"" in normalized_target``
+        # trap (the empty ``normalized_column`` would otherwise be a substring
+        # of every non-empty ``normalized_target`` and be flagged whenever a
+        # target was provided), and similarly avoids any keyword false-positive
+        # via the same empty-substring path.
+        if not normalized_column:
+            continue
+
         keyword_hit = any(keyword and keyword in normalized_column for keyword in normalized_keywords)
         target_hit = normalized_target is not None and (
             normalized_column == normalized_target
diff --git a/tests/test_utils.py b/tests/test_utils.py
index a26e655..75731d3 100644
--- a/tests/test_utils.py
+++ b/tests/test_utils.py
@@ -70,6 +70,22 @@ def test_leakage_detection_empty_string_target_does_not_match_everything():
     assert find_potential_leakage_columns(["a", "b", "c"], target_name="---", keywords=[]) == []
 
 
+def test_leakage_detection_columns_normalizing_to_empty_string_are_skipped():
+    """Columns whose labels normalize to an empty string must not be flagged."""
+    # Without the column-side guard, the empty ``normalized_column`` would be
+    # a substring of every non-empty ``normalized_target`` (``"" in "label"``
+    # is True), so any column literally labeled ``"---"`` / ``"!!!"`` would be
+    # flagged whenever a target was provided. The guard skips such columns.
+    assert find_potential_leakage_columns(["---", "!!!"], target_name="label") == []
+    assert find_potential_leakage_columns(["---", "label_x"], target_name="label", keywords=[]) == ["label_x"]
+    # Mixing meaningful and empty-normalizing column labels still reports only
+    # the meaningful ones.
+    assert find_potential_leakage_columns(["target", "---", "future_x"], target_name="label") == [
+        "target",
+        "future_x",
+    ]
+
+
 # ---------------------------------------------------------------------------
 # FeatureCache tests
 # ---------------------------------------------------------------------------