thinkall · thinkall · Apr 30, 2026 · Mar 17, 2026 · Mar 17, 2026 · Mar 17, 2026
diff --git a/benchmarks/compare_tools/run_fe_tools_comparison.py b/benchmarks/compare_tools/run_fe_tools_comparison.py
@@ -43,7 +43,6 @@
 import pandas as pd
 from packaging.version import Version
 from sklearn.metrics import accuracy_score, f1_score, mean_squared_error, r2_score, roc_auc_score
-from sklearn.model_selection import train_test_split
 from sklearn.preprocessing import LabelEncoder
 
 from benchmarks.datasets import (
@@ -61,12 +60,14 @@
     sanitize_feature_frames,
     save_feature_cache,
 )
+from benchmarks.splits import split_benchmark_data
 from featcopilot.utils.logger import get_logger  # noqa: E402
 
 logger = get_logger(__name__)
 
 warnings.filterwarnings("ignore")
 
+
 # Default configuration
 QUICK_DATASETS = [
     # Interaction-heavy synthetic (FeatCopilot creates valuable polynomial features)
@@ -843,8 +844,7 @@ def run_single_benchmark(
         X_encoded[col] = le.fit_transform(X_encoded[col].astype(str))
 
     # Split data (keep raw and encoded in sync)
-    indices = np.arange(len(X_encoded))
-    train_idx, test_idx, y_train, y_test = train_test_split(indices, y, test_size=0.2, random_state=random_state)
+    train_idx, test_idx, y_train, y_test = split_benchmark_data(X_encoded, y, task, random_state)
     X_train_encoded = X_encoded.iloc[train_idx]
     X_test_encoded = X_encoded.iloc[test_idx]
     X_train_raw = X.iloc[train_idx]

diff --git a/benchmarks/datasets.py b/benchmarks/datasets.py
@@ -5,6 +5,8 @@
 time series datasets, and text/semantic datasets for comprehensive benchmarking.
 """
 
+from __future__ import annotations
+
 import numpy as np
 import pandas as pd
 

diff --git a/benchmarks/splits.py b/benchmarks/splits.py
@@ -0,0 +1,102 @@
+"""Shared split utilities for FeatCopilot benchmarks.
+
+Centralizes the split policy so individual benchmark scripts share the same
+realistic defaults: chronological splits for forecasting/timeseries tasks and
+stratified random splits for classification tasks (when class counts allow).
+"""
+
+from __future__ import annotations
+
+import numpy as np
+import pandas as pd
+from sklearn.model_selection import train_test_split
+
+
+def split_benchmark_data(
+    X: pd.DataFrame,
+    y: pd.Series,
+    task: str,
+    random_state: int,
+    test_size: float = 0.2,
+) -> tuple[np.ndarray, np.ndarray, pd.Series, pd.Series]:
+    """
+    Split benchmark data with task-aware defaults.
+
+    Parameters
+    ----------
+    X : pandas.DataFrame
+        Feature matrix. Only its length is used; the function returns positional
+        indices (use ``X.iloc[train_idx]`` / ``X.iloc[test_idx]`` to materialize
+        the splits).
+    y : pandas.Series
+        Target values aligned with ``X``.
+    task : str
+        Task identifier. Substrings ``"forecast"`` / ``"timeseries"`` trigger a
+        chronological split; ``"classification"`` triggers a stratified split
+        when class counts allow it. Anything else falls back to a random split.
+    random_state : int
+        Random state for reproducible random splits.
+    test_size : float, default=0.2
+        Fraction of rows held out for the test split.
+
+    Returns
+    -------
+    train_idx : numpy.ndarray
+        Positional indices for the training rows.
+    test_idx : numpy.ndarray
+        Positional indices for the test rows.
+    y_train : pandas.Series
+        Target values for the training rows.
+    y_test : pandas.Series
+        Target values for the test rows.
+
+    Raises
+    ------
+    ValueError
+        If ``test_size`` is not strictly between 0 and 1, or if the resulting
+        chronological split would leave either side empty (for example, a
+        very small dataset combined with an extreme ``test_size``).
+    """
+    # Validate ``test_size`` up front so the chronological branch matches the
+    # behavior of ``sklearn.model_selection.train_test_split`` (which rejects
+    # ``test_size <= 0`` / ``>= 1``) instead of silently producing an empty
+    # or overlapping split.
+    if not (0 < test_size < 1):
+        raise ValueError(f"test_size must be a float strictly between 0 and 1; got {test_size!r}")
+
+    indices = np.arange(len(X))
+
+    if "forecast" in task or "timeseries" in task:
+        split_idx = int(len(indices) * (1 - test_size))
+        if split_idx <= 0 or split_idx >= len(indices):
+            raise ValueError(
+                "Chronological split would leave one side empty: "
+                f"len(X)={len(indices)}, test_size={test_size} -> split_idx={split_idx}. "
+                "Provide more rows or pick a different ``test_size``."
+            )
+        train_idx = indices[:split_idx]
+        test_idx = indices[split_idx:]
+        y_train = y.iloc[train_idx]
+        y_test = y.iloc[test_idx]
+        return train_idx, test_idx, y_train, y_test
+
+    stratify = None
+    if "classification" in task:
+        try:
+            class_counts = pd.Series(y).value_counts(dropna=False)
+            if len(class_counts) > 1 and class_counts.min() >= 2:
+                stratify = y
+        except Exception:
+            stratify = None
+
+    train_idx, test_idx, y_train, y_test = train_test_split(
+        indices,
+        y,
+        test_size=test_size,
+        random_state=random_state,
+        stratify=stratify,
+    )
+    return train_idx, test_idx, y_train, y_test
+
+
+__all__ = ["split_benchmark_data"]
diff --git a/benchmarks/use_cases/AUTO_FEATURE_ENGINEERING_USE_CASE.md b/benchmarks/use_cases/AUTO_FEATURE_ENGINEERING_USE_CASE.md
@@ -0,0 +1,10 @@
+# Auto Feature Engineering Use-Case Benchmark
+
+Compares a plain baseline with FeatCopilot and common automatic feature engineering tools on an interaction-heavy tabular classification task.
+
+| Tool | Status | ROC-AUC | Feature Count |
+|------|--------|---------|---------------|
+| baseline | ok | 0.6330 | 9 |
+| featcopilot | ok | 0.6328 | 11 |
+| featuretools | ok | 0.6362 | 60 |
+| autofeat | failed: check_array() got an unexpected keyword argument 'force_all_finite' | - | - |
diff --git a/benchmarks/use_cases/README.md b/benchmarks/use_cases/README.md
@@ -0,0 +1,20 @@
+# Use-Case Benchmarks
+
+Targeted benchmarks for realistic feature-engineering scenarios.
+
+## Auto Feature Engineering
+
+This benchmark compares:
+- plain baseline
+- FeatCopilot
+- Featuretools (if installed)
+- autofeat (if installed)
+
+on an interaction-heavy tabular classification task where automatic feature engineering should matter.
+
+```bash
+python -m benchmarks.use_cases.run_auto_feature_engineering_benchmark
+```
+
+Outputs:
+- `AUTO_FEATURE_ENGINEERING_USE_CASE.md`