diff --git a/benchmarks/compare_tools/run_fe_tools_comparison.py b/benchmarks/compare_tools/run_fe_tools_comparison.py index f678fe4..dd7b10f 100644 --- a/benchmarks/compare_tools/run_fe_tools_comparison.py +++ b/benchmarks/compare_tools/run_fe_tools_comparison.py @@ -43,7 +43,6 @@ import pandas as pd from packaging.version import Version from sklearn.metrics import accuracy_score, f1_score, mean_squared_error, r2_score, roc_auc_score -from sklearn.model_selection import train_test_split from sklearn.preprocessing import LabelEncoder from benchmarks.datasets import ( @@ -61,12 +60,14 @@ sanitize_feature_frames, save_feature_cache, ) +from benchmarks.splits import split_benchmark_data from featcopilot.utils.logger import get_logger # noqa: E402 logger = get_logger(__name__) warnings.filterwarnings("ignore") + # Default configuration QUICK_DATASETS = [ # Interaction-heavy synthetic (FeatCopilot creates valuable polynomial features) @@ -843,8 +844,7 @@ def run_single_benchmark( X_encoded[col] = le.fit_transform(X_encoded[col].astype(str)) # Split data (keep raw and encoded in sync) - indices = np.arange(len(X_encoded)) - train_idx, test_idx, y_train, y_test = train_test_split(indices, y, test_size=0.2, random_state=random_state) + train_idx, test_idx, y_train, y_test = split_benchmark_data(X_encoded, y, task, random_state) X_train_encoded = X_encoded.iloc[train_idx] X_test_encoded = X_encoded.iloc[test_idx] X_train_raw = X.iloc[train_idx] diff --git a/benchmarks/datasets.py b/benchmarks/datasets.py index c611bf6..db5e238 100644 --- a/benchmarks/datasets.py +++ b/benchmarks/datasets.py @@ -5,6 +5,8 @@ time series datasets, and text/semantic datasets for comprehensive benchmarking. """ +from __future__ import annotations + import numpy as np import pandas as pd diff --git a/benchmarks/splits.py b/benchmarks/splits.py new file mode 100644 index 0000000..4b97c34 --- /dev/null +++ b/benchmarks/splits.py @@ -0,0 +1,102 @@ +"""Shared split utilities for FeatCopilot benchmarks. + +Centralizes the split policy so individual benchmark scripts share the same +realistic defaults: chronological splits for forecasting/timeseries tasks and +stratified random splits for classification tasks (when class counts allow). +""" + +from __future__ import annotations + +import numpy as np +import pandas as pd +from sklearn.model_selection import train_test_split + + +def split_benchmark_data( + X: pd.DataFrame, + y: pd.Series, + task: str, + random_state: int, + test_size: float = 0.2, +) -> tuple[np.ndarray, np.ndarray, pd.Series, pd.Series]: + """ + Split benchmark data with task-aware defaults. + + Parameters + ---------- + X : pandas.DataFrame + Feature matrix. Only its length is used; the function returns positional + indices (use ``X.iloc[train_idx]`` / ``X.iloc[test_idx]`` to materialize + the splits). + y : pandas.Series + Target values aligned with ``X``. + task : str + Task identifier. Substrings ``"forecast"`` / ``"timeseries"`` trigger a + chronological split; ``"classification"`` triggers a stratified split + when class counts allow it. Anything else falls back to a random split. + random_state : int + Random state for reproducible random splits. + test_size : float, default=0.2 + Fraction of rows held out for the test split. + + Returns + ------- + train_idx : numpy.ndarray + Positional indices for the training rows. + test_idx : numpy.ndarray + Positional indices for the test rows. + y_train : pandas.Series + Target values for the training rows. + y_test : pandas.Series + Target values for the test rows. + + Raises + ------ + ValueError + If ``test_size`` is not strictly between 0 and 1, or if the resulting + chronological split would leave either side empty (for example, a + very small dataset combined with an extreme ``test_size``). + """ + # Validate ``test_size`` up front so the chronological branch matches the + # behavior of ``sklearn.model_selection.train_test_split`` (which rejects + # ``test_size <= 0`` / ``>= 1``) instead of silently producing an empty + # or overlapping split. + if not (0 < test_size < 1): + raise ValueError(f"test_size must be a float strictly between 0 and 1; got {test_size!r}") + + indices = np.arange(len(X)) + + if "forecast" in task or "timeseries" in task: + split_idx = int(len(indices) * (1 - test_size)) + if split_idx <= 0 or split_idx >= len(indices): + raise ValueError( + "Chronological split would leave one side empty: " + f"len(X)={len(indices)}, test_size={test_size} -> split_idx={split_idx}. " + "Provide more rows or pick a different ``test_size``." + ) + train_idx = indices[:split_idx] + test_idx = indices[split_idx:] + y_train = y.iloc[train_idx] + y_test = y.iloc[test_idx] + return train_idx, test_idx, y_train, y_test + + stratify = None + if "classification" in task: + try: + class_counts = pd.Series(y).value_counts(dropna=False) + if len(class_counts) > 1 and class_counts.min() >= 2: + stratify = y + except Exception: + stratify = None + + train_idx, test_idx, y_train, y_test = train_test_split( + indices, + y, + test_size=test_size, + random_state=random_state, + stratify=stratify, + ) + return train_idx, test_idx, y_train, y_test + + +__all__ = ["split_benchmark_data"] diff --git a/benchmarks/use_cases/AUTO_FEATURE_ENGINEERING_USE_CASE.md b/benchmarks/use_cases/AUTO_FEATURE_ENGINEERING_USE_CASE.md new file mode 100644 index 0000000..927e33f --- /dev/null +++ b/benchmarks/use_cases/AUTO_FEATURE_ENGINEERING_USE_CASE.md @@ -0,0 +1,10 @@ +# Auto Feature Engineering Use-Case Benchmark + +Compares a plain baseline with FeatCopilot and common automatic feature engineering tools on an interaction-heavy tabular classification task. + +| Tool | Status | ROC-AUC | Feature Count | +|------|--------|---------|---------------| +| baseline | ok | 0.6330 | 9 | +| featcopilot | ok | 0.6328 | 11 | +| featuretools | ok | 0.6362 | 60 | +| autofeat | failed: check_array() got an unexpected keyword argument 'force_all_finite' | - | - | diff --git a/benchmarks/use_cases/README.md b/benchmarks/use_cases/README.md new file mode 100644 index 0000000..b08d50f --- /dev/null +++ b/benchmarks/use_cases/README.md @@ -0,0 +1,20 @@ +# Use-Case Benchmarks + +Targeted benchmarks for realistic feature-engineering scenarios. + +## Auto Feature Engineering + +This benchmark compares: +- plain baseline +- FeatCopilot +- Featuretools (if installed) +- autofeat (if installed) + +on an interaction-heavy tabular classification task where automatic feature engineering should matter. + +```bash +python -m benchmarks.use_cases.run_auto_feature_engineering_benchmark +``` + +Outputs: +- `AUTO_FEATURE_ENGINEERING_USE_CASE.md` diff --git a/benchmarks/use_cases/run_auto_feature_engineering_benchmark.py b/benchmarks/use_cases/run_auto_feature_engineering_benchmark.py new file mode 100644 index 0000000..b4b426d --- /dev/null +++ b/benchmarks/use_cases/run_auto_feature_engineering_benchmark.py @@ -0,0 +1,257 @@ +"""Benchmark FeatCopilot against common baselines for auto feature engineering. + +Focused on the practical use case of structured/tabular data with interaction +and ratio effects where feature engineering should matter. +""" + +from __future__ import annotations + +import argparse +import json +from pathlib import Path +from typing import Any + +import numpy as np +import pandas as pd +from sklearn.compose import ColumnTransformer +from sklearn.impute import SimpleImputer +from sklearn.linear_model import LogisticRegression +from sklearn.metrics import roc_auc_score +from sklearn.pipeline import Pipeline +from sklearn.preprocessing import StandardScaler + +from benchmarks.splits import split_benchmark_data +from featcopilot import AutoFeatureEngineer + +REPORT_DIR = Path(__file__).resolve().parent + + +def create_dataset(n_samples: int = 5000, random_state: int = 42) -> pd.DataFrame: + """Create a synthetic classification dataset with explicit ratio/interaction signal.""" + rng = np.random.default_rng(random_state) + df = pd.DataFrame( + { + "age": rng.integers(18, 80, n_samples), + "income": rng.gamma(3.5, 18000, n_samples), + "tenure_months": rng.integers(1, 120, n_samples), + "monthly_charges": rng.uniform(20, 180, n_samples), + "num_products": rng.integers(1, 6, n_samples), + "support_tickets": rng.poisson(2.2, n_samples), + "plan_tier": rng.choice(["free", "pro", "team"], n_samples, p=[0.45, 0.4, 0.15]), + } + ) + + charge_ratio = df["monthly_charges"] / (df["income"] / 12 + 1) + complaint_rate = df["support_tickets"] / (df["tenure_months"] + 1) + product_density = df["monthly_charges"] / (df["num_products"] + 0.5) + loyalty = ((df["age"] - 18) / 62) * (df["tenure_months"] / 120) + free_flag = (df["plan_tier"] == "free").astype(int) + team_flag = (df["plan_tier"] == "team").astype(int) + + interaction_signal = charge_ratio * complaint_rate * 8.0 + threshold_bonus = ((charge_ratio > 0.020) & (complaint_rate > 0.045)).astype(float) * 1.2 + plan_interaction = free_flag * charge_ratio * 6.0 - team_flag * loyalty * 1.5 + + logit = ( + -2.2 + 0.0035 * product_density + 1.8 * interaction_signal + threshold_bonus + plan_interaction - 1.1 * loyalty + ) + prob = 1 / (1 + np.exp(-logit)) + df["target"] = (rng.random(n_samples) < prob).astype(int) + return df + + +def align_and_fill(X_train: pd.DataFrame, X_test: pd.DataFrame) -> tuple[pd.DataFrame, pd.DataFrame]: + """Align train/test columns and replace NaN/±inf with 0 for downstream models. + + Several feature-engineering tools can emit ``±inf`` (e.g. Featuretools' + ``divide_numeric`` primitive on rows where the denominator is 0). Scaling + or fitting any sklearn estimator on those values raises, so we sanitize + them here as part of column alignment. + """ + X_train_aligned, X_test_aligned = X_train.align(X_test, join="left", axis=1, fill_value=0) + X_train_clean = X_train_aligned.replace([np.inf, -np.inf], 0).fillna(0) + X_test_clean = X_test_aligned.replace([np.inf, -np.inf], 0).fillna(0) + return X_train_clean, X_test_clean + + +def evaluate_auc(X_train: pd.DataFrame, X_test: pd.DataFrame, y_train: pd.Series, y_test: pd.Series) -> float: + """Train a simple classifier and return ROC-AUC. + + Owns the categorical encoding + column alignment so individual benchmark + cases don't have to repeat that step. The encoding is idempotent for + already-numeric inputs (``get_dummies`` is a no-op when there are no + object/categorical columns), so per-tool runners that produce numeric + matrices can pass them in directly without first one-hot encoding again. + """ + X_train = pd.get_dummies(X_train, drop_first=False) + X_test = pd.get_dummies(X_test, drop_first=False) + X_train, X_test = align_and_fill(X_train, X_test) + + numeric_features = X_train.columns.tolist() + preprocessor = ColumnTransformer( + transformers=[ + ( + "num", + Pipeline( + [ + ("imputer", SimpleImputer(strategy="constant", fill_value=0.0)), + ("scaler", StandardScaler()), + ] + ), + numeric_features, + ) + ] + ) + model = Pipeline( + steps=[ + ("preprocessor", preprocessor), + ("classifier", LogisticRegression(max_iter=2000, C=1.0)), + ] + ) + model.fit(X_train, y_train) + return roc_auc_score(y_test, model.predict_proba(X_test)[:, 1]) + + +def run_baseline(X_train: pd.DataFrame, X_test: pd.DataFrame, y_train: pd.Series, y_test: pd.Series) -> dict[str, Any]: + """Run one-hot baseline. + + Encoding and alignment are owned by :func:`evaluate_auc`, so we pass the + raw frames straight through. ``n_features`` is reported as the post-encoding + column count to match what the model actually trains on. + """ + auc = evaluate_auc(X_train, X_test, y_train, y_test) + n_features = pd.get_dummies(X_train, drop_first=False).shape[1] + return {"tool": "baseline", "auc": auc, "n_features": n_features} + + +def run_featcopilot_case( + X_train: pd.DataFrame, X_test: pd.DataFrame, y_train: pd.Series, y_test: pd.Series +) -> dict[str, Any]: + """Run FeatCopilot benchmark case.""" + engineer = AutoFeatureEngineer( + engines=["tabular"], + max_features=60, + selection_methods=["mutual_info", "importance"], + correlation_threshold=0.95, + leakage_guard="warn", + verbose=False, + ) + X_train_fe = engineer.fit_transform(X_train, y_train, target_name="target", apply_selection=True) + X_test_fe = engineer.transform(X_test) + X_train_fe, X_test_fe = align_and_fill(X_train_fe, X_test_fe) + auc = evaluate_auc(X_train_fe, X_test_fe, y_train, y_test) + return {"tool": "featcopilot", "auc": auc, "n_features": X_train_fe.shape[1]} + + +def run_featuretools_case( + X_train: pd.DataFrame, X_test: pd.DataFrame, y_train: pd.Series, y_test: pd.Series +) -> dict[str, Any]: + """Run Featuretools if available.""" + try: + import featuretools as ft + import woodwork # noqa: F401 + except Exception as exc: + return {"tool": "featuretools", "status": f"unavailable: {exc}"} + + train_copy = pd.get_dummies(X_train, drop_first=False).reset_index(drop=True) + test_copy = pd.get_dummies(X_test, drop_first=False).reset_index(drop=True) + train_copy, test_copy = align_and_fill(train_copy, test_copy) + train_copy["row_id"] = np.arange(len(train_copy)) + test_copy["row_id"] = np.arange(len(test_copy)) + + try: + train_copy.ww.init(name="data", index="row_id") + es_train = ft.EntitySet(id="afe_train").add_dataframe( + dataframe_name="data", dataframe=train_copy, index="row_id" + ) + train_fm, feature_defs = ft.dfs( + entityset=es_train, + target_dataframe_name="data", + trans_primitives=["add_numeric", "multiply_numeric", "divide_numeric"], + agg_primitives=[], + max_depth=2, + max_features=60, + ) + + test_copy.ww.init(name="data", index="row_id") + es_test = ft.EntitySet(id="afe_test").add_dataframe(dataframe_name="data", dataframe=test_copy, index="row_id") + test_fm = ft.calculate_feature_matrix(entityset=es_test, features=feature_defs) + train_fm, test_fm = align_and_fill(train_fm, test_fm) + auc = evaluate_auc(train_fm, test_fm, y_train, y_test) + return {"tool": "featuretools", "auc": auc, "n_features": train_fm.shape[1]} + except Exception as exc: + return {"tool": "featuretools", "status": f"failed: {exc}"} + + +def run_autofeat_case( + X_train: pd.DataFrame, X_test: pd.DataFrame, y_train: pd.Series, y_test: pd.Series +) -> dict[str, Any]: + """Run autofeat if available.""" + try: + from autofeat import AutoFeatClassifier + except Exception as exc: + return {"tool": "autofeat", "status": f"unavailable: {exc}"} + + X_train_num = pd.get_dummies(X_train, drop_first=False) + X_test_num = pd.get_dummies(X_test, drop_first=False) + X_train_num, X_test_num = align_and_fill(X_train_num, X_test_num) + + try: + model = AutoFeatClassifier(verbose=0, feateng_steps=2, featsel_runs=2) + y_train_arr = np.asarray(y_train).ravel() + X_train_fe = model.fit_transform(X_train_num, y_train_arr) + X_test_fe = model.transform(X_test_num) + X_train_fe, X_test_fe = align_and_fill(X_train_fe, X_test_fe) + auc = evaluate_auc(X_train_fe, X_test_fe, y_train, y_test) + return {"tool": "autofeat", "auc": auc, "n_features": X_train_fe.shape[1]} + except Exception as exc: + return {"tool": "autofeat", "status": f"failed: {exc}"} + + +def write_report(results: list[dict[str, Any]], output_path: Path) -> None: + """Write a markdown report.""" + lines = [ + "# Auto Feature Engineering Use-Case Benchmark", + "", + "Compares a plain baseline with FeatCopilot and common automatic feature engineering tools on an interaction-heavy tabular classification task.", + "", + "| Tool | Status | ROC-AUC | Feature Count |", + "|------|--------|---------|---------------|", + ] + for row in results: + status = row.get("status", "ok") + auc = f"{row['auc']:.4f}" if "auc" in row else "-" + n_features = str(row.get("n_features", "-")) + lines.append(f"| {row['tool']} | {status} | {auc} | {n_features} |") + output_path.write_text("\n".join(lines) + "\n", encoding="utf-8") + + +def main() -> None: + """Run the use-case benchmark.""" + parser = argparse.ArgumentParser(description="Run an auto feature engineering use-case benchmark") + parser.add_argument("--samples", type=int, default=5000, help="Number of synthetic samples") + parser.add_argument("--seed", type=int, default=42, help="Random seed") + args = parser.parse_args() + + data = create_dataset(n_samples=args.samples, random_state=args.seed) + X = data.drop(columns=["target"]) + y = data["target"] + train_idx, test_idx, y_train, y_test = split_benchmark_data(X, y, "classification", random_state=args.seed) + X_train = X.iloc[train_idx] + X_test = X.iloc[test_idx] + + results = [ + run_baseline(X_train, X_test, y_train, y_test), + run_featcopilot_case(X_train, X_test, y_train, y_test), + run_featuretools_case(X_train, X_test, y_train, y_test), + run_autofeat_case(X_train, X_test, y_train, y_test), + ] + + output_path = REPORT_DIR / "AUTO_FEATURE_ENGINEERING_USE_CASE.md" + write_report(results, output_path) + print(json.dumps(results, indent=2)) + print(f"\nWrote report to {output_path}") + + +if __name__ == "__main__": + main() diff --git a/docs/examples/relational-feature-engineering.md b/docs/examples/relational-feature-engineering.md new file mode 100644 index 0000000..f3e0fac --- /dev/null +++ b/docs/examples/relational-feature-engineering.md @@ -0,0 +1,76 @@ +# Relational Feature Engineering + +A practical example for **customer / orders** style data where aggregation features matter. + +## When to use this + +Use the relational engine when: +- you have a primary table and one or more related tables +- useful signal lives in counts / sums / means / maxima over related entities +- you want a lighter-weight path than a full Featuretools setup + +## Example + +```python +import pandas as pd + +from featcopilot.engines.relational import RelationalEngine + +orders = pd.DataFrame( + { + "order_id": [1, 2, 3, 4, 5], + "customer_id": [1, 1, 2, 2, 3], + "amount": [100, 200, 150, 300, 50], + "category": ["A", "B", "A", "A", "B"], + } +) + +customers = pd.DataFrame( + { + "customer_id": [1, 2, 3], + "age": [25, 35, 45], + "income": [50000, 70000, 60000], + } +) + +engine = RelationalEngine( + aggregation_functions=["mean", "sum", "count", "max", "min"], + verbose=True, +) +engine.add_relationship("orders", "customers", "customer_id") + +features = engine.fit_transform( + orders, + related_tables={"customers": customers}, +) + +print(features.columns.tolist()) +``` + +## Typical generated features + +- `customers_age_mean` +- `customers_income_mean` +- `amount_by_category_mean` +- `amount_by_category_count` + +## Guardrails + +`RelationalEngine` now validates configured relationship keys: +- missing child key in the primary table -> raises early +- missing parent key in a related table -> raises early +- duplicate relationship definitions are ignored + +That is boring, but it is the kind of boring that saves time. + +## When Featuretools is still the better choice + +Use Featuretools when you need: +- deeper DFS-style synthesis +- richer primitive libraries +- more complex multi-table workflows + +Use FeatCopilot relational mode when you want: +- a smaller API surface +- straightforward aggregation features +- sklearn-friendly workflows diff --git a/docs/examples/time-aware-tabular.md b/docs/examples/time-aware-tabular.md new file mode 100644 index 0000000..7b938e1 --- /dev/null +++ b/docs/examples/time-aware-tabular.md @@ -0,0 +1,66 @@ +# Time-Aware Tabular Prototype + +A practical prototype for **leakage-safe auto feature engineering** on time-aware tabular data. + +## Why this example matters + +Most real feature engineering failures are not caused by weak transformations. They come from: + +- random train/test splits on temporal data +- future information leaking into features +- offline features that cannot be reproduced later + +This example shows a safer baseline: + +1. sort by time +2. split by time +3. fit features on the training slice only +4. transform the holdout slice separately +5. compare against a plain model baseline + +## Script + +See: + +```text +examples/time_aware_tabular_prototype.py +``` + +## Core pattern + +```python +engineer = AutoFeatureEngineer( + engines=["tabular"], + max_features=30, + selection_methods=["mutual_info", "importance"], + correlation_threshold=0.9, + leakage_guard="warn", +) + +X_train_fe = engineer.fit_transform( + X_train, + y_train, + target_name="churned", + apply_selection=True, +) +X_test_fe = engineer.transform(X_test) +``` + +## Leakage guard + +`AutoFeatureEngineer` now supports a lightweight `leakage_guard` option: + +- `"warn"` — default, warns if suspicious columns are present +- `"raise"` — fail fast when likely leakage columns are detected +- `"off"` — disable the check + +This is intentionally conservative. It does **not** prove your pipeline is safe. It just catches obvious foot-guns such as columns named like: + +- `target` +- `label` +- `outcome` +- `future_*` + +## Recommendation + +For a real project, start with this workflow before trying more advanced LLM or agent-based feature generation. If the time-aware baseline is not trustworthy, more automation only makes the mistake faster. diff --git a/examples/time_aware_tabular_prototype.py b/examples/time_aware_tabular_prototype.py new file mode 100644 index 0000000..f8d8781 --- /dev/null +++ b/examples/time_aware_tabular_prototype.py @@ -0,0 +1,112 @@ +"""Time-aware tabular prototype with leakage-safe evaluation. + +This example shows a practical starting point for auto feature engineering on +behavioral / event / tabular data where time-based splitting matters. +""" + +from __future__ import annotations + +import numpy as np +import pandas as pd +from sklearn.ensemble import HistGradientBoostingClassifier +from sklearn.metrics import roc_auc_score + +from featcopilot import AutoFeatureEngineer + + +def create_time_aware_dataset(n_samples: int = 2000) -> pd.DataFrame: + """Create a synthetic time-aware churn-like dataset.""" + rng = np.random.default_rng(42) + timestamps = pd.date_range("2024-01-01", periods=n_samples, freq="h") + + df = pd.DataFrame( + { + "event_time": timestamps, + "account_age_days": rng.integers(10, 1200, n_samples), + "sessions_7d": rng.poisson(8, n_samples), + "tickets_30d": rng.poisson(2, n_samples), + "spend_30d": rng.gamma(2.5, 40, n_samples), + "plan_tier": rng.choice(["free", "pro", "team"], n_samples, p=[0.45, 0.4, 0.15]), + } + ) + + spend_ratio = df["spend_30d"] / (df["account_age_days"] + 10) + support_pressure = df["tickets_30d"] / (df["sessions_7d"] + 1) + pro_flag = (df["plan_tier"] == "pro").astype(int) + team_flag = (df["plan_tier"] == "team").astype(int) + + churn_logit = ( + -1.2 + - 0.015 * df["sessions_7d"] + + 0.25 * df["tickets_30d"] + + 3.2 * support_pressure + + 1.7 * spend_ratio + - 0.35 * pro_flag + - 0.55 * team_flag + ) + churn_prob = 1 / (1 + np.exp(-churn_logit)) + df["churned"] = (rng.random(n_samples) < churn_prob).astype(int) + return df + + +def temporal_split(df: pd.DataFrame, time_col: str, valid_fraction: float = 0.2) -> tuple[pd.DataFrame, pd.DataFrame]: + """Split a dataset by time instead of random shuffling.""" + df = df.sort_values(time_col).reset_index(drop=True) + split_idx = int(len(df) * (1 - valid_fraction)) + return df.iloc[:split_idx].copy(), df.iloc[split_idx:].copy() + + +def main() -> None: + """Run a leakage-safe auto feature engineering prototype.""" + data = create_time_aware_dataset() + train_df, test_df = temporal_split(data, time_col="event_time", valid_fraction=0.2) + + feature_cols = [ + "account_age_days", + "sessions_7d", + "tickets_30d", + "spend_30d", + "plan_tier", + ] + target_col = "churned" + + X_train = train_df[feature_cols] + y_train = train_df[target_col] + X_test = test_df[feature_cols] + y_test = test_df[target_col] + + X_train_baseline = pd.get_dummies(X_train, drop_first=False) + X_test_baseline = pd.get_dummies(X_test, drop_first=False) + X_train_baseline, X_test_baseline = X_train_baseline.align(X_test_baseline, join="left", axis=1, fill_value=0) + + baseline = HistGradientBoostingClassifier(max_depth=4, learning_rate=0.05, random_state=42) + baseline.fit(X_train_baseline, y_train) + baseline_auc = roc_auc_score(y_test, baseline.predict_proba(X_test_baseline)[:, 1]) + + engineer = AutoFeatureEngineer( + engines=["tabular"], + max_features=30, + selection_methods=["mutual_info", "importance"], + correlation_threshold=0.9, + leakage_guard="warn", + verbose=True, + ) + X_train_fe = engineer.fit_transform(X_train, y_train, target_name=target_col, apply_selection=True).fillna(0) + X_test_fe = engineer.transform(X_test).fillna(0) + + common_cols = [col for col in X_train_fe.columns if col in X_test_fe.columns] + X_train_fe = X_train_fe[common_cols] + X_test_fe = X_test_fe[common_cols] + + model = HistGradientBoostingClassifier(max_depth=4, learning_rate=0.05, random_state=42) + model.fit(X_train_fe, y_train) + engineered_auc = roc_auc_score(y_test, model.predict_proba(X_test_fe)[:, 1]) + + print(f"Temporal baseline ROC-AUC: {baseline_auc:.4f}") + print(f"Engineered ROC-AUC: {engineered_auc:.4f}") + print(f"Delta: {engineered_auc - baseline_auc:+.4f}") + print(f"Selected features: {len(common_cols)}") + + +if __name__ == "__main__": + main() diff --git a/featcopilot/__init__.py b/featcopilot/__init__.py index 377aefb..7178867 100644 --- a/featcopilot/__init__.py +++ b/featcopilot/__init__.py @@ -5,9 +5,14 @@ with novel LLM-powered capabilities via GitHub Copilot SDK. """ -from importlib.metadata import version +from importlib.metadata import PackageNotFoundError, version + +try: + __version__ = version("featcopilot") +except PackageNotFoundError: + # Allow importing directly from a source checkout before editable/install step. + __version__ = "0+unknown" -__version__ = version("featcopilot") __author__ = "FeatCopilot Contributors" from featcopilot.core.base import BaseEngine, BaseSelector diff --git a/featcopilot/engines/relational.py b/featcopilot/engines/relational.py index 19ad435..fdc9963 100644 --- a/featcopilot/engines/relational.py +++ b/featcopilot/engines/relational.py @@ -106,14 +106,14 @@ def add_relationship( ------- self : RelationalEngine """ - self._relationships.append( - { - "child": child_table, - "parent": parent_table, - "child_key": key_column, - "parent_key": parent_key or key_column, - } - ) + relationship = { + "child": child_table, + "parent": parent_table, + "child_key": key_column, + "parent_key": parent_key or key_column, + } + if relationship not in self._relationships: + self._relationships.append(relationship) return self def fit( @@ -143,6 +143,8 @@ def fit( self._related_tables = related_tables or {} self._primary_columns = X.columns.tolist() + self._validate_relationships(X, self._related_tables) + if self.config.verbose: logger.info(f"RelationalEngine: {len(self._relationships)} relationships defined") @@ -175,6 +177,7 @@ def transform( X = self._validate_input(X) related_tables = related_tables or self._related_tables + self._validate_relationships(X, related_tables) result = X.copy() # Generate features from relationships @@ -198,6 +201,23 @@ def transform( return result + def _validate_relationships(self, primary_df: pd.DataFrame, related_tables: dict[str, pd.DataFrame]) -> None: + """Validate configured relationships against available tables and keys.""" + for relationship in self._relationships: + child_key = relationship["child_key"] + parent_table = relationship["parent"] + parent_key = relationship["parent_key"] + + if child_key not in primary_df.columns: + raise ValueError(f"child_key '{child_key}' not found in primary table") + + if parent_table not in related_tables: + continue + + parent_df = related_tables[parent_table] + if parent_key not in parent_df.columns: + raise ValueError(f"parent_key '{parent_key}' not found in related table '{parent_table}'") + def _aggregate_from_relationship( self, child_df: pd.DataFrame, diff --git a/featcopilot/engines/timeseries.py b/featcopilot/engines/timeseries.py index 8702b75..d361f20 100644 --- a/featcopilot/engines/timeseries.py +++ b/featcopilot/engines/timeseries.py @@ -21,6 +21,7 @@ class TimeSeriesEngineConfig(EngineConfig): """Configuration for time series feature engine.""" name: str = "TimeSeriesEngine" + series_in_rows: bool = Field(default=False, description="Treat each row as an independent time series") features: list[str] = Field( default_factory=lambda: [ "basic_stats", @@ -92,6 +93,7 @@ def __init__( window_sizes: Optional[list[int]] = None, max_features: Optional[int] = None, verbose: bool = False, + series_in_rows: bool = False, **kwargs, ): config = TimeSeriesEngineConfig( @@ -99,6 +101,7 @@ def __init__( window_sizes=window_sizes or [5, 10, 20], max_features=max_features, verbose=verbose, + series_in_rows=series_in_rows, **kwargs, ) super().__init__(config=config) @@ -131,11 +134,24 @@ def fit( """ X = self._validate_input(X) + if time_column is not None and time_column not in X.columns: + raise ValueError(f"time_column '{time_column}' not found in input data") + + self._time_index_column = time_column + # Identify numeric columns for time series analysis - self._time_columns = X.select_dtypes(include=[np.number]).columns.tolist() + if self.config.series_in_rows: + self._time_columns = [col for col in X.columns if col != time_column] + else: + self._time_columns = X.select_dtypes(include=[np.number]).columns.tolist() + if time_column is not None: + self._time_columns = [col for col in self._time_columns if col != time_column] if self.config.verbose: - logger.info(f"TimeSeriesEngine: Found {len(self._time_columns)} numeric columns") + logger.info( + f"TimeSeriesEngine: Found {len(self._time_columns)} columns " + f"(series_in_rows={self.config.series_in_rows})" + ) self._is_fitted = True return self @@ -158,21 +174,10 @@ def transform(self, X: Union[pd.DataFrame, np.ndarray], **kwargs) -> pd.DataFram raise RuntimeError("Engine must be fitted before transform") X = self._validate_input(X) - features_dict = {} - for col in self._time_columns: - series = X[col].values - - for feature_group in self.config.features: - if feature_group in self.FEATURE_EXTRACTORS: - method_name = self.FEATURE_EXTRACTORS[feature_group] - method = getattr(self, method_name) - extracted = method(series, col) - features_dict.update(extracted) - - # For DataFrames with multiple rows, extract features across the entire column - if len(X) > 1: - # Each column is treated as a single time series + if self.config.series_in_rows: + result = self._extract_per_row(X) + else: features_dict = {} for col in self._time_columns: series = X[col].values @@ -184,7 +189,7 @@ def transform(self, X: Union[pd.DataFrame, np.ndarray], **kwargs) -> pd.DataFram extracted = method(series, col) features_dict.update(extracted) - result = pd.DataFrame([features_dict]) + result = pd.DataFrame([features_dict]) self._feature_names = list(result.columns) diff --git a/featcopilot/transformers/sklearn_compat.py b/featcopilot/transformers/sklearn_compat.py index c93f8fa..07b5566 100644 --- a/featcopilot/transformers/sklearn_compat.py +++ b/featcopilot/transformers/sklearn_compat.py @@ -3,6 +3,7 @@ Provides drop-in sklearn transformers for feature engineering pipelines. """ +import warnings from typing import Any, Optional, Union import numpy as np @@ -16,6 +17,7 @@ from featcopilot.engines.timeseries import TimeSeriesEngine from featcopilot.selection.unified import FeatureSelector from featcopilot.utils.logger import get_logger +from featcopilot.utils.validation import find_potential_leakage_columns logger = get_logger(__name__) @@ -95,10 +97,23 @@ class AutoFeatureEngineer(BaseEstimator, TransformerMixin): Maximum features to generate/select selection_methods : list, default=['mutual_info', 'importance'] Feature selection methods + correlation_threshold : float, default=0.85 + Maximum pairwise correlation allowed during correlation-based selection llm_config : dict, optional Configuration for LLM engine verbose : bool, default=False Verbose output + leakage_guard : {'off', 'warn', 'raise'}, default='warn' + How to handle columns whose names suggest target, label, or future-information leakage + + Other Parameters + ---------------- + target_name : hashable, optional + Fit-time parameter accepted by :meth:`fit` and :meth:`fit_transform`. + When provided, the leakage guard cross-references column labels + against the target so derived variants (e.g. ``target_encoded``) are + flagged. Accepts any column-label type DataFrames support + (typically ``str``, but also ``int`` or other hashables). Examples -------- @@ -107,9 +122,13 @@ class AutoFeatureEngineer(BaseEstimator, TransformerMixin): ... max_features=100, ... llm_config={'model': 'gpt-5.2', 'enable_semantic': True} ... ) - >>> X_transformed = engineer.fit_transform(X, y) + >>> X_transformed = engineer.fit_transform(X, y, target_name='label') """ + SUPPORTED_ENGINES = {"tabular", "timeseries", "relational", "text", "llm"} + SUPPORTED_SELECTION_METHODS = {"mutual_info", "importance", "f_test", "chi2", "correlation", "xgboost"} + SUPPORTED_LEAKAGE_GUARDS = {"off", "warn", "raise"} + def __init__( self, engines: Optional[list[str]] = None, @@ -118,13 +137,21 @@ def __init__( correlation_threshold: float = 0.85, llm_config: Optional[dict[str, Any]] = None, verbose: bool = False, + leakage_guard: str = "warn", ): - self.engines = engines or ["tabular"] + # Use ``is not None`` defaulting (rather than ``or``) so that explicit + # empty containers and identity-bearing arguments are preserved. This + # also keeps ``self. is param`` for any non-None argument, which + # is required for sklearn's ``clone`` round-trip identity check. + self.engines = engines if engines is not None else ["tabular"] self.max_features = max_features - self.selection_methods = selection_methods or ["mutual_info", "importance"] + self.selection_methods = selection_methods if selection_methods is not None else ["mutual_info", "importance"] self.correlation_threshold = correlation_threshold - self.llm_config = llm_config or {} + self.llm_config = llm_config if llm_config is not None else {} self.verbose = verbose + self.leakage_guard = leakage_guard + + self._validate_configuration() self._engine_instances: dict[str, Any] = {} self._selector: Optional[FeatureSelector] = None @@ -133,12 +160,107 @@ def __init__( self._column_descriptions: dict[str, str] = {} self._task_description: str = "" + def _validate_configuration(self) -> None: + """Validate user-facing configuration early.""" + # Reject non-sequence containers (and ``str``/``bytes``, which are + # technically iterable but would be iterated character-by-character) + # before any iteration so that the downstream non-string-entry, + # empty, and set-diff checks all run on a real list/tuple. Without + # this guard, ``engines="tabular"`` would silently expand into + # individual characters and produce a confusing "Unknown engines" + # error, and ``engines=5`` would raise an unrelated ``TypeError`` + # from ``set(self.engines)``. + if not isinstance(self.engines, (list, tuple)): + raise ValueError( + "engines must be a list or tuple of strings; got " + f"{type(self.engines).__name__}={self.engines!r}. " + f"Supported engines: {sorted(self.SUPPORTED_ENGINES)}" + ) + + if not isinstance(self.selection_methods, (list, tuple)): + raise ValueError( + "selection_methods must be a list or tuple of strings; got " + f"{type(self.selection_methods).__name__}={self.selection_methods!r}. " + f"Supported methods: {sorted(self.SUPPORTED_SELECTION_METHODS)}" + ) + + # Reject non-string entries up front so that the diff against the + # supported-name sets (and the ``sorted(...)`` used to build the error + # message) cannot raise an unrelated ``TypeError`` for mixed-type + # inputs (e.g. ``engines=[None, "spaceship"]``). + non_string_engines = [e for e in self.engines if not isinstance(e, str)] + if non_string_engines: + raise ValueError( + "engines must contain only strings; got non-string entries: " + f"{non_string_engines!r}. Supported engines: {sorted(self.SUPPORTED_ENGINES)}" + ) + + # Reject empty collections explicitly. ``engines=None`` is normalized to + # the default in ``__init__`` / ``set_params``; an explicit empty list + # would otherwise leave ``fit()`` running zero engines and ``transform()`` + # silently returning the input (modulo NaN/inf cleanup), which is a + # surprising silent no-op rather than a misconfiguration. + if not self.engines: + raise ValueError( + "engines must contain at least one engine; got an empty sequence. " + f"Pass ``engines=None`` for the default ['tabular'] or pick from {sorted(self.SUPPORTED_ENGINES)}." + ) + + non_string_methods = [m for m in self.selection_methods if not isinstance(m, str)] + if non_string_methods: + raise ValueError( + "selection_methods must contain only strings; got non-string entries: " + f"{non_string_methods!r}. Supported methods: {sorted(self.SUPPORTED_SELECTION_METHODS)}" + ) + + if not self.selection_methods: + raise ValueError( + "selection_methods must contain at least one method; got an empty sequence. " + "Pass ``selection_methods=None`` for the default " + f"['mutual_info', 'importance'] or pick from {sorted(self.SUPPORTED_SELECTION_METHODS)}." + ) + + unknown_engines = sorted(set(self.engines) - self.SUPPORTED_ENGINES) + if unknown_engines: + raise ValueError(f"Unknown engines: {unknown_engines}. Supported engines: {sorted(self.SUPPORTED_ENGINES)}") + + unknown_methods = sorted(set(self.selection_methods) - self.SUPPORTED_SELECTION_METHODS) + if unknown_methods: + raise ValueError( + "Unknown selection methods: " + f"{unknown_methods}. Supported methods: {sorted(self.SUPPORTED_SELECTION_METHODS)}" + ) + + if self.leakage_guard not in self.SUPPORTED_LEAKAGE_GUARDS: + raise ValueError( + f"leakage_guard must be one of {sorted(self.SUPPORTED_LEAKAGE_GUARDS)}, got {self.leakage_guard!r}" + ) + + if self.max_features is not None and self.max_features <= 0: + raise ValueError("max_features must be positive when provided") + + def _reset_fit_state(self) -> None: + """Reset all attributes populated during ``fit``/``fit_transform``. + + Called at the start of ``fit`` so that re-fitting (e.g. after changing + ``engines`` via ``set_params``) cannot leave stale fitted engines, a + stale selector, or fit-time metadata in place. Mirrors the fit-derived + attribute initialization in ``__init__``. + """ + self._engine_instances = {} + self._selector = None + self._feature_set = FeatureSet() + self._is_fitted = False + self._column_descriptions = {} + self._task_description = "" + def fit( self, X: Union[pd.DataFrame, np.ndarray], y: Optional[Union[pd.Series, np.ndarray]] = None, column_descriptions: Optional[dict[str, str]] = None, task_description: str = "prediction task", + target_name: Optional[Any] = None, **fit_params, ) -> "AutoFeatureEngineer": """ @@ -154,6 +276,11 @@ def fit( Human-readable descriptions of columns (for LLM) task_description : str Description of the ML task (for LLM) + target_name : hashable, optional + Target column label used by leakage checks to identify related + feature columns. Accepts any column-label type DataFrames support + (typically ``str``, but also ``int`` or other hashables); the + leakage helper normalizes labels via ``str(...)`` before matching. **fit_params : dict Additional parameters @@ -161,6 +288,14 @@ def fit( ------- self : AutoFeatureEngineer """ + # Reset all fit-derived state so a refit (e.g. after changing ``engines`` + # or after a previous ``fit_transform`` that built a selector) cannot leak + # stale engines, a stale selector, or the previous ``_is_fitted`` flag + # into a subsequent ``transform`` call. Any early exit below (validation + # error, leakage_guard='raise', engine fit failure) leaves the estimator + # in a clean, unfitted state rather than a partially-fitted one. + self._reset_fit_state() + # Convert to DataFrame if needed if isinstance(X, np.ndarray): X = pd.DataFrame(X, columns=[f"feature_{i}" for i in range(X.shape[1])]) @@ -168,6 +303,17 @@ def fit( self._column_descriptions = column_descriptions or {} self._task_description = task_description + suspicious_columns = find_potential_leakage_columns(X.columns.tolist(), target_name=target_name) + if suspicious_columns and self.leakage_guard != "off": + message = ( + "Potential leakage-prone columns detected: " + f"{suspicious_columns}. Review time/label leakage before fitting, " + "or set leakage_guard='off' to disable this check." + ) + if self.leakage_guard == "raise": + raise ValueError(message) + warnings.warn(message, UserWarning, stacklevel=2) + # Fit each engine for engine_name in self.engines: engine = self._create_engine(engine_name) @@ -271,6 +417,7 @@ def fit_transform( y: Optional[Union[pd.Series, np.ndarray]] = None, column_descriptions: Optional[dict[str, str]] = None, task_description: str = "prediction task", + target_name: Optional[Any] = None, apply_selection: bool = True, **fit_params, ) -> pd.DataFrame: @@ -287,6 +434,11 @@ def fit_transform( Human-readable column descriptions task_description : str ML task description + target_name : hashable, optional + Target column label used by leakage checks to identify related + feature columns. Accepts any column-label type DataFrames support + (typically ``str``, but also ``int`` or other hashables); the + leakage helper normalizes labels via ``str(...)`` before matching. apply_selection : bool, default=True Whether to apply feature selection **fit_params : dict @@ -297,8 +449,9 @@ def fit_transform( X_transformed : DataFrame Transformed data with generated features """ - self.fit(X, y, column_descriptions, task_description, **fit_params) - result = self.transform(X) + self.fit(X, y, column_descriptions, task_description, target_name=target_name, **fit_params) + # Reuse transform-relevant kwargs (e.g. text_columns, related_tables) during fit_transform. + result = self.transform(X, **fit_params) # Track original features (input columns) vs derived features if isinstance(X, np.ndarray): @@ -408,10 +561,73 @@ def get_params(self, deep=True): "correlation_threshold": self.correlation_threshold, "llm_config": self.llm_config, "verbose": self.verbose, + "leakage_guard": self.leakage_guard, } def set_params(self, **params): - """Set parameters for sklearn compatibility.""" - for key, value in params.items(): - setattr(self, key, value) + """ + Set parameters for sklearn compatibility. + + Validates parameter keys against the estimator's known parameters + (raising :class:`ValueError` on unknown keys, matching scikit-learn + ``BaseEstimator.set_params`` behavior) and then mirrors the defaulting + performed in ``__init__`` so callers (e.g. sklearn cloning, + ``GridSearchCV`` parameter grids) can pass ``None`` for + collection-valued parameters and have it normalized back to the default + rather than raising during validation. + + The update is atomic: if any provided value fails configuration + validation, all in-flight mutations are rolled back so the estimator + is left in its pre-call state rather than a partially-mutated invalid + one. + + Parameters + ---------- + **params + Estimator parameters to update. Each key must already be a + top-level parameter accepted by ``__init__``. + + Returns + ------- + AutoFeatureEngineer + ``self``, to support fluent chaining. + + Raises + ------ + ValueError + If ``params`` contains a key that is not a known estimator + parameter, or if any provided value fails configuration + validation (see :meth:`_validate_configuration`). On validation + failure the estimator's parameters are restored to the values + they held before the call. + """ + valid_params = self.get_params(deep=True) + invalid_keys = sorted(set(params) - set(valid_params)) + if invalid_keys: + raise ValueError( + f"Invalid parameter(s) {invalid_keys} for estimator {type(self).__name__}. " + f"Valid parameters are: {sorted(valid_params)}." + ) + + # Snapshot the current values for every parameter we are about to + # change (including any whose final value will come from None + # normalization below) so that a validation failure can roll back to + # a fully consistent pre-call state. + snapshot = {key: getattr(self, key) for key in params} + + try: + for key, value in params.items(): + setattr(self, key, value) + if self.engines is None: + self.engines = ["tabular"] + if self.selection_methods is None: + self.selection_methods = ["mutual_info", "importance"] + if self.llm_config is None: + self.llm_config = {} + self._validate_configuration() + except Exception: + for key, value in snapshot.items(): + setattr(self, key, value) + raise + return self diff --git a/featcopilot/utils/__init__.py b/featcopilot/utils/__init__.py index ff6a969..8920ef4 100644 --- a/featcopilot/utils/__init__.py +++ b/featcopilot/utils/__init__.py @@ -10,6 +10,7 @@ list_models, ) from featcopilot.utils.parallel import parallel_apply +from featcopilot.utils.validation import find_potential_leakage_columns __all__ = [ "parallel_apply", @@ -20,4 +21,5 @@ "get_default_model", "get_model_names", "is_valid_model", + "find_potential_leakage_columns", ] diff --git a/featcopilot/utils/validation.py b/featcopilot/utils/validation.py new file mode 100644 index 0000000..bce5ec2 --- /dev/null +++ b/featcopilot/utils/validation.py @@ -0,0 +1,100 @@ +"""Validation helpers for safer feature engineering workflows.""" + +import re +from typing import Any, Optional + +DEFAULT_LEAKAGE_KEYWORDS = [ + "target", + "label", + "outcome", + "ground_truth", + "y_true", + "future", + "leak", +] + + +def _normalize_column_name(name: Any) -> str: + """Normalize a column name for fuzzy matching.""" + return re.sub(r"[^a-z0-9]+", "", str(name).lower()) + + +def find_potential_leakage_columns( + columns: list[Any], + target_name: Optional[Any] = None, + keywords: Optional[list[str]] = None, +) -> list[Any]: + """ + Find suspicious columns that may leak label or future information. + + Parameters + ---------- + columns : list + Input column names or labels to inspect. + target_name : optional + Expected target/label column name or label. Related variants will be flagged. + keywords : list[str], optional + Additional suspicious keywords to match against normalized column names. + + Returns + ------- + list + Column names or labels that deserve manual review for leakage. + + Notes + ----- + Target-name matching is intentionally fuzzy: labels are normalized and substring + variants are flagged so derived names such as ``target_encoded`` are reviewed. + + Pass ``keywords=[]`` to opt out of keyword-based matching entirely (only + the explicit ``target_name`` will be used). ``target_name`` is treated as + absent only when it is ``None`` *or* normalizes to an empty string after + stripping non-alphanumerics; this lets falsy-but-meaningful values such + as ``0`` participate in matching while preventing ``target_name=""`` + from matching every column via the empty-substring trap. Symmetrically, + columns whose labels normalize to an empty string (e.g. ``"---"``, + ``"!!!"``) are skipped entirely so the empty ``normalized_column`` cannot + be reported as a substring of every ``normalized_target``. + """ + # Use ``is None`` defaulting (rather than ``or``) so callers can pass an + # empty list to explicitly disable keyword matching. + if keywords is None: + keywords = DEFAULT_LEAKAGE_KEYWORDS + normalized_keywords = [_normalize_column_name(keyword) for keyword in keywords] + + # Use explicit ``is None`` so falsy but meaningful target labels (e.g. ``0``) + # still participate in matching. After normalization, an empty string is + # treated as "no target" so that ``target_name=""`` (or values like ``"---"`` + # that strip to nothing) cannot match every column via the + # ``normalized_target in normalized_column`` substring check. + if target_name is None: + normalized_target: Optional[str] = None + else: + normalized = _normalize_column_name(target_name) + normalized_target = normalized if normalized else None + + suspicious: list[Any] = [] + for column in columns: + normalized_column = _normalize_column_name(column) + + # A column whose label normalizes to an empty string (e.g. ``"---"`` or + # ``"!!!"``) has no meaningful content to compare against. Skipping it + # closes the column-side counterpart of the ``"" in normalized_target`` + # trap (the empty ``normalized_column`` would otherwise be a substring + # of every non-empty ``normalized_target`` and be flagged whenever a + # target was provided), and similarly avoids any keyword false-positive + # via the same empty-substring path. + if not normalized_column: + continue + + keyword_hit = any(keyword and keyword in normalized_column for keyword in normalized_keywords) + target_hit = normalized_target is not None and ( + normalized_column == normalized_target + or normalized_target in normalized_column + or normalized_column in normalized_target + ) + + if keyword_hit or target_hit: + suspicious.append(column) + + return suspicious diff --git a/mkdocs.yml b/mkdocs.yml index a091b3f..8136adf 100644 --- a/mkdocs.yml +++ b/mkdocs.yml @@ -85,6 +85,8 @@ nav: - Examples: - End-to-End Demo: examples/e2e-demo.md - Basic Usage: examples/basic.md + - Time-Aware Tabular: examples/time-aware-tabular.md + - Relational Feature Engineering: examples/relational-feature-engineering.md - LLM-Powered: examples/llm-powered.md - Sklearn Pipeline: examples/sklearn-pipeline.md - Domain-Specific: examples/domain-specific.md diff --git a/tests/test_autofeat.py b/tests/test_autofeat.py index afe6eb5..600b11a 100644 --- a/tests/test_autofeat.py +++ b/tests/test_autofeat.py @@ -75,6 +75,39 @@ def test_multiple_engines(self, sample_data): assert result is not None assert len(result.columns) > 0 + def test_leakage_guard_warns_on_suspicious_columns(self, sample_data): + """Test leakage guard warns when suspicious columns are present.""" + X, y = sample_data + X = X.rename(columns={"balance": "future_target_signal"}) + engineer = AutoFeatureEngineer(engines=["tabular"], leakage_guard="warn") + + with pytest.warns(UserWarning, match="Potential leakage-prone columns detected"): + engineer.fit(X, y, target_name="target") + + def test_leakage_guard_raises_on_suspicious_columns(self, sample_data): + """Test leakage guard can hard-fail when suspicious columns are present.""" + X, y = sample_data + X = X.rename(columns={"balance": "churn_label_proxy"}) + engineer = AutoFeatureEngineer(engines=["tabular"], leakage_guard="raise") + + with pytest.raises(ValueError, match="Potential leakage-prone columns detected"): + engineer.fit(X, y, target_name="churn") + + def test_invalid_engine_configuration_raises(self): + """Test invalid engine names fail early.""" + with pytest.raises(ValueError, match="Unknown engines"): + AutoFeatureEngineer(engines=["tabular", "spaceship"]) + + def test_invalid_selection_method_raises(self): + """Test invalid selection methods fail early.""" + with pytest.raises(ValueError, match="Unknown selection methods"): + AutoFeatureEngineer(selection_methods=["mutual_info", "magic"]) + + def test_invalid_leakage_guard_raises(self): + """Test invalid leakage guard setting fails early.""" + with pytest.raises(ValueError, match="leakage_guard must be one of"): + AutoFeatureEngineer(leakage_guard="maybe") + class TestAutoFeatureEngineerParams: """Test parameter handling.""" diff --git a/tests/test_benchmark_splits.py b/tests/test_benchmark_splits.py new file mode 100644 index 0000000..570ea39 --- /dev/null +++ b/tests/test_benchmark_splits.py @@ -0,0 +1,176 @@ +"""Tests for the shared benchmarks split helper and its wiring.""" + +from __future__ import annotations + +import importlib +import inspect +from pathlib import Path + +import numpy as np +import pandas as pd +import pytest + +from benchmarks.splits import split_benchmark_data + +# --------------------------------------------------------------------------- +# Behavioral tests for split_benchmark_data +# --------------------------------------------------------------------------- + + +def test_classification_uses_stratified_split(): + """Classification tasks should produce a stratified split when class counts allow.""" + rng = np.random.default_rng(0) + X = pd.DataFrame({"f": rng.normal(size=200)}) + y = pd.Series(([0] * 160) + ([1] * 40)) + + train_idx, test_idx, y_train, y_test = split_benchmark_data(X, y, "classification", random_state=42) + + assert len(train_idx) + len(test_idx) == len(X) + assert set(train_idx).isdisjoint(set(test_idx)) + train_pos_ratio = (y_train == 1).mean() + test_pos_ratio = (y_test == 1).mean() + expected_ratio = (y == 1).mean() + assert abs(train_pos_ratio - expected_ratio) < 0.02 + assert abs(test_pos_ratio - expected_ratio) < 0.02 + + +def test_classification_falls_back_when_class_too_small(): + """Singleton classes should not raise; the split falls back to non-stratified.""" + X = pd.DataFrame({"f": np.arange(10.0)}) + y = pd.Series([0] * 9 + [1]) + + train_idx, test_idx, _, _ = split_benchmark_data(X, y, "classification", random_state=0) + + assert len(train_idx) + len(test_idx) == len(X) + + +def test_forecasting_uses_chronological_split(): + """Forecasting/timeseries tasks should preserve temporal order.""" + X = pd.DataFrame({"t": np.arange(100)}) + y = pd.Series(np.arange(100, dtype=float)) + + train_idx, test_idx, y_train, y_test = split_benchmark_data(X, y, "forecasting", random_state=123) + + assert list(train_idx) == list(range(80)) + assert list(test_idx) == list(range(80, 100)) + assert y_train.iloc[-1] < y_test.iloc[0] + + +def test_timeseries_keyword_also_chronological(): + """The 'timeseries' substring should also trigger chronological split.""" + X = pd.DataFrame({"t": np.arange(50)}) + y = pd.Series(np.arange(50, dtype=float)) + + train_idx, test_idx, _, _ = split_benchmark_data(X, y, "timeseries_regression", random_state=0) + + assert list(train_idx) == list(range(40)) + assert list(test_idx) == list(range(40, 50)) + + +def test_regression_uses_random_split_no_stratify(): + """Non-classification, non-forecasting tasks should use a plain random split.""" + rng = np.random.default_rng(1) + X = pd.DataFrame({"f": rng.normal(size=100)}) + y = pd.Series(rng.normal(size=100)) + + train_idx, test_idx, _, _ = split_benchmark_data(X, y, "regression", random_state=42) + + assert len(train_idx) == 80 + assert len(test_idx) == 20 + assert set(train_idx).isdisjoint(set(test_idx)) + + +def test_custom_test_size_respected(): + """``test_size`` should override the default 0.2.""" + X = pd.DataFrame({"f": np.arange(100.0)}) + y = pd.Series(np.arange(100, dtype=float)) + + train_idx, test_idx, _, _ = split_benchmark_data(X, y, "regression", random_state=0, test_size=0.4) + + assert len(test_idx) == 40 + assert len(train_idx) == 60 + + +@pytest.mark.parametrize("bad_test_size", [0.0, 1.0, -0.1, 1.5, 2]) +def test_split_benchmark_data_rejects_out_of_range_test_size(bad_test_size): + """``test_size`` must be strictly between 0 and 1 for both branches.""" + X = pd.DataFrame({"f": np.arange(100.0)}) + y = pd.Series(np.arange(100, dtype=float)) + + # Random branch. + with pytest.raises(ValueError, match="test_size must be a float strictly between 0 and 1"): + split_benchmark_data(X, y, "regression", random_state=0, test_size=bad_test_size) + + # Chronological branch -- previously silently produced empty/overlapping splits. + with pytest.raises(ValueError, match="test_size must be a float strictly between 0 and 1"): + split_benchmark_data(X, y, "forecasting", random_state=0, test_size=bad_test_size) + + +def test_split_benchmark_data_chronological_rejects_empty_train_split(): + """Tiny datasets with extreme ``test_size`` must raise instead of producing an empty train set.""" + X = pd.DataFrame({"t": np.arange(2)}) + y = pd.Series(np.arange(2, dtype=float)) + + # len=2, test_size=0.9 -> split_idx = int(2 * 0.1) = 0 -> empty train. + with pytest.raises(ValueError, match="Chronological split would leave one side empty"): + split_benchmark_data(X, y, "forecasting", random_state=0, test_size=0.9) + + +def test_split_benchmark_data_chronological_single_row_dataset_raises(): + """A single-row dataset cannot be chronologically split for any valid ``test_size``.""" + X = pd.DataFrame({"t": [0]}) + y = pd.Series([0.0]) + + with pytest.raises(ValueError, match="Chronological split would leave one side empty"): + split_benchmark_data(X, y, "forecasting", random_state=0, test_size=0.5) + + +# --------------------------------------------------------------------------- +# Wiring tests: ensure benchmark scripts actually use the shared helper. +# These guard against the regression flagged on PR #2 where the helper was +# introduced but never wired into the call sites. +# --------------------------------------------------------------------------- + + +_REPO_ROOT = Path(__file__).resolve().parents[1] + + +@pytest.mark.parametrize( + "module_path", + [ + "benchmarks.compare_tools.run_fe_tools_comparison", + "benchmarks.use_cases.run_auto_feature_engineering_benchmark", + ], +) +def test_benchmark_scripts_import_split_helper(module_path): + """Both in-scope benchmark scripts must import ``split_benchmark_data``.""" + module = importlib.import_module(module_path) + assert ( + getattr(module, "split_benchmark_data", None) is split_benchmark_data + ), f"{module_path} should import split_benchmark_data from benchmarks.splits" + + +@pytest.mark.parametrize( + "relative_path", + [ + "benchmarks/compare_tools/run_fe_tools_comparison.py", + "benchmarks/use_cases/run_auto_feature_engineering_benchmark.py", + ], +) +def test_benchmark_scripts_do_not_call_train_test_split_directly(relative_path): + """Benchmark scripts should route through ``split_benchmark_data`` instead of ``train_test_split``.""" + source = (_REPO_ROOT / relative_path).read_text(encoding="utf-8") + assert ( + "train_test_split(" not in source + ), f"{relative_path} should not call train_test_split directly; use split_benchmark_data." + assert ( + "split_benchmark_data(" in source + ), f"{relative_path} should call split_benchmark_data from benchmarks.splits." + + +def test_split_benchmark_data_signature_stable(): + """Pin the public signature so downstream benchmark scripts keep working.""" + sig = inspect.signature(split_benchmark_data) + params = list(sig.parameters) + assert params == ["X", "y", "task", "random_state", "test_size"] + assert sig.parameters["test_size"].default == 0.2 diff --git a/tests/test_engines.py b/tests/test_engines.py index ee49cfa..3005ae9 100644 --- a/tests/test_engines.py +++ b/tests/test_engines.py @@ -785,6 +785,27 @@ def test_ndarray_input(self): assert "feature_0_mean" in result.columns assert "feature_1_mean" in result.columns + def test_series_in_rows_mode(self): + """Test per-row mode for sequence-like cells.""" + df = pd.DataFrame( + { + "series": [np.array([1.0, 2.0, 3.0]), np.array([3.0, 2.0, 1.0])], + "event_time": [pd.Timestamp("2024-01-01"), pd.Timestamp("2024-01-02")], + } + ) + engine = TimeSeriesEngine(features=["basic_stats"], series_in_rows=True) + result = engine.fit_transform(df, time_column="event_time") + + assert isinstance(result, pd.DataFrame) + assert len(result) == 2 + assert "series_mean" in result.columns + + def test_time_column_missing_raises(self, ts_data): + """Test invalid time column raises a helpful error.""" + engine = TimeSeriesEngine(features=["basic_stats"]) + with pytest.raises(ValueError, match="time_column 'missing_time'"): + engine.fit(ts_data, time_column="missing_time") + # --------------------------------------------------------------------------- # RelationalEngine tests @@ -950,6 +971,20 @@ def test_transform_with_different_related_tables(self, orders_data, customers_da assert isinstance(result, pd.DataFrame) assert "customers_age_mean" in result.columns + def test_fit_missing_child_key_raises(self, orders_data, customers_data): + """Test fit raises when child key is missing from primary table.""" + engine = RelationalEngine() + engine.add_relationship("orders", "customers", "missing_customer_id") + with pytest.raises(ValueError, match="child_key 'missing_customer_id'"): + engine.fit(orders_data, related_tables={"customers": customers_data}) + + def test_fit_missing_parent_key_raises(self, orders_data, customers_data): + """Test fit raises when parent key is missing from related table.""" + engine = RelationalEngine() + engine.add_relationship("orders", "customers", "customer_id", parent_key="missing_parent_key") + with pytest.raises(ValueError, match="parent_key 'missing_parent_key'"): + engine.fit(orders_data, related_tables={"customers": customers_data}) + def test_no_relationships(self, orders_data): """Test engine with no relationships defined.""" engine = RelationalEngine() diff --git a/tests/test_sklearn_compat.py b/tests/test_sklearn_compat.py index ad1ccd1..6920b9a 100644 --- a/tests/test_sklearn_compat.py +++ b/tests/test_sklearn_compat.py @@ -1,5 +1,6 @@ """Tests for scikit-learn compatible feature engineering transformers.""" +import importlib from unittest.mock import MagicMock, patch import numpy as np @@ -262,6 +263,273 @@ def test_set_params(self): assert afe.max_features == 20 assert afe.verbose is True + def test_set_params_none_normalizes_to_defaults(self): + """set_params should accept None for collection-valued params (sklearn compat). + + Sklearn's ``clone`` and ``GridSearchCV`` may pass ``None`` for parameters + whose default in ``__init__`` is also ``None``. Validation should not raise + in that case; ``None`` should be normalized to the same defaults + ``__init__`` applies. + """ + afe = AutoFeatureEngineer(engines=["tabular", "timeseries"]) + afe.set_params(engines=None, selection_methods=None, llm_config=None) + + assert afe.engines == ["tabular"] + assert afe.selection_methods == ["mutual_info", "importance"] + assert afe.llm_config == {} + + def test_set_params_invalid_engine_still_raises(self): + """set_params should still validate non-None values.""" + afe = AutoFeatureEngineer() + with pytest.raises(ValueError, match="Unknown engines"): + afe.set_params(engines=["not_a_real_engine"]) + + def test_set_params_unknown_key_raises(self): + """set_params should reject unknown parameter names (sklearn convention).""" + afe = AutoFeatureEngineer() + with pytest.raises(ValueError, match="Invalid parameter"): + afe.set_params(not_a_real_param=42) + + def test_set_params_unknown_key_does_not_mutate_state(self): + """A failing set_params call must leave the estimator unchanged.""" + afe = AutoFeatureEngineer(engines=["tabular"], max_features=5) + with pytest.raises(ValueError): + afe.set_params(typo_param=99) + + assert afe.engines == ["tabular"] + assert afe.max_features == 5 + assert not hasattr(afe, "typo_param") + + def test_sklearn_clone_round_trip(self): + """A cloned estimator must be configurable identically to the original.""" + from sklearn.base import clone + + afe = AutoFeatureEngineer(engines=["tabular"], max_features=7) + cloned = clone(afe) + + assert cloned.engines == ["tabular"] + assert cloned.max_features == 7 + assert cloned.selection_methods == ["mutual_info", "importance"] + + def test_set_params_invalid_value_rolls_back_state(self, sample_df): + """A failing set_params call must leave every parameter at its pre-call value.""" + afe = AutoFeatureEngineer( + engines=["tabular"], + max_features=5, + selection_methods=["mutual_info"], + correlation_threshold=0.9, + llm_config={"model": "gpt-5.2"}, + verbose=False, + leakage_guard="warn", + ) + + with pytest.raises(ValueError): + afe.set_params( + max_features=10, + engines=["bogus_engine"], + leakage_guard="not_a_mode", + ) + + # Every parameter that was part of the failing call must be restored. + assert afe.engines == ["tabular"] + assert afe.max_features == 5 + assert afe.leakage_guard == "warn" + # Untouched parameters are obviously unchanged but assert anyway to + # guard against unrelated mutations. + assert afe.selection_methods == ["mutual_info"] + assert afe.correlation_threshold == 0.9 + assert afe.llm_config == {"model": "gpt-5.2"} + assert afe.verbose is False + + def test_set_params_invalid_value_after_none_normalization_rolls_back(self): + """Rollback must capture the pre-call value, not the None-normalized one.""" + afe = AutoFeatureEngineer(engines=["tabular", "timeseries"]) + + with pytest.raises(ValueError): + afe.set_params(engines=None, max_features=-1) + + # engines was None-normalized to ["tabular"] mid-call; rollback must + # restore the original ["tabular", "timeseries"], not ["tabular"]. + assert afe.engines == ["tabular", "timeseries"] + assert afe.max_features is None + + def test_validate_engines_rejects_non_string_entries(self): + """Mixed-type engine lists must raise ValueError, not TypeError from sorted().""" + # A bare ``sorted(set(...))`` over a mix of None/str would raise + # ``TypeError: '<' not supported between instances of 'str' and 'NoneType'``. + # The validator must surface a clear ValueError instead. + with pytest.raises(ValueError, match="engines must contain only strings"): + AutoFeatureEngineer(engines=[None, "tabular"]) + with pytest.raises(ValueError, match="engines must contain only strings"): + AutoFeatureEngineer(engines=["tabular", 42]) + + def test_validate_selection_methods_rejects_non_string_entries(self): + """Mixed-type selection_methods lists must raise ValueError, not TypeError from sorted().""" + with pytest.raises(ValueError, match="selection_methods must contain only strings"): + AutoFeatureEngineer(selection_methods=[None, "mutual_info"]) + with pytest.raises(ValueError, match="selection_methods must contain only strings"): + AutoFeatureEngineer(selection_methods=["mutual_info", 0]) + + def test_set_params_rejects_non_string_engine_entries_and_rolls_back(self): + """set_params must surface the same ValueError and roll back state.""" + afe = AutoFeatureEngineer(engines=["tabular"]) + with pytest.raises(ValueError, match="engines must contain only strings"): + afe.set_params(engines=[None, "spaceship"]) + assert afe.engines == ["tabular"] + + def test_init_rejects_empty_engines_list(self): + """An explicitly empty ``engines=[]`` must raise rather than silently no-op.""" + # ``engines=None`` defaults to ['tabular']; an explicit empty list is a + # different intent and would otherwise let ``fit()`` mark the estimator + # fitted with zero engines so that ``transform()`` becomes a silent no-op. + with pytest.raises(ValueError, match="engines must contain at least one engine"): + AutoFeatureEngineer(engines=[]) + + def test_init_rejects_empty_selection_methods_list(self): + """An explicitly empty ``selection_methods=[]`` must raise.""" + with pytest.raises(ValueError, match="selection_methods must contain at least one method"): + AutoFeatureEngineer(selection_methods=[]) + + def test_set_params_rejects_empty_engines_and_rolls_back(self): + """set_params must reject empty ``engines=[]`` and leave state untouched.""" + afe = AutoFeatureEngineer(engines=["tabular"], max_features=5) + with pytest.raises(ValueError, match="engines must contain at least one engine"): + afe.set_params(engines=[]) + assert afe.engines == ["tabular"] + assert afe.max_features == 5 + + def test_init_engines_none_still_defaults_to_tabular(self): + """``engines=None`` continues to normalize to the default ['tabular'].""" + afe = AutoFeatureEngineer(engines=None) + assert afe.engines == ["tabular"] + + def test_init_rejects_string_engines_argument(self): + """A bare ``str`` for ``engines`` must raise instead of iterating char-by-char.""" + # Without the container-type guard, ``engines="tabular"`` would expand + # into individual characters and produce a confusing "Unknown engines" + # error such as ``Unknown engines: ['a', 'b', 'l', 'r', 't', 'u']``. + with pytest.raises(ValueError, match="engines must be a list or tuple of strings"): + AutoFeatureEngineer(engines="tabular") + + def test_init_rejects_non_sequence_engines_argument(self): + """Non-sequence ``engines`` (e.g. ``int``) must raise a clear ValueError.""" + # Without the guard, ``set(self.engines)`` would raise a bare + # ``TypeError: 'int' object is not iterable``. + with pytest.raises(ValueError, match="engines must be a list or tuple of strings"): + AutoFeatureEngineer(engines=5) + with pytest.raises(ValueError, match="engines must be a list or tuple of strings"): + AutoFeatureEngineer(engines={"tabular": True}) + + def test_init_rejects_string_selection_methods_argument(self): + """A bare ``str`` for ``selection_methods`` must raise.""" + with pytest.raises(ValueError, match="selection_methods must be a list or tuple of strings"): + AutoFeatureEngineer(selection_methods="mutual_info") + + def test_init_rejects_non_sequence_selection_methods_argument(self): + """Non-sequence ``selection_methods`` must raise a clear ValueError.""" + with pytest.raises(ValueError, match="selection_methods must be a list or tuple of strings"): + AutoFeatureEngineer(selection_methods=42) + + def test_init_accepts_tuple_engines(self): + """Tuples of strings are an acceptable container for ``engines``.""" + afe = AutoFeatureEngineer(engines=("tabular",)) + assert afe.engines == ("tabular",) + + def test_set_params_rejects_string_engines_and_rolls_back(self): + """``set_params`` inherits the container-type check and rolls back on failure.""" + afe = AutoFeatureEngineer(engines=["tabular"], max_features=5) + with pytest.raises(ValueError, match="engines must be a list or tuple of strings"): + afe.set_params(engines="tabular") + assert afe.engines == ["tabular"] + assert afe.max_features == 5 + + def test_fit_accepts_non_string_target_name(self, sample_df, sample_target): + """``target_name`` is typed Optional[Any]; integer column labels must work.""" + # Build a DataFrame with an integer column name that overlaps the target. + df = sample_df.copy() + df.columns = [0, 1, 2, 3] + afe = AutoFeatureEngineer(engines=["tabular"], leakage_guard="raise") + # Integer target_name=0 must be honored (and would raise here because + # leakage_guard="raise" + a column named 0). This pins the type-hint + # contract: non-string target labels are accepted at runtime. + with pytest.raises(ValueError, match="leakage-prone"): + afe.fit(df, sample_target, target_name=0) + + def test_fit_resets_engine_instances_when_engines_change(self, sample_df, sample_target): + """Refitting after removing an engine must drop the previously fitted engine.""" + afe = AutoFeatureEngineer(engines=["tabular", "timeseries"], verbose=False) + afe.fit(sample_df, sample_target) + assert set(afe._engine_instances) == {"tabular", "timeseries"} + + afe.set_params(engines=["tabular"]) + afe.fit(sample_df, sample_target) + + # The previously fitted "timeseries" engine must not survive into the + # new fit, otherwise transform() would invoke a stale engine. + assert set(afe._engine_instances) == {"tabular"} + + def test_fit_resets_selector_after_prior_fit_transform(self, sample_df, sample_target): + """A plain fit() following fit_transform() must clear the selector.""" + afe = AutoFeatureEngineer(engines=["tabular"], max_features=3, verbose=False) + afe.fit_transform(sample_df, sample_target) + assert afe._selector is not None + + afe.fit(sample_df, sample_target) + + # Without a selector reset, transform() would still apply the stale + # selection from the previous fit_transform call. + assert afe._selector is None + result = afe.transform(sample_df) + # Every input column must survive transform when no selector is active. + for col in sample_df.columns: + assert col in result.columns + + def test_fit_resets_state_when_called_after_failed_fit(self, sample_df, sample_target, monkeypatch): + """If fit raises mid-flight, _is_fitted must be False so transform errors out.""" + afe = AutoFeatureEngineer(engines=["tabular"], verbose=False) + afe.fit(sample_df, sample_target) + assert afe._is_fitted is True + + # Force the next fit to fail partway through engine fitting. + from featcopilot.engines.tabular import TabularEngine + + def _boom(self, X, y=None, **kwargs): + raise RuntimeError("simulated engine failure") + + monkeypatch.setattr(TabularEngine, "fit", _boom) + with pytest.raises(RuntimeError, match="simulated engine failure"): + afe.fit(sample_df, sample_target) + + # The failed fit must not leave the estimator in a "fitted" state + # that points at stale engines from the previous successful fit. + assert afe._is_fitted is False + assert afe._engine_instances == {} + with pytest.raises(RuntimeError, match="Must call fit"): + afe.transform(sample_df) + + +class TestPackageImport: + """Tests for top-level package import behavior.""" + + def test_import_without_installed_metadata_falls_back(self): + """Test source import works even when distribution metadata is unavailable.""" + import importlib.metadata as importlib_metadata + + import featcopilot + + original_version = importlib_metadata.version + + def fake_version(name): + if name == "featcopilot": + raise importlib_metadata.PackageNotFoundError + return original_version(name) + + with patch("importlib.metadata.version", side_effect=fake_version): + reloaded = importlib.reload(featcopilot) + assert reloaded.__version__ == "0+unknown" + + importlib.reload(featcopilot) + def test_verbose_logging(self, sample_df, sample_target): """Test that verbose=True does not error.""" afe = AutoFeatureEngineer(engines=["tabular"], verbose=True) diff --git a/tests/test_utils.py b/tests/test_utils.py index 9d2c5e8..75731d3 100644 --- a/tests/test_utils.py +++ b/tests/test_utils.py @@ -21,6 +21,70 @@ list_models, ) from featcopilot.utils.parallel import parallel_apply, parallel_transform +from featcopilot.utils.validation import find_potential_leakage_columns + +# --------------------------------------------------------------------------- +# Validation tests +# --------------------------------------------------------------------------- + + +def test_leakage_detection_non_string_columns(): + """Test leakage detection accepts non-string column labels.""" + assert find_potential_leakage_columns([], target_name="label") == [] + assert find_potential_leakage_columns([123, "future_label"], target_name="label") == ["future_label"] + assert find_potential_leakage_columns([123, "target"], target_name="other") == ["target"] + assert find_potential_leakage_columns([123, 456], target_name="label") == [] + assert find_potential_leakage_columns([123, "feature"], target_name=123) == [123] + assert find_potential_leakage_columns(["Churn Label"], target_name="churn_label") == ["Churn Label"] + assert find_potential_leakage_columns(["future-target!"], target_name="target") == ["future-target!"] + + +def test_leakage_detection_empty_keywords_disables_keyword_matching(): + """Passing ``keywords=[]`` must opt out of keyword-based matching entirely.""" + # Without an explicit empty keywords list, "target" / "future_score" would + # be flagged via DEFAULT_LEAKAGE_KEYWORDS. With ``keywords=[]`` and no + # ``target_name``, no column should be flagged. + assert find_potential_leakage_columns(["target", "future_score", "x"], keywords=[]) == [] + + # Explicit ``target_name`` still works alongside ``keywords=[]``. + assert find_potential_leakage_columns( + ["target", "future_score", "label_x"], + target_name="label", + keywords=[], + ) == ["label_x"] + + +def test_leakage_detection_falsy_target_name_zero_still_matches(): + """Falsy but meaningful targets (e.g. ``0``) must still drive matching.""" + # Previously ``if target_name`` would treat ``0`` as absent and skip target + # matching entirely. The integer column ``0`` should now be flagged. + assert find_potential_leakage_columns([0, "feature"], target_name=0) == [0] + + +def test_leakage_detection_empty_string_target_does_not_match_everything(): + """``target_name=''`` must not flag every column via the empty-substring trap.""" + # An empty (or whitespace-only) target normalizes to "" which would otherwise + # be treated as a substring of every column. Treating empty normalized + # results as absent prevents that. + assert find_potential_leakage_columns(["a", "b", "c"], target_name="", keywords=[]) == [] + assert find_potential_leakage_columns(["a", "b", "c"], target_name="---", keywords=[]) == [] + + +def test_leakage_detection_columns_normalizing_to_empty_string_are_skipped(): + """Columns whose labels normalize to an empty string must not be flagged.""" + # Without the column-side guard, the empty ``normalized_column`` would be + # a substring of every non-empty ``normalized_target`` (``"" in "label"`` + # is True), so any column literally labeled ``"---"`` / ``"!!!"`` would be + # flagged whenever a target was provided. The guard skips such columns. + assert find_potential_leakage_columns(["---", "!!!"], target_name="label") == [] + assert find_potential_leakage_columns(["---", "label_x"], target_name="label", keywords=[]) == ["label_x"] + # Mixing meaningful and empty-normalizing column labels still reports only + # the meaningful ones. + assert find_potential_leakage_columns(["target", "---", "future_x"], target_name="label") == [ + "target", + "future_x", + ] + # --------------------------------------------------------------------------- # FeatureCache tests