diff --git a/benchmarks/compare_tools/run_fe_tools_comparison.py b/benchmarks/compare_tools/run_fe_tools_comparison.py
index f678fe4..dd7b10f 100644
--- a/benchmarks/compare_tools/run_fe_tools_comparison.py
+++ b/benchmarks/compare_tools/run_fe_tools_comparison.py
@@ -43,7 +43,6 @@
 import pandas as pd
 from packaging.version import Version
 from sklearn.metrics import accuracy_score, f1_score, mean_squared_error, r2_score, roc_auc_score
-from sklearn.model_selection import train_test_split
 from sklearn.preprocessing import LabelEncoder
 
 from benchmarks.datasets import (
@@ -61,12 +60,14 @@
     sanitize_feature_frames,
     save_feature_cache,
 )
+from benchmarks.splits import split_benchmark_data
 from featcopilot.utils.logger import get_logger  # noqa: E402
 
 logger = get_logger(__name__)
 
 warnings.filterwarnings("ignore")
 
+
 # Default configuration
 QUICK_DATASETS = [
     # Interaction-heavy synthetic (FeatCopilot creates valuable polynomial features)
@@ -843,8 +844,7 @@ def run_single_benchmark(
         X_encoded[col] = le.fit_transform(X_encoded[col].astype(str))
 
     # Split data (keep raw and encoded in sync)
-    indices = np.arange(len(X_encoded))
-    train_idx, test_idx, y_train, y_test = train_test_split(indices, y, test_size=0.2, random_state=random_state)
+    train_idx, test_idx, y_train, y_test = split_benchmark_data(X_encoded, y, task, random_state)
     X_train_encoded = X_encoded.iloc[train_idx]
     X_test_encoded = X_encoded.iloc[test_idx]
     X_train_raw = X.iloc[train_idx]
diff --git a/benchmarks/datasets.py b/benchmarks/datasets.py
index c611bf6..db5e238 100644
--- a/benchmarks/datasets.py
+++ b/benchmarks/datasets.py
@@ -5,6 +5,8 @@
 time series datasets, and text/semantic datasets for comprehensive benchmarking.
 """
 
+from __future__ import annotations
+
 import numpy as np
 import pandas as pd
 
diff --git a/benchmarks/splits.py b/benchmarks/splits.py
new file mode 100644
index 0000000..4b97c34
--- /dev/null
+++ b/benchmarks/splits.py
@@ -0,0 +1,102 @@
+"""Shared split utilities for FeatCopilot benchmarks.
+
+Centralizes the split policy so individual benchmark scripts share the same
+realistic defaults: chronological splits for forecasting/timeseries tasks and
+stratified random splits for classification tasks (when class counts allow).
+"""
+
+from __future__ import annotations
+
+import numpy as np
+import pandas as pd
+from sklearn.model_selection import train_test_split
+
+
+def split_benchmark_data(
+    X: pd.DataFrame,
+    y: pd.Series,
+    task: str,
+    random_state: int,
+    test_size: float = 0.2,
+) -> tuple[np.ndarray, np.ndarray, pd.Series, pd.Series]:
+    """
+    Split benchmark data with task-aware defaults.
+
+    Parameters
+    ----------
+    X : pandas.DataFrame
+        Feature matrix. Only its length is used; the function returns positional
+        indices (use ``X.iloc[train_idx]`` / ``X.iloc[test_idx]`` to materialize
+        the splits).
+    y : pandas.Series
+        Target values aligned with ``X``.
+    task : str
+        Task identifier. Substrings ``"forecast"`` / ``"timeseries"`` trigger a
+        chronological split; ``"classification"`` triggers a stratified split
+        when class counts allow it. Anything else falls back to a random split.
+    random_state : int
+        Random state for reproducible random splits.
+    test_size : float, default=0.2
+        Fraction of rows held out for the test split.
+
+    Returns
+    -------
+    train_idx : numpy.ndarray
+        Positional indices for the training rows.
+    test_idx : numpy.ndarray
+        Positional indices for the test rows.
+    y_train : pandas.Series
+        Target values for the training rows.
+    y_test : pandas.Series
+        Target values for the test rows.
+
+    Raises
+    ------
+    ValueError
+        If ``test_size`` is not strictly between 0 and 1, or if the resulting
+        chronological split would leave either side empty (for example, a
+        very small dataset combined with an extreme ``test_size``).
+    """
+    # Validate ``test_size`` up front so the chronological branch matches the
+    # behavior of ``sklearn.model_selection.train_test_split`` (which rejects
+    # ``test_size <= 0`` / ``>= 1``) instead of silently producing an empty
+    # or overlapping split.
+    if not (0 < test_size < 1):
+        raise ValueError(f"test_size must be a float strictly between 0 and 1; got {test_size!r}")
+
+    indices = np.arange(len(X))
+
+    if "forecast" in task or "timeseries" in task:
+        split_idx = int(len(indices) * (1 - test_size))
+        if split_idx <= 0 or split_idx >= len(indices):
+            raise ValueError(
+                "Chronological split would leave one side empty: "
+                f"len(X)={len(indices)}, test_size={test_size} -> split_idx={split_idx}. "
+                "Provide more rows or pick a different ``test_size``."
+            )
+        train_idx = indices[:split_idx]
+        test_idx = indices[split_idx:]
+        y_train = y.iloc[train_idx]
+        y_test = y.iloc[test_idx]
+        return train_idx, test_idx, y_train, y_test
+
+    stratify = None
+    if "classification" in task:
+        try:
+            class_counts = pd.Series(y).value_counts(dropna=False)
+            if len(class_counts) > 1 and class_counts.min() >= 2:
+                stratify = y
+        except Exception:
+            stratify = None
+
+    train_idx, test_idx, y_train, y_test = train_test_split(
+        indices,
+        y,
+        test_size=test_size,
+        random_state=random_state,
+        stratify=stratify,
+    )
+    return train_idx, test_idx, y_train, y_test
+
+
+__all__ = ["split_benchmark_data"]
diff --git a/benchmarks/use_cases/AUTO_FEATURE_ENGINEERING_USE_CASE.md b/benchmarks/use_cases/AUTO_FEATURE_ENGINEERING_USE_CASE.md
new file mode 100644
index 0000000..927e33f
--- /dev/null
+++ b/benchmarks/use_cases/AUTO_FEATURE_ENGINEERING_USE_CASE.md
@@ -0,0 +1,10 @@
+# Auto Feature Engineering Use-Case Benchmark
+
+Compares a plain baseline with FeatCopilot and common automatic feature engineering tools on an interaction-heavy tabular classification task.
+
+| Tool | Status | ROC-AUC | Feature Count |
+|------|--------|---------|---------------|
+| baseline | ok | 0.6330 | 9 |
+| featcopilot | ok | 0.6328 | 11 |
+| featuretools | ok | 0.6362 | 60 |
+| autofeat | failed: check_array() got an unexpected keyword argument 'force_all_finite' | - | - |
diff --git a/benchmarks/use_cases/README.md b/benchmarks/use_cases/README.md
new file mode 100644
index 0000000..b08d50f
--- /dev/null
+++ b/benchmarks/use_cases/README.md
@@ -0,0 +1,20 @@
+# Use-Case Benchmarks
+
+Targeted benchmarks for realistic feature-engineering scenarios.
+
+## Auto Feature Engineering
+
+This benchmark compares:
+- plain baseline
+- FeatCopilot
+- Featuretools (if installed)
+- autofeat (if installed)
+
+on an interaction-heavy tabular classification task where automatic feature engineering should matter.
+
+```bash
+python -m benchmarks.use_cases.run_auto_feature_engineering_benchmark
+```
+
+Outputs:
+- `AUTO_FEATURE_ENGINEERING_USE_CASE.md`
diff --git a/benchmarks/use_cases/run_auto_feature_engineering_benchmark.py b/benchmarks/use_cases/run_auto_feature_engineering_benchmark.py
new file mode 100644
index 0000000..b4b426d
--- /dev/null
+++ b/benchmarks/use_cases/run_auto_feature_engineering_benchmark.py
@@ -0,0 +1,257 @@
+"""Benchmark FeatCopilot against common baselines for auto feature engineering.
+
+Focused on the practical use case of structured/tabular data with interaction
+and ratio effects where feature engineering should matter.
+"""
+
+from __future__ import annotations
+
+import argparse
+import json
+from pathlib import Path
+from typing import Any
+
+import numpy as np
+import pandas as pd
+from sklearn.compose import ColumnTransformer
+from sklearn.impute import SimpleImputer
+from sklearn.linear_model import LogisticRegression
+from sklearn.metrics import roc_auc_score
+from sklearn.pipeline import Pipeline
+from sklearn.preprocessing import StandardScaler
+
+from benchmarks.splits import split_benchmark_data
+from featcopilot import AutoFeatureEngineer
+
+REPORT_DIR = Path(__file__).resolve().parent
+
+
+def create_dataset(n_samples: int = 5000, random_state: int = 42) -> pd.DataFrame:
+    """Create a synthetic classification dataset with explicit ratio/interaction signal."""
+    rng = np.random.default_rng(random_state)
+    df = pd.DataFrame(
+        {
+            "age": rng.integers(18, 80, n_samples),
+            "income": rng.gamma(3.5, 18000, n_samples),
+            "tenure_months": rng.integers(1, 120, n_samples),
+            "monthly_charges": rng.uniform(20, 180, n_samples),
+            "num_products": rng.integers(1, 6, n_samples),
+            "support_tickets": rng.poisson(2.2, n_samples),
+            "plan_tier": rng.choice(["free", "pro", "team"], n_samples, p=[0.45, 0.4, 0.15]),
+        }
+    )
+
+    charge_ratio = df["monthly_charges"] / (df["income"] / 12 + 1)
+    complaint_rate = df["support_tickets"] / (df["tenure_months"] + 1)
+    product_density = df["monthly_charges"] / (df["num_products"] + 0.5)
+    loyalty = ((df["age"] - 18) / 62) * (df["tenure_months"] / 120)
+    free_flag = (df["plan_tier"] == "free").astype(int)
+    team_flag = (df["plan_tier"] == "team").astype(int)
+
+    interaction_signal = charge_ratio * complaint_rate * 8.0
+    threshold_bonus = ((charge_ratio > 0.020) & (complaint_rate > 0.045)).astype(float) * 1.2
+    plan_interaction = free_flag * charge_ratio * 6.0 - team_flag * loyalty * 1.5
+
+    logit = (
+        -2.2 + 0.0035 * product_density + 1.8 * interaction_signal + threshold_bonus + plan_interaction - 1.1 * loyalty
+    )
+    prob = 1 / (1 + np.exp(-logit))
+    df["target"] = (rng.random(n_samples) < prob).astype(int)
+    return df
+
+
+def align_and_fill(X_train: pd.DataFrame, X_test: pd.DataFrame) -> tuple[pd.DataFrame, pd.DataFrame]:
+    """Align train/test columns and replace NaN/±inf with 0 for downstream models.
+
+    Several feature-engineering tools can emit ``±inf`` (e.g. Featuretools'
+    ``divide_numeric`` primitive on rows where the denominator is 0). Scaling
+    or fitting any sklearn estimator on those values raises, so we sanitize
+    them here as part of column alignment.
+    """
+    X_train_aligned, X_test_aligned = X_train.align(X_test, join="left", axis=1, fill_value=0)
+    X_train_clean = X_train_aligned.replace([np.inf, -np.inf], 0).fillna(0)
+    X_test_clean = X_test_aligned.replace([np.inf, -np.inf], 0).fillna(0)
+    return X_train_clean, X_test_clean
+
+
+def evaluate_auc(X_train: pd.DataFrame, X_test: pd.DataFrame, y_train: pd.Series, y_test: pd.Series) -> float:
+    """Train a simple classifier and return ROC-AUC.
+
+    Owns the categorical encoding + column alignment so individual benchmark
+    cases don't have to repeat that step. The encoding is idempotent for
+    already-numeric inputs (``get_dummies`` is a no-op when there are no
+    object/categorical columns), so per-tool runners that produce numeric
+    matrices can pass them in directly without first one-hot encoding again.
+    """
+    X_train = pd.get_dummies(X_train, drop_first=False)
+    X_test = pd.get_dummies(X_test, drop_first=False)
+    X_train, X_test = align_and_fill(X_train, X_test)
+
+    numeric_features = X_train.columns.tolist()
+    preprocessor = ColumnTransformer(
+        transformers=[
+            (
+                "num",
+                Pipeline(
+                    [
+                        ("imputer", SimpleImputer(strategy="constant", fill_value=0.0)),
+                        ("scaler", StandardScaler()),
+                    ]
+                ),
+                numeric_features,
+            )
+        ]
+    )
+    model = Pipeline(
+        steps=[
+            ("preprocessor", preprocessor),
+            ("classifier", LogisticRegression(max_iter=2000, C=1.0)),
+        ]
+    )
+    model.fit(X_train, y_train)
+    return roc_auc_score(y_test, model.predict_proba(X_test)[:, 1])
+
+
+def run_baseline(X_train: pd.DataFrame, X_test: pd.DataFrame, y_train: pd.Series, y_test: pd.Series) -> dict[str, Any]:
+    """Run one-hot baseline.
+
+    Encoding and alignment are owned by :func:`evaluate_auc`, so we pass the
+    raw frames straight through. ``n_features`` is reported as the post-encoding
+    column count to match what the model actually trains on.
+    """
+    auc = evaluate_auc(X_train, X_test, y_train, y_test)
+    n_features = pd.get_dummies(X_train, drop_first=False).shape[1]
+    return {"tool": "baseline", "auc": auc, "n_features": n_features}
+
+
+def run_featcopilot_case(
+    X_train: pd.DataFrame, X_test: pd.DataFrame, y_train: pd.Series, y_test: pd.Series
+) -> dict[str, Any]:
+    """Run FeatCopilot benchmark case."""
+    engineer = AutoFeatureEngineer(
+        engines=["tabular"],
+        max_features=60,
+        selection_methods=["mutual_info", "importance"],
+        correlation_threshold=0.95,
+        leakage_guard="warn",
+        verbose=False,
+    )
+    X_train_fe = engineer.fit_transform(X_train, y_train, target_name="target", apply_selection=True)
+    X_test_fe = engineer.transform(X_test)
+    X_train_fe, X_test_fe = align_and_fill(X_train_fe, X_test_fe)
+    auc = evaluate_auc(X_train_fe, X_test_fe, y_train, y_test)
+    return {"tool": "featcopilot", "auc": auc, "n_features": X_train_fe.shape[1]}
+
+
+def run_featuretools_case(
+    X_train: pd.DataFrame, X_test: pd.DataFrame, y_train: pd.Series, y_test: pd.Series
+) -> dict[str, Any]:
+    """Run Featuretools if available."""
+    try:
+        import featuretools as ft
+        import woodwork  # noqa: F401
+    except Exception as exc:
+        return {"tool": "featuretools", "status": f"unavailable: {exc}"}
+
+    train_copy = pd.get_dummies(X_train, drop_first=False).reset_index(drop=True)
+    test_copy = pd.get_dummies(X_test, drop_first=False).reset_index(drop=True)
+    train_copy, test_copy = align_and_fill(train_copy, test_copy)
+    train_copy["row_id"] = np.arange(len(train_copy))
+    test_copy["row_id"] = np.arange(len(test_copy))
+
+    try:
+        train_copy.ww.init(name="data", index="row_id")
+        es_train = ft.EntitySet(id="afe_train").add_dataframe(
+            dataframe_name="data", dataframe=train_copy, index="row_id"
+        )
+        train_fm, feature_defs = ft.dfs(
+            entityset=es_train,
+            target_dataframe_name="data",
+            trans_primitives=["add_numeric", "multiply_numeric", "divide_numeric"],
+            agg_primitives=[],
+            max_depth=2,
+            max_features=60,
+        )
+
+        test_copy.ww.init(name="data", index="row_id")
+        es_test = ft.EntitySet(id="afe_test").add_dataframe(dataframe_name="data", dataframe=test_copy, index="row_id")
+        test_fm = ft.calculate_feature_matrix(entityset=es_test, features=feature_defs)
+        train_fm, test_fm = align_and_fill(train_fm, test_fm)
+        auc = evaluate_auc(train_fm, test_fm, y_train, y_test)
+        return {"tool": "featuretools", "auc": auc, "n_features": train_fm.shape[1]}
+    except Exception as exc:
+        return {"tool": "featuretools", "status": f"failed: {exc}"}
+
+
+def run_autofeat_case(
+    X_train: pd.DataFrame, X_test: pd.DataFrame, y_train: pd.Series, y_test: pd.Series
+) -> dict[str, Any]:
+    """Run autofeat if available."""
+    try:
+        from autofeat import AutoFeatClassifier
+    except Exception as exc:
+        return {"tool": "autofeat", "status": f"unavailable: {exc}"}
+
+    X_train_num = pd.get_dummies(X_train, drop_first=False)
+    X_test_num = pd.get_dummies(X_test, drop_first=False)
+    X_train_num, X_test_num = align_and_fill(X_train_num, X_test_num)
+
+    try:
+        model = AutoFeatClassifier(verbose=0, feateng_steps=2, featsel_runs=2)
+        y_train_arr = np.asarray(y_train).ravel()
+        X_train_fe = model.fit_transform(X_train_num, y_train_arr)
+        X_test_fe = model.transform(X_test_num)
+        X_train_fe, X_test_fe = align_and_fill(X_train_fe, X_test_fe)
+        auc = evaluate_auc(X_train_fe, X_test_fe, y_train, y_test)
+        return {"tool": "autofeat", "auc": auc, "n_features": X_train_fe.shape[1]}
+    except Exception as exc:
+        return {"tool": "autofeat", "status": f"failed: {exc}"}
+
+
+def write_report(results: list[dict[str, Any]], output_path: Path) -> None:
+    """Write a markdown report."""
+    lines = [
+        "# Auto Feature Engineering Use-Case Benchmark",
+        "",
+        "Compares a plain baseline with FeatCopilot and common automatic feature engineering tools on an interaction-heavy tabular classification task.",
+        "",
+        "| Tool | Status | ROC-AUC | Feature Count |",
+        "|------|--------|---------|---------------|",
+    ]
+    for row in results:
+        status = row.get("status", "ok")
+        auc = f"{row['auc']:.4f}" if "auc" in row else "-"
+        n_features = str(row.get("n_features", "-"))
+        lines.append(f"| {row['tool']} | {status} | {auc} | {n_features} |")
+    output_path.write_text("\n".join(lines) + "\n", encoding="utf-8")
+
+
+def main() -> None:
+    """Run the use-case benchmark."""
+    parser = argparse.ArgumentParser(description="Run an auto feature engineering use-case benchmark")
+    parser.add_argument("--samples", type=int, default=5000, help="Number of synthetic samples")
+    parser.add_argument("--seed", type=int, default=42, help="Random seed")
+    args = parser.parse_args()
+
+    data = create_dataset(n_samples=args.samples, random_state=args.seed)
+    X = data.drop(columns=["target"])
+    y = data["target"]
+    train_idx, test_idx, y_train, y_test = split_benchmark_data(X, y, "classification", random_state=args.seed)
+    X_train = X.iloc[train_idx]
+    X_test = X.iloc[test_idx]
+
+    results = [
+        run_baseline(X_train, X_test, y_train, y_test),
+        run_featcopilot_case(X_train, X_test, y_train, y_test),
+        run_featuretools_case(X_train, X_test, y_train, y_test),
+        run_autofeat_case(X_train, X_test, y_train, y_test),
+    ]
+
+    output_path = REPORT_DIR / "AUTO_FEATURE_ENGINEERING_USE_CASE.md"
+    write_report(results, output_path)
+    print(json.dumps(results, indent=2))
+    print(f"\nWrote report to {output_path}")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/docs/examples/relational-feature-engineering.md b/docs/examples/relational-feature-engineering.md
new file mode 100644
index 0000000..f3e0fac
--- /dev/null
+++ b/docs/examples/relational-feature-engineering.md
@@ -0,0 +1,76 @@
+# Relational Feature Engineering
+
+A practical example for **customer / orders** style data where aggregation features matter.
+
+## When to use this
+
+Use the relational engine when:
+- you have a primary table and one or more related tables
+- useful signal lives in counts / sums / means / maxima over related entities
+- you want a lighter-weight path than a full Featuretools setup
+
+## Example
+
+```python
+import pandas as pd
+
+from featcopilot.engines.relational import RelationalEngine
+
+orders = pd.DataFrame(
+    {
+        "order_id": [1, 2, 3, 4, 5],
+        "customer_id": [1, 1, 2, 2, 3],
+        "amount": [100, 200, 150, 300, 50],
+        "category": ["A", "B", "A", "A", "B"],
+    }
+)
+
+customers = pd.DataFrame(
+    {
+        "customer_id": [1, 2, 3],
+        "age": [25, 35, 45],
+        "income": [50000, 70000, 60000],
+    }
+)
+
+engine = RelationalEngine(
+    aggregation_functions=["mean", "sum", "count", "max", "min"],
+    verbose=True,
+)
+engine.add_relationship("orders", "customers", "customer_id")
+
+features = engine.fit_transform(
+    orders,
+    related_tables={"customers": customers},
+)
+
+print(features.columns.tolist())
+```
+
+## Typical generated features
+
+- `customers_age_mean`
+- `customers_income_mean`
+- `amount_by_category_mean`
+- `amount_by_category_count`
+
+## Guardrails
+
+`RelationalEngine` now validates configured relationship keys:
+- missing child key in the primary table -> raises early
+- missing parent key in a related table -> raises early
+- duplicate relationship definitions are ignored
+
+That is boring, but it is the kind of boring that saves time.
+
+## When Featuretools is still the better choice
+
+Use Featuretools when you need:
+- deeper DFS-style synthesis
+- richer primitive libraries
+- more complex multi-table workflows
+
+Use FeatCopilot relational mode when you want:
+- a smaller API surface
+- straightforward aggregation features
+- sklearn-friendly workflows
diff --git a/docs/examples/time-aware-tabular.md b/docs/examples/time-aware-tabular.md
new file mode 100644
index 0000000..7b938e1
--- /dev/null
+++ b/docs/examples/time-aware-tabular.md
@@ -0,0 +1,66 @@
+# Time-Aware Tabular Prototype
+
+A practical prototype for **leakage-safe auto feature engineering** on time-aware tabular data.
+
+## Why this example matters
+
+Most real feature engineering failures are not caused by weak transformations. They come from:
+
+- random train/test splits on temporal data
+- future information leaking into features
+- offline features that cannot be reproduced later
+
+This example shows a safer baseline:
+
+1. sort by time
+2. split by time
+3. fit features on the training slice only
+4. transform the holdout slice separately
+5. compare against a plain model baseline
+
+## Script
+
+See:
+
+```text
+examples/time_aware_tabular_prototype.py
+```
+
+## Core pattern
+
+```python
+engineer = AutoFeatureEngineer(
+    engines=["tabular"],
+    max_features=30,
+    selection_methods=["mutual_info", "importance"],
+    correlation_threshold=0.9,
+    leakage_guard="warn",
+)
+
+X_train_fe = engineer.fit_transform(
+    X_train,
+    y_train,
+    target_name="churned",
+    apply_selection=True,
+)
+X_test_fe = engineer.transform(X_test)
+```
+
+## Leakage guard
+
+`AutoFeatureEngineer` now supports a lightweight `leakage_guard` option:
+
+- `"warn"` — default, warns if suspicious columns are present
+- `"raise"` — fail fast when likely leakage columns are detected
+- `"off"` — disable the check
+
+This is intentionally conservative. It does **not** prove your pipeline is safe. It just catches obvious foot-guns such as columns named like:
+
+- `target`
+- `label`
+- `outcome`
+- `future_*`
+
+## Recommendation
+
+For a real project, start with this workflow before trying more advanced LLM or agent-based feature generation. If the time-aware baseline is not trustworthy, more automation only makes the mistake faster.
diff --git a/examples/time_aware_tabular_prototype.py b/examples/time_aware_tabular_prototype.py
new file mode 100644
index 0000000..f8d8781
--- /dev/null
+++ b/examples/time_aware_tabular_prototype.py
@@ -0,0 +1,112 @@
+"""Time-aware tabular prototype with leakage-safe evaluation.
+
+This example shows a practical starting point for auto feature engineering on
+behavioral / event / tabular data where time-based splitting matters.
+"""
+
+from __future__ import annotations
+
+import numpy as np
+import pandas as pd
+from sklearn.ensemble import HistGradientBoostingClassifier
+from sklearn.metrics import roc_auc_score
+
+from featcopilot import AutoFeatureEngineer
+
+
+def create_time_aware_dataset(n_samples: int = 2000) -> pd.DataFrame:
+    """Create a synthetic time-aware churn-like dataset."""
+    rng = np.random.default_rng(42)
+    timestamps = pd.date_range("2024-01-01", periods=n_samples, freq="h")
+
+    df = pd.DataFrame(
+        {
+            "event_time": timestamps,
+            "account_age_days": rng.integers(10, 1200, n_samples),
+            "sessions_7d": rng.poisson(8, n_samples),
+            "tickets_30d": rng.poisson(2, n_samples),
+            "spend_30d": rng.gamma(2.5, 40, n_samples),
+            "plan_tier": rng.choice(["free", "pro", "team"], n_samples, p=[0.45, 0.4, 0.15]),
+        }
+    )
+
+    spend_ratio = df["spend_30d"] / (df["account_age_days"] + 10)
+    support_pressure = df["tickets_30d"] / (df["sessions_7d"] + 1)
+    pro_flag = (df["plan_tier"] == "pro").astype(int)
+    team_flag = (df["plan_tier"] == "team").astype(int)
+
+    churn_logit = (
+        -1.2
+        - 0.015 * df["sessions_7d"]
+        + 0.25 * df["tickets_30d"]
+        + 3.2 * support_pressure
+        + 1.7 * spend_ratio
+        - 0.35 * pro_flag
+        - 0.55 * team_flag
+    )
+    churn_prob = 1 / (1 + np.exp(-churn_logit))
+    df["churned"] = (rng.random(n_samples) < churn_prob).astype(int)
+    return df
+
+
+def temporal_split(df: pd.DataFrame, time_col: str, valid_fraction: float = 0.2) -> tuple[pd.DataFrame, pd.DataFrame]:
+    """Split a dataset by time instead of random shuffling."""
+    df = df.sort_values(time_col).reset_index(drop=True)
+    split_idx = int(len(df) * (1 - valid_fraction))
+    return df.iloc[:split_idx].copy(), df.iloc[split_idx:].copy()
+
+
+def main() -> None:
+    """Run a leakage-safe auto feature engineering prototype."""
+    data = create_time_aware_dataset()
+    train_df, test_df = temporal_split(data, time_col="event_time", valid_fraction=0.2)
+
+    feature_cols = [
+        "account_age_days",
+        "sessions_7d",
+        "tickets_30d",
+        "spend_30d",
+        "plan_tier",
+    ]
+    target_col = "churned"
+
+    X_train = train_df[feature_cols]
+    y_train = train_df[target_col]
+    X_test = test_df[feature_cols]
+    y_test = test_df[target_col]
+
+    X_train_baseline = pd.get_dummies(X_train, drop_first=False)
+    X_test_baseline = pd.get_dummies(X_test, drop_first=False)
+    X_train_baseline, X_test_baseline = X_train_baseline.align(X_test_baseline, join="left", axis=1, fill_value=0)
+
+    baseline = HistGradientBoostingClassifier(max_depth=4, learning_rate=0.05, random_state=42)
+    baseline.fit(X_train_baseline, y_train)
+    baseline_auc = roc_auc_score(y_test, baseline.predict_proba(X_test_baseline)[:, 1])
+
+    engineer = AutoFeatureEngineer(
+        engines=["tabular"],
+        max_features=30,
+        selection_methods=["mutual_info", "importance"],
+        correlation_threshold=0.9,
+        leakage_guard="warn",
+        verbose=True,
+    )
+    X_train_fe = engineer.fit_transform(X_train, y_train, target_name=target_col, apply_selection=True).fillna(0)
+    X_test_fe = engineer.transform(X_test).fillna(0)
+
+    common_cols = [col for col in X_train_fe.columns if col in X_test_fe.columns]
+    X_train_fe = X_train_fe[common_cols]
+    X_test_fe = X_test_fe[common_cols]
+
+    model = HistGradientBoostingClassifier(max_depth=4, learning_rate=0.05, random_state=42)
+    model.fit(X_train_fe, y_train)
+    engineered_auc = roc_auc_score(y_test, model.predict_proba(X_test_fe)[:, 1])
+
+    print(f"Temporal baseline ROC-AUC: {baseline_auc:.4f}")
+    print(f"Engineered ROC-AUC:       {engineered_auc:.4f}")
+    print(f"Delta:                    {engineered_auc - baseline_auc:+.4f}")
+    print(f"Selected features:        {len(common_cols)}")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/featcopilot/__init__.py b/featcopilot/__init__.py
index 377aefb..7178867 100644
--- a/featcopilot/__init__.py
+++ b/featcopilot/__init__.py
@@ -5,9 +5,14 @@
 with novel LLM-powered capabilities via GitHub Copilot SDK.
 """
 
-from importlib.metadata import version
+from importlib.metadata import PackageNotFoundError, version
+
+try:
+    __version__ = version("featcopilot")
+except PackageNotFoundError:
+    # Allow importing directly from a source checkout before editable/install step.
+    __version__ = "0+unknown"
 
-__version__ = version("featcopilot")
 __author__ = "FeatCopilot Contributors"
 
 from featcopilot.core.base import BaseEngine, BaseSelector
diff --git a/featcopilot/engines/relational.py b/featcopilot/engines/relational.py
index 19ad435..fdc9963 100644
--- a/featcopilot/engines/relational.py
+++ b/featcopilot/engines/relational.py
@@ -106,14 +106,14 @@ def add_relationship(
         -------
         self : RelationalEngine
         """
-        self._relationships.append(
-            {
-                "child": child_table,
-                "parent": parent_table,
-                "child_key": key_column,
-                "parent_key": parent_key or key_column,
-            }
-        )
+        relationship = {
+            "child": child_table,
+            "parent": parent_table,
+            "child_key": key_column,
+            "parent_key": parent_key or key_column,
+        }
+        if relationship not in self._relationships:
+            self._relationships.append(relationship)
         return self
 
     def fit(
@@ -143,6 +143,8 @@ def fit(
         self._related_tables = related_tables or {}
         self._primary_columns = X.columns.tolist()
 
+        self._validate_relationships(X, self._related_tables)
+
         if self.config.verbose:
             logger.info(f"RelationalEngine: {len(self._relationships)} relationships defined")
 
@@ -175,6 +177,7 @@ def transform(
 
         X = self._validate_input(X)
         related_tables = related_tables or self._related_tables
+        self._validate_relationships(X, related_tables)
         result = X.copy()
 
         # Generate features from relationships
@@ -198,6 +201,23 @@ def transform(
 
         return result
 
+    def _validate_relationships(self, primary_df: pd.DataFrame, related_tables: dict[str, pd.DataFrame]) -> None:
+        """Validate configured relationships against available tables and keys."""
+        for relationship in self._relationships:
+            child_key = relationship["child_key"]
+            parent_table = relationship["parent"]
+            parent_key = relationship["parent_key"]
+
+            if child_key not in primary_df.columns:
+                raise ValueError(f"child_key '{child_key}' not found in primary table")
+
+            if parent_table not in related_tables:
+                continue
+
+            parent_df = related_tables[parent_table]
+            if parent_key not in parent_df.columns:
+                raise ValueError(f"parent_key '{parent_key}' not found in related table '{parent_table}'")
+
     def _aggregate_from_relationship(
         self,
         child_df: pd.DataFrame,
diff --git a/featcopilot/engines/timeseries.py b/featcopilot/engines/timeseries.py
index 8702b75..d361f20 100644
--- a/featcopilot/engines/timeseries.py
+++ b/featcopilot/engines/timeseries.py
@@ -21,6 +21,7 @@ class TimeSeriesEngineConfig(EngineConfig):
     """Configuration for time series feature engine."""
 
     name: str = "TimeSeriesEngine"
+    series_in_rows: bool = Field(default=False, description="Treat each row as an independent time series")
     features: list[str] = Field(
         default_factory=lambda: [
             "basic_stats",
@@ -92,6 +93,7 @@ def __init__(
         window_sizes: Optional[list[int]] = None,
         max_features: Optional[int] = None,
         verbose: bool = False,
+        series_in_rows: bool = False,
         **kwargs,
     ):
         config = TimeSeriesEngineConfig(
@@ -99,6 +101,7 @@ def __init__(
             window_sizes=window_sizes or [5, 10, 20],
             max_features=max_features,
             verbose=verbose,
+            series_in_rows=series_in_rows,
             **kwargs,
         )
         super().__init__(config=config)
@@ -131,11 +134,24 @@ def fit(
         """
         X = self._validate_input(X)
 
+        if time_column is not None and time_column not in X.columns:
+            raise ValueError(f"time_column '{time_column}' not found in input data")
+
+        self._time_index_column = time_column
+
         # Identify numeric columns for time series analysis
-        self._time_columns = X.select_dtypes(include=[np.number]).columns.tolist()
+        if self.config.series_in_rows:
+            self._time_columns = [col for col in X.columns if col != time_column]
+        else:
+            self._time_columns = X.select_dtypes(include=[np.number]).columns.tolist()
+            if time_column is not None:
+                self._time_columns = [col for col in self._time_columns if col != time_column]
 
         if self.config.verbose:
-            logger.info(f"TimeSeriesEngine: Found {len(self._time_columns)} numeric columns")
+            logger.info(
+                f"TimeSeriesEngine: Found {len(self._time_columns)} columns "
+                f"(series_in_rows={self.config.series_in_rows})"
+            )
 
         self._is_fitted = True
         return self
@@ -158,21 +174,10 @@ def transform(self, X: Union[pd.DataFrame, np.ndarray], **kwargs) -> pd.DataFram
             raise RuntimeError("Engine must be fitted before transform")
 
         X = self._validate_input(X)
-        features_dict = {}
 
-        for col in self._time_columns:
-            series = X[col].values
-
-            for feature_group in self.config.features:
-                if feature_group in self.FEATURE_EXTRACTORS:
-                    method_name = self.FEATURE_EXTRACTORS[feature_group]
-                    method = getattr(self, method_name)
-                    extracted = method(series, col)
-                    features_dict.update(extracted)
-
-        # For DataFrames with multiple rows, extract features across the entire column
-        if len(X) > 1:
-            # Each column is treated as a single time series
+        if self.config.series_in_rows:
+            result = self._extract_per_row(X)
+        else:
             features_dict = {}
             for col in self._time_columns:
                 series = X[col].values
@@ -184,7 +189,7 @@ def transform(self, X: Union[pd.DataFrame, np.ndarray], **kwargs) -> pd.DataFram
                         extracted = method(series, col)
                         features_dict.update(extracted)
 
-        result = pd.DataFrame([features_dict])
+            result = pd.DataFrame([features_dict])
 
         self._feature_names = list(result.columns)
 
diff --git a/featcopilot/transformers/sklearn_compat.py b/featcopilot/transformers/sklearn_compat.py
index c93f8fa..07b5566 100644
--- a/featcopilot/transformers/sklearn_compat.py
+++ b/featcopilot/transformers/sklearn_compat.py
@@ -3,6 +3,7 @@
 Provides drop-in sklearn transformers for feature engineering pipelines.
 """
 
+import warnings
 from typing import Any, Optional, Union
 
 import numpy as np
@@ -16,6 +17,7 @@
 from featcopilot.engines.timeseries import TimeSeriesEngine
 from featcopilot.selection.unified import FeatureSelector
 from featcopilot.utils.logger import get_logger
+from featcopilot.utils.validation import find_potential_leakage_columns
 
 logger = get_logger(__name__)
 
@@ -95,10 +97,23 @@ class AutoFeatureEngineer(BaseEstimator, TransformerMixin):
         Maximum features to generate/select
     selection_methods : list, default=['mutual_info', 'importance']
         Feature selection methods
+    correlation_threshold : float, default=0.85
+        Maximum pairwise correlation allowed during correlation-based selection
     llm_config : dict, optional
         Configuration for LLM engine
     verbose : bool, default=False
         Verbose output
+    leakage_guard : {'off', 'warn', 'raise'}, default='warn'
+        How to handle columns whose names suggest target, label, or future-information leakage
+
+    Other Parameters
+    ----------------
+    target_name : hashable, optional
+        Fit-time parameter accepted by :meth:`fit` and :meth:`fit_transform`.
+        When provided, the leakage guard cross-references column labels
+        against the target so derived variants (e.g. ``target_encoded``) are
+        flagged. Accepts any column-label type DataFrames support
+        (typically ``str``, but also ``int`` or other hashables).
 
     Examples
     --------
@@ -107,9 +122,13 @@ class AutoFeatureEngineer(BaseEstimator, TransformerMixin):
     ...     max_features=100,
     ...     llm_config={'model': 'gpt-5.2', 'enable_semantic': True}
     ... )
-    >>> X_transformed = engineer.fit_transform(X, y)
+    >>> X_transformed = engineer.fit_transform(X, y, target_name='label')
     """
 
+    SUPPORTED_ENGINES = {"tabular", "timeseries", "relational", "text", "llm"}
+    SUPPORTED_SELECTION_METHODS = {"mutual_info", "importance", "f_test", "chi2", "correlation", "xgboost"}
+    SUPPORTED_LEAKAGE_GUARDS = {"off", "warn", "raise"}
+
     def __init__(
         self,
         engines: Optional[list[str]] = None,
@@ -118,13 +137,21 @@ def __init__(
         correlation_threshold: float = 0.85,
         llm_config: Optional[dict[str, Any]] = None,
         verbose: bool = False,
+        leakage_guard: str = "warn",
     ):
-        self.engines = engines or ["tabular"]
+        # Use ``is not None`` defaulting (rather than ``or``) so that explicit
+        # empty containers and identity-bearing arguments are preserved. This
+        # also keeps ``self.<param> is param`` for any non-None argument, which
+        # is required for sklearn's ``clone`` round-trip identity check.
+        self.engines = engines if engines is not None else ["tabular"]
         self.max_features = max_features
-        self.selection_methods = selection_methods or ["mutual_info", "importance"]
+        self.selection_methods = selection_methods if selection_methods is not None else ["mutual_info", "importance"]
         self.correlation_threshold = correlation_threshold
-        self.llm_config = llm_config or {}
+        self.llm_config = llm_config if llm_config is not None else {}
         self.verbose = verbose
+        self.leakage_guard = leakage_guard
+
+        self._validate_configuration()
 
         self._engine_instances: dict[str, Any] = {}
         self._selector: Optional[FeatureSelector] = None
@@ -133,12 +160,107 @@ def __init__(
         self._column_descriptions: dict[str, str] = {}
         self._task_description: str = ""
 
+    def _validate_configuration(self) -> None:
+        """Validate user-facing configuration early."""
+        # Reject non-sequence containers (and ``str``/``bytes``, which are
+        # technically iterable but would be iterated character-by-character)
+        # before any iteration so that the downstream non-string-entry,
+        # empty, and set-diff checks all run on a real list/tuple. Without
+        # this guard, ``engines="tabular"`` would silently expand into
+        # individual characters and produce a confusing "Unknown engines"
+        # error, and ``engines=5`` would raise an unrelated ``TypeError``
+        # from ``set(self.engines)``.
+        if not isinstance(self.engines, (list, tuple)):
+            raise ValueError(
+                "engines must be a list or tuple of strings; got "
+                f"{type(self.engines).__name__}={self.engines!r}. "
+                f"Supported engines: {sorted(self.SUPPORTED_ENGINES)}"
+            )
+
+        if not isinstance(self.selection_methods, (list, tuple)):
+            raise ValueError(
+                "selection_methods must be a list or tuple of strings; got "
+                f"{type(self.selection_methods).__name__}={self.selection_methods!r}. "
+                f"Supported methods: {sorted(self.SUPPORTED_SELECTION_METHODS)}"
+            )
+
+        # Reject non-string entries up front so that the diff against the
+        # supported-name sets (and the ``sorted(...)`` used to build the error
+        # message) cannot raise an unrelated ``TypeError`` for mixed-type
+        # inputs (e.g. ``engines=[None, "spaceship"]``).
+        non_string_engines = [e for e in self.engines if not isinstance(e, str)]
+        if non_string_engines:
+            raise ValueError(
+                "engines must contain only strings; got non-string entries: "
+                f"{non_string_engines!r}. Supported engines: {sorted(self.SUPPORTED_ENGINES)}"
+            )
+
+        # Reject empty collections explicitly. ``engines=None`` is normalized to
+        # the default in ``__init__`` / ``set_params``; an explicit empty list
+        # would otherwise leave ``fit()`` running zero engines and ``transform()``
+        # silently returning the input (modulo NaN/inf cleanup), which is a
+        # surprising silent no-op rather than a misconfiguration.
+        if not self.engines:
+            raise ValueError(
+                "engines must contain at least one engine; got an empty sequence. "
+                f"Pass ``engines=None`` for the default ['tabular'] or pick from {sorted(self.SUPPORTED_ENGINES)}."
+            )
+
+        non_string_methods = [m for m in self.selection_methods if not isinstance(m, str)]
+        if non_string_methods:
+            raise ValueError(
+                "selection_methods must contain only strings; got non-string entries: "
+                f"{non_string_methods!r}. Supported methods: {sorted(self.SUPPORTED_SELECTION_METHODS)}"
+            )
+
+        if not self.selection_methods:
+            raise ValueError(
+                "selection_methods must contain at least one method; got an empty sequence. "
+                "Pass ``selection_methods=None`` for the default "
+                f"['mutual_info', 'importance'] or pick from {sorted(self.SUPPORTED_SELECTION_METHODS)}."
+            )
+
+        unknown_engines = sorted(set(self.engines) - self.SUPPORTED_ENGINES)
+        if unknown_engines:
+            raise ValueError(f"Unknown engines: {unknown_engines}. Supported engines: {sorted(self.SUPPORTED_ENGINES)}")
+
+        unknown_methods = sorted(set(self.selection_methods) - self.SUPPORTED_SELECTION_METHODS)
+        if unknown_methods:
+            raise ValueError(
+                "Unknown selection methods: "
+                f"{unknown_methods}. Supported methods: {sorted(self.SUPPORTED_SELECTION_METHODS)}"
+            )
+
+        if self.leakage_guard not in self.SUPPORTED_LEAKAGE_GUARDS:
+            raise ValueError(
+                f"leakage_guard must be one of {sorted(self.SUPPORTED_LEAKAGE_GUARDS)}, got {self.leakage_guard!r}"
+            )
+
+        if self.max_features is not None and self.max_features <= 0:
+            raise ValueError("max_features must be positive when provided")
+
+    def _reset_fit_state(self) -> None:
+        """Reset all attributes populated during ``fit``/``fit_transform``.
+
+        Called at the start of ``fit`` so that re-fitting (e.g. after changing
+        ``engines`` via ``set_params``) cannot leave stale fitted engines, a
+        stale selector, or fit-time metadata in place. Mirrors the fit-derived
+        attribute initialization in ``__init__``.
+        """
+        self._engine_instances = {}
+        self._selector = None
+        self._feature_set = FeatureSet()
+        self._is_fitted = False
+        self._column_descriptions = {}
+        self._task_description = ""
+
     def fit(
         self,
         X: Union[pd.DataFrame, np.ndarray],
         y: Optional[Union[pd.Series, np.ndarray]] = None,
         column_descriptions: Optional[dict[str, str]] = None,
         task_description: str = "prediction task",
+        target_name: Optional[Any] = None,
         **fit_params,
     ) -> "AutoFeatureEngineer":
         """
@@ -154,6 +276,11 @@ def fit(
             Human-readable descriptions of columns (for LLM)
         task_description : str
             Description of the ML task (for LLM)
+        target_name : hashable, optional
+            Target column label used by leakage checks to identify related
+            feature columns. Accepts any column-label type DataFrames support
+            (typically ``str``, but also ``int`` or other hashables); the
+            leakage helper normalizes labels via ``str(...)`` before matching.
         **fit_params : dict
             Additional parameters
 
@@ -161,6 +288,14 @@ def fit(
         -------
         self : AutoFeatureEngineer
         """
+        # Reset all fit-derived state so a refit (e.g. after changing ``engines``
+        # or after a previous ``fit_transform`` that built a selector) cannot leak
+        # stale engines, a stale selector, or the previous ``_is_fitted`` flag
+        # into a subsequent ``transform`` call. Any early exit below (validation
+        # error, leakage_guard='raise', engine fit failure) leaves the estimator
+        # in a clean, unfitted state rather than a partially-fitted one.
+        self._reset_fit_state()
+
         # Convert to DataFrame if needed
         if isinstance(X, np.ndarray):
             X = pd.DataFrame(X, columns=[f"feature_{i}" for i in range(X.shape[1])])
@@ -168,6 +303,17 @@ def fit(
         self._column_descriptions = column_descriptions or {}
         self._task_description = task_description
 
+        suspicious_columns = find_potential_leakage_columns(X.columns.tolist(), target_name=target_name)
+        if suspicious_columns and self.leakage_guard != "off":
+            message = (
+                "Potential leakage-prone columns detected: "
+                f"{suspicious_columns}. Review time/label leakage before fitting, "
+                "or set leakage_guard='off' to disable this check."
+            )
+            if self.leakage_guard == "raise":
+                raise ValueError(message)
+            warnings.warn(message, UserWarning, stacklevel=2)
+
         # Fit each engine
         for engine_name in self.engines:
             engine = self._create_engine(engine_name)
@@ -271,6 +417,7 @@ def fit_transform(
         y: Optional[Union[pd.Series, np.ndarray]] = None,
         column_descriptions: Optional[dict[str, str]] = None,
         task_description: str = "prediction task",
+        target_name: Optional[Any] = None,
         apply_selection: bool = True,
         **fit_params,
     ) -> pd.DataFrame:
@@ -287,6 +434,11 @@ def fit_transform(
             Human-readable column descriptions
         task_description : str
             ML task description
+        target_name : hashable, optional
+            Target column label used by leakage checks to identify related
+            feature columns. Accepts any column-label type DataFrames support
+            (typically ``str``, but also ``int`` or other hashables); the
+            leakage helper normalizes labels via ``str(...)`` before matching.
         apply_selection : bool, default=True
             Whether to apply feature selection
         **fit_params : dict
@@ -297,8 +449,9 @@ def fit_transform(
         X_transformed : DataFrame
             Transformed data with generated features
         """
-        self.fit(X, y, column_descriptions, task_description, **fit_params)
-        result = self.transform(X)
+        self.fit(X, y, column_descriptions, task_description, target_name=target_name, **fit_params)
+        # Reuse transform-relevant kwargs (e.g. text_columns, related_tables) during fit_transform.
+        result = self.transform(X, **fit_params)
 
         # Track original features (input columns) vs derived features
         if isinstance(X, np.ndarray):
@@ -408,10 +561,73 @@ def get_params(self, deep=True):
             "correlation_threshold": self.correlation_threshold,
             "llm_config": self.llm_config,
             "verbose": self.verbose,
+            "leakage_guard": self.leakage_guard,
         }
 
     def set_params(self, **params):
-        """Set parameters for sklearn compatibility."""
-        for key, value in params.items():
-            setattr(self, key, value)
+        """
+        Set parameters for sklearn compatibility.
+
+        Validates parameter keys against the estimator's known parameters
+        (raising :class:`ValueError` on unknown keys, matching scikit-learn
+        ``BaseEstimator.set_params`` behavior) and then mirrors the defaulting
+        performed in ``__init__`` so callers (e.g. sklearn cloning,
+        ``GridSearchCV`` parameter grids) can pass ``None`` for
+        collection-valued parameters and have it normalized back to the default
+        rather than raising during validation.
+
+        The update is atomic: if any provided value fails configuration
+        validation, all in-flight mutations are rolled back so the estimator
+        is left in its pre-call state rather than a partially-mutated invalid
+        one.
+
+        Parameters
+        ----------
+        **params
+            Estimator parameters to update. Each key must already be a
+            top-level parameter accepted by ``__init__``.
+
+        Returns
+        -------
+        AutoFeatureEngineer
+            ``self``, to support fluent chaining.
+
+        Raises
+        ------
+        ValueError
+            If ``params`` contains a key that is not a known estimator
+            parameter, or if any provided value fails configuration
+            validation (see :meth:`_validate_configuration`). On validation
+            failure the estimator's parameters are restored to the values
+            they held before the call.
+        """
+        valid_params = self.get_params(deep=True)
+        invalid_keys = sorted(set(params) - set(valid_params))
+        if invalid_keys:
+            raise ValueError(
+                f"Invalid parameter(s) {invalid_keys} for estimator {type(self).__name__}. "
+                f"Valid parameters are: {sorted(valid_params)}."
+            )
+
+        # Snapshot the current values for every parameter we are about to
+        # change (including any whose final value will come from None
+        # normalization below) so that a validation failure can roll back to
+        # a fully consistent pre-call state.
+        snapshot = {key: getattr(self, key) for key in params}
+
+        try:
+            for key, value in params.items():
+                setattr(self, key, value)
+            if self.engines is None:
+                self.engines = ["tabular"]
+            if self.selection_methods is None:
+                self.selection_methods = ["mutual_info", "importance"]
+            if self.llm_config is None:
+                self.llm_config = {}
+            self._validate_configuration()
+        except Exception:
+            for key, value in snapshot.items():
+                setattr(self, key, value)
+            raise
+
         return self
diff --git a/featcopilot/utils/__init__.py b/featcopilot/utils/__init__.py
index ff6a969..8920ef4 100644
--- a/featcopilot/utils/__init__.py
+++ b/featcopilot/utils/__init__.py
@@ -10,6 +10,7 @@
     list_models,
 )
 from featcopilot.utils.parallel import parallel_apply
+from featcopilot.utils.validation import find_potential_leakage_columns
 
 __all__ = [
     "parallel_apply",
@@ -20,4 +21,5 @@
     "get_default_model",
     "get_model_names",
     "is_valid_model",
+    "find_potential_leakage_columns",
 ]
diff --git a/featcopilot/utils/validation.py b/featcopilot/utils/validation.py
new file mode 100644
index 0000000..bce5ec2
--- /dev/null
+++ b/featcopilot/utils/validation.py
@@ -0,0 +1,100 @@
+"""Validation helpers for safer feature engineering workflows."""
+
+import re
+from typing import Any, Optional
+
+DEFAULT_LEAKAGE_KEYWORDS = [
+    "target",
+    "label",
+    "outcome",
+    "ground_truth",
+    "y_true",
+    "future",
+    "leak",
+]
+
+
+def _normalize_column_name(name: Any) -> str:
+    """Normalize a column name for fuzzy matching."""
+    return re.sub(r"[^a-z0-9]+", "", str(name).lower())
+
+
+def find_potential_leakage_columns(
+    columns: list[Any],
+    target_name: Optional[Any] = None,
+    keywords: Optional[list[str]] = None,
+) -> list[Any]:
+    """
+    Find suspicious columns that may leak label or future information.
+
+    Parameters
+    ----------
+    columns : list
+        Input column names or labels to inspect.
+    target_name : optional
+        Expected target/label column name or label. Related variants will be flagged.
+    keywords : list[str], optional
+        Additional suspicious keywords to match against normalized column names.
+
+    Returns
+    -------
+    list
+        Column names or labels that deserve manual review for leakage.
+
+    Notes
+    -----
+    Target-name matching is intentionally fuzzy: labels are normalized and substring
+    variants are flagged so derived names such as ``target_encoded`` are reviewed.
+
+    Pass ``keywords=[]`` to opt out of keyword-based matching entirely (only
+    the explicit ``target_name`` will be used). ``target_name`` is treated as
+    absent only when it is ``None`` *or* normalizes to an empty string after
+    stripping non-alphanumerics; this lets falsy-but-meaningful values such
+    as ``0`` participate in matching while preventing ``target_name=""``
+    from matching every column via the empty-substring trap. Symmetrically,
+    columns whose labels normalize to an empty string (e.g. ``"---"``,
+    ``"!!!"``) are skipped entirely so the empty ``normalized_column`` cannot
+    be reported as a substring of every ``normalized_target``.
+    """
+    # Use ``is None`` defaulting (rather than ``or``) so callers can pass an
+    # empty list to explicitly disable keyword matching.
+    if keywords is None:
+        keywords = DEFAULT_LEAKAGE_KEYWORDS
+    normalized_keywords = [_normalize_column_name(keyword) for keyword in keywords]
+
+    # Use explicit ``is None`` so falsy but meaningful target labels (e.g. ``0``)
+    # still participate in matching. After normalization, an empty string is
+    # treated as "no target" so that ``target_name=""`` (or values like ``"---"``
+    # that strip to nothing) cannot match every column via the
+    # ``normalized_target in normalized_column`` substring check.
+    if target_name is None:
+        normalized_target: Optional[str] = None
+    else:
+        normalized = _normalize_column_name(target_name)
+        normalized_target = normalized if normalized else None
+
+    suspicious: list[Any] = []
+    for column in columns:
+        normalized_column = _normalize_column_name(column)
+
+        # A column whose label normalizes to an empty string (e.g. ``"---"`` or
+        # ``"!!!"``) has no meaningful content to compare against. Skipping it
+        # closes the column-side counterpart of the ``"" in normalized_target``
+        # trap (the empty ``normalized_column`` would otherwise be a substring
+        # of every non-empty ``normalized_target`` and be flagged whenever a
+        # target was provided), and similarly avoids any keyword false-positive
+        # via the same empty-substring path.
+        if not normalized_column:
+            continue
+
+        keyword_hit = any(keyword and keyword in normalized_column for keyword in normalized_keywords)
+        target_hit = normalized_target is not None and (
+            normalized_column == normalized_target
+            or normalized_target in normalized_column
+            or normalized_column in normalized_target
+        )
+
+        if keyword_hit or target_hit:
+            suspicious.append(column)
+
+    return suspicious
diff --git a/mkdocs.yml b/mkdocs.yml
index a091b3f..8136adf 100644
--- a/mkdocs.yml
+++ b/mkdocs.yml
@@ -85,6 +85,8 @@ nav:
   - Examples:
     - End-to-End Demo: examples/e2e-demo.md
     - Basic Usage: examples/basic.md
+    - Time-Aware Tabular: examples/time-aware-tabular.md
+    - Relational Feature Engineering: examples/relational-feature-engineering.md
     - LLM-Powered: examples/llm-powered.md
     - Sklearn Pipeline: examples/sklearn-pipeline.md
     - Domain-Specific: examples/domain-specific.md
diff --git a/tests/test_autofeat.py b/tests/test_autofeat.py
index afe6eb5..600b11a 100644
--- a/tests/test_autofeat.py
+++ b/tests/test_autofeat.py
@@ -75,6 +75,39 @@ def test_multiple_engines(self, sample_data):
         assert result is not None
         assert len(result.columns) > 0
 
+    def test_leakage_guard_warns_on_suspicious_columns(self, sample_data):
+        """Test leakage guard warns when suspicious columns are present."""
+        X, y = sample_data
+        X = X.rename(columns={"balance": "future_target_signal"})
+        engineer = AutoFeatureEngineer(engines=["tabular"], leakage_guard="warn")
+
+        with pytest.warns(UserWarning, match="Potential leakage-prone columns detected"):
+            engineer.fit(X, y, target_name="target")
+
+    def test_leakage_guard_raises_on_suspicious_columns(self, sample_data):
+        """Test leakage guard can hard-fail when suspicious columns are present."""
+        X, y = sample_data
+        X = X.rename(columns={"balance": "churn_label_proxy"})
+        engineer = AutoFeatureEngineer(engines=["tabular"], leakage_guard="raise")
+
+        with pytest.raises(ValueError, match="Potential leakage-prone columns detected"):
+            engineer.fit(X, y, target_name="churn")
+
+    def test_invalid_engine_configuration_raises(self):
+        """Test invalid engine names fail early."""
+        with pytest.raises(ValueError, match="Unknown engines"):
+            AutoFeatureEngineer(engines=["tabular", "spaceship"])
+
+    def test_invalid_selection_method_raises(self):
+        """Test invalid selection methods fail early."""
+        with pytest.raises(ValueError, match="Unknown selection methods"):
+            AutoFeatureEngineer(selection_methods=["mutual_info", "magic"])
+
+    def test_invalid_leakage_guard_raises(self):
+        """Test invalid leakage guard setting fails early."""
+        with pytest.raises(ValueError, match="leakage_guard must be one of"):
+            AutoFeatureEngineer(leakage_guard="maybe")
+
 
 class TestAutoFeatureEngineerParams:
     """Test parameter handling."""
diff --git a/tests/test_benchmark_splits.py b/tests/test_benchmark_splits.py
new file mode 100644
index 0000000..570ea39
--- /dev/null
+++ b/tests/test_benchmark_splits.py
@@ -0,0 +1,176 @@
+"""Tests for the shared benchmarks split helper and its wiring."""
+
+from __future__ import annotations
+
+import importlib
+import inspect
+from pathlib import Path
+
+import numpy as np
+import pandas as pd
+import pytest
+
+from benchmarks.splits import split_benchmark_data
+
+# ---------------------------------------------------------------------------
+# Behavioral tests for split_benchmark_data
+# ---------------------------------------------------------------------------
+
+
+def test_classification_uses_stratified_split():
+    """Classification tasks should produce a stratified split when class counts allow."""
+    rng = np.random.default_rng(0)
+    X = pd.DataFrame({"f": rng.normal(size=200)})
+    y = pd.Series(([0] * 160) + ([1] * 40))
+
+    train_idx, test_idx, y_train, y_test = split_benchmark_data(X, y, "classification", random_state=42)
+
+    assert len(train_idx) + len(test_idx) == len(X)
+    assert set(train_idx).isdisjoint(set(test_idx))
+    train_pos_ratio = (y_train == 1).mean()
+    test_pos_ratio = (y_test == 1).mean()
+    expected_ratio = (y == 1).mean()
+    assert abs(train_pos_ratio - expected_ratio) < 0.02
+    assert abs(test_pos_ratio - expected_ratio) < 0.02
+
+
+def test_classification_falls_back_when_class_too_small():
+    """Singleton classes should not raise; the split falls back to non-stratified."""
+    X = pd.DataFrame({"f": np.arange(10.0)})
+    y = pd.Series([0] * 9 + [1])
+
+    train_idx, test_idx, _, _ = split_benchmark_data(X, y, "classification", random_state=0)
+
+    assert len(train_idx) + len(test_idx) == len(X)
+
+
+def test_forecasting_uses_chronological_split():
+    """Forecasting/timeseries tasks should preserve temporal order."""
+    X = pd.DataFrame({"t": np.arange(100)})
+    y = pd.Series(np.arange(100, dtype=float))
+
+    train_idx, test_idx, y_train, y_test = split_benchmark_data(X, y, "forecasting", random_state=123)
+
+    assert list(train_idx) == list(range(80))
+    assert list(test_idx) == list(range(80, 100))
+    assert y_train.iloc[-1] < y_test.iloc[0]
+
+
+def test_timeseries_keyword_also_chronological():
+    """The 'timeseries' substring should also trigger chronological split."""
+    X = pd.DataFrame({"t": np.arange(50)})
+    y = pd.Series(np.arange(50, dtype=float))
+
+    train_idx, test_idx, _, _ = split_benchmark_data(X, y, "timeseries_regression", random_state=0)
+
+    assert list(train_idx) == list(range(40))
+    assert list(test_idx) == list(range(40, 50))
+
+
+def test_regression_uses_random_split_no_stratify():
+    """Non-classification, non-forecasting tasks should use a plain random split."""
+    rng = np.random.default_rng(1)
+    X = pd.DataFrame({"f": rng.normal(size=100)})
+    y = pd.Series(rng.normal(size=100))
+
+    train_idx, test_idx, _, _ = split_benchmark_data(X, y, "regression", random_state=42)
+
+    assert len(train_idx) == 80
+    assert len(test_idx) == 20
+    assert set(train_idx).isdisjoint(set(test_idx))
+
+
+def test_custom_test_size_respected():
+    """``test_size`` should override the default 0.2."""
+    X = pd.DataFrame({"f": np.arange(100.0)})
+    y = pd.Series(np.arange(100, dtype=float))
+
+    train_idx, test_idx, _, _ = split_benchmark_data(X, y, "regression", random_state=0, test_size=0.4)
+
+    assert len(test_idx) == 40
+    assert len(train_idx) == 60
+
+
+@pytest.mark.parametrize("bad_test_size", [0.0, 1.0, -0.1, 1.5, 2])
+def test_split_benchmark_data_rejects_out_of_range_test_size(bad_test_size):
+    """``test_size`` must be strictly between 0 and 1 for both branches."""
+    X = pd.DataFrame({"f": np.arange(100.0)})
+    y = pd.Series(np.arange(100, dtype=float))
+
+    # Random branch.
+    with pytest.raises(ValueError, match="test_size must be a float strictly between 0 and 1"):
+        split_benchmark_data(X, y, "regression", random_state=0, test_size=bad_test_size)
+
+    # Chronological branch -- previously silently produced empty/overlapping splits.
+    with pytest.raises(ValueError, match="test_size must be a float strictly between 0 and 1"):
+        split_benchmark_data(X, y, "forecasting", random_state=0, test_size=bad_test_size)
+
+
+def test_split_benchmark_data_chronological_rejects_empty_train_split():
+    """Tiny datasets with extreme ``test_size`` must raise instead of producing an empty train set."""
+    X = pd.DataFrame({"t": np.arange(2)})
+    y = pd.Series(np.arange(2, dtype=float))
+
+    # len=2, test_size=0.9 -> split_idx = int(2 * 0.1) = 0 -> empty train.
+    with pytest.raises(ValueError, match="Chronological split would leave one side empty"):
+        split_benchmark_data(X, y, "forecasting", random_state=0, test_size=0.9)
+
+
+def test_split_benchmark_data_chronological_single_row_dataset_raises():
+    """A single-row dataset cannot be chronologically split for any valid ``test_size``."""
+    X = pd.DataFrame({"t": [0]})
+    y = pd.Series([0.0])
+
+    with pytest.raises(ValueError, match="Chronological split would leave one side empty"):
+        split_benchmark_data(X, y, "forecasting", random_state=0, test_size=0.5)
+
+
+# ---------------------------------------------------------------------------
+# Wiring tests: ensure benchmark scripts actually use the shared helper.
+# These guard against the regression flagged on PR #2 where the helper was
+# introduced but never wired into the call sites.
+# ---------------------------------------------------------------------------
+
+
+_REPO_ROOT = Path(__file__).resolve().parents[1]
+
+
+@pytest.mark.parametrize(
+    "module_path",
+    [
+        "benchmarks.compare_tools.run_fe_tools_comparison",
+        "benchmarks.use_cases.run_auto_feature_engineering_benchmark",
+    ],
+)
+def test_benchmark_scripts_import_split_helper(module_path):
+    """Both in-scope benchmark scripts must import ``split_benchmark_data``."""
+    module = importlib.import_module(module_path)
+    assert (
+        getattr(module, "split_benchmark_data", None) is split_benchmark_data
+    ), f"{module_path} should import split_benchmark_data from benchmarks.splits"
+
+
+@pytest.mark.parametrize(
+    "relative_path",
+    [
+        "benchmarks/compare_tools/run_fe_tools_comparison.py",
+        "benchmarks/use_cases/run_auto_feature_engineering_benchmark.py",
+    ],
+)
+def test_benchmark_scripts_do_not_call_train_test_split_directly(relative_path):
+    """Benchmark scripts should route through ``split_benchmark_data`` instead of ``train_test_split``."""
+    source = (_REPO_ROOT / relative_path).read_text(encoding="utf-8")
+    assert (
+        "train_test_split(" not in source
+    ), f"{relative_path} should not call train_test_split directly; use split_benchmark_data."
+    assert (
+        "split_benchmark_data(" in source
+    ), f"{relative_path} should call split_benchmark_data from benchmarks.splits."
+
+
+def test_split_benchmark_data_signature_stable():
+    """Pin the public signature so downstream benchmark scripts keep working."""
+    sig = inspect.signature(split_benchmark_data)
+    params = list(sig.parameters)
+    assert params == ["X", "y", "task", "random_state", "test_size"]
+    assert sig.parameters["test_size"].default == 0.2
diff --git a/tests/test_engines.py b/tests/test_engines.py
index ee49cfa..3005ae9 100644
--- a/tests/test_engines.py
+++ b/tests/test_engines.py
@@ -785,6 +785,27 @@ def test_ndarray_input(self):
         assert "feature_0_mean" in result.columns
         assert "feature_1_mean" in result.columns
 
+    def test_series_in_rows_mode(self):
+        """Test per-row mode for sequence-like cells."""
+        df = pd.DataFrame(
+            {
+                "series": [np.array([1.0, 2.0, 3.0]), np.array([3.0, 2.0, 1.0])],
+                "event_time": [pd.Timestamp("2024-01-01"), pd.Timestamp("2024-01-02")],
+            }
+        )
+        engine = TimeSeriesEngine(features=["basic_stats"], series_in_rows=True)
+        result = engine.fit_transform(df, time_column="event_time")
+
+        assert isinstance(result, pd.DataFrame)
+        assert len(result) == 2
+        assert "series_mean" in result.columns
+
+    def test_time_column_missing_raises(self, ts_data):
+        """Test invalid time column raises a helpful error."""
+        engine = TimeSeriesEngine(features=["basic_stats"])
+        with pytest.raises(ValueError, match="time_column 'missing_time'"):
+            engine.fit(ts_data, time_column="missing_time")
+
 
 # ---------------------------------------------------------------------------
 # RelationalEngine tests
@@ -950,6 +971,20 @@ def test_transform_with_different_related_tables(self, orders_data, customers_da
         assert isinstance(result, pd.DataFrame)
         assert "customers_age_mean" in result.columns
 
+    def test_fit_missing_child_key_raises(self, orders_data, customers_data):
+        """Test fit raises when child key is missing from primary table."""
+        engine = RelationalEngine()
+        engine.add_relationship("orders", "customers", "missing_customer_id")
+        with pytest.raises(ValueError, match="child_key 'missing_customer_id'"):
+            engine.fit(orders_data, related_tables={"customers": customers_data})
+
+    def test_fit_missing_parent_key_raises(self, orders_data, customers_data):
+        """Test fit raises when parent key is missing from related table."""
+        engine = RelationalEngine()
+        engine.add_relationship("orders", "customers", "customer_id", parent_key="missing_parent_key")
+        with pytest.raises(ValueError, match="parent_key 'missing_parent_key'"):
+            engine.fit(orders_data, related_tables={"customers": customers_data})
+
     def test_no_relationships(self, orders_data):
         """Test engine with no relationships defined."""
         engine = RelationalEngine()
diff --git a/tests/test_sklearn_compat.py b/tests/test_sklearn_compat.py
index ad1ccd1..6920b9a 100644
--- a/tests/test_sklearn_compat.py
+++ b/tests/test_sklearn_compat.py
@@ -1,5 +1,6 @@
 """Tests for scikit-learn compatible feature engineering transformers."""
 
+import importlib
 from unittest.mock import MagicMock, patch
 
 import numpy as np
@@ -262,6 +263,273 @@ def test_set_params(self):
         assert afe.max_features == 20
         assert afe.verbose is True
 
+    def test_set_params_none_normalizes_to_defaults(self):
+        """set_params should accept None for collection-valued params (sklearn compat).
+
+        Sklearn's ``clone`` and ``GridSearchCV`` may pass ``None`` for parameters
+        whose default in ``__init__`` is also ``None``. Validation should not raise
+        in that case; ``None`` should be normalized to the same defaults
+        ``__init__`` applies.
+        """
+        afe = AutoFeatureEngineer(engines=["tabular", "timeseries"])
+        afe.set_params(engines=None, selection_methods=None, llm_config=None)
+
+        assert afe.engines == ["tabular"]
+        assert afe.selection_methods == ["mutual_info", "importance"]
+        assert afe.llm_config == {}
+
+    def test_set_params_invalid_engine_still_raises(self):
+        """set_params should still validate non-None values."""
+        afe = AutoFeatureEngineer()
+        with pytest.raises(ValueError, match="Unknown engines"):
+            afe.set_params(engines=["not_a_real_engine"])
+
+    def test_set_params_unknown_key_raises(self):
+        """set_params should reject unknown parameter names (sklearn convention)."""
+        afe = AutoFeatureEngineer()
+        with pytest.raises(ValueError, match="Invalid parameter"):
+            afe.set_params(not_a_real_param=42)
+
+    def test_set_params_unknown_key_does_not_mutate_state(self):
+        """A failing set_params call must leave the estimator unchanged."""
+        afe = AutoFeatureEngineer(engines=["tabular"], max_features=5)
+        with pytest.raises(ValueError):
+            afe.set_params(typo_param=99)
+
+        assert afe.engines == ["tabular"]
+        assert afe.max_features == 5
+        assert not hasattr(afe, "typo_param")
+
+    def test_sklearn_clone_round_trip(self):
+        """A cloned estimator must be configurable identically to the original."""
+        from sklearn.base import clone
+
+        afe = AutoFeatureEngineer(engines=["tabular"], max_features=7)
+        cloned = clone(afe)
+
+        assert cloned.engines == ["tabular"]
+        assert cloned.max_features == 7
+        assert cloned.selection_methods == ["mutual_info", "importance"]
+
+    def test_set_params_invalid_value_rolls_back_state(self, sample_df):
+        """A failing set_params call must leave every parameter at its pre-call value."""
+        afe = AutoFeatureEngineer(
+            engines=["tabular"],
+            max_features=5,
+            selection_methods=["mutual_info"],
+            correlation_threshold=0.9,
+            llm_config={"model": "gpt-5.2"},
+            verbose=False,
+            leakage_guard="warn",
+        )
+
+        with pytest.raises(ValueError):
+            afe.set_params(
+                max_features=10,
+                engines=["bogus_engine"],
+                leakage_guard="not_a_mode",
+            )
+
+        # Every parameter that was part of the failing call must be restored.
+        assert afe.engines == ["tabular"]
+        assert afe.max_features == 5
+        assert afe.leakage_guard == "warn"
+        # Untouched parameters are obviously unchanged but assert anyway to
+        # guard against unrelated mutations.
+        assert afe.selection_methods == ["mutual_info"]
+        assert afe.correlation_threshold == 0.9
+        assert afe.llm_config == {"model": "gpt-5.2"}
+        assert afe.verbose is False
+
+    def test_set_params_invalid_value_after_none_normalization_rolls_back(self):
+        """Rollback must capture the pre-call value, not the None-normalized one."""
+        afe = AutoFeatureEngineer(engines=["tabular", "timeseries"])
+
+        with pytest.raises(ValueError):
+            afe.set_params(engines=None, max_features=-1)
+
+        # engines was None-normalized to ["tabular"] mid-call; rollback must
+        # restore the original ["tabular", "timeseries"], not ["tabular"].
+        assert afe.engines == ["tabular", "timeseries"]
+        assert afe.max_features is None
+
+    def test_validate_engines_rejects_non_string_entries(self):
+        """Mixed-type engine lists must raise ValueError, not TypeError from sorted()."""
+        # A bare ``sorted(set(...))`` over a mix of None/str would raise
+        # ``TypeError: '<' not supported between instances of 'str' and 'NoneType'``.
+        # The validator must surface a clear ValueError instead.
+        with pytest.raises(ValueError, match="engines must contain only strings"):
+            AutoFeatureEngineer(engines=[None, "tabular"])
+        with pytest.raises(ValueError, match="engines must contain only strings"):
+            AutoFeatureEngineer(engines=["tabular", 42])
+
+    def test_validate_selection_methods_rejects_non_string_entries(self):
+        """Mixed-type selection_methods lists must raise ValueError, not TypeError from sorted()."""
+        with pytest.raises(ValueError, match="selection_methods must contain only strings"):
+            AutoFeatureEngineer(selection_methods=[None, "mutual_info"])
+        with pytest.raises(ValueError, match="selection_methods must contain only strings"):
+            AutoFeatureEngineer(selection_methods=["mutual_info", 0])
+
+    def test_set_params_rejects_non_string_engine_entries_and_rolls_back(self):
+        """set_params must surface the same ValueError and roll back state."""
+        afe = AutoFeatureEngineer(engines=["tabular"])
+        with pytest.raises(ValueError, match="engines must contain only strings"):
+            afe.set_params(engines=[None, "spaceship"])
+        assert afe.engines == ["tabular"]
+
+    def test_init_rejects_empty_engines_list(self):
+        """An explicitly empty ``engines=[]`` must raise rather than silently no-op."""
+        # ``engines=None`` defaults to ['tabular']; an explicit empty list is a
+        # different intent and would otherwise let ``fit()`` mark the estimator
+        # fitted with zero engines so that ``transform()`` becomes a silent no-op.
+        with pytest.raises(ValueError, match="engines must contain at least one engine"):
+            AutoFeatureEngineer(engines=[])
+
+    def test_init_rejects_empty_selection_methods_list(self):
+        """An explicitly empty ``selection_methods=[]`` must raise."""
+        with pytest.raises(ValueError, match="selection_methods must contain at least one method"):
+            AutoFeatureEngineer(selection_methods=[])
+
+    def test_set_params_rejects_empty_engines_and_rolls_back(self):
+        """set_params must reject empty ``engines=[]`` and leave state untouched."""
+        afe = AutoFeatureEngineer(engines=["tabular"], max_features=5)
+        with pytest.raises(ValueError, match="engines must contain at least one engine"):
+            afe.set_params(engines=[])
+        assert afe.engines == ["tabular"]
+        assert afe.max_features == 5
+
+    def test_init_engines_none_still_defaults_to_tabular(self):
+        """``engines=None`` continues to normalize to the default ['tabular']."""
+        afe = AutoFeatureEngineer(engines=None)
+        assert afe.engines == ["tabular"]
+
+    def test_init_rejects_string_engines_argument(self):
+        """A bare ``str`` for ``engines`` must raise instead of iterating char-by-char."""
+        # Without the container-type guard, ``engines="tabular"`` would expand
+        # into individual characters and produce a confusing "Unknown engines"
+        # error such as ``Unknown engines: ['a', 'b', 'l', 'r', 't', 'u']``.
+        with pytest.raises(ValueError, match="engines must be a list or tuple of strings"):
+            AutoFeatureEngineer(engines="tabular")
+
+    def test_init_rejects_non_sequence_engines_argument(self):
+        """Non-sequence ``engines`` (e.g. ``int``) must raise a clear ValueError."""
+        # Without the guard, ``set(self.engines)`` would raise a bare
+        # ``TypeError: 'int' object is not iterable``.
+        with pytest.raises(ValueError, match="engines must be a list or tuple of strings"):
+            AutoFeatureEngineer(engines=5)
+        with pytest.raises(ValueError, match="engines must be a list or tuple of strings"):
+            AutoFeatureEngineer(engines={"tabular": True})
+
+    def test_init_rejects_string_selection_methods_argument(self):
+        """A bare ``str`` for ``selection_methods`` must raise."""
+        with pytest.raises(ValueError, match="selection_methods must be a list or tuple of strings"):
+            AutoFeatureEngineer(selection_methods="mutual_info")
+
+    def test_init_rejects_non_sequence_selection_methods_argument(self):
+        """Non-sequence ``selection_methods`` must raise a clear ValueError."""
+        with pytest.raises(ValueError, match="selection_methods must be a list or tuple of strings"):
+            AutoFeatureEngineer(selection_methods=42)
+
+    def test_init_accepts_tuple_engines(self):
+        """Tuples of strings are an acceptable container for ``engines``."""
+        afe = AutoFeatureEngineer(engines=("tabular",))
+        assert afe.engines == ("tabular",)
+
+    def test_set_params_rejects_string_engines_and_rolls_back(self):
+        """``set_params`` inherits the container-type check and rolls back on failure."""
+        afe = AutoFeatureEngineer(engines=["tabular"], max_features=5)
+        with pytest.raises(ValueError, match="engines must be a list or tuple of strings"):
+            afe.set_params(engines="tabular")
+        assert afe.engines == ["tabular"]
+        assert afe.max_features == 5
+
+    def test_fit_accepts_non_string_target_name(self, sample_df, sample_target):
+        """``target_name`` is typed Optional[Any]; integer column labels must work."""
+        # Build a DataFrame with an integer column name that overlaps the target.
+        df = sample_df.copy()
+        df.columns = [0, 1, 2, 3]
+        afe = AutoFeatureEngineer(engines=["tabular"], leakage_guard="raise")
+        # Integer target_name=0 must be honored (and would raise here because
+        # leakage_guard="raise" + a column named 0). This pins the type-hint
+        # contract: non-string target labels are accepted at runtime.
+        with pytest.raises(ValueError, match="leakage-prone"):
+            afe.fit(df, sample_target, target_name=0)
+
+    def test_fit_resets_engine_instances_when_engines_change(self, sample_df, sample_target):
+        """Refitting after removing an engine must drop the previously fitted engine."""
+        afe = AutoFeatureEngineer(engines=["tabular", "timeseries"], verbose=False)
+        afe.fit(sample_df, sample_target)
+        assert set(afe._engine_instances) == {"tabular", "timeseries"}
+
+        afe.set_params(engines=["tabular"])
+        afe.fit(sample_df, sample_target)
+
+        # The previously fitted "timeseries" engine must not survive into the
+        # new fit, otherwise transform() would invoke a stale engine.
+        assert set(afe._engine_instances) == {"tabular"}
+
+    def test_fit_resets_selector_after_prior_fit_transform(self, sample_df, sample_target):
+        """A plain fit() following fit_transform() must clear the selector."""
+        afe = AutoFeatureEngineer(engines=["tabular"], max_features=3, verbose=False)
+        afe.fit_transform(sample_df, sample_target)
+        assert afe._selector is not None
+
+        afe.fit(sample_df, sample_target)
+
+        # Without a selector reset, transform() would still apply the stale
+        # selection from the previous fit_transform call.
+        assert afe._selector is None
+        result = afe.transform(sample_df)
+        # Every input column must survive transform when no selector is active.
+        for col in sample_df.columns:
+            assert col in result.columns
+
+    def test_fit_resets_state_when_called_after_failed_fit(self, sample_df, sample_target, monkeypatch):
+        """If fit raises mid-flight, _is_fitted must be False so transform errors out."""
+        afe = AutoFeatureEngineer(engines=["tabular"], verbose=False)
+        afe.fit(sample_df, sample_target)
+        assert afe._is_fitted is True
+
+        # Force the next fit to fail partway through engine fitting.
+        from featcopilot.engines.tabular import TabularEngine
+
+        def _boom(self, X, y=None, **kwargs):
+            raise RuntimeError("simulated engine failure")
+
+        monkeypatch.setattr(TabularEngine, "fit", _boom)
+        with pytest.raises(RuntimeError, match="simulated engine failure"):
+            afe.fit(sample_df, sample_target)
+
+        # The failed fit must not leave the estimator in a "fitted" state
+        # that points at stale engines from the previous successful fit.
+        assert afe._is_fitted is False
+        assert afe._engine_instances == {}
+        with pytest.raises(RuntimeError, match="Must call fit"):
+            afe.transform(sample_df)
+
+
+class TestPackageImport:
+    """Tests for top-level package import behavior."""
+
+    def test_import_without_installed_metadata_falls_back(self):
+        """Test source import works even when distribution metadata is unavailable."""
+        import importlib.metadata as importlib_metadata
+
+        import featcopilot
+
+        original_version = importlib_metadata.version
+
+        def fake_version(name):
+            if name == "featcopilot":
+                raise importlib_metadata.PackageNotFoundError
+            return original_version(name)
+
+        with patch("importlib.metadata.version", side_effect=fake_version):
+            reloaded = importlib.reload(featcopilot)
+            assert reloaded.__version__ == "0+unknown"
+
+        importlib.reload(featcopilot)
+
     def test_verbose_logging(self, sample_df, sample_target):
         """Test that verbose=True does not error."""
         afe = AutoFeatureEngineer(engines=["tabular"], verbose=True)
diff --git a/tests/test_utils.py b/tests/test_utils.py
index 9d2c5e8..75731d3 100644
--- a/tests/test_utils.py
+++ b/tests/test_utils.py
@@ -21,6 +21,70 @@
     list_models,
 )
 from featcopilot.utils.parallel import parallel_apply, parallel_transform
+from featcopilot.utils.validation import find_potential_leakage_columns
+
+# ---------------------------------------------------------------------------
+# Validation tests
+# ---------------------------------------------------------------------------
+
+
+def test_leakage_detection_non_string_columns():
+    """Test leakage detection accepts non-string column labels."""
+    assert find_potential_leakage_columns([], target_name="label") == []
+    assert find_potential_leakage_columns([123, "future_label"], target_name="label") == ["future_label"]
+    assert find_potential_leakage_columns([123, "target"], target_name="other") == ["target"]
+    assert find_potential_leakage_columns([123, 456], target_name="label") == []
+    assert find_potential_leakage_columns([123, "feature"], target_name=123) == [123]
+    assert find_potential_leakage_columns(["Churn Label"], target_name="churn_label") == ["Churn Label"]
+    assert find_potential_leakage_columns(["future-target!"], target_name="target") == ["future-target!"]
+
+
+def test_leakage_detection_empty_keywords_disables_keyword_matching():
+    """Passing ``keywords=[]`` must opt out of keyword-based matching entirely."""
+    # Without an explicit empty keywords list, "target" / "future_score" would
+    # be flagged via DEFAULT_LEAKAGE_KEYWORDS. With ``keywords=[]`` and no
+    # ``target_name``, no column should be flagged.
+    assert find_potential_leakage_columns(["target", "future_score", "x"], keywords=[]) == []
+
+    # Explicit ``target_name`` still works alongside ``keywords=[]``.
+    assert find_potential_leakage_columns(
+        ["target", "future_score", "label_x"],
+        target_name="label",
+        keywords=[],
+    ) == ["label_x"]
+
+
+def test_leakage_detection_falsy_target_name_zero_still_matches():
+    """Falsy but meaningful targets (e.g. ``0``) must still drive matching."""
+    # Previously ``if target_name`` would treat ``0`` as absent and skip target
+    # matching entirely. The integer column ``0`` should now be flagged.
+    assert find_potential_leakage_columns([0, "feature"], target_name=0) == [0]
+
+
+def test_leakage_detection_empty_string_target_does_not_match_everything():
+    """``target_name=''`` must not flag every column via the empty-substring trap."""
+    # An empty (or whitespace-only) target normalizes to "" which would otherwise
+    # be treated as a substring of every column. Treating empty normalized
+    # results as absent prevents that.
+    assert find_potential_leakage_columns(["a", "b", "c"], target_name="", keywords=[]) == []
+    assert find_potential_leakage_columns(["a", "b", "c"], target_name="---", keywords=[]) == []
+
+
+def test_leakage_detection_columns_normalizing_to_empty_string_are_skipped():
+    """Columns whose labels normalize to an empty string must not be flagged."""
+    # Without the column-side guard, the empty ``normalized_column`` would be
+    # a substring of every non-empty ``normalized_target`` (``"" in "label"``
+    # is True), so any column literally labeled ``"---"`` / ``"!!!"`` would be
+    # flagged whenever a target was provided. The guard skips such columns.
+    assert find_potential_leakage_columns(["---", "!!!"], target_name="label") == []
+    assert find_potential_leakage_columns(["---", "label_x"], target_name="label", keywords=[]) == ["label_x"]
+    # Mixing meaningful and empty-normalizing column labels still reports only
+    # the meaningful ones.
+    assert find_potential_leakage_columns(["target", "---", "future_x"], target_name="label") == [
+        "target",
+        "future_x",
+    ]
+
 
 # ---------------------------------------------------------------------------
 # FeatureCache tests