# Merged Feature Engineering

## Task Checklist

1. **Aggregate depth=1 and depth=2 tables (max, min)** - DONE
   - MUST_AGG tables aggregated with Polars, incremental AUC evaluation
2. **Split data into Applicant vs others (num_group1)** - NOT DONE
   - Person tables tested but dropped (reduced AUC); no applicant/others split implemented
3. **Active and closed contracts** - DONE
   - Status-based + date-based logic in `build_contract_features()`
4. **Time-windowed aggregations** - DONE
   - Default windows: <=1y, 1-3y, >3y; custom windows for ap1w: <=0.5y, 0.5-2y, >2y
5. **DPD-conditional aggregations** - DONE
   - DPD>=30 and DPD>=90 counts crossed with active/closed status
6. **Aggregate redundancy from multiple tables** - NOT DONE
   - No shard deduplication across table shards (e.g. credit_bureau_a_1_0 through _3)
7. **StratifiedGroupKFold with WEEK_NUM as groups** - PARTIAL
   - Rolling time splits used instead (cuts at WEEK_NUM 50, 60, 70); arguably better for stability metric
8. **Remove features that fluctuate a lot, rank feature importance** - DONE
   - Incremental block evaluation: add block, check AUC delta, keep or drop; rolling stability validation

---

## Current state: 74 features across 5 retained blocks

- cb1 (credit_bureau_b_1) - active/closed contracts, DPD, time windows
- ap1w (applprev_1_1) - custom time windows (<=0.5y, 0.5-2y, >2y)
- dep (deposit_1) - active/closed deposits, time windows
- taxa (tax_registry_a_1) - record presence, time windows
- ap2 (applprev_2) - application history counts

## Still missing (to be built)

- Static tables (static_0_0, static_0_1, static_cb_0) - 1:1 joins, easiest features
- DANGEROUS tables (credit_bureau_a shards) - largest data source, untouched
- Applicant vs others split from person tables
- Shard deduplication
- other_1, tax_registry_b_1, tax_registry_c_1, credit_bureau_b_2

## 1) Setup

In [None]:
from pathlib import Path
import polars as pl
import numpy as np
import pandas as pd
from sklearn.ensemble import HistGradientBoostingClassifier
from sklearn.metrics import roc_auc_score

# UPDATE THIS PATH to your local data directory
DATA_DIR = Path("home_credit_data/csv_files/train")

TABLES = {
    "base": DATA_DIR / "train_base.csv",
    "applprev_1_0": DATA_DIR / "train_applprev_1_0.csv",
    "applprev_1_1": DATA_DIR / "train_applprev_1_1.csv",
    "applprev_2": DATA_DIR / "train_applprev_2.csv",
    "credit_bureau_b_1": DATA_DIR / "train_credit_bureau_b_1.csv",
    "deposit_1": DATA_DIR / "train_deposit_1.csv",
    "tax_registry_a_1": DATA_DIR / "train_tax_registry_a_1.csv",
}

## 2) Load base table

In [None]:
base = pl.read_csv(TABLES["base"])

base_dates = (
    base.select(["case_id", "date_decision"])
    .with_columns(pl.col("date_decision").str.strptime(pl.Date, strict=False))
)

print(f"Base shape: {base.shape}")
print(f"Target rate: {base['target'].mean():.4f}")
base.head()

## 3) Feature building functions

Leak-safe aggregation: only uses records known at decision time (age_years >= 0).
Produces active/closed counts, DPD conditional counts, and time-windowed counts.

In [None]:
def build_contract_features(
    table_path: Path,
    prefix: str,
    base_dates: pl.DataFrame,
    event_date_cols: list[str],
    dpd_col: str | None = None,
    active_flag_expr: pl.Expr | None = None,
    closed_flag_expr: pl.Expr | None = None,
):
    df = pl.read_csv(table_path).join(base_dates, on="case_id", how="left")

    parse_exprs = [pl.col(c).str.strptime(pl.Date, strict=False).alias(c) for c in event_date_cols if c in df.columns]
    cast_exprs = [pl.col(dpd_col).cast(pl.Float64, strict=False).alias(dpd_col)] if (dpd_col is not None and dpd_col in df.columns) else []
    df = df.with_columns(parse_exprs + cast_exprs)

    available_dates = [pl.col(c) for c in event_date_cols if c in df.columns]
    if len(available_dates) > 0:
        df = df.with_columns(pl.coalesce(available_dates).alias("event_date"))
        df = df.with_columns(((pl.col("date_decision") - pl.col("event_date")).dt.total_days() / 365.25).alias("age_years"))
        known_mask = (pl.col("age_years") >= 0)
    else:
        df = df.with_columns([
            pl.lit(None).cast(pl.Date).alias("event_date"),
            pl.lit(None).cast(pl.Float64).alias("age_years"),
        ])
        known_mask = pl.lit(True)

    if active_flag_expr is None:
        active_flag_expr = pl.lit(False)
    if closed_flag_expr is None:
        closed_flag_expr = pl.lit(False)

    df = df.with_columns([
        active_flag_expr.fill_null(False).alias("is_active"),
        closed_flag_expr.fill_null(False).alias("is_closed"),
    ])

    if dpd_col is not None and dpd_col in df.columns:
        df = df.with_columns([
            (pl.col(dpd_col) >= 30).fill_null(False).alias("dpd30"),
            (pl.col(dpd_col) >= 90).fill_null(False).alias("dpd90"),
        ])
    else:
        df = df.with_columns([
            pl.lit(False).alias("dpd30"),
            pl.lit(False).alias("dpd90"),
        ])

    agg = df.group_by("case_id").agg([
        pl.len().alias(f"{prefix}_row_count_all"),
        known_mask.sum().alias(f"{prefix}_known_count"),

        (known_mask & pl.col("is_active")).sum().alias(f"{prefix}_active_count_all"),
        (known_mask & pl.col("is_closed")).sum().alias(f"{prefix}_closed_count_all"),

        (known_mask & pl.col("is_active") & pl.col("dpd30")).sum().alias(f"{prefix}_active_dpd30_count_all"),
        (known_mask & pl.col("is_active") & pl.col("dpd90")).sum().alias(f"{prefix}_active_dpd90_count_all"),
        (known_mask & pl.col("is_closed") & pl.col("dpd30")).sum().alias(f"{prefix}_closed_dpd30_count_all"),
        (known_mask & pl.col("is_closed") & pl.col("dpd90")).sum().alias(f"{prefix}_closed_dpd90_count_all"),

        (known_mask & pl.col("is_active") & (pl.col("age_years") <= 1)).sum().alias(f"{prefix}_active_count_le1y"),
        (known_mask & pl.col("is_closed") & (pl.col("age_years") <= 1)).sum().alias(f"{prefix}_closed_count_le1y"),

        (known_mask & pl.col("is_active") & (pl.col("age_years") > 1) & (pl.col("age_years") <= 3)).sum().alias(f"{prefix}_active_count_1to3y"),
        (known_mask & pl.col("is_closed") & (pl.col("age_years") > 1) & (pl.col("age_years") <= 3)).sum().alias(f"{prefix}_closed_count_1to3y"),

        (known_mask & pl.col("is_active") & (pl.col("age_years") > 3)).sum().alias(f"{prefix}_active_count_gt3y"),
        (known_mask & pl.col("is_closed") & (pl.col("age_years") > 3)).sum().alias(f"{prefix}_closed_count_gt3y"),
    ])

    agg = agg.with_columns([
        pl.when(pl.col(f"{prefix}_active_count_all") > 0)
        .then(pl.col(f"{prefix}_active_dpd30_count_all") / pl.col(f"{prefix}_active_count_all"))
        .otherwise(0.0)
        .alias(f"{prefix}_active_dpd30_rate"),

        pl.when(pl.col(f"{prefix}_closed_count_all") > 0)
        .then(pl.col(f"{prefix}_closed_dpd30_count_all") / pl.col(f"{prefix}_closed_count_all"))
        .otherwise(0.0)
        .alias(f"{prefix}_closed_dpd30_rate"),
    ])

    return agg

In [None]:
def build_contract_features_custom_windows(
    table_path: Path,
    prefix: str,
    base_dates: pl.DataFrame,
    event_date_cols: list[str],
    windows=((0,1),(1,3),(3,999)),
    dpd_col: str | None = None,
    active_flag_expr: pl.Expr | None = None,
    closed_flag_expr: pl.Expr | None = None,
):
    df = pl.read_csv(table_path).join(base_dates, on="case_id", how="left")

    parse_exprs = [pl.col(c).str.strptime(pl.Date, strict=False).alias(c) for c in event_date_cols if c in df.columns]
    cast_exprs = [pl.col(dpd_col).cast(pl.Float64, strict=False).alias(dpd_col)] if (dpd_col is not None and dpd_col in df.columns) else []
    df = df.with_columns(parse_exprs + cast_exprs)

    dates = [pl.col(c) for c in event_date_cols if c in df.columns]
    if len(dates) > 0:
        df = df.with_columns([
            pl.coalesce(dates).alias("event_date"),
            ((pl.col("date_decision") - pl.coalesce(dates)).dt.total_days() / 365.25).alias("age_years"),
        ])
        known = (pl.col("age_years") >= 0)
    else:
        df = df.with_columns([pl.lit(None).cast(pl.Float64).alias("age_years")])
        known = pl.lit(True)

    if active_flag_expr is None:
        active_flag_expr = pl.lit(False)
    if closed_flag_expr is None:
        closed_flag_expr = pl.lit(False)

    df = df.with_columns([
        active_flag_expr.fill_null(False).alias("is_active"),
        closed_flag_expr.fill_null(False).alias("is_closed"),
    ])

    aggs = [
        pl.len().alias(f"{prefix}_row_count_all"),
        (known & pl.col("is_active")).sum().alias(f"{prefix}_active_count_all"),
        (known & pl.col("is_closed")).sum().alias(f"{prefix}_closed_count_all"),
    ]

    for lo, hi in windows:
        tag = f"{lo}to{hi}" if hi < 999 else f"gt{lo}"
        cond = (pl.col("age_years") > lo) & (pl.col("age_years") <= hi) if hi < 999 else (pl.col("age_years") > lo)
        aggs += [
            (known & pl.col("is_active") & cond).sum().alias(f"{prefix}_active_count_{tag}"),
            (known & pl.col("is_closed") & cond).sum().alias(f"{prefix}_closed_count_{tag}"),
        ]

    return df.group_by("case_id").agg(aggs)

## 4) Build retained feature blocks

5 blocks retained after incremental AUC evaluation:
- cb1: credit_bureau_b_1
- ap1w: applprev_1_1 (custom windows)
- dep: deposit_1
- taxa: tax_registry_a_1
- ap2: applprev_2

In [None]:
# cb1: credit_bureau_b_1
cb1_agg = build_contract_features(
    table_path=TABLES["credit_bureau_b_1"],
    prefix="cb1",
    base_dates=base_dates,
    event_date_cols=["lastupdate_260D", "contractdate_551D"],
    dpd_col="dpd_550P",
    active_flag_expr=(pl.col("contractmaturitydate_151D").str.strptime(pl.Date, strict=False) >= pl.col("date_decision")),
    closed_flag_expr=(pl.col("contractmaturitydate_151D").str.strptime(pl.Date, strict=False) < pl.col("date_decision")),
)
print(f"cb1: {cb1_agg.shape}")

In [None]:
# ap1w: applprev_1_1 with best window scheme (<=0.5y, 0.5-2y, >2y)
ap1w_agg = build_contract_features_custom_windows(
    table_path=TABLES["applprev_1_1"],
    prefix="ap1w",
    base_dates=base_dates,
    event_date_cols=["dateactivated_425D", "approvaldate_319D", "creationdate_885D"],
    windows=((0, 0.5), (0.5, 2), (2, 999)),
    dpd_col=None,
    active_flag_expr=pl.col("status_219L").is_in(["A"]),
    closed_flag_expr=pl.col("status_219L").is_in(["D", "K", "T"]),
)
print(f"ap1w: {ap1w_agg.shape}")

In [None]:
# dep: deposit_1
dep_agg = build_contract_features(
    table_path=TABLES["deposit_1"],
    prefix="dep",
    base_dates=base_dates,
    event_date_cols=["openingdate_313D", "contractenddate_991D"],
    dpd_col=None,
    active_flag_expr=(pl.col("contractenddate_991D") >= pl.col("date_decision")),
    closed_flag_expr=(pl.col("contractenddate_991D") < pl.col("date_decision")),
)
print(f"dep: {dep_agg.shape}")

In [None]:
# taxa: tax_registry_a_1
taxa_agg = build_contract_features(
    table_path=TABLES["tax_registry_a_1"],
    prefix="taxa",
    base_dates=base_dates,
    event_date_cols=["recorddate_4527225D"],
    dpd_col=None,
    active_flag_expr=pl.lit(True),
    closed_flag_expr=pl.lit(False),
)
print(f"taxa: {taxa_agg.shape}")

In [None]:
# ap2: applprev_2 (no event dates or DPD, just row counts)
ap2 = pl.read_csv(TABLES["applprev_2"]).join(base_dates, on="case_id", how="left")

ap2_agg = (
    ap2.group_by("case_id").agg([
        pl.len().alias("ap2_row_count_all"),
        pl.len().alias("ap2_known_count"),
        pl.len().alias("ap2_active_count_all"),
        pl.lit(0).sum().cast(pl.Int64).alias("ap2_closed_count_all"),
        pl.lit(0).sum().cast(pl.Int64).alias("ap2_active_dpd30_count_all"),
        pl.lit(0).sum().cast(pl.Int64).alias("ap2_active_dpd90_count_all"),
        pl.lit(0).sum().cast(pl.Int64).alias("ap2_closed_dpd30_count_all"),
        pl.lit(0).sum().cast(pl.Int64).alias("ap2_closed_dpd90_count_all"),
        pl.lit(0).sum().cast(pl.Int64).alias("ap2_active_count_le1y"),
        pl.lit(0).sum().cast(pl.Int64).alias("ap2_closed_count_le1y"),
        pl.lit(0).sum().cast(pl.Int64).alias("ap2_active_count_1to3y"),
        pl.lit(0).sum().cast(pl.Int64).alias("ap2_closed_count_1to3y"),
        pl.lit(0).sum().cast(pl.Int64).alias("ap2_active_count_gt3y"),
        pl.lit(0).sum().cast(pl.Int64).alias("ap2_closed_count_gt3y"),
    ])
    .with_columns([
        pl.lit(0.0).alias("ap2_active_dpd30_rate"),
        pl.lit(0.0).alias("ap2_closed_dpd30_rate"),
    ])
)
print(f"ap2: {ap2_agg.shape}")

## 5) Merge all blocks into base

In [None]:
model_df = (
    base
    .join(cb1_agg, on="case_id", how="left")
    .join(ap1w_agg, on="case_id", how="left")
    .join(dep_agg, on="case_id", how="left")
    .join(taxa_agg, on="case_id", how="left")
    .join(ap2_agg, on="case_id", how="left")
    .with_columns([
        pl.col("^cb1_.*$").fill_null(0),
        pl.col("^ap1w_.*$").fill_null(0),
        pl.col("^dep_.*$").fill_null(0),
        pl.col("^taxa_.*$").fill_null(0),
        pl.col("^ap2_.*$").fill_null(0),
    ])
)

feature_cols = [c for c in model_df.columns if c not in ["case_id", "date_decision", "MONTH", "target"]]
print(f"Final shape: {model_df.shape}")
print(f"Feature count: {len(feature_cols)}")
print(f"Features: {feature_cols}")

## 6) Evaluation helpers

In [None]:
def run_auc_for_cut(df, feature_cols, cut_week):
    tr = df.filter(pl.col("WEEK_NUM") <= cut_week)
    va = df.filter(pl.col("WEEK_NUM") > cut_week)

    Xtr = tr.select(feature_cols).to_pandas()
    ytr = tr["target"].to_pandas()
    Xva = va.select(feature_cols).to_pandas()
    yva = va["target"].to_pandas()

    clf = HistGradientBoostingClassifier(max_depth=6, learning_rate=0.05, max_iter=200, random_state=42)
    clf.fit(Xtr, ytr)
    p = clf.predict_proba(Xva)[:, 1]
    return roc_auc_score(yva, p)


def eval_stability(df, feature_cols, cuts=(50, 60, 70), label="model"):
    rows = []
    for c in cuts:
        auc = run_auc_for_cut(df, feature_cols, c)
        rows.append({"model": label, "cut_week": c, "auc": auc})
    out = pd.DataFrame(rows)
    summary = {
        "model": label,
        "mean_auc": out["auc"].mean(),
        "std_auc": out["auc"].std(ddof=0),
        "min_auc": out["auc"].min(),
        "max_auc": out["auc"].max(),
    }
    return out, pd.DataFrame([summary])

## 7) Validate

In [None]:
detail, summary = eval_stability(model_df, feature_cols, cuts=(50, 60, 70), label="darren_74_features")
print(detail)
print()
print(summary)