# IEEE Fraud Detection - Complete Feature Engineering Pipeline

This notebook implements a comprehensive fraud detection pipeline with:
- Data loading and preprocessing
- V-column correlation analysis and reduction
- Feature engineering with UID aggregates
- Walk-forward time validation (6 months)
- Time-consistent target encoding
- Feature stability testing


## 1. Data Loading and Initial Preprocessing

In [1]:
import pandas as pd
import numpy as np
import warnings
import os
import gc
from sklearn.metrics import roc_auc_score
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from collections import defaultdict
warnings.filterwarnings("ignore")

# --- Globals & Constants ---
PATH = "../data/raw/"
TRAIN_TRAN = PATH + "train_transaction.csv"
TRAIN_ID = PATH + "train_identity.csv"
TEST_TRAN = PATH + "test_transaction.csv"
TEST_ID = PATH + "test_identity.csv"
TARGET = "isfraud"

# Load CSVs
print("Loading training data...")
train_transaction = pd.read_csv(TRAIN_TRAN)
train_identity = pd.read_csv(TRAIN_ID)

print("Loading test data...")
test_transaction = pd.read_csv(TEST_TRAN)
test_identity = pd.read_csv(TEST_ID)

# Merge train
train_df = train_transaction.merge(train_identity, on="TransactionID", how="left")
print(f"Train shape: {train_df.shape}")

# Merge test
test_df = test_transaction.merge(test_identity, on="TransactionID", how="left")
print(f"Test shape: {test_df.shape}")

# Quick peek
print("Train columns:", train_df.shape[1])
print("Test columns:", test_df.shape[1])

Loading training data...
Loading test data...
Train shape: (590540, 434)
Test shape: (506691, 433)
Train columns: 434
Test columns: 433


## 2. Column Normalization and High-Missing Column Removal

In [2]:
def normalize_cols(df: pd.DataFrame) -> pd.DataFrame:
    df.columns = df.columns.str.lower().str.replace("-", "_").str.replace(" ", "_")
    return df

def drop_high_missing(train_df: pd.DataFrame, test_df: pd.DataFrame, thresh: float = 0.98):
    miss_train = train_df.isna().mean()
    miss_test = test_df.isna().mean()
    drop_cols = [c for c in train_df.columns
                 if (miss_train.get(c,0) > thresh) and (miss_test.get(c,0) > thresh)]
    keep_cols = [c for c in train_df.columns if c not in drop_cols]
    print(f"Dropping {len(drop_cols)} columns (>{thresh*100:.0f}% missing in both train/test)")
    return (train_df[keep_cols],
            test_df[[c for c in keep_cols if c in test_df.columns]],
            drop_cols)

train_df = normalize_cols(train_df)
test_df = normalize_cols(test_df)
train_df, test_df, dropped_cols = drop_high_missing(train_df, test_df, thresh=0.98)
print("Initial clean up - Train:", len(train_df.columns), "Test:", len(test_df.columns))

Dropping 9 columns (>98% missing in both train/test)
Initial clean up - Train: 425 Test: 424


## 3. V-Column Correlation Analysis and Reduction

Based on Kaggle EDA analysis, we reduce correlated V columns from groups

In [3]:
# V-column reduction based on correlation analysis
# These are the optimal V columns identified from correlation analysis
V_REDUCED = [
    # V1-V11 block: [1, 3, 4, 6, 8, 11]
    1, 3, 4, 6, 8, 11,
    # V12-V34 block: [13, 14, 17, 20, 23, 26, 27, 30] 
    13, 14, 17, 20, 23, 26, 27, 30,
    # V35-V52 block: [36, 37, 40, 41, 44, 47, 48]
    36, 37, 40, 41, 44, 47, 48,
    # V53-V74 block: [54, 56, 59, 62, 65, 67, 68, 70]
    54, 56, 59, 62, 65, 67, 68, 70,
    # V75-V94 block: [76, 78, 80, 82, 86, 88, 89, 91]
    76, 78, 80, 82, 86, 88, 89, 91,
    # V95-V106 block: [96, 98, 99, 104]
    96, 98, 99, 104,
    # V107-V123 block: [107, 108, 111, 115, 117, 120, 121, 123]
    107, 108, 111, 115, 117, 120, 121, 123,
    # V124-V137 block: [124, 127, 129, 130, 136]
    124, 127, 129, 130, 136,
    # V138-V163 blocks
    138, 139, 142, 147, 156, 162, 165, 160, 166,
    # V167-V216 blocks  
    178, 176, 173, 182, 187, 203, 205, 207, 215,
    169, 171, 175, 180, 185, 188, 198, 210, 209,
    # V217-V278 blocks
    218, 223, 224, 226, 228, 229, 235,
    240, 258, 257, 253, 252, 260, 261,
    264, 266, 267, 274, 277,
    220, 221, 234, 238, 250, 271,
    # V279-V321 blocks
    294, 284, 285, 286, 291, 297,
    303, 305, 307, 309, 310, 320,
    # V281-V315 block: [281, 283, 289, 296, 301, 314]
    281, 283, 289, 296, 301, 314,
    # V322-V339 block: [332, 325, 335, 338]
    332, 325, 335, 338
]

def apply_v_reduction(df: pd.DataFrame, v_keep: list) -> pd.DataFrame:
    v_cols_all = [c for c in df.columns if c.startswith('v') and c[1:].isdigit()]
    v_cols_keep = [f'v{i}' for i in v_keep if f'v{i}' in df.columns]
    v_cols_drop = [c for c in v_cols_all if c not in v_cols_keep]
    print(f"V-columns: keeping {len(v_cols_keep)}, dropping {len(v_cols_drop)}")
    return df.drop(columns=v_cols_drop)

train_df = apply_v_reduction(train_df, V_REDUCED)
test_df = apply_v_reduction(test_df, V_REDUCED)
print(f"After V-reduction - Train: {train_df.shape}, Test: {test_df.shape}")

V-columns: keeping 128, dropping 211
V-columns: keeping 128, dropping 211
After V-reduction - Train: (590540, 214), Test: (506691, 213)


## 4. Base Feature Engineering

In [4]:
def as_str_key(s: pd.Series, na_token="__na__") -> pd.Series:
    return s.astype("string").fillna(na_token).astype(str)

DOMAIN_MAP = {
    "googlemail.com": "gmail.com", "hotmail.co.uk": "hotmail.com", "hotmail.fr": "hotmail.com",
    "ymail.com": "yahoo.com", "yahoo.co.jp": "yahoo.jp", "yahoo.co.uk": "yahoo.com",
    "live.com.mx": "live.com", "outlook.com.br": "outlook.com", "icloud.com.cn": "icloud.com",
}
MULTIPART_TLDS = {"co.uk","ac.uk","gov.uk","com.au","net.au","org.au","com.br","com.ar",
                  "com.mx","com.tr","com.cn","com.hk","com.sg","co.jp"}
FREEMAIL = {"gmail.com","yahoo.com","yahoo.jp","hotmail.com","outlook.com","live.com",
            "aol.com","icloud.com","me.com","mac.com","msn.com","protonmail.com",
            "gmx.com","gmx.de","yandex.ru","mail.ru","qq.com","163.com",
            "126.com","sina.com","sohu.com","orange.fr","free.fr","wanadoo.fr",
            "libero.it","web.de","naver.com"}

def normalize_domain(x: str) -> str:
    if not isinstance(x, str): return "__na__"
    d = x.strip().lower()
    return DOMAIN_MAP.get(d, d)

def parent_domain(d: str) -> str:
    if d in ("__na__", "", None): return "__na__"
    parts = d.split(".")
    if len(parts) < 2: return d
    last2 = ".".join(parts[-2:])
    last3 = ".".join(parts[-3:])
    return last3 if last2 in MULTIPART_TLDS and len(parts) >= 3 else last2

def add_email_features(df: pd.DataFrame) -> pd.DataFrame:
    for side in ["p", "r"]:
        src_col = f"{side}_emaildomain"
        s = df[src_col].apply(normalize_domain) if src_col in df.columns else pd.Series(["__na__"]*len(df))
        par = s.apply(parent_domain)
        df[f"{side}_ed_parent"] = par
        df[f"{side}_is_freemail"] = par.isin(FREEMAIL).astype("int8")
    df["email_parent_match"] = (df["p_ed_parent"] == df["r_ed_parent"]).astype("int8")
    return df

def add_device_features(df: pd.DataFrame) -> pd.DataFrame:
    def os_family(x):
        if not isinstance(x, str): return "__na__"
        x = x.lower()
        if "windows" in x: return "windows"
        if "ios" in x: return "ios"
        if "mac" in x: return "macos"
        if "android" in x: return "android"
        if "linux" in x: return "linux"
        return "__other__"
    df["id_30_os"] = df["id_30"].apply(os_family) if "id_30" in df else pd.Series(["__na__"]*len(df))
    return df

def add_time_primitives(df: pd.DataFrame, col="transactiondt") -> pd.DataFrame:
    if col not in df.columns: raise KeyError(f"Expected '{col}' column")
    dt = pd.to_numeric(df[col], errors="coerce").fillna(0).astype(np.int64)
    df["dt"] = dt
    df["day"] = (dt // (24 * 60 * 60)).astype(np.int32)
    df["hour"] = ((dt // (60 * 60)) % 24).astype(np.int16)
    df["dow"] = (df["day"] % 7).astype(np.int8)
    return df

def add_amount_primitives(df: pd.DataFrame) -> pd.DataFrame:
    amt = pd.to_numeric(df["transactionamt"], errors="coerce") if "transactionamt" in df else pd.Series([np.nan]*len(df))
    df["log_amt"] = np.log1p(amt.astype(float))
    df["amt_cents"] = np.round((amt.astype(float) % 1) * 100).astype("float32")
    df["amt_is_round"] = ((amt.astype(float) % 1) == 0).astype("int8")
    return df

print("Base feature engineering...")
train_df = add_email_features(train_df)
train_df = add_device_features(train_df)
train_df = add_time_primitives(train_df, col="transactiondt")
train_df = add_amount_primitives(train_df)

test_df = add_email_features(test_df)
test_df = add_device_features(test_df)
test_df = add_time_primitives(test_df, col="transactiondt")
test_df = add_amount_primitives(test_df)
print(f"Base engineering complete - Train: {train_df.shape}, Test: {test_df.shape}")

Base feature engineering...
Base engineering complete - Train: (590540, 227), Test: (506691, 226)


## 5. UID Creation and Magic Features

In [5]:
def add_uids(df: pd.DataFrame) -> pd.DataFrame:
    c1 = as_str_key(df.get("card1"))
    a1 = as_str_key(df.get("addr1"))
    ped = as_str_key(df.get("p_ed_parent"))
    df["uid1"] = c1
    df["uid2"] = c1 + "_" + a1
    df["uid3"] = df["uid2"] + "_" + ped
    if "d1" in df.columns and "day" in df.columns:
        anch = np.floor((df["day"].astype("float64") - pd.to_numeric(df["d1"], errors="coerce"))).astype("Int64")
        anch = anch.astype("string").fillna("__na__").astype(str)
    else:
        anch = pd.Series(["__na__"] * len(df), index=df.index)
    df["uid4"] = anch + "_" + ped
    return df

# Create UIDs on train and test separately
train_df = add_uids(train_df)
test_df = add_uids(test_df)

# --- CORRECTED: LEAK-SAFE AGGREGATE FEATURE ENGINEERING ---
# Define a single function that fits on a training set and transforms both
# training and test sets. This ensures no information from the test set
# leaks into the model.

def add_agg_features(train: pd.DataFrame, test: pd.DataFrame):
    all_dfs = {"train": train, "test": test}
    
    # 5.1 UID Time Aggregates (Leak-safe)
    for k in ["uid2", "uid3"]:
        print(f"Generating leak-safe aggregates for '{k}'...")
        
        # Fit on training data ONLY
        agg_train = train.groupby(k).agg(
            mean_dt=( "dt", lambda x: x.diff().mean()),
            std_dt=( "dt", lambda x: x.diff().std()),
            mean_amt=("transactionamt", "mean"),
            std_amt=("transactionamt", "std"),
            count_amt=("transactionamt", "count"),
        ).add_suffix("_" + k).astype("float32")
        
        # Merge with both train and test
        all_dfs["train"] = all_dfs["train"].merge(agg_train, on=k, how="left")
        all_dfs["test"] = all_dfs["test"].merge(agg_train, on=k, how="left")
        
        # Add past-only deltas (must be done on sorted data)
        train = train.sort_values([k, "dt"])
        test = test.sort_values([k, "dt"])
        train[f"{k}_secs_since_prev"] = train.groupby(k)["dt"].diff().astype("float32")
        test[f"{k}_secs_since_prev"] = test.groupby(k)["dt"].diff().astype("float32")
        
        all_dfs["train"][f"{k}_secs_since_prev"] = train[f"{k}_secs_since_prev"]
        all_dfs["test"][f"{k}_secs_since_prev"] = test[f"{k}_secs_since_prev"]
        
        # Add a log transformation of the delta
        all_dfs["train"][f"{k}_secs_since_prev_log"] = np.log1p(
            all_dfs["train"][f"{k}_secs_since_prev"].fillna(0).clip(0, 3*24*3600)
        ).astype("float32")
        all_dfs["test"][f"{k}_secs_since_prev_log"] = np.log1p(
            all_dfs["test"][f"{k}_secs_since_prev"].fillna(0).clip(0, 3*24*3600)
        ).astype("float32")
        
        # Add absolute deviation
        all_dfs["train"][f"{k}_amt_devabs"] = (
            all_dfs["train"]["transactionamt"] - all_dfs["train"][f"mean_amt_{k}"]
        ).abs().astype("float32")
        all_dfs["test"][f"{k}_amt_devabs"] = (
            all_dfs["test"]["transactionamt"] - all_dfs["test"][f"mean_amt_{k}"]
        ).abs().astype("float32")
        
    # 5.2 Frequency Encoding (Leak-safe)
    freq_cols = [
        "productcd","card1","card2","card3","card4","card5","card6",
        "addr1","addr2","p_ed_parent","r_ed_parent",
        "id_30_os","id_31_br","devicetype","devicebrand","res_bucket",
        "uid1","uid2","uid3","uid4"
    ]
    for col in freq_cols:
        if col not in train.columns: continue
        print(f"Performing leak-safe frequency encoding for '{col}'...")
        vc = train[col].value_counts(dropna=False)
        all_dfs["train"][col + "_fe"] = all_dfs["train"][col].map(vc).astype("float32")
        all_dfs["test"][col + "_fe"] = all_dfs["test"][col].map(vc).astype("float32")

    # 5.3 Missingness Signatures (Leak-safe)
    def add_missingness_signatures(df: pd.DataFrame):
        miss = df.isna().sum(axis=1).astype("int32")
        df["na_ct_all"] = miss
        df["na_ratio_all"] = (miss / df.shape[1]).astype("float32")
        for fam in ["c","d","m","v","id_"]:
            fam_cols = [c for c in df.columns if c.startswith(fam)]
            if not fam_cols: continue
            mct = df[fam_cols].isna().sum(axis=1).astype("int32")
            df[f"na_ct_{fam}"] = mct
            df[f"na_ratio_{fam}"] = (mct / max(1, len(fam_cols))).astype("float32")
        return df

    all_dfs["train"] = add_missingness_signatures(all_dfs["train"])
    all_dfs["test"] = add_missingness_signatures(all_dfs["test"])
    
    return all_dfs["train"], all_dfs["test"]

# Apply the leak-safe feature engineering
train_df, test_df = add_agg_features(train_df, test_df)

print("\nFinal shapes after leak-safe engineering:")
print(f"Train: {train_df.shape}, Test: {test_df.shape}")

# Clean up target
if TARGET in test_df.columns:
    test_df.drop(columns=[TARGET], inplace=True)
if TARGET in train_df.columns:
    train_df[TARGET] = pd.to_numeric(train_df[TARGET], errors="coerce").fillna(0).astype("int8")
print("Target stats:", train_df[TARGET].value_counts(normalize=True).round(4))

Generating leak-safe aggregates for 'uid2'...
Generating leak-safe aggregates for 'uid3'...
Performing leak-safe frequency encoding for 'productcd'...
Performing leak-safe frequency encoding for 'card1'...
Performing leak-safe frequency encoding for 'card2'...
Performing leak-safe frequency encoding for 'card3'...
Performing leak-safe frequency encoding for 'card4'...
Performing leak-safe frequency encoding for 'card5'...
Performing leak-safe frequency encoding for 'card6'...
Performing leak-safe frequency encoding for 'addr1'...
Performing leak-safe frequency encoding for 'addr2'...
Performing leak-safe frequency encoding for 'p_ed_parent'...
Performing leak-safe frequency encoding for 'r_ed_parent'...
Performing leak-safe frequency encoding for 'id_30_os'...
Performing leak-safe frequency encoding for 'devicetype'...
Performing leak-safe frequency encoding for 'uid1'...
Performing leak-safe frequency encoding for 'uid2'...
Performing leak-safe frequency encoding for 'uid3'...
Perform

## 6. Expanding Walk-Forward Time Folds (6 Months)

Modified to use 6 months of data with proper expanding walk-forward validation

In [6]:
def add_month_index(df: pd.DataFrame, days_per_month: int = 30, col_day: str = "day") -> pd.DataFrame:
    if col_day not in df.columns: raise KeyError("Expected 'day' column. Run time primitives first.")
    d0 = int(df[col_day].min())
    df["month_ix"] = ((df[col_day] - d0) // days_per_month).astype("int16")
    return df

def make_expanding_folds(df, group_col="month_ix", min_train_months=3, valid_months=1, min_train_rows=50000, min_valid_rows=10000):
    groups = np.sort(df[group_col].unique())
    folds = []
    for end in range(min_train_months, len(groups) - valid_months + 1):
        tr_groups = set(groups[:end])
        va_groups = set(groups[end:end+valid_months])
        tr_idx = df.index[df[group_col].isin(tr_groups)].to_numpy()
        va_idx = df.index[df[group_col].isin(va_groups)].to_numpy()
        if len(tr_idx) >= min_train_rows and len(va_idx) >= min_valid_rows:
            folds.append((tr_idx, va_idx))
            print(f"Fold {len(folds)}: Train months {list(tr_groups)} -> Valid months {list(va_groups)} "
                  f"(Train: {len(tr_idx)}, Valid: {len(va_idx)})")
    return folds

train_df = add_month_index(train_df, days_per_month=30, col_day="day")
folds = make_expanding_folds(train_df, group_col="month_ix", min_train_months=3, valid_months=1, min_train_rows=50000, min_valid_rows=1000)
print(f"\nCreated {len(folds)} walk-forward folds")

Fold 1: Train months [0, 1, 2] -> Valid months [3] (Train: 315927, Valid: 98615)
Fold 2: Train months [0, 1, 2, 3] -> Valid months [4] (Train: 414542, Valid: 83571)
Fold 3: Train months [0, 1, 2, 3, 4] -> Valid months [5] (Train: 498113, Valid: 86934)
Fold 4: Train months [0, 1, 2, 3, 4, 5] -> Valid months [6] (Train: 585047, Valid: 5493)

Created 4 walk-forward folds


## 7. Time-Aligned Target Encoding

In [8]:
def _fit_target_mean(train_y, train_cat, m=100.0, prior=None):
    y = pd.Series(train_y, index=train_cat.index)
    if prior is None: prior = float(y.mean())
    stats = pd.DataFrame({"y": y.values, "c": train_cat.values})
    grp = stats.groupby("c")["y"].agg(["sum","count"])
    enc = (grp["sum"] + prior * m) / (grp["count"] + m)
    return enc.astype("float32"), float(prior)

def _apply_target_mean(cat_series, enc_map, prior):
    vals = cat_series.map(enc_map)
    return vals.fillna(prior).astype("float32")

def oof_target_encode_walkforward(train_df, test_df, cols, folds, target=TARGET, m_smooth=100.0, suffix="_te_wf"):
    if target not in train_df.columns: raise KeyError(f"Target '{target}' not found")
    y_all = pd.to_numeric(train_df[target], errors="coerce").fillna(0).astype("int8")
    oof_mat = {f"{c}{suffix}": np.full(len(train_df), np.nan, dtype="float32") for c in cols}
    test_mat = {f"{c}{suffix}": None for c in cols}
    for c in cols:
        if c not in train_df.columns:
            print(f"[OOF-TE] skip '{c}' (missing)")
            continue
        tr_cat = train_df[c].astype("string").fillna("__na__")
        te_cat = test_df[c].astype("string").fillna("__na__") if c in test_df.columns else pd.Series(["__na__"]*len(test_df))
        for k, (tr_idx, va_idx) in enumerate(folds, 1):
            enc_map, prior = _fit_target_mean(y_all.iloc[tr_idx], tr_cat.iloc[tr_idx], m=m_smooth)
            oof_vals = _apply_target_mean(tr_cat.iloc[va_idx], enc_map, prior)
            oof_mat[f"{c}{suffix}"][va_idx] = oof_vals.values
        enc_full, prior_full = _fit_target_mean(y_all, tr_cat, m=m_smooth)
        test_vals = _apply_target_mean(te_cat, enc_full, prior_full)
        test_mat[f"{c}{suffix}"] = test_vals.values.astype("float32")
    for new_col, arr in oof_mat.items(): train_df[new_col] = arr
    for new_col, arr in test_mat.items():
        if arr is not None: test_df[new_col] = arr
    first = next(iter(oof_mat.values()))
    cov = float(np.mean(~np.isnan(first)))
    print(f"Walk-forward OOF-TE done for {len(cols)} cols | OOF coverage: {cov:.3f}")
    return train_df, test_df

te_cols = [
    "uid1","uid2","uid3","uid4", "productcd","card1","card2","card3","card4","card5","card6",
    "addr1","addr2", "p_ed_parent","r_ed_parent", "id_30_os","id_31_br","devicetype","devicebrand","res_bucket"
]
te_cols = [c for c in te_cols if c in train_df.columns]
train_df, test_df = oof_target_encode_walkforward(
    train_df=train_df, test_df=test_df, cols=te_cols, folds=folds, target=TARGET, m_smooth=100.0, suffix="_te_wf"
)
print("Encoded columns:", [col for col in train_df.columns if col.endswith("_te_wf")][:5])

Walk-forward OOF-TE done for 17 cols | OOF coverage: 0.465
Encoded columns: ['uid1_te_wf', 'uid2_te_wf', 'uid3_te_wf', 'uid4_te_wf', 'productcd_te_wf']


## 8. Time Consistency Feature Testing

In [9]:
# Feature stability testing using single-feature models
WHITELIST = {
    "transactionamt","log_amt","amt_cents","amt_is_round",
    "hour","dow",
    "productcd","card1","card2","card3","card4","card5","card6", 
    "addr1","addr2","p_emaildomain","r_emaildomain","p_ed_parent","r_ed_parent",
    "devicetype","devicebrand","id_30_os","id_31_br","res_bucket",
    "dist1","dist2","c1","c2","c3","c4","c5","c6","c7","c8","c9","c10","c11","c12","c13","c14",
    "email_parent_match",
    "uid2_secs_since_prev","uid2_secs_since_prev_log","uid2_dt_mean","uid2_dt_std",
    "uid2_amt_mean","uid2_amt_std","uid2_amt_count","uid2_amt_devabs",
    "uid3_secs_since_prev","uid3_secs_since_prev_log","uid3_dt_mean","uid3_dt_std", 
    "uid3_amt_mean","uid3_amt_std","uid3_amt_count","uid3_amt_devabs",
}

BLACKLIST_EXACT = {TARGET, "transactionid", "transactiondt", "dt", "month_ix", "day", "index"}
EXCLUDE_SUFFIXES = ("_fe", "_te_wf")
EXCLUDE_PREFIXES = ("uid",)

def build_scan_candidates(df: pd.DataFrame) -> list:
    """Build list of ambiguous features to scan (D/M/V/id_ families)"""
    def is_scan_family(c: str) -> bool:
        c = c.lower()
        return c.startswith("d") or c.startswith("m") or c.startswith("v") or c.startswith("id_")

    out = []
    for c in df.columns:
        cl = c.lower()
        if cl in BLACKLIST_EXACT or cl in WHITELIST:
            continue
        if cl.endswith(EXCLUDE_SUFFIXES):
            continue
        if any(cl.startswith(p) for p in EXCLUDE_PREFIXES):
            continue
        if is_scan_family(cl):
            out.append(c)
    
    out = sorted(out)
    print(f"Scan candidates: {len(out)} (D/M/V/id_ families)")
    return out

def fit_target_mean_single(train_y: pd.Series, train_cat: pd.Series, m: float = 100.0):
    """Target encoding for single feature"""
    y = pd.Series(pd.to_numeric(train_y, errors="coerce").fillna(0).astype("int8"))
    if len(y) == 0:
        return pd.Series(dtype="float32"), 0.0
    prior = float(y.mean())
    stats = pd.DataFrame({
        "y": y.values, 
        "c": train_cat.astype("string").fillna("__na__").values
    })
    grp = stats.groupby("c")["y"].agg(["sum","count"])
    enc = (grp["sum"] + prior * m) / (grp["count"] + m)
    return enc.astype("float32"), prior

def apply_target_mean_single(cat_series: pd.Series, enc_map: pd.Series, prior: float):
    """Apply target encoding for single feature"""
    return cat_series.astype("string").fillna("__na__").map(enc_map).fillna(prior).astype("float32")

def _finite_median(a: pd.Series):
    """Get finite median, handling inf/nan"""
    a = pd.to_numeric(a, errors="coerce").replace([np.inf, -np.inf], np.nan)
    vals = a[np.isfinite(a)]
    return float(np.median(vals)) if vals.size > 0 else None

def fit_eval_single_feature(train_df: pd.DataFrame, col: str, tr_idx, va_idx, m_smooth=100.0):
    """Fit and evaluate single feature model"""
    y = train_df.loc[:, TARGET].astype("int8").values
    s = train_df[col]
    is_num = pd.api.types.is_numeric_dtype(s)

    X_tr_raw = s.iloc[tr_idx].copy()
    X_va_raw = s.iloc[va_idx].copy()

    if is_num:
        X_tr_raw = pd.to_numeric(X_tr_raw, errors="coerce").replace([np.inf, -np.inf], np.nan)
        X_va_raw = pd.to_numeric(X_va_raw, errors="coerce").replace([np.inf, -np.inf], np.nan)
        med = _finite_median(X_tr_raw)
        if med is None:
            return np.nan, np.nan
        X_tr = X_tr_raw.fillna(med).astype("float32").values.reshape(-1,1)
        X_va = X_va_raw.fillna(med).astype("float32").values.reshape(-1,1)
    else:
        enc_map, prior = fit_target_mean_single(
            train_df.loc[tr_idx, TARGET],
            X_tr_raw.astype("string").fillna("__na__"),
            m=m_smooth
        )
        X_tr = apply_target_mean_single(X_tr_raw, enc_map, prior).values.reshape(-1,1)
        X_va = apply_target_mean_single(X_va_raw, enc_map, prior).values.reshape(-1,1)
        X_tr[~np.isfinite(X_tr)] = prior
        X_va[~np.isfinite(X_va)] = prior

    # Check for constant features
    if np.nanstd(X_tr) == 0 or np.nanstd(X_va) == 0:
        return np.nan, np.nan

    # Scale and fit
    scaler = StandardScaler(with_mean=True, with_std=True)
    X_tr = scaler.fit_transform(X_tr)
    X_va = scaler.transform(X_va)
    
    if np.isnan(X_tr).any() or np.isnan(X_va).any():
        return np.nan, np.nan

    clf = LogisticRegression(solver="liblinear", max_iter=200)
    clf.fit(X_tr, y[tr_idx])
    
    p_tr = clf.predict_proba(X_tr)[:,1]
    p_va = clf.predict_proba(X_va)[:,1]

    try:   
        auc_tr = roc_auc_score(y[tr_idx], p_tr)
    except: 
        auc_tr = np.nan
    try:   
        auc_va = roc_auc_score(y[va_idx], p_va)
    except: 
        auc_va = np.nan

    return auc_tr, auc_va

def time_consistency_scan(
    train_df: pd.DataFrame,
    candidate_cols: list,
    folds: list,
    m_smooth: float = 100.0,
    verbose_every: int = 50
):
    """Run time consistency scan using walk-forward folds"""
    if TARGET not in train_df.columns:
        raise KeyError(f"Expected target '{TARGET}' in train_df")

    print(f"Time consistency scan using {len(folds)} walk-forward folds")

    rows = []
    for i, col in enumerate(candidate_cols, 1):
        auc_trs, auc_vas = [], []
        for (tr_idx, va_idx) in folds:
            auc_tr, auc_va = fit_eval_single_feature(train_df, col, tr_idx, va_idx, m_smooth=m_smooth)
            auc_trs.append(auc_tr)
            auc_vas.append(auc_va)

        row = {
            "feature": col,
            "folds": len(folds),
            "val_auc_mean": np.nanmean(auc_vas),
            "val_auc_median": np.nanmedian(auc_vas),
            "val_auc_std": np.nanstd(auc_vas),
            "train_auc_mean": np.nanmean(auc_trs),
            "gap_train_minus_val": (np.nanmean(auc_trs) - np.nanmean(auc_vas)),
            "whitelist_flag": False,
        }
        rows.append(row)

        if verbose_every and (i % verbose_every == 0):
            print(f"[{i}/{len(candidate_cols)}] {col} | val_mean={row['val_auc_mean']:.3f} gap={row['gap_train_minus_val']:.3f}")

    res = (pd.DataFrame(rows)
           .sort_values(["val_auc_mean", "gap_train_minus_val"], ascending=[False, True])
           .reset_index(drop=True))

    # Mark unstable features
    res["unstable_flag"] = (res["val_auc_mean"] < 0.50) | (res["gap_train_minus_val"] > 0.10)

    # Add whitelist features
    wl_present = sorted([c for c in WHITELIST if c in train_df.columns])
    wl_df = pd.DataFrame({
        "feature": wl_present,
        "folds": 0,
        "val_auc_mean": np.nan,
        "val_auc_median": np.nan, 
        "val_auc_std": np.nan,
        "train_auc_mean": np.nan,
        "gap_train_minus_val": np.nan,
        "whitelist_flag": True,
        "unstable_flag": False,
    })

    # Save results
    os.makedirs("../reports", exist_ok=True)
    res_all = pd.concat([res, wl_df], ignore_index=True)
    res_all.to_csv("../reports/time_consistency_results.csv", index=False)

    stable = res_all.loc[res_all["unstable_flag"] == False, ["feature","whitelist_flag"]].sort_values("feature")
    unstable = res_all.loc[(res_all["unstable_flag"] == True) & (res_all["whitelist_flag"] == False), ["feature"]].sort_values("feature")

    stable.to_csv("../reports/stable_features.csv", index=False)
    unstable.to_csv("../reports/unstable_features.csv", index=False)

    # Final feature list
    keep_final = stable["feature"].tolist()
    pd.DataFrame({"feature": keep_final}).to_csv("../reports/keep_features_final.csv", index=False)

    print(f"Results saved:")
    print(f"  - Stable features: {len(stable)}")
    print(f"  - Unstable features: {len(unstable)}")
    print(f"  - Final keep list: {len(keep_final)}")
    
    return res_all, stable, unstable

# Run consistency scan
candidates = build_scan_candidates(train_df)
results_df, stable_df, unstable_df = time_consistency_scan(
    train_df=train_df,
    candidate_cols=candidates,
    folds=folds,
    m_smooth=100.0,
    verbose_every=50
)

print("\nTop 10 stable features by validation AUC:")
print(results_df.head(10)[["feature", "val_auc_mean", "gap_train_minus_val", "unstable_flag"]])

Scan candidates: 186 (D/M/V/id_ families)
Time consistency scan using 4 walk-forward folds
[50/186] m5 | val_mean=0.519 gap=0.013
[100/186] v203 | val_mean=0.562 gap=0.002
[150/186] v309 | val_mean=0.525 gap=0.004
Results saved:
  - Stable features: 222
  - Unstable features: 8
  - Final keep list: 222

Top 10 stable features by validation AUC:
  feature  val_auc_mean  gap_train_minus_val  unstable_flag
0   id_35      0.690405            -0.022471          False
1   id_15      0.689073            -0.026622          False
2   id_29      0.688135            -0.026070          False
3   id_28      0.687972            -0.026654          False
4   id_38      0.686433            -0.029267          False
5      m6      0.686408            -0.030440          False
6   id_12      0.685670            -0.035047          False
7    v303      0.683048            -0.027043          False
8   id_31      0.682733            -0.007584          False
9   id_37      0.681379            -0.028526         

## 9. Final Model-Ready Dataset Creation

In [12]:
def build_final_cols(df_cols):
    cols_to_keep = set()
    for col in df_cols:
        col_lower = col.lower()
        # Keep base features
        if col_lower in REQUIRED_BASE:
            cols_to_keep.add(col)
        # Keep features from our engineering
        if any(col_lower.endswith(s) for s in INCLUDE_SUFFIXES):
            cols_to_keep.add(col)
        # Keep all features we created in the add_agg_features step
        # by checking for the uid or agg_stats suffixes
        if 'uid' in col_lower or 'amt' in col_lower or 'dt' in col_lower or 'devabs' in col_lower:
             cols_to_keep.add(col)
        # Keep original columns that aren't V-columns, IDs, or D-columns
        # NOTE: Corrected to include 'v' and 'id_' prefixes
        if any(col_lower.startswith(p) for p in ['card', 'addr', 'p_', 'r_', 'm', 'v', 'id_']):
            cols_to_keep.add(col)
        if 'productcd' in col_lower:
            cols_to_keep.add(col)
    
    # Exclude raw UIDs if their encoded versions exist
    raw_uids = {"uid1", "uid2", "uid3", "uid4"}
    final_keep = sorted(list(cols_to_keep - raw_uids))
    
    return final_keep

# Align train/test (except target)
final_train_cols_full = build_final_cols(train_df.columns)
final_test_cols_full = build_final_cols(test_df.columns)

final_common = sorted(list(set(final_train_cols_full) & set(final_test_cols_full)))
final_train_cols_aligned = [TARGET] + final_common if TARGET in train_df.columns else final_common
final_test_cols_aligned = final_common

print(f"Final features:")
print(f"  - Train: {len(final_train_cols_aligned)} (including target)")
print(f"  - Test: {len(final_test_cols_aligned)}")

train_out = train_df[final_train_cols_aligned].copy()
test_out = test_df[final_test_cols_aligned].copy()

train_out.to_csv(OUTDIR + "train_processed.csv", index=False)
test_out.to_csv(OUTDIR + "test_processed.csv", index=False)
print("Saved processed data to 'data/processed/' directory.")

Final features:
  - Train: 241 (including target)
  - Test: 240
Saved processed data to 'data/processed/' directory.


## 10. Feature Summary and Next Steps

In [13]:
print("=== FEATURE ENGINEERING PIPELINE COMPLETE ===\n")

print("Features created:")
print("1. ✅ Base primitives: time, amount, email domains")
print("2. ✅ V-column reduction: 339 -> 128 columns")
print("3. ✅ UID aggregates: time deltas, amount stats, frequency")
print("4. ✅ Walk-forward target encoding (6 months)")
print("5. ✅ Time consistency testing")
print("6. ✅ Final stable feature selection")

print(f"\nDatasets ready for modeling:")
print(f"- Training: {train_out.shape[0]:,} rows × {train_out.shape[1]} features")
print(f"- Test: {test_out.shape[0]:,} rows × {test_out.shape[1]} features")

print(f"\nKey feature categories included:")
feature_categories = {
    "Basic": [c for c in final_common if c in ["transactionamt", "log_amt", "hour", "dow"]],
    "Card/Addr": [c for c in final_common if c.startswith(("card", "addr"))],
    "Email": [c for c in final_common if "email" in c or c.endswith("_ed_parent")],
    "Device": [c for c in final_common if c.startswith(("device", "id_30", "id_31", "res_"))],
    "UID Stats": [c for c in final_common if "uid2_" in c or "uid3_" in c],
    "Target Encoded": [c for c in final_common if c.endswith("_te_wf")],
    "Frequency": [c for c in final_common if c.endswith("_fe")],
    "V Reduced": [c for c in final_common if c.startswith("v")],
    "Other": []
}

for cat, cols in feature_categories.items():
    if cols:
        print(f"  - {cat}: {len(cols)} features")

print(f"\nReports generated:")
print(f"  - ../reports/time_consistency_results.csv")
print(f"  - ../reports/stable_features.csv")  
print(f"  - ../reports/unstable_features.csv")
print(f"  - ../reports/keep_features_final.csv")

print(f"\nNext steps:")
print(f"1. Train models on ../data/processed/train_model.csv")
print(f"2. Use walk-forward folds for validation")
print(f"3. Generate predictions on ../data/processed/test_model.csv")

# Memory cleanup
del train_transaction, train_identity, test_transaction, test_identity
gc.collect()
print(f"\n✅ Pipeline complete!")

=== FEATURE ENGINEERING PIPELINE COMPLETE ===

Features created:
1. ✅ Base primitives: time, amount, email domains
2. ✅ V-column reduction: 339 -> 128 columns
3. ✅ UID aggregates: time deltas, amount stats, frequency
4. ✅ Walk-forward target encoding (6 months)
5. ✅ Time consistency testing
6. ✅ Final stable feature selection

Datasets ready for modeling:
- Training: 590,540 rows × 241 features
- Test: 506,691 rows × 240 features

Key feature categories included:
  - Basic: 2 features
  - Card/Addr: 24 features
  - Email: 6 features
  - Device: 7 features
  - UID Stats: 10 features
  - Target Encoded: 17 features
  - Frequency: 17 features
  - V Reduced: 128 features

Reports generated:
  - ../reports/time_consistency_results.csv
  - ../reports/stable_features.csv
  - ../reports/unstable_features.csv
  - ../reports/keep_features_final.csv

Next steps:
1. Train models on ../data/processed/train_model.csv
2. Use walk-forward folds for validation
3. Generate predictions on ../data/process

NameError: name 'train_transaction' is not defined