# LGBM

In [1]:
import os, gc
import numpy as np
import pandas as pd
import lightgbm as lgb
from sklearn.metrics import roc_auc_score

In [2]:
DATA_DIR = "../data/processed"
trainX = pd.read_csv(f"{DATA_DIR}/IEEE_Train.csv")
y      = pd.read_csv(f"{DATA_DIR}/IEEE_Target.csv")   # TransactionID, isFraud
testX  = pd.read_csv(f"{DATA_DIR}/IEEE_Test.csv")

train = trainX.merge(y, on="TransactionID", how="left")
assert "isFraud" in train.columns
print("Shapes -> train:", train.shape, "test:", testX.shape)

Shapes -> train: (590540, 293) test: (506691, 292)


In [3]:
# Expanding-window folds
def add_month_ix_from_day(df, day_col="day", days_per_month=30):
    if day_col not in df.columns:
        raise KeyError("Expected a 'day' column (present in this dataset).")
    d0 = int(df[day_col].min())
    return ((df[day_col] - d0) // days_per_month).astype("int16")

train["month_ix"] = train["DT_M"].astype("int16")
train["month_ix"] = add_month_ix_from_day(train, day_col="day")

def make_expanding_folds(df, group_col="month_ix",
                         min_train_months=3, valid_months=1,
                         min_train_rows=50_000, min_valid_rows=20_000):
    groups = np.sort(df[group_col].unique())
    folds = []
    for end in range(min_train_months, len(groups) - valid_months + 1):
        tr_groups = set(groups[:end])
        va_groups = set(groups[end:end+valid_months])
        tr_idx = df.index[df[group_col].isin(tr_groups)].to_numpy()
        va_idx = df.index[df[group_col].isin(va_groups)].to_numpy()
        if len(tr_idx) >= min_train_rows and len(va_idx) >= min_valid_rows:
            folds.append((tr_idx, va_idx))
            print(f"Fold {len(folds)}: train months {sorted(tr_groups)} -> valid months {sorted(va_groups)} "
                  f"(tr {len(tr_idx):,}, va {len(va_idx):,})")
    return folds

folds = make_expanding_folds(train, min_train_months=3, valid_months=1,
                             min_train_rows=50_000, min_valid_rows=1_000)
print("Num folds:", len(folds))

Fold 1: train months [0, 1, 2] -> valid months [3] (tr 315,927, va 98,615)
Fold 2: train months [0, 1, 2, 3] -> valid months [4] (tr 414,542, va 83,571)
Fold 3: train months [0, 1, 2, 3, 4] -> valid months [5] (tr 498,113, va 86,934)
Fold 4: train months [0, 1, 2, 3, 4, 5] -> valid months [6] (tr 585,047, va 5,493)
Num folds: 4


In [4]:
# Overall fraud rate
overall_fraud_rate = train['isFraud'].mean()
print(f"Overall fraud rate: {overall_fraud_rate:.4f} ({overall_fraud_rate*100:.2f}%)")

# Fraud rate by month
fraud_by_month = train.groupby('month_ix')['isFraud'].agg(['count', 'sum', 'mean'])
fraud_by_month.columns = ['total_transactions', 'fraud_count', 'fraud_rate']
print("\nFraud rate by month:")
print(fraud_by_month)

# Fraud rate for each fold
print("\nFraud rate per fold:")
for i, (tr_idx, va_idx) in enumerate(folds):
    tr_fraud_rate = train.loc[tr_idx, 'isFraud'].mean()
    va_fraud_rate = train.loc[va_idx, 'isFraud'].mean()
    print(f"Fold {i+1}: Train fraud rate: {tr_fraud_rate:.4f} ({tr_fraud_rate*100:.2f}%), "
          f"Valid fraud rate: {va_fraud_rate:.4f} ({va_fraud_rate*100:.2f}%)")

Overall fraud rate: 0.0350 (3.50%)

Fraud rate by month:
          total_transactions  fraud_count  fraud_rate
month_ix                                             
0                     134339         3401    0.025317
1                      89399         3577    0.040012
2                      92189         3724    0.040395
3                      98615         3898    0.039527
4                      83571         2850    0.034103
5                      86934         2972    0.034187
6                       5493          241    0.043874

Fraud rate per fold:
Fold 1: Train fraud rate: 0.0339 (3.39%), Valid fraud rate: 0.0395 (3.95%)
Fold 2: Train fraud rate: 0.0352 (3.52%), Valid fraud rate: 0.0341 (3.41%)
Fold 3: Train fraud rate: 0.0350 (3.50%), Valid fraud rate: 0.0342 (3.42%)
Fold 4: Train fraud rate: 0.0349 (3.49%), Valid fraud rate: 0.0439 (4.39%)


In [5]:
drop_time = ['TransactionDT']                                # time index
drop_leaky = ['D6','D7','D8','D9','D12','D13','D14']         # leaky/time-variant set
drop_unstable = ['C3','M5','id_08','id_33',                  # failed time consistency
                 'card4','id_07','id_14','id_21','id_30','id_32','id_34'] \
                + [f'id_{x}' for x in range(22,28)]

DROP_COLS = set(drop_time + drop_leaky + drop_unstable)

EXCLUDE_COLS = {
    'TransactionID',  # id
    'uid',            # raw identifier -> exclude
    'isFraud',        # target
    # exclude time indices
    'DT_M', 'day'
}

# Build FEATURES from train, enforce presence in test as well
base = [c for c in train.columns if c not in EXCLUDE_COLS]
FEATURES = [c for c in base if c not in DROP_COLS]
# keep only columns that exist in BOTH train and test (prevents surprises)
FEATURES = [c for c in FEATURES if c in testX.columns]

# Categorical lists
CATEGORICAL_FEATURES = [
    "ProductCD",
    "card6",
    "addr1","addr2",
    "P_emaildomain","R_emaildomain",
    "DeviceType","DeviceInfo",
    "id_31","id_33","id_35","id_36","id_37","id_38"
    "card1_addr1","card1_addr1_P_emaildomain",
]
BOOLEAN_FEATURES = [f"M{i}" for i in range(1,10) if f"M{i}" in train.columns]

CAT_FEATS = [c for c in set(CATEGORICAL_FEATURES + BOOLEAN_FEATURES) if c in FEATURES]

# Cast categoricals; replace -1 with NaN (if present)
for col in CAT_FEATS:
    train[col] = train[col].replace(-1, np.nan).astype('category')
    testX[col]  = testX[col].replace(-1, np.nan).astype('category')

print(f"Total FEATURES: {len(FEATURES)} | Categorical passed to LGBM: {len(CAT_FEATS)}")

Total FEATURES: 263 | Categorical passed to LGBM: 21


In [6]:
print('NOW USING THE FOLLOWING',len(FEATURES),'FEATURES.')
np.array(FEATURES)

NOW USING THE FOLLOWING 263 FEATURES.


array(['TransactionAmt', 'ProductCD', 'card1', 'card2', 'card3', 'card5',
       'card6', 'addr1', 'addr2', 'dist1', 'dist2', 'P_emaildomain',
       'R_emaildomain', 'C1', 'C2', 'C4', 'C5', 'C6', 'C7', 'C8', 'C9',
       'C10', 'C11', 'C12', 'C13', 'C14', 'D1', 'D2', 'D3', 'D4', 'D5',
       'D10', 'D11', 'D15', 'M1', 'M2', 'M3', 'M4', 'M6', 'M7', 'M8',
       'M9', 'V1', 'V3', 'V4', 'V6', 'V8', 'V11', 'V13', 'V14', 'V17',
       'V20', 'V23', 'V26', 'V27', 'V30', 'V36', 'V37', 'V40', 'V41',
       'V44', 'V47', 'V48', 'V54', 'V56', 'V59', 'V62', 'V65', 'V67',
       'V68', 'V70', 'V76', 'V78', 'V80', 'V82', 'V86', 'V88', 'V89',
       'V91', 'V107', 'V108', 'V111', 'V115', 'V117', 'V120', 'V121',
       'V123', 'V124', 'V127', 'V129', 'V130', 'V136', 'V138', 'V139',
       'V142', 'V147', 'V156', 'V160', 'V162', 'V165', 'V166', 'V169',
       'V171', 'V173', 'V175', 'V176', 'V178', 'V180', 'V182', 'V185',
       'V187', 'V188', 'V198', 'V203', 'V205', 'V207', 'V209', 'V210',
       '

In [9]:
# LightGBM (Expanding Window)
import json, os, gc
import numpy as np
import pandas as pd
import lightgbm as lgb
from sklearn.metrics import roc_auc_score

# Optuna setup (persistent storage so runs can resume)
import optuna
from optuna.pruners import MedianPruner
from optuna.integration import LightGBMPruningCallback

STUDY_NAME = "ieee_lgbm_expanding"
STORAGE_URL = "sqlite:///optuna_ieee.db"   # creates a local SQLite DB for resume
N_TRIALS = 50                              
TIMEOUT = None

study = optuna.create_study(
    study_name=STUDY_NAME,
    storage=STORAGE_URL,
    load_if_exists=True,   # <-- resume if the DB already exists
    direction="maximize",
    pruner=MedianPruner(n_warmup_steps=5)   # prunes bad trials early based on AUC
)

[I 2025-09-01 09:47:52,444] A new study created in RDB with name: ieee_lgbm_expanding


In [10]:
# Objective: weighted-CV AUC across expanding folds with better regularization
import warnings
warnings.filterwarnings("ignore", category=UserWarning, module="optuna")

def objective(trial: optuna.Trial) -> float:
    # ---- Parameter distributions (with stronger regularization) ----
    # Learning rate (log-scale): smaller values allow more trees
    learning_rate = trial.suggest_float("learning_rate", 0.005, 0.2, log=True)
    
    # Tree shape - more conservative to reduce overfitting
    num_leaves     = trial.suggest_int("num_leaves", 31, 256)   # reduced max from 512
    max_depth      = trial.suggest_categorical("max_depth", [-1, 6, 8, 12])  # reduced options
    min_data_leaf  = trial.suggest_int("min_data_in_leaf", 100, 2000)  # increased min from 50
    
    # Column & row subsampling - more aggressive to reduce overfitting
    feature_fraction = trial.suggest_float("feature_fraction", 0.5, 0.9)  # reduced max from 1.0
    bagging_fraction = trial.suggest_float("bagging_fraction", 0.6, 0.9)  # reduced max from 1.0
    bagging_freq     = trial.suggest_int("bagging_freq", 1, 10)
    
    # Regularization - much stronger
    lambda_l1 = trial.suggest_float("lambda_l1", 0.1, 50.0, log=True)  # increased min from 1e-8
    lambda_l2 = trial.suggest_float("lambda_l2", 0.1, 50.0, log=True)  # increased min from 1e-8
    min_gain_to_split = trial.suggest_float("min_gain_to_split", 0.0, 10.0)  # increased max from 5.0
    
    # Class imbalance (optional; can help on IEEE)
    scale_pos_weight = trial.suggest_float("scale_pos_weight", 0.5, 20.0, log=True)

    params = {
        "objective": "binary",
        "metric": "auc",
        "boosting_type": "gbdt",
        "verbosity": -1,
        "n_jobs": -1,
        "force_col_wise": True,
        "random_state": 42,

        "learning_rate": learning_rate,
        "num_leaves": num_leaves,
        "max_depth": max_depth,
        "min_data_in_leaf": min_data_leaf,

        "feature_fraction": feature_fraction,  # == colsample_bytree
        "bagging_fraction": bagging_fraction,  # == subsample
        "bagging_freq": bagging_freq,

        "lambda_l1": lambda_l1,
        "lambda_l2": lambda_l2,
        "min_gain_to_split": min_gain_to_split,

        "scale_pos_weight": scale_pos_weight,
    }

    # Train across expanding folds
    fold_scores = []
    fold_sizes  = []
    
    for k, (tr_idx, va_idx) in enumerate(folds, 1):
        X_tr, y_tr = train.iloc[tr_idx][FEATURES], train.iloc[tr_idx]["isFraud"]
        X_va, y_va = train.iloc[va_idx][FEATURES], train.iloc[va_idx]["isFraud"]

        dtr = lgb.Dataset(X_tr, label=y_tr, categorical_feature=CAT_FEATS, free_raw_data=False)
        dva = lgb.Dataset(X_va, label=y_va, categorical_feature=CAT_FEATS, reference=dtr, free_raw_data=False)

        model = lgb.train(
            params,
            dtr,
            num_boost_round=5_000,  # reduced from 10_000 for faster training
            valid_sets=[dtr, dva],
            valid_names=["train","valid"],
            callbacks=[
                lgb.early_stopping(stopping_rounds=100, verbose=False),  # reduced from 200
                # Removed LightGBMPruningCallback to avoid warning spam
                lgb.log_evaluation(period=500),  # less frequent logging
            ],
        )
        
        preds = model.predict(X_va, num_iteration=model.best_iteration)
        auc = roc_auc_score(y_va, preds)
        fold_scores.append(auc)
        fold_sizes.append(len(va_idx))
        
        print(f"  Fold {k}: AUC = {auc:.4f}, Trees = {model.best_iteration}")

        # free memory
        del X_tr, y_tr, X_va, y_va, dtr, dva, model
        gc.collect()

    # Weighted CV AUC by validation sizes
    fold_scores = np.array(fold_scores, dtype=float)
    weights = np.array(fold_sizes, dtype=float) / np.sum(fold_sizes)
    weighted_cv = float(np.sum(fold_scores * weights))

    # Track nice info on the trial
    trial.set_user_attr("fold_scores", fold_scores.tolist())
    trial.set_user_attr("weights", weights.tolist())
    trial.set_user_attr("weighted_cv", weighted_cv)
    
    print(f"  Weighted CV AUC: {weighted_cv:.4f}")
    print(f"  Fold scores: {[f'{score:.4f}' for score in fold_scores]}")

    return weighted_cv

# Run the optimization
print("Starting Optuna optimize… (resumable)")
study.optimize(objective, n_trials=N_TRIALS, timeout=TIMEOUT, gc_after_trial=True, show_progress_bar=True)

# Save study artifacts for resume/analyze
os.makedirs("optuna_artifacts", exist_ok=True)
study_df = study.trials_dataframe()
study_df.to_csv("optuna_artifacts/ieee_lgbm_trials.csv", index=False)
with open("optuna_artifacts/ieee_lgbm_best_params.json", "w") as f:
    json.dump(study.best_params, f, indent=2)

print("Best value (weighted CV AUC):", study.best_value)
print("Best params:", study.best_params)

# Optional: Print top 5 trials
print("\nTop 5 trials:")
top_trials = sorted(study.trials, key=lambda t: t.value if t.value is not None else 0, reverse=True)[:5]
for i, trial in enumerate(top_trials, 1):
    if trial.value is not None:
        print(f"{i}. Trial {trial.number}: {trial.value:.4f}")
        fold_scores = trial.user_attrs.get('fold_scores', [])
        if fold_scores:
            print(f"   Fold scores: {[f'{score:.4f}' for score in fold_scores]}")

Starting Optuna optimize… (resumable)


  0%|          | 0/50 [00:00<?, ?it/s]

  Fold 1: AUC = 0.9152, Trees = 131
[500]	train's auc: 0.986299	valid's auc: 0.934738
  Fold 2: AUC = 0.9348, Trees = 491
[500]	train's auc: 0.985641	valid's auc: 0.929802
  Fold 3: AUC = 0.9301, Trees = 450
[500]	train's auc: 0.984512	valid's auc: 0.946899
  Fold 4: AUC = 0.9471, Trees = 507
  Weighted CV AUC: 0.9265
  Fold scores: ['0.9152', '0.9348', '0.9301', '0.9471']
[I 2025-09-01 09:55:22,335] Trial 0 finished with value: 0.9265284804005063 and parameters: {'learning_rate': 0.05579982928494453, 'num_leaves': 114, 'max_depth': 8, 'min_data_in_leaf': 283, 'feature_fraction': 0.7197308745526103, 'bagging_fraction': 0.6560184056233574, 'bagging_freq': 4, 'lambda_l1': 41.99740556658881, 'lambda_l2': 4.041477597446202, 'min_gain_to_split': 6.560189328334421, 'scale_pos_weight': 3.841831312803122}. Best is trial 0 with value: 0.9265284804005063.
  Fold 1: AUC = 0.9023, Trees = 11
  Fold 2: AUC = 0.9264, Trees = 82
  Fold 3: AUC = 0.9223, Trees = 108
  Fold 4: AUC = 0.9608, Trees = 248


AssertionError: Should not reach.

In [None]:
# Retrain with best params across folds; save models; compute OOF
best_params = study.best_params.copy()
best_params.update({
    "objective": "binary",
    "metric": "auc",
    "boosting_type": "gbdt",
    "verbosity": -1,
    "n_jobs": -1,
    "force_col_wise": True,
    "random_state": 42,
})
models = []
oof = np.zeros(len(train), dtype=float)
fold_scores, fold_sizes = [], []
for k, (tr_idx, va_idx) in enumerate(folds, 1):
    print(f"\n=== Refit fold {k} with best params ===")
    X_tr, y_tr = train.iloc[tr_idx][FEATURES], train.iloc[tr_idx]["isFraud"]
    X_va, y_va = train.iloc[va_idx][FEATURES], train.iloc[va_idx]["isFraud"]

    dtr = lgb.Dataset(X_tr, label=y_tr, categorical_feature=CAT_FEATS, free_raw_data=False)
    dva = lgb.Dataset(X_va, label=y_va, categorical_feature=CAT_FEATS, reference=dtr, free_raw_data=False)

    m = lgb.train(
        best_params, dtr, num_boost_round=10_000,
        valid_sets=[dtr, dva], valid_names=["train","valid"],
        callbacks=[lgb.early_stopping(200, verbose=False), lgb.log_evaluation(200)],
    )
    pv = m.predict(X_va, num_iteration=m.best_iteration)
    oof[va_idx] = pv
    auc = roc_auc_score(y_va, pv)
    fold_scores.append(auc); fold_sizes.append(len(va_idx))
    models.append(m)

    # save a copy of the fold model
    m.save_model(f"optuna_artifacts/lgbm_fold{k}.txt", num_iteration=m.best_iteration)

    del X_tr, y_tr, X_va, y_va, dtr, dva
    gc.collect()

fold_scores = np.array(fold_scores, dtype=float)
weights = np.array(fold_sizes, dtype=float) / np.sum(fold_sizes)
weighted_cv = float(np.sum(fold_scores * weights))
oof_mask = oof != 0
oof_auc = roc_auc_score(train["isFraud"][oof_mask], oof[oof_mask]) if oof_mask.any() else float("nan")

print("\n=== Refit CV Summary ===")
print("Fold AUCs:", np.round(fold_scores, 4))
print("Weighted mean AUC:", f"{weighted_cv:.4f}")
print("OOF AUC (predicted samples):", f"{oof_auc:.4f}")

In [None]:
# Predict test + save submission aligned by TransactionID
def coerce_transaction_id(s: pd.Series) -> pd.Series:
    if pd.api.types.is_integer_dtype(s): return s.astype("int64")
    if pd.api.types.is_float_dtype(s):   return s.round().astype("int64")
    if pd.api.types.is_string_dtype(s):
        s2 = s.str.replace(r"\.0$", "", regex=True)
        return pd.to_numeric(s2, errors="raise").astype("int64")
    return pd.to_numeric(s, errors="raise").astype("int64")

if "TransactionID" in testX.columns:
    test_id = coerce_transaction_id(testX["TransactionID"])
    test_preds = np.mean([m.predict(testX[FEATURES], num_iteration=m.best_iteration) for m in models], axis=0)
    sub = pd.DataFrame({"TransactionID": test_id, "isFraud": test_preds})
    os.makedirs("submissions", exist_ok=True)
    out_path = f"submissions/submission_optuna_cv{weighted_cv:.4f}.csv"
    sub.to_csv(out_path, index=False)
    print("\nSaved submission:", out_path)
else:
    print("\n(No TransactionID in testX — skipping submission build.)")