# XGB

In [1]:
# XGB
import os, gc, json
import numpy as np
import pandas as pd
import xgboost as xgb
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import GroupKFold

In [2]:
DATA_DIR = "../data/processed"
trainX = pd.read_csv(f"{DATA_DIR}/IEEE_Train.csv")
y      = pd.read_csv(f"{DATA_DIR}/IEEE_Target.csv")   # TransactionID, isFraud
testX  = pd.read_csv(f"{DATA_DIR}/IEEE_Test.csv")

train = trainX.merge(y, on="TransactionID", how="left")
assert "isFraud" in train.columns
print("Shapes -> train:", train.shape, "test:", testX.shape)

Shapes -> train: (590540, 293) test: (506691, 292)


In [3]:
# Use GroupKFold
print("\n" + "="*50)
print("USING GROUPKFOLD WITH DT_M")
print("="*50)

# Check if DT_M column exists
if 'DT_M' not in train.columns:
    print("WARNING: DT_M column not found. Creating from day column...")
    def add_month_ix_from_day(df, day_col="day", days_per_month=30):
        if day_col not in df.columns:
            raise KeyError("Expected a 'day' column (present in this dataset).")
        d0 = int(df[day_col].min())
        return ((df[day_col] - d0) // days_per_month).astype("int16")
    
    train["DT_M"] = add_month_ix_from_day(train, day_col="day")

# Use GroupKFold
skf = GroupKFold(n_splits=6)
folds = list(skf.split(train, train["isFraud"], groups=train['DT_M']))

print(f"Created {len(folds)} folds using GroupKFold")

# Print fold information like the winner
for i, (tr_idx, va_idx) in enumerate(folds):
    month = train.iloc[va_idx]['DT_M'].iloc[0]
    print(f'Fold {i} withholding month {month}')
    print(f' rows of train = {len(tr_idx)}, rows of holdout = {len(va_idx)}')

# Overall fraud rate
overall_fraud_rate = train['isFraud'].mean()
print(f"\nOverall fraud rate: {overall_fraud_rate:.4f} ({overall_fraud_rate*100:.2f}%)")


USING GROUPKFOLD WITH DT_M
Created 6 folds using GroupKFold
Fold 0 withholding month 12
 rows of train = 453219, rows of holdout = 137321
Fold 1 withholding month 15
 rows of train = 488908, rows of holdout = 101632
Fold 2 withholding month 13
 rows of train = 497955, rows of holdout = 92585
Fold 3 withholding month 17
 rows of train = 501214, rows of holdout = 89326
Fold 4 withholding month 14
 rows of train = 504519, rows of holdout = 86021
Fold 5 withholding month 16
 rows of train = 506885, rows of holdout = 83655

Overall fraud rate: 0.0350 (3.50%)


In [4]:
# Feature selection
drop_time = ['TransactionDT']                                # time index
drop_leaky = ['D6','D7','D8','D9','D12','D13','D14']         # leaky/time-variant set
drop_unstable = ['C3','M5','id_08','id_33',                  # failed time consistency
                 'card4','id_07','id_14','id_21','id_30','id_32','id_34'] \
                + [f'id_{x}' for x in range(22,28)]

DROP_COLS = set(drop_time + drop_leaky + drop_unstable)

EXCLUDE_COLS = {
    'TransactionID',  # id
    'uid',            # raw identifier -> exclude
    'isFraud',        # target
    # exclude time indices
    'DT_M', 'day'
}

# Build FEATURES from train, enforce presence in test as well
base = [c for c in train.columns if c not in EXCLUDE_COLS]
FEATURES = [c for c in base if c not in DROP_COLS]
# keep only columns that exist in BOTH train and test (prevents surprises)
FEATURES = [c for c in FEATURES if c in testX.columns]

print(f'NOW USING THE FOLLOWING {len(FEATURES)} FEATURES.')
print(FEATURES)

NOW USING THE FOLLOWING 263 FEATURES.
['TransactionAmt', 'ProductCD', 'card1', 'card2', 'card3', 'card5', 'card6', 'addr1', 'addr2', 'dist1', 'dist2', 'P_emaildomain', 'R_emaildomain', 'C1', 'C2', 'C4', 'C5', 'C6', 'C7', 'C8', 'C9', 'C10', 'C11', 'C12', 'C13', 'C14', 'D1', 'D2', 'D3', 'D4', 'D5', 'D10', 'D11', 'D15', 'M1', 'M2', 'M3', 'M4', 'M6', 'M7', 'M8', 'M9', 'V1', 'V3', 'V4', 'V6', 'V8', 'V11', 'V13', 'V14', 'V17', 'V20', 'V23', 'V26', 'V27', 'V30', 'V36', 'V37', 'V40', 'V41', 'V44', 'V47', 'V48', 'V54', 'V56', 'V59', 'V62', 'V65', 'V67', 'V68', 'V70', 'V76', 'V78', 'V80', 'V82', 'V86', 'V88', 'V89', 'V91', 'V107', 'V108', 'V111', 'V115', 'V117', 'V120', 'V121', 'V123', 'V124', 'V127', 'V129', 'V130', 'V136', 'V138', 'V139', 'V142', 'V147', 'V156', 'V160', 'V162', 'V165', 'V166', 'V169', 'V171', 'V173', 'V175', 'V176', 'V178', 'V180', 'V182', 'V185', 'V187', 'V188', 'V198', 'V203', 'V205', 'V207', 'V209', 'V210', 'V215', 'V218', 'V220', 'V221', 'V223', 'V224', 'V226', 'V228', 'V2

In [None]:
# XGBoost parameters
xgb_params = {
    'n_estimators': 5000,
    'max_depth': 12, 
    'learning_rate': 0.02, 
    'subsample': 0.8,
    'colsample_bytree': 0.4,
    'missing': -1,
    'eval_metric': 'auc',
    'tree_method': 'hist',
    'random_state': 42,
    'n_jobs': -1,
    'early_stopping_rounds' : 200,
    'verbosity': 1
}

print("XGBoost parameters (Winner's approach):")
print(json.dumps(xgb_params, indent=2))

# Initialize storage
models = []
oof = np.zeros(len(train), dtype=float)
fold_scores, fold_sizes = [], []

print("\n" + "="*50)
print("TRAINING XGBOOST WITH GROUPKFOLD")
print("="*50)

for k, (tr_idx, va_idx) in enumerate(folds):
    month = train.iloc[va_idx]['DT_M'].iloc[0]
    print(f'\nFold {k} withholding month {month}')
    print(f' rows of train = {len(tr_idx)}, rows of holdout = {len(va_idx)}')
    
    # Prepare data
    X_tr, y_tr = train.iloc[tr_idx][FEATURES], train.iloc[tr_idx]["isFraud"]
    X_va, y_va = train.iloc[va_idx][FEATURES], train.iloc[va_idx]["isFraud"]
    
    # Create XGBoost classifier
    clf = xgb.XGBClassifier(**xgb_params)
    
    # Fit with early stopping like the winner
    h = clf.fit(
        X_tr, y_tr,
        eval_set=[(X_va, y_va)],
        verbose=100
    )
    
    # Predict validation set
    preds = clf.predict_proba(X_va)[:, 1]  # Get probability of class 1
    oof[va_idx] = preds
    
    # Calculate AUC
    auc = roc_auc_score(y_va, preds)
    fold_scores.append(auc)
    fold_sizes.append(len(va_idx))
    
    print(f"Fold {k} AUC: {auc:.4f}, Best iteration: {clf.best_iteration}")
    
    # Store model
    models.append(clf)
    
    # Save individual fold model
    os.makedirs("xgboost_artifacts", exist_ok=True)
    clf.save_model(f"xgboost_artifacts/xgb_fold{k}.json")
    
    # Memory cleanup like winner
    del h, clf, X_tr, y_tr, X_va, y_va
    x = gc.collect()

# Calculate overall performance
fold_scores = np.array(fold_scores, dtype=float)
weights = np.array(fold_sizes, dtype=float) / np.sum(fold_sizes)
weighted_cv = float(np.sum(fold_scores * weights))
oof_mask = oof != 0
oof_auc = roc_auc_score(train["isFraud"], oof)

print('#'*20)
print(f'XGB OOF CV = {oof_auc:.4f}')

print("\n" + "="*50)
print("XGBOOST CV RESULTS")
print("="*50)
print("Fold AUCs:", np.round(fold_scores, 4))
print("Weighted mean AUC:", f"{weighted_cv:.4f}")
print("OOF AUC:", f"{oof_auc:.4f}")

XGBoost parameters (Winner's approach):
{
  "n_estimators": 5000,
  "max_depth": 12,
  "learning_rate": 0.02,
  "subsample": 0.8,
  "colsample_bytree": 0.4,
  "missing": -1,
  "eval_metric": "auc",
  "tree_method": "hist",
  "random_state": 42,
  "n_jobs": -1,
  "early_stopping_rounds": 200,
  "verbosity": 1
}

TRAINING XGBOOST WITH GROUPKFOLD

Fold 0 withholding month 12
 rows of train = 453219, rows of holdout = 137321
[0]	validation_0-auc:0.80381
[100]	validation_0-auc:0.91492
[200]	validation_0-auc:0.92219
[300]	validation_0-auc:0.92619
[400]	validation_0-auc:0.92738
[500]	validation_0-auc:0.92766
[600]	validation_0-auc:0.92738
[646]	validation_0-auc:0.92736
Fold 0 AUC: 0.9277, Best iteration: 447

Fold 1 withholding month 15
 rows of train = 488908, rows of holdout = 101632
[0]	validation_0-auc:0.85161
[100]	validation_0-auc:0.94673
[200]	validation_0-auc:0.95586
[300]	validation_0-auc:0.95882
[400]	validation_0-auc:0.95977
[500]	validation_0-auc:0.96019
[600]	validation_0-auc:0.9

In [None]:
# ============================================
# GENERATE TEST PREDICTIONS
# ============================================

def coerce_transaction_id(s: pd.Series) -> pd.Series:
    if pd.api.types.is_integer_dtype(s): 
        return s.astype("int64")
    if pd.api.types.is_float_dtype(s):   
        return s.round().astype("int64")
    if pd.api.types.is_string_dtype(s):
        s2 = s.str.replace(r"\.0$", "", regex=True)
        return pd.to_numeric(s2, errors="raise").astype("int64")
    return pd.to_numeric(s, errors="raise").astype("int64")

if "TransactionID" in testX.columns:
    print("\n" + "="*30)
    print("GENERATING SUBMISSION")
    print("="*30)
    
    test_id = coerce_transaction_id(testX["TransactionID"])
    
    # Average predictions like winner (they used preds += clf.predict_proba(X_test[cols])[:,1]/skf.n_splits)
    test_preds = np.zeros(len(testX))
    
    for i, model in enumerate(models):
        fold_preds = model.predict_proba(testX[FEATURES])[:, 1]
        test_preds += fold_preds / len(models)  # Average like winner
        print(f"Fold {i} test predictions: mean={fold_preds.mean():.4f}, std={fold_preds.std():.4f}")
    
    print(f"Final averaged predictions: mean={test_preds.mean():.4f}, std={test_preds.std():.4f}")
    
    # Create submission
    sub = pd.DataFrame({"TransactionID": test_id, "isFraud": test_preds})
    os.makedirs("submissions", exist_ok=True)
    out_path = f"submissions/submission_xgb_groupkfold_cv{oof_auc:.4f}.csv"
    sub.to_csv(out_path, index=False)
    print(f"Saved submission: {out_path}")
    
    # Save artifacts for ensemble
    np.save("xgboost_artifacts/xgb_oof_predictions.npy", oof)
    np.save("xgboost_artifacts/xgb_test_predictions.npy", test_preds)
    
    # Save model metadata
    model_metadata = {
        "model_type": "xgboost_groupkfold",
        "oof_auc": oof_auc,
        "weighted_cv_auc": weighted_cv,
        "fold_scores": fold_scores.tolist(),
        "n_folds": len(folds),
        "params": xgb_params,
        "features": FEATURES,
        "validation_method": "GroupKFold with DT_M",
        "best_iterations": [getattr(m, 'best_iteration', None) for m in models]
    }
    
    with open("xgboost_artifacts/xgb_metadata.json", "w") as f:
        json.dump(model_metadata, f, indent=2)
    
    print(f"\n" + "="*50)
    print("KEY CHANGES MADE TO MATCH WINNER:")
    print("="*50)
    print("1. ✅ Used GroupKFold(n_splits=6) instead of expanding folds")
    print("2. ✅ Increased n_estimators from 2000 to 5000") 
    print("3. ✅ Used groups=train['DT_M'] for temporal validation")
    print("4. ✅ Only used validation set in eval_set (not train set)")
    print("5. ✅ Calculated OOF AUC on all samples")
    print("6. ✅ Added proper memory cleanup")

else:
    print("\n(No TransactionID in testX — skipping submission build.)")


GENERATING SUBMISSION
Fold 0 test predictions: mean=0.0277, std=0.1120
Fold 1 test predictions: mean=0.0243, std=0.1119
Fold 2 test predictions: mean=0.0268, std=0.1129
Fold 3 test predictions: mean=0.0257, std=0.1140
Fold 4 test predictions: mean=0.0266, std=0.1120
Fold 5 test predictions: mean=0.0234, std=0.1121
Final averaged predictions: mean=0.0258, std=0.1116
Saved submission: submissions/submission_xgb_groupkfold_cv0.9551.csv

KEY CHANGES MADE TO MATCH WINNER:
1. ✅ Used GroupKFold(n_splits=6) instead of expanding folds
2. ✅ Increased n_estimators from 2000 to 5000
3. ✅ Used groups=train['DT_M'] for temporal validation
4. ✅ Only used validation set in eval_set (not train set)
5. ✅ Calculated OOF AUC on all samples
6. ✅ Added proper memory cleanup
