# LGBM

In [1]:
# LGBM with GroupKFold
import os, gc, json, pickle, warnings
import numpy as np
import pandas as pd
import lightgbm as lgb
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import GroupKFold

warnings.filterwarnings('ignore')

In [2]:
# LOAD DATA
DATA_DIR = "../data/processed"

# Load preprocessed features from kyakovlev's kernel
print("Loading preprocessed features...")
train_df = pd.read_pickle(f"{DATA_DIR}/train_df.pkl")
test_df = pd.read_pickle(f"{DATA_DIR}/test_df.pkl")
remove_features_df = pd.read_pickle(f"{DATA_DIR}/remove_features.pkl")

print(f"Train shape: {train_df.shape}")
print(f"Test shape: {test_df.shape}")
print(f"Features to remove: {len(remove_features_df)}")

# Get features to remove
remove_features = list(remove_features_df['features_to_remove'].values)
print(f"Removing {len(remove_features)} features")

# Build final feature list (exclude removed features)
all_features = [col for col in train_df.columns if col not in remove_features]

# Exclude target and ID columns
EXCLUDE_COLS = {'TransactionID', 'isFraud', 'DT_M'}
FEATURES = [col for col in all_features if col not in EXCLUDE_COLS]

print(f"Final feature count: {len(FEATURES)}")
print(FEATURES)

Loading preprocessed features...
Train shape: (590540, 791)
Test shape: (506691, 791)
Features to remove: 19
Removing 19 features
Final feature count: 772
['TransactionAmt', 'ProductCD', 'card1', 'card2', 'card3', 'card4', 'card5', 'card6', 'addr1', 'addr2', 'dist1', 'dist2', 'P_emaildomain', 'R_emaildomain', 'C1', 'C2', 'C3', 'C4', 'C5', 'C6', 'C7', 'C8', 'C9', 'C10', 'C11', 'C12', 'C13', 'C14', 'D1', 'D2', 'D3', 'D4', 'D5', 'D6', 'D7', 'D8', 'D9', 'D10', 'D11', 'D12', 'D13', 'D14', 'D15', 'M1', 'M2', 'M3', 'M4', 'M5', 'M6', 'M7', 'M8', 'M9', 'V1', 'V2', 'V3', 'V4', 'V5', 'V6', 'V7', 'V8', 'V9', 'V10', 'V11', 'V12', 'V13', 'V14', 'V15', 'V16', 'V17', 'V18', 'V19', 'V20', 'V21', 'V22', 'V23', 'V24', 'V25', 'V26', 'V27', 'V28', 'V29', 'V30', 'V31', 'V32', 'V33', 'V34', 'V35', 'V36', 'V37', 'V38', 'V39', 'V40', 'V41', 'V42', 'V43', 'V44', 'V45', 'V46', 'V47', 'V48', 'V49', 'V50', 'V51', 'V52', 'V53', 'V54', 'V55', 'V56', 'V57', 'V58', 'V59', 'V60', 'V61', 'V62', 'V63', 'V64', 'V65', 'V66

In [3]:
# SETUP GROUPKFOLD

print("\n" + "=" * 60)
print("USING GROUPKFOLD WITH DT_M")
print("=" * 60)

# Use GroupKFold
skf = GroupKFold(n_splits=6)
folds = list(skf.split(train_df, train_df["isFraud"], groups=train_df['DT_M']))

print(f"Created {len(folds)} folds using GroupKFold")

# Print fold information like the winner
for i, (tr_idx, va_idx) in enumerate(folds):
    month = train_df.iloc[va_idx]['DT_M'].iloc[0]
    print(f'Fold {i} withholding month {month}')
    print(f' rows of train = {len(tr_idx)}, rows of holdout = {len(va_idx)}')

# Overall fraud rate
overall_fraud_rate = train_df['isFraud'].mean()
print(f"\nOverall fraud rate: {overall_fraud_rate:.4f} ({overall_fraud_rate*100:.2f}%)")


USING GROUPKFOLD WITH DT_M
Created 6 folds using GroupKFold
Fold 0 withholding month 12
 rows of train = 453219, rows of holdout = 137321
Fold 1 withholding month 15
 rows of train = 488908, rows of holdout = 101632
Fold 2 withholding month 13
 rows of train = 497955, rows of holdout = 92585
Fold 3 withholding month 17
 rows of train = 501214, rows of holdout = 89326
Fold 4 withholding month 14
 rows of train = 504519, rows of holdout = 86021
Fold 5 withholding month 16
 rows of train = 506885, rows of holdout = 83655

Overall fraud rate: 0.0350 (3.50%)


In [4]:
# ============================================================================
# LIGHTGBM PARAMETERS
# ============================================================================

# Winner's approach - parameters similar to their XGBoost but adapted for LGBM
lgb_params = {
    'objective': 'binary',
    'boosting_type': 'gbdt',
    'metric': 'auc',
    'n_jobs': -1,
    'learning_rate': 0.007,
    'num_leaves': 256,       # 2^8
    'max_depth': -1,
    'tree_learner': 'serial',
    'colsample_bytree': 0.5,
    'subsample_freq': 1,
    'subsample': 0.7,
    'n_estimators': 10000,
    'max_bin': 255,
    'verbose': -1,
    'seed': 42,
    'early_stopping_rounds': 100,
    'force_col_wise': True,
}

print("LightGBM parameters:")
print(json.dumps(lgb_params, indent=2))

# ============================================================================
# TRAINING WITH GROUPKFOLD
# ============================================================================

print("\n" + "=" * 60)
print("TRAINING LIGHTGBM WITH GROUPKFOLD")
print("=" * 60)

models = []
oof = np.zeros(len(train_df), dtype=float)
fold_scores, fold_sizes = [], []

for fold_, (tr_idx, va_idx) in enumerate(folds):
    month = train_df.iloc[va_idx]['DT_M'].iloc[0]
    print(f'\nFold {fold_} withholding month {month}')
    print(f' rows of train = {len(tr_idx)}, rows of holdout = {len(va_idx)}')
    
    # Prepare data
    X_tr, y_tr = train_df.iloc[tr_idx][FEATURES], train_df.iloc[tr_idx]["isFraud"]
    X_va, y_va = train_df.iloc[va_idx][FEATURES], train_df.iloc[va_idx]["isFraud"]
    
    # Create datasets
    tr_data = lgb.Dataset(X_tr, label=y_tr)
    vl_data = lgb.Dataset(X_va, label=y_va)
    
    # Train model
    estimator = lgb.train(
        lgb_params,
        tr_data,
        valid_sets=[tr_data, vl_data],
        valid_names=['train', 'valid'],
        callbacks=[
            lgb.early_stopping(stopping_rounds=100, verbose=False),
            lgb.log_evaluation(period=200),  # Print every 200 rounds like winner
        ]
    )
    
    # Predict validation set
    oof_preds = estimator.predict(X_va, num_iteration=estimator.best_iteration)
    oof[va_idx] = oof_preds
    
    # Calculate AUC
    auc = roc_auc_score(y_va, oof_preds)
    fold_scores.append(auc)
    fold_sizes.append(len(va_idx))
    
    print(f"Fold {fold_} AUC: {auc:.4f}, Best iteration: {estimator.best_iteration}")
    
    # Store model
    models.append(estimator)
    
    # Save individual fold model
    os.makedirs("lgbm_artifacts", exist_ok=True)
    estimator.save_model(f"lgbm_artifacts/lgbm_fold{fold_}.txt", num_iteration=estimator.best_iteration)
    
    # Memory cleanup like winner
    del X_tr, y_tr, X_va, y_va, tr_data, vl_data
    gc.collect()

# Calculate overall performance
fold_scores = np.array(fold_scores, dtype=float)
weights = np.array(fold_sizes, dtype=float) / np.sum(fold_sizes)
weighted_cv = float(np.sum(fold_scores * weights))

# OOF AUC
oof_auc = roc_auc_score(train_df["isFraud"], oof)

print('#' * 20)
print(f'LGBM OOF CV = {oof_auc:.4f}')

print("\n" + "=" * 50)
print("LIGHTGBM CV RESULTS")
print("=" * 50)
print("Fold AUCs:", np.round(fold_scores, 4))
print("Weighted mean AUC:", f"{weighted_cv:.4f}")
print("OOF AUC:", f"{oof_auc:.4f}")

LightGBM parameters:
{
  "objective": "binary",
  "boosting_type": "gbdt",
  "metric": "auc",
  "n_jobs": -1,
  "learning_rate": 0.007,
  "num_leaves": 256,
  "max_depth": -1,
  "tree_learner": "serial",
  "colsample_bytree": 0.5,
  "subsample_freq": 1,
  "subsample": 0.7,
  "n_estimators": 10000,
  "max_bin": 255,
  "verbose": -1,
  "seed": 42,
  "early_stopping_rounds": 100,
  "force_col_wise": true
}

TRAINING LIGHTGBM WITH GROUPKFOLD

Fold 0 withholding month 12
 rows of train = 453219, rows of holdout = 137321
[200]	train's auc: 0.963837	valid's auc: 0.883393
[400]	train's auc: 0.987693	valid's auc: 0.899017
[600]	train's auc: 0.995788	valid's auc: 0.905981
[800]	train's auc: 0.99856	valid's auc: 0.910027
[1000]	train's auc: 0.999511	valid's auc: 0.912859
[1200]	train's auc: 0.999838	valid's auc: 0.914792
[1400]	train's auc: 0.999948	valid's auc: 0.916426
[1600]	train's auc: 0.999985	valid's auc: 0.91741
[1800]	train's auc: 0.999996	valid's auc: 0.918059
[2000]	train's auc: 0.9999

In [5]:
# ============================================================================
# SAVE FOR ENSEMBLE
# ============================================================================

# 1. Save OOF predictions
np.save("lgbm_artifacts/lgbm_oof_predictions.npy", oof)
print(f"Saved OOF predictions shape: {oof.shape}")

# 2. Save model metadata
model_metadata = {
    "model_type": "lightgbm_groupkfold",
    "oof_auc": oof_auc,
    "weighted_cv_auc": weighted_cv,
    "fold_scores": fold_scores.tolist(),
    "fold_weights": weights.tolist(),
    "n_folds": len(folds),
    "params": lgb_params,
    "features": FEATURES,
    "n_features": len(FEATURES),
    "validation_method": "GroupKFold with DT_M",
    "best_iterations": [m.best_iteration for m in models],
    "data_source": "kyakovlev/ieee-fe-with-some-eda"
}

with open("lgbm_artifacts/lgbm_metadata.json", "w") as f:
    json.dump(model_metadata, f, indent=2)
print("Saved model metadata")

# 3. Save fold indices for reproducibility (same as XGBoost)
fold_info = {
    "folds": [(tr_idx.tolist(), va_idx.tolist()) for tr_idx, va_idx in folds],
    "validation_method": "GroupKFold",
    "n_splits": 6,
    "group_column": "DT_M"
}

with open("lgbm_artifacts/lgbm_fold_indices.json", "w") as f:
    json.dump(fold_info, f, indent=2)
print("Saved fold indices")

# 4. Save feature importance
feature_importance = pd.DataFrame({
    'feature': FEATURES,
    'importance': np.mean([m.feature_importance(importance_type='gain') for m in models], axis=0)
}).sort_values('importance', ascending=False)

feature_importance.to_csv("lgbm_artifacts/lgbm_feature_importance.csv", index=False)
print("Saved feature importance")

# ============================================================================
# GENERATE TEST PREDICTIONS AND SUBMISSION
# ============================================================================

def coerce_transaction_id(s: pd.Series) -> pd.Series:
    if pd.api.types.is_integer_dtype(s): 
        return s.astype("int64")
    if pd.api.types.is_float_dtype(s):   
        return s.round().astype("int64")
    if pd.api.types.is_string_dtype(s):
        s2 = s.str.replace(r"\.0$", "", regex=True)
        return pd.to_numeric(s2, errors="raise").astype("int64")
    return pd.to_numeric(s, errors="raise").astype("int64")

if "TransactionID" in test_df.columns:
    print("\n" + "=" * 30)
    print("GENERATING SUBMISSION")
    print("=" * 30)
    
    test_id = coerce_transaction_id(test_df["TransactionID"])
    
    # Average predictions across all fold models (like winner)
    predictions = np.zeros(len(test_df))
    
    for i, model in enumerate(models):
        fold_preds = model.predict(test_df[FEATURES], num_iteration=model.best_iteration)
        predictions += fold_preds / len(models)  # Average like winner
        print(f"Fold {i} test predictions: mean={fold_preds.mean():.4f}, std={fold_preds.std():.4f}")
    
    print(f"Final averaged predictions: mean={predictions.mean():.4f}, std={predictions.std():.4f}")
    
    # Create submission like winner's notebook
    test_predictions_df = pd.DataFrame({
        "TransactionID": test_id, 
        "isFraud": predictions
    })
    
    # Save submission
    os.makedirs("submissions", exist_ok=True)
    out_path = f"submissions/submission_lgbm_groupkfold_cv{oof_auc:.4f}.csv"
    test_predictions_df.to_csv(out_path, index=False)
    print(f"Saved submission: {out_path}")
    
    # Save test predictions for ensemble
    np.save("lgbm_artifacts/lgbm_test_predictions.npy", predictions)
    print("Saved test predictions for ensemble")
    
    # Save summary
    summary = {
        "model_name": "lgbm_groupkfold_winner_approach",
        "performance": {
            "oof_auc": oof_auc,
            "weighted_cv_auc": weighted_cv,
            "fold_aucs": fold_scores.tolist()
        },
        "data_source": "kyakovlev/ieee-fe-with-some-eda",
        "validation_method": "GroupKFold with DT_M (Winner's approach)",
        "files_saved": [
            "lgbm_oof_predictions.npy",
            "lgbm_test_predictions.npy", 
            "lgbm_metadata.json",
            "lgbm_fold_indices.json",
            "lgbm_feature_importance.csv"
        ] + [f"lgbm_fold{i}.txt" for i in range(len(folds))],
        "usage_notes": {
            "oof_predictions": "Use for stacking/blending with XGBoost and other models",
            "test_predictions": "Ready for ensemble averaging",
            "fold_models": "Load individual models for prediction",
            "same_folds_as_xgb": "Use same fold indices for proper ensemble"
        }
    }
    
    with open("lgbm_artifacts/lgbm_ensemble_summary.json", "w") as f:
        json.dump(summary, f, indent=2)
    
    print(f"\n" + "=" * 60)
    print("KEY IMPROVEMENTS MADE:")
    print("=" * 60)
    print("1. ✅ Used GroupKFold(n_splits=6)")
    print("2. ✅ Used kyakovlev's preprocessed features")
    print("3. ✅ Applied feature removal from winner's analysis")
    print("4. ✅ Used winner's parameter settings")
    print("5. ✅ Matched winner's training approach")
    print("6. ✅ Saved files for proper ensemble with XGBoost")

else:
    print("\n(No TransactionID in test_df — skipping submission build.)")

print(f"\n" + "=" * 60)
print("LGBM GROUPKFOLD COMPLETE")
print("=" * 60)
print(f"OOF AUC: {oof_auc:.4f}")

Saved OOF predictions shape: (590540,)
Saved model metadata
Saved fold indices
Saved feature importance

GENERATING SUBMISSION
Fold 0 test predictions: mean=0.0214, std=0.1162
Fold 1 test predictions: mean=0.0239, std=0.1132
Fold 2 test predictions: mean=0.0230, std=0.1147
Fold 3 test predictions: mean=0.0254, std=0.1138
Fold 4 test predictions: mean=0.0233, std=0.1151
Fold 5 test predictions: mean=0.0229, std=0.1145
Final averaged predictions: mean=0.0233, std=0.1139
Saved submission: submissions/submission_lgbm_groupkfold_cv0.9427.csv
Saved test predictions for ensemble

KEY IMPROVEMENTS MADE:
1. ✅ Used GroupKFold(n_splits=6) like winner
2. ✅ Used kyakovlev's preprocessed features
3. ✅ Applied feature removal from winner's analysis
4. ✅ Used winner's parameter settings
5. ✅ Matched winner's training approach
6. ✅ Saved files for proper ensemble with XGBoost

LGBM GROUPKFOLD COMPLETE
OOF AUC: 0.9427
This should match your XGBoost validation approach!
