# 07_oof_predictions.ipynb

This notebook generates out-of-fold (OOF) predictions for the three optimized base models:

- LightGBM
- XGBoost
- CatBoost

The goal is to create unbiased validation predictions by ensuring that each fold’s predictions are made by a model that has not seen the corresponding data during training. These OOF predictions are later used as input features for a stacked ensemble model.

In addition to OOF generation, this notebook saves test set predictions for each model to be used in the ensemble test submission.

### Data Preparation and CV Setup

In [None]:
# --- Shared setup cell for all models ---
from pathlib import Path
import pandas as pd
import numpy as np
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score
from creditutils.path_utils import get_project_root

# Define project paths
proj_root = get_project_root(levels_up=1)
output_dir = proj_root / "outputs"
train_path = output_dir / "03_train_features_autosearch_baseline.parquet"
test_path = output_dir / "03_test_features_autosearch_baseline.parquet"

# Load training and test data
df_train = pd.read_parquet(train_path)
df_test = pd.read_parquet(test_path)

# Separate target and features
y = df_train["TARGET"]
X = df_train.drop(columns=["SK_ID_CURR", "TARGET"])
X_test = df_test.drop(columns=["SK_ID_CURR"])

# Convert object columns to category
for df in [X, X_test]:
    for col in df.select_dtypes(include="object").columns:
        df[col] = df[col].astype("category")

# Align category levels between train and test
for col in X.select_dtypes(include="category").columns:
    if col in X_test:
        cats = list(set(X[col].astype(str).unique()) | set(X_test[col].astype(str).unique()))
        X[col] = X[col].astype("category").cat.set_categories(cats)
        X_test[col] = X_test[col].astype("category").cat.set_categories(cats)

# Set up stratified 5-fold CV
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

### LGBM OOF

In [None]:
from pathlib import Path
import pandas as pd
import numpy as np
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score
import lightgbm as lgb
from creditutils.path_utils import get_project_root

# Create arrays for out-of-fold (OOF) and test predictions
oof_preds = np.zeros(len(X))
test_preds = np.zeros(len(X_test))

# Cross-validation loop
for fold, (train_idx, val_idx) in enumerate(cv.split(X, y)):
    print(f"Training fold {fold}...")
    X_train, y_train = X.iloc[train_idx], y.iloc[train_idx]
    X_val, y_val = X.iloc[val_idx], y.iloc[val_idx]

    # LightGBM model with tuned parameters
    model = lgb.LGBMClassifier(
        boosting_type="goss",  # Use Gradient-based One-Side Sampling
        learning_rate=0.0031335727235880005,
        max_depth=13,
        num_leaves=100,
        min_child_samples=191,
        reg_alpha=0.037036865048284115,
        reg_lambda=0.0021849671286405664,
        subsample=0.8476213315586094,
        colsample_bytree=0.444071532621635,
        n_estimators=5339,
        random_state=fold,
        n_jobs=3
    )

    # Train and validate
    model.fit(X_train, y_train, eval_set=[(X_val, y_val)], eval_metric="auc")

    # Save predictions
    oof_preds[val_idx] = model.predict_proba(X_val)[:, 1]
    test_preds += model.predict_proba(X_test)[:, 1] / cv.n_splits

# Calculate overall OOF AUC
oof_auc = roc_auc_score(y, oof_preds)
print(f"\nOOF AUC: {oof_auc:.5f}")

# Save predictions to disk
df_train["OOF_LGB"] = oof_preds
df_test["PRED_LGB"] = test_preds

df_train[["SK_ID_CURR", "TARGET", "OOF_LGB"]].to_csv(output_dir / "oof_lgb.csv", index=False)
df_test[["SK_ID_CURR", "PRED_LGB"]].to_csv(output_dir / "pred_test_lgb.csv", index=False)

print("Saved to:")
print("- oof_lgb.csv")
print("- pred_test_lgb.csv")

Training fold 0...
[LightGBM] [Info] Number of positive: 19860, number of negative: 226148
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.014930 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 12076
[LightGBM] [Info] Number of data points in the train set: 246008, number of used features: 62
[LightGBM] [Info] Using GOSS
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.080729 -> initscore=-2.432482
[LightGBM] [Info] Start training from score -2.432482
Training fold 1...
[LightGBM] [Info] Number of positive: 19860, number of negative: 226149
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.014939 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 12078
[LightGBM] [Info] Number of data points

Catboost OOF

In [None]:
from catboost import CatBoostClassifier, Pool
from sklearn.metrics import roc_auc_score

# Copy to avoid modifying original dataframes
cat_df_train = df_train.copy()
cat_df_test = df_test.copy()
cat_X = cat_df_train.drop(columns=["SK_ID_CURR", "TARGET"])
cat_y = cat_df_train["TARGET"]
cat_X_test = cat_df_test.drop(columns=["SK_ID_CURR"])

# Identify categorical features
cat_features = cat_X.select_dtypes(include=["object", "category"]).columns.tolist()

# Convert categories to string (CatBoost requires this format)
for col in cat_features:
    cat_X[col] = cat_X[col].astype(str)
    cat_X_test[col] = cat_X_test[col].astype(str)

# Create arrays for OOF and test predictions
cat_oof_preds = np.zeros(len(cat_X))
cat_test_preds = np.zeros(len(cat_X_test))

# Cross-validation loop
for fold, (train_idx, val_idx) in enumerate(cv.split(cat_X, cat_y)):
    print(f"Training CatBoost fold {fold}...")
    X_train, y_train = cat_X.iloc[train_idx], cat_y.iloc[train_idx]
    X_val, y_val = cat_X.iloc[val_idx], cat_y.iloc[val_idx]

    # Create CatBoost Pool objects
    train_pool = Pool(X_train, y_train, cat_features=cat_features)
    val_pool = Pool(X_val, y_val, cat_features=cat_features)
    test_pool = Pool(cat_X_test, cat_features=cat_features)

    # Initialize CatBoost model with tuned hyperparameters
    cat_model = CatBoostClassifier(
        iterations=1383,
        learning_rate=0.042326510977740595,
        depth=5,
        l2_leaf_reg=5.551455325485743,
        bagging_temperature=0.6584052675829963,
        border_count=72,
        auto_class_weights='Balanced',
        eval_metric='AUC',
        random_seed=42,
        verbose=0,
        early_stopping_rounds=75,
        task_type='CPU',
        thread_count=-1
    )

    # Train and validate
    cat_model.fit(train_pool, eval_set=val_pool)
    cat_oof_preds[val_idx] = cat_model.predict_proba(X_val)[:, 1]
    cat_test_preds += cat_model.predict_proba(cat_X_test)[:, 1] / cv.n_splits

# Compute OOF AUC
cat_auc = roc_auc_score(cat_y, cat_oof_preds)
print(f"\nOOF AUC (CatBoost): {cat_auc:.5f}")

# Save predictions to disk
cat_df_train["OOF_CAT"] = cat_oof_preds
cat_df_test["PRED_CAT"] = cat_test_preds

cat_df_train[["SK_ID_CURR", "TARGET", "OOF_CAT"]].to_csv(output_dir / "oof_cat.csv", index=False)
cat_df_test[["SK_ID_CURR", "PRED_CAT"]].to_csv(output_dir / "pred_test_cat.csv", index=False)

print("Saved to:")
print("- oof_cat.csv")
print("- pred_test_cat.csv")

Training fold 0...
Training fold 1...
Training fold 2...
Training fold 3...
Training fold 4...

OOF AUC (CatBoost): 0.79103
Saved to:
- oof_cat.csv
- pred_test_cat.csv


XGBoost OOF

In [None]:
from xgboost import XGBClassifier

# Create copies of the data to avoid modifying original DataFrames
xgb_df_train = df_train.copy()
xgb_df_test = df_test.copy()
xgb_X = xgb_df_train.drop(columns=["SK_ID_CURR", "TARGET"])
xgb_y = xgb_df_train["TARGET"]
xgb_X_test = xgb_df_test.drop(columns=["SK_ID_CURR"])

# Identify and convert categorical features
xgb_cat_features = xgb_X.select_dtypes(include=["object", "category"]).columns.tolist()
for col in xgb_cat_features:
    xgb_X[col] = xgb_X[col].astype("category")
    xgb_X_test[col] = xgb_X_test[col].astype("category")

# Arrays for OOF and test predictions
xgb_oof_preds = np.zeros(len(xgb_X))
xgb_test_preds = np.zeros(len(xgb_X_test))

# Cross-validation loop
for fold, (train_idx, val_idx) in enumerate(cv.split(xgb_X, xgb_y)):
    print(f"XGBoost fold {fold}")
    X_train, y_train = xgb_X.iloc[train_idx], xgb_y.iloc[train_idx]
    X_val, y_val = xgb_X.iloc[val_idx], xgb_y.iloc[val_idx]

    # Initialize XGBoost model with tuned hyperparameters
    xgb_model = XGBClassifier(
        n_estimators=1042,
        learning_rate=0.05558810799284791,
        max_depth=3,
        subsample=0.8787759145726666,
        colsample_bytree=0.9105365550107795,
        gamma=3.633742017324177,
        reg_alpha=3.5602493930649466,
        reg_lambda=2.183731116122563,
        scale_pos_weight=11.387150050352467,
        use_label_encoder=False,
        enable_categorical=True,
        eval_metric='auc',
        tree_method='hist',
        early_stopping_rounds=75,
        random_state=42,
        n_jobs=3
    )

    # Train and validate
    xgb_model.fit(X_train, y_train, eval_set=[(X_val, y_val)], verbose=0)
    xgb_oof_preds[val_idx] = xgb_model.predict_proba(X_val)[:, 1]
    xgb_test_preds += xgb_model.predict_proba(xgb_X_test)[:, 1] / cv.n_splits

# Compute OOF AUC
xgb_auc = roc_auc_score(xgb_y, xgb_oof_preds)
print(f"\nOOF AUC (XGBoost): {xgb_auc:.5f}")

# Save predictions to disk
xgb_df_train["OOF_XGB"] = xgb_oof_preds
xgb_df_test["PRED_XGB"] = xgb_test_preds

xgb_df_train[["SK_ID_CURR", "TARGET", "OOF_XGB"]].to_csv(output_dir / "oof_xgb.csv", index=False)
xgb_df_test[["SK_ID_CURR", "PRED_XGB"]].to_csv(output_dir / "pred_test_xgb.csv", index=False)

print("Saved to:")
print("- oof_xgb.csv")
print("- pred_test_xgb.csv")


XGB Fold 0


Parameters: { "use_label_encoder" } are not used.

  self.starting_round = model.num_boosted_rounds()


XGB Fold 1


Parameters: { "use_label_encoder" } are not used.

  self.starting_round = model.num_boosted_rounds()


XGB Fold 2


Parameters: { "use_label_encoder" } are not used.

  self.starting_round = model.num_boosted_rounds()


XGB Fold 3


Parameters: { "use_label_encoder" } are not used.

  self.starting_round = model.num_boosted_rounds()


XGB Fold 4


Parameters: { "use_label_encoder" } are not used.

  self.starting_round = model.num_boosted_rounds()



OOF AUC (XGBoost): 0.78907


Stacked LGBM Model

In [None]:
import lightgbm as lgb
from sklearn.metrics import roc_auc_score

# Load out-of-fold (OOF) predictions
oof_lgb = pd.read_csv(output_dir / "oof_lgb.csv")
oof_cat = pd.read_csv(output_dir / "oof_cat.csv")
oof_xgb = pd.read_csv(output_dir / "oof_xgb.csv")

# Load test predictions
pred_lgb = pd.read_csv(output_dir / "pred_test_lgb.csv")
pred_cat = pd.read_csv(output_dir / "pred_test_cat.csv")
pred_xgb = pd.read_csv(output_dir / "pred_test_xgb.csv")

# Construct meta training set
meta_X = pd.DataFrame({
    "lgb": oof_lgb["OOF_LGB"],
    "cat": oof_cat["OOF_CAT"],
    "xgb": oof_xgb["OOF_XGB"]
})
meta_y = oof_lgb["TARGET"]

# Construct meta test set
meta_X_test = pd.DataFrame({
    "lgb": pred_lgb["PRED_LGB"],
    "cat": pred_cat["PRED_CAT"],
    "xgb": pred_xgb["PRED_XGB"]
})

# Define LightGBM as meta-model
meta_model = lgb.LGBMClassifier(
    learning_rate=0.01,
    n_estimators=500,
    max_depth=3,
    num_leaves=8,
    random_state=42,
    n_jobs=-1
)

# Train meta-model on OOF predictions
meta_model.fit(meta_X, meta_y)

# Evaluate meta-model
meta_oof_pred = meta_model.predict_proba(meta_X)[:, 1]
meta_auc = roc_auc_score(meta_y, meta_oof_pred)
print(f"Meta-Model OOF AUC (LGBM): {meta_auc:.5f}")

# Predict on test set
meta_test_pred = meta_model.predict_proba(meta_X_test)[:, 1]

# Save final stacked submission
submission_dir = proj_root / "submissions"
submission_dir.mkdir(exist_ok=True)

submission_stacked = pd.DataFrame({
    "SK_ID_CURR": pred_lgb["SK_ID_CURR"],
    "TARGET": meta_test_pred
})
submission_stacked.to_csv(submission_dir / "submission_stacked_lgb.csv", index=False)

print("Saved to: submission_stacked_lgb.csv")

[LightGBM] [Info] Number of positive: 24825, number of negative: 282686
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000674 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 765
[LightGBM] [Info] Number of data points in the train set: 307511, number of used features: 3
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.080729 -> initscore=-2.432486
[LightGBM] [Info] Start training from score -2.432486
Meta-Model OOF AUC (LGBM): 0.79503
Saved to: submission_stacked_lgb.csv


Stacked Catboost Model

In [None]:
from catboost import CatBoostClassifier

# Define CatBoost as meta-model
meta_model_cat = CatBoostClassifier(
    learning_rate=0.01,
    iterations=500,
    depth=3,
    l2_leaf_reg=3.0,
    random_seed=42,
    verbose=0,
    task_type="CPU"
)

# Train meta-model on OOF predictions
meta_model_cat.fit(meta_X, meta_y)

# Evaluate CatBoost meta-model
meta_oof_pred_cat = meta_model_cat.predict_proba(meta_X)[:, 1]
meta_auc_cat = roc_auc_score(meta_y, meta_oof_pred_cat)
print(f"Meta-Model OOF AUC (CatBoost): {meta_auc_cat:.5f}")

# Predict on test set
meta_test_pred_cat = meta_model_cat.predict_proba(meta_X_test)[:, 1]

# Save CatBoost-based stacked submission
submission_cat = pd.DataFrame({
    "SK_ID_CURR": pred_lgb["SK_ID_CURR"],
    "TARGET": meta_test_pred_cat
})
submission_cat.to_csv(submission_dir / "submission_stacked_cat.csv", index=False)

print("Saved to: submission_stacked_cat.csv")

Meta-Model OOF AUC (CatBoost): 0.79361
Saved to: submission_stacked_cat.csv


Stacked XGBoost Model

In [None]:
from xgboost import XGBClassifier

# Define XGBoost as meta-model
meta_model_xgb = XGBClassifier(
    learning_rate=0.01,
    n_estimators=500,
    max_depth=3,
    subsample=0.9,
    colsample_bytree=0.9,
    use_label_encoder=False,
    eval_metric="auc",
    random_state=42,
    n_jobs=-1
)

# Train meta-model on OOF predictions
meta_model_xgb.fit(meta_X, meta_y)

# Evaluate XGBoost meta-model
meta_oof_pred_xgb = meta_model_xgb.predict_proba(meta_X)[:, 1]
meta_auc_xgb = roc_auc_score(meta_y, meta_oof_pred_xgb)
print(f"Meta-Model OOF AUC (XGBoost): {meta_auc_xgb:.5f}")

# Predict on test set
meta_test_pred_xgb = meta_model_xgb.predict_proba(meta_X_test)[:, 1]

# Save XGBoost-based stacked submission
submission_xgb = pd.DataFrame({
    "SK_ID_CURR": pred_lgb["SK_ID_CURR"],
    "TARGET": meta_test_pred_xgb
})
submission_xgb.to_csv(submission_dir / "submission_stacked_xgb.csv", index=False)

print("Saved to: submission_stacked_xgb.csv")

Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


Meta-Model OOF AUC (XGBoost): 0.79487
Saved to: submission_stacked_xgb.csv
