# 05_auto_feature_search.ipynb

---

## 1. Setup project paths and load base feature set

Set up the project root path and load the previously engineered baseline feature set.


In [1]:
from pathlib import Path
import pandas as pd
import numpy as np
from creditutils.path_utils import get_project_root

# Determine project root (one level above notebooks/)
proj_root = get_project_root(levels_up=1)

# Define paths using pathlib
DATA_DIR = proj_root / "data"
OUTPUT_DIR = proj_root / "outputs"
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)

## 2. Load raw data and define EntitySet for Featuretools

Load all relevant tables and define relationships between them for Deep Feature Synthesis.


In [2]:
import featuretools as ft

# Load all necessary data
df_app_train = pd.read_parquet(DATA_DIR / "application_train.parquet")
df_app_test  = pd.read_parquet(DATA_DIR / "application_test.parquet")
df_app_test["TARGET"] = np.nan  # falls nötig
df_app = pd.concat([df_app_train, df_app_test], ignore_index=True)
df_prev   = pd.read_parquet(DATA_DIR / "previous_application.parquet")
df_bureau = pd.read_parquet(DATA_DIR / "bureau.parquet")
df_bbal   = pd.read_parquet(DATA_DIR / "bureau_balance.parquet")
df_inst   = pd.read_parquet(DATA_DIR / "installments_payments.parquet")
df_cc_bal = pd.read_parquet(DATA_DIR / "credit_card_balance.parquet")
df_pos    = pd.read_parquet(DATA_DIR / "POS_CASH_balance.parquet")


# Create EntitySet
es = ft.EntitySet(id="homecredit_full")

# Add dataframes with explicit named arguments
es = es.add_dataframe(
    dataframe_name="app",
    dataframe=df_app,
    index="SK_ID_CURR"
)
es = es.add_dataframe(
    dataframe_name="previous",
    dataframe=df_prev,
    make_index=True,
    index="PREV_INDEX",
    time_index="DAYS_DECISION"
)
es = es.add_relationship(
    parent_dataframe_name="app",
    parent_column_name="SK_ID_CURR",
    child_dataframe_name="previous",
    child_column_name="SK_ID_CURR"
)

es = es.add_dataframe(
    dataframe_name="bureau",
    dataframe=df_bureau,
    index="SK_ID_BUREAU"
)
es = es.add_dataframe(
    dataframe_name="bureau_balance",
    dataframe=df_bbal,
    make_index=True,
    index="BBAL_INDEX"
)
es = es.add_relationship(
    parent_dataframe_name="bureau",
    parent_column_name="SK_ID_BUREAU",
    child_dataframe_name="bureau_balance",
    child_column_name="SK_ID_BUREAU"
)
es = es.add_relationship(
    parent_dataframe_name="app",
    parent_column_name="SK_ID_CURR",
    child_dataframe_name="bureau",
    child_column_name="SK_ID_CURR"
)

es = es.add_dataframe(
    dataframe_name="inst",
    dataframe=df_inst,
    make_index=True,
    index="INST_INDEX",
    time_index="DAYS_ENTRY_PAYMENT"
)
es = es.add_relationship(
    parent_dataframe_name="app",
    parent_column_name="SK_ID_CURR",
    child_dataframe_name="inst",
    child_column_name="SK_ID_CURR"
)

es = es.add_dataframe(
    dataframe_name="ccbal",
    dataframe=df_cc_bal,
    make_index=True,
    index="CC_INDEX",
    time_index="MONTHS_BALANCE"
)
es = es.add_relationship(
    parent_dataframe_name="app",
    parent_column_name="SK_ID_CURR",
    child_dataframe_name="ccbal",
    child_column_name="SK_ID_CURR"
)

es = es.add_dataframe(
    dataframe_name="pos",
    dataframe=df_pos,
    make_index=True,
    index="POS_INDEX",
    time_index="MONTHS_BALANCE"
)
es = es.add_relationship(
    parent_dataframe_name="app",
    parent_column_name="SK_ID_CURR",
    child_dataframe_name="pos",
    child_column_name="SK_ID_CURR"
)

  pd.to_datetime(
  pd.to_datetime(
  pd.to_datetime(
  pd.to_datetime(
  pd.to_datetime(
  pd.to_datetime(
  pd.to_datetime(
  pd.to_datetime(
  pd.to_datetime(
  pd.to_datetime(
  pd.to_datetime(
  pd.to_datetime(
  pd.to_datetime(
  pd.to_datetime(
  pd.to_datetime(
  pd.to_datetime(
  pd.to_datetime(
  pd.to_datetime(
  pd.to_datetime(
  pd.to_datetime(
  pd.to_datetime(
  pd.to_datetime(
  pd.to_datetime(
  pd.to_datetime(
  pd.to_datetime(
  pd.to_datetime(
  pd.to_datetime(
  pd.to_datetime(
  pd.to_datetime(
  pd.to_datetime(
  pd.to_datetime(
  pd.to_datetime(
  pd.to_datetime(
  pd.to_datetime(
  pd.to_datetime(
  pd.to_datetime(
  pd.to_datetime(
  pd.to_datetime(
  pd.to_datetime(
  pd.to_datetime(
  pd.to_datetime(
  pd.to_datetime(
  pd.to_datetime(
  pd.to_datetime(
  pd.to_datetime(
  pd.to_datetime(
  pd.to_datetime(
  pd.to_datetime(
  pd.to_datetime(
  pd.to_datetime(
  pd.to_datetime(
  pd.to_datetime(
  pd.to_datetime(
  pd.to_datetime(
  pd.to_datetime(
  pd.to_da

## 3. Perform Deep Feature Synthesis (DFS)

Use Featuretools to automatically generate aggregated features across related tables. The `max_depth` parameter is set to 2, as 3 was way to RAM consuming and 2 in the most cases two is enough


In [3]:
# Feature synthesis with Featuretools
feature_matrix_full, feature_defs_full = ft.dfs(
    entityset=es,
    target_dataframe_name="app",
    agg_primitives=["mean", "sum", "count", "max", "min", "std"],
    trans_primitives=[],
    max_depth=2,
    n_jobs=1
)

# Reset index to keep SK_ID_CURR as a column
feature_matrix_full = feature_matrix_full.reset_index()
print("DFS feature matrix shape:", feature_matrix_full.shape)

# Reconstruct train/test IDs from original application data
train_ids = df_app_train["SK_ID_CURR"]
test_ids  = df_app_test["SK_ID_CURR"]

# Drop TARGET if DFS accidentally retained it (safety check)
if "TARGET" in feature_matrix_full.columns:
    feature_matrix_full = feature_matrix_full.drop(columns="TARGET")

# Split feature matrix into train and test
dfs_train = feature_matrix_full[feature_matrix_full["SK_ID_CURR"].isin(train_ids)].copy()
dfs_test  = feature_matrix_full[feature_matrix_full["SK_ID_CURR"].isin(test_ids)].copy()

# Merge TARGET back into train set (from original app data)
dfs_train = dfs_train.merge(df_app_train[["SK_ID_CURR", "TARGET"]], on="SK_ID_CURR", how="left")

# Save
dfs_train.to_parquet(OUTPUT_DIR / "dfs_features_train.parquet", index=False)
dfs_test.to_parquet(OUTPUT_DIR / "dfs_features_test.parquet", index=False)

print("Saved DFS train shape:", dfs_train.shape)
print("Saved DFS test shape:", dfs_test.shape)

  ).agg(to_agg)
  ).agg(to_agg)
  ).agg(to_agg)
  ).agg(to_agg)
  ).agg(to_agg)
  ).agg(to_agg)
  ).agg(to_agg)
  ).agg(to_agg)
  ).agg(to_agg)
  ).agg(to_agg)
  ).agg(to_agg)
  ).agg(to_agg)
  ).agg(to_agg)
  ).agg(to_agg)
  ).agg(to_agg)
  ).agg(to_agg)
  ).agg(to_agg)
  ).agg(to_agg)
  ).agg(to_agg)
  ).agg(to_agg)
  ).agg(to_agg)
  ).agg(to_agg)
  ).agg(to_agg)
  ).agg(to_agg)
  ).agg(to_agg)
  ).agg(to_agg)
  ).agg(to_agg)
  ).agg(to_agg)
  ).agg(to_agg)
  ).agg(to_agg)
  ).agg(to_agg)
  ).agg(to_agg)
  ).agg(to_agg)
  ).agg(to_agg)
  ).agg(to_agg)


DFS feature matrix shape: (356255, 548)
Saved DFS train shape: (307511, 548)
Saved DFS test shape: (48744, 547)


## 4. Merge manually selected and auto-generated features

Clean column names and merge both feature sets into a single dataframe.

.parquet lead to problems for the baseline LGBM-model a cell below, so the `02_train_features_auto_all_and_first_baseline` will be saved as .csv


In [None]:
import re
import pandas as pd
from creditutils.path_utils import get_project_root

# Get project root and define file paths
proj_root = get_project_root(levels_up=1)
output_dir = proj_root / "outputs"

# Input paths
baseline_train_file = output_dir / "01_train_features_first_baseline.parquet"
baseline_test_file  = output_dir / "01_test_features_first_baseline.parquet"
dfs_train_file      = output_dir / "dfs_features_train.parquet"
dfs_test_file       = output_dir / "dfs_features_test.parquet"

# Output paths
out_train_file = output_dir / "02_train_features_auto_all_and_first_baseline.csv"
out_test_file  = output_dir / "02_test_features_auto_all_and_first_baseline.csv"

# Load data
df_base_train = pd.read_parquet(baseline_train_file)
df_base_test  = pd.read_parquet(baseline_test_file)
df_dfs_train  = pd.read_parquet(dfs_train_file)
df_dfs_test   = pd.read_parquet(dfs_test_file)

# Sanitize column names in DFS (except SK_ID_CURR)
def sanitize(col):
    s = re.sub(r'[^0-9A-Za-z_]', '_', col)
    return re.sub(r'__+', '_', s).strip('_')

df_dfs_train = df_dfs_train.drop(columns=[c for c in df_dfs_train.columns if c.upper() == "TARGET"])
df_dfs_train.columns = [
    c if c == "SK_ID_CURR" else sanitize(c)
    for c in df_dfs_train.columns
]

df_dfs_test.columns = [
    c if c == "SK_ID_CURR" else sanitize(c)
    for c in df_dfs_test.columns
]

# Merge baseline with DFS
df_comb_train = df_base_train.merge(df_dfs_train, on="SK_ID_CURR", how="left")
df_comb_test  = df_base_test.merge(df_dfs_test,  on="SK_ID_CURR", how="left")

# Fill NaNs only in numeric columns
for df in [df_comb_train, df_comb_test]:
    numeric_cols = df.select_dtypes(include=["number"]).columns
    df[numeric_cols] = df[numeric_cols].fillna(0)

# Save as CSV
df_comb_train.to_csv(out_train_file, index=False)
df_comb_test.to_csv(out_test_file, index=False)
print(f"Combined TRAIN saved to: {out_train_file}")
print(f"Combined TEST  saved to: {out_test_file}")

KeyboardInterrupt: 

## 5. Train LightGBM on combined features

Evaluate model performance using cross-validation.

> Attention! This step is very RAM intensive, so you can skip to the next cell where the features are reduced if your machine can't execute this cell


In [None]:
from sklearn.model_selection import StratifiedKFold
import lightgbm as lgb
from creditutils.feature_selection_utils import evaluate_auc

df = pd.read_csv(out_train_file)

# Convert object columns to category
for c in df.select_dtypes(include=['object']).columns:
    df[c] = df[c].astype('category')

y = df['TARGET']
X = df.drop(columns=['SK_ID_CURR','TARGET'])

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
model = lgb.LGBMClassifier(random_state=42)

# 5 fold cross-validation funciton from utils
evaluate_auc(X, y, model, cv, name="Baseline Combined Features")

[LightGBM] [Info] Number of positive: 19860, number of negative: 226148
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.353435 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 109516
[LightGBM] [Info] Number of data points in the train set: 246008, number of used features: 577
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.080729 -> initscore=-2.432482
[LightGBM] [Info] Start training from score -2.432482
[LightGBM] [Info] Number of positive: 19860, number of negative: 226149
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.436757 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 109698
[LightGBM] [Info] Number of data points in the train set: 246009, number of used features: 576
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.080729 -> initscore=-2.432486
[LightGBM] [Info] Start training from score -2.432486
[L

(0.7848216027046654, 0.003963733071328941)

Baseline and all unfiltered auto search features: AUC: 0.78482

## 6. Select features using the Elbow Method

Train a model and use the elbow point of cumulative feature importances to retain the most relevant features.

In [17]:
from kneed import KneeLocator

# Train model on full feature set
model.fit(X, y)
importances = pd.Series(model.feature_importances_, index=X.columns)

# Sort and compute cumulative importance
imp_sorted = importances.sort_values(ascending=False)
cum_imp = imp_sorted.cumsum() / imp_sorted.sum()

# Find elbow point
kl = KneeLocator(
    x=list(range(len(cum_imp))),
    y=cum_imp.values,
    curve="concave",
    direction="increasing"
)
cutoff = kl.knee
print(f"Elbow at rank {cutoff}, cumulative importance: {cum_imp.values[cutoff]:.2%}")

# Select features up to elbow
selected_feats = imp_sorted.index[: cutoff + 1].tolist()
X_imp = X[selected_feats]
print("Shape after elbow filtering:", X_imp.shape)

# Evaluate
evaluate_auc(X_imp, y, model, cv, name="After Elbow Importance Filter")

[LightGBM] [Info] Number of positive: 24825, number of negative: 282686
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.107596 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 25618
[LightGBM] [Info] Number of data points in the train set: 307511, number of used features: 121
[LightGBM] [Info] Using GOSS
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.080729 -> initscore=-2.432486
[LightGBM] [Info] Start training from score -2.432486
Elbow at rank 60, cumulative importance: 71.91%
Shape after elbow filtering: (307511, 61)
[LightGBM] [Info] Number of positive: 19860, number of negative: 226148
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.021396 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 14806
[LightGBM] [Info] Number of data points in the train set: 246008, number of used features: 61
[LightGBM] [Info] Us

KeyboardInterrupt: 

Importance Filtered AUC: 0.78499

## 7. Remove highly correlated features

Drop one feature from each pair of highly correlated features based on importance.

In [None]:
# Only use numeric features
X_num = X_imp.select_dtypes(include=[np.number])

# Absolute correlation matrix
corr_matrix = X_num.corr().abs()
upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))

# Identify pairs above threshold
drop_info = []
threshold = 0.90
for col in upper.columns:
    for other in upper.index[upper[col] > threshold]:
        corr_val = upper.at[other, col]
        if importances[col] < importances[other]:
            drop_info.append((col, other, corr_val))
        else:
            drop_info.append((other, col, corr_val))

to_drop = list({feat for feat, _, _ in drop_info})

print("Dropping the following highly correlated features:")
for feat, corr_with, corr_val in drop_info:
    print(f"- {feat} (corr={corr_val:.2f}) with {corr_with}")

X_corr = X_imp.drop(columns=to_drop)
print("Shape after correlation filtering:", X_corr.shape)

evaluate_auc(X_corr, y, model, cv, name="After Correlation Filter")

Dropping the following highly correlated features:
- AMT_CREDIT (corr=0.99) with AMT_GOODS_PRICE_x
- STD_pos_MONTHS_BALANCE (corr=0.91) with STD_inst_DAYS_INSTALMENT
- EXT_SOURCE_2_x (corr=0.99) with EXT_SOURCE_2_y
- MEAN_pos_SK_DPD_DEF (corr=0.97) with STD_pos_SK_DPD_DEF
- MAX_previous_AMT_DOWN_PAYMENT (corr=0.91) with SUM_previous_AMT_DOWN_PAYMENT
- STD_inst_DAYS_ENTRY_PAYMENT (corr=1.00) with STD_inst_DAYS_INSTALMENT
- STD_inst_DAYS_ENTRY_PAYMENT (corr=0.91) with STD_pos_MONTHS_BALANCE
- SUM_bureau_AMT_CREDIT_MAX_OVERDUE (corr=0.95) with MEAN_bureau_AMT_CREDIT_MAX_OVERDUE
- SUM_inst_AMT_INSTALMENT (corr=0.98) with SUM_inst_AMT_PAYMENT
- bur_DEBT_LIMIT_RATIO_MEAN (corr=0.96) with MEAN_bureau_AMT_CREDIT_SUM_DEBT
- SUM_previous_DAYS_LAST_DUE_1ST_VERSION (corr=1.00) with MAX_previous_DAYS_LAST_DUE_1ST_VERSION
- STD_inst_NUM_INSTALMENT_NUMBER (corr=0.97) with MAX_inst_NUM_INSTALMENT_NUMBER
- MAX_pos_CNT_INSTALMENT (corr=0.95) with STD_pos_CNT_INSTALMENT_FUTURE
- bur_CREDIT_COUNT_SUM (cor

(0.7847081958491091, 0.004211001759322028)

Correlation Filtered AUC: 0.78470

## 8. SHAP-based Recursive Feature Elimination

Drop least important features based on SHAP values and evaluate AUC after each step.


In [None]:
import shap
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
import lightgbm as lgb

# Prepare data
X = X_corr.copy()
y = df['TARGET']

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)

# SHAP helper function
def compute_shap_values(model, X_background, X_target):
    explainer = shap.TreeExplainer(model)
    shap_vals = explainer.shap_values(X_target)
    return shap_vals[1] if isinstance(shap_vals, list) else shap_vals

model = lgb.LGBMClassifier(random_state=42)
model.fit(X_train, y_train)

shap_values = compute_shap_values(model, X_train, X_test)

# Plot summary
#shap.summary_plot(shap_values, X_test, plot_type="bar", max_display=shap_values.shape[1])

[LightGBM] [Info] Number of positive: 19860, number of negative: 226148
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.485620 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 126735
[LightGBM] [Info] Number of data points in the train set: 246008, number of used features: 729
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.080729 -> initscore=-2.432482
[LightGBM] [Info] Start training from score -2.432482




## 9. Iteratively eliminate least important SHAP features

Evaluate performance as features are dropped based on SHAP ranking.


In [None]:
from sklearn.model_selection import StratifiedKFold
from creditutils.feature_selection_utils import evaluate_auc

def iteratively_drop_least_important_features(X, y, shap_vals, max_drop):
    mean_shap = np.abs(shap_vals).mean(axis=0)
    feature_ranking = pd.Series(mean_shap, index=X.columns).sort_values()

    model = lgb.LGBMClassifier(random_state=42, verbosity=-1)
    cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

    print("Iterative SHAP Feature Elimination (CV-AUC):")
    print("n_dropped | AUC      | STD     | Dropped Features")
    print("--------------------------------------------------------------")

    for n in range(1, max_drop + 1):
        to_drop = feature_ranking.index[:n].tolist()
        X_reduced = X.drop(columns=to_drop)
        mean_auc, std_auc = evaluate_auc(X_reduced, y, model, cv, verbose=False)
        print(f"{n:<9} | {mean_auc:.5f} | {std_auc:.5f} | {', '.join(to_drop)}")

# Run for up to 100 dropped features
iteratively_drop_least_important_features(X, y, shap_values, max_drop=100)

Iterative SHAP Feature Elimination (CV-AUC):
n_dropped | AUC      | STD     | Dropped Features
--------------------------------------------------------------
1         | 0.78401 | 0.00418 | MIN_previous_NFLAG_INSURED_ON_APPROVAL
2         | 0.78401 | 0.00418 | MIN_previous_NFLAG_INSURED_ON_APPROVAL, FLAG_DOCUMENT_20_y
3         | 0.78401 | 0.00418 | MIN_previous_NFLAG_INSURED_ON_APPROVAL, FLAG_DOCUMENT_20_y, FLAG_DOCUMENT_19_y
4         | 0.78397 | 0.00414 | MIN_previous_NFLAG_INSURED_ON_APPROVAL, FLAG_DOCUMENT_20_y, FLAG_DOCUMENT_19_y, FLAG_DOCUMENT_18_y
5         | 0.78401 | 0.00418 | MIN_previous_NFLAG_INSURED_ON_APPROVAL, FLAG_DOCUMENT_20_y, FLAG_DOCUMENT_19_y, FLAG_DOCUMENT_18_y, FLAG_DOCUMENT_17_y
6         | 0.78401 | 0.00418 | MIN_previous_NFLAG_INSURED_ON_APPROVAL, FLAG_DOCUMENT_20_y, FLAG_DOCUMENT_19_y, FLAG_DOCUMENT_18_y, FLAG_DOCUMENT_17_y, FLAG_DOCUMENT_16_y
7         | 0.78422 | 0.00450 | MIN_previous_NFLAG_INSURED_ON_APPROVAL, FLAG_DOCUMENT_20_y, FLAG_DOCUMENT_19_y, FLAG

Dropping the 40 least important features (based on SHAP values) yielded the highest AUC of **0.78581**.  
However, I ultimately decided to drop **59 features**, as the benefit of further dimensionality reduction and improved interpretability was prioritized over achieving the absolute maximum AUC. This has still a very good AUC of **0.78547**.

So finally, the **total number of features** is **62**

## 10. Save final feature set after SHAP-RFE

Drop 56 least relevant features based on SHAP and save new baseline feature set.


In [12]:
# Compute mean absolute SHAP values and sort ascending
mean_shap = np.abs(shap_values).mean(axis=0)
shap_importance = pd.Series(mean_shap, index=X.columns).sort_values()

# Drop the 59 least important features
features_to_drop = shap_importance.index[:59].tolist()
X_final = X.drop(columns=features_to_drop)
df_final = df[["SK_ID_CURR", "TARGET"]].join(X_final)

# Save final train feature set
out_path_train = OUTPUT_DIR / "03_train_features_autosearch_baseline.parquet"
df_final.to_parquet(out_path_train, index=False)
print(f"Saved to {out_path_train}, shape: {df_final.shape}")

# --- Filter test features to match X_final ---
# Load full test set
test_path_full = OUTPUT_DIR / "02_test_features_auto_all_and_first_baseline.csv"
test_df = pd.read_csv(test_path_full)

# Keep only SK_ID_CURR + final feature columns
final_feature_cols = X_final.columns.tolist()
test_df_final = test_df[["SK_ID_CURR"] + final_feature_cols].copy()

# Save filtered test set
out_path_test = OUTPUT_DIR / "03_test_features_autosearch_baseline.parquet"
test_df_final.to_parquet(out_path_test, index=False)
print(f"Saved to {out_path_test}, shape: {test_df_final.shape}")

# Evaluate performance
mean_auc, std_auc = evaluate_auc(X_final, y, model, cv, name="Final SHAP-RFE Feature Baseline")

Saved to C:\Users\tgruenecker\OneDrive\Desktop\Master_Studium\3. Semester\Home_Credit_Projekt\Home_Credit_Project\outputs\03_train_features_autosearch_baseline.parquet, shape: (307511, 64)
Saved to C:\Users\tgruenecker\OneDrive\Desktop\Master_Studium\3. Semester\Home_Credit_Projekt\Home_Credit_Project\outputs\03_test_features_autosearch_baseline.parquet, shape: (48744, 63)
[LightGBM] [Info] Number of positive: 19860, number of negative: 226148
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.047164 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 12080
[LightGBM] [Info] Number of data points in the train set: 246008, number of used features: 62
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.080729 -> initscore=-2.432482
[LightGBM] [Info] Start training from score -2.432482
[LightGBM] [Info] Number of positive: 19860, number of negative: 226149
[LightGBM] [Info] Auto-choosing col-wise multi-threadin

## 11. Log final model and features to Weights & Biases

Track experiment metrics and selected features for reproducibility.


In [None]:
import wandb

# Initialize wandb run
wandb.init(
    project="home_credit_default",
    name="03_autosearch_baseline",
    config={
        "model": "lightgbm",
        "selection": "SHAP + Elbow + Correlation",
    }
)

# Log scalar metrics
wandb.log({
    "auc": mean_auc,
    "cv_std": std_auc,
    "n_features": X_final.shape[1],
})

# Log feature names as summary
wandb.summary["feature_names"] = X_final.columns.tolist()

wandb.finish()