In [None]:
import sys
from pathlib import Path
import pandas as pd
import numpy as np
import lightgbm as lgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score, average_precision_score, classification_report, confusion_matrix

PROJ_ROOT = Path.cwd().parent
if str(PROJ_ROOT) not in sys.path:
    sys.path.append(str(PROJ_ROOT))

from credit_risk_xai.config import FEATURE_CACHE_PATH
from credit_risk_xai.features.engineer import prepare_modeling_data

In [None]:
# Load and filter data
MIN_REVENUE_KSEK = 1_000
df = pd.read_parquet(FEATURE_CACHE_PATH)
df = df[(df["ser_aktiv"] == 1) & (df["rr01_ntoms"] >= MIN_REVENUE_KSEK)]
X, y = prepare_modeling_data(df)

print(f"Features: {X.shape[1]} | Samples: {len(X):,}")
print(f"Target distribution:\n{y.value_counts()}")
print(f"Imbalance: {(y==0).sum()/(y==1).sum():.1f}:1")

In [None]:
# Train/val split
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)
print(f"Train: {len(X_train):,} | Val: {len(X_val):,}")

In [None]:
# Train LightGBM
scale_pos_weight = (y_train == 0).sum() / (y_train == 1).sum()
model = lgb.LGBMClassifier(
    n_estimators=10000,
    learning_rate=0.05,
    num_leaves=31,
    subsample=0.8,
    colsample_bytree=0.8,
    scale_pos_weight=scale_pos_weight,
    random_state=42,
    n_jobs=-1,
    verbose=-1
)

model.fit(
    X_train, y_train,
    eval_set=[(X_train, y_train), (X_val, y_val)],
    eval_metric='auc',
    callbacks=[lgb.early_stopping(50, verbose=True), lgb.log_evaluation(100)]
)

print(f"\nBest iteration: {model.best_iteration_}")

In [None]:
# Evaluate
y_pred = model.predict(X_val)
y_pred_proba = model.predict_proba(X_val)[:, 1]

auc = roc_auc_score(y_val, y_pred_proba)
pr_auc = average_precision_score(y_val, y_pred_proba)

print(f"AUC: {auc:.4f}")
print(f"PR-AUC: {pr_auc:.4f}\n")
print("Classification Report:")
print(classification_report(y_val, y_pred))
print("\nConfusion Matrix:")
print(confusion_matrix(y_val, y_pred))

In [None]:
# Feature importance
importance_df = pd.DataFrame({
    'feature': model.feature_name_,
    'importance': model.feature_importances_
}).sort_values('importance', ascending=False)

print("Top 20 Features:")
print(importance_df.head(20).to_string(index=False))

In [None]:
# SHAP analysis (optional - uncomment to run)
# import shap
# sample_size = min(5000, len(X_val))
# X_sample = X_val.sample(n=sample_size, random_state=42)
# explainer = shap.TreeExplainer(model)
# shap_values = explainer.shap_values(X_sample)
# if isinstance(shap_values, list):
#     shap_values = shap_values[1]
# shap.summary_plot(shap_values, X_sample)