In [1]:
import sys
from pathlib import Path

import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score, classification_report, confusion_matrix
import matplotlib.pyplot as plt

import xgboost as xgb
from xgboost.callback import EarlyStopping
import lightgbm as lgb
import time

# Add project root to path
PROJ_ROOT = Path.cwd().parent
if str(PROJ_ROOT) not in sys.path:
    sys.path.append(str(PROJ_ROOT))

# Import from actual project structure
from credit_risk_xai.config import (
    BASE_CACHE_PATH,
    FEATURE_CACHE_PATH,
    MACRO_CACHE_PATH,
    COLS_TO_LOAD,
    FEATURES_FOR_MODEL,
    MACRO_FEATURE_NAMES,
    MACRO_FEATURE_PRIORITIES,
    SME_CATEGORIES,
    CATEGORICAL_COLS,
    MIN_REVENUE_KSEK,
    NY_COLS,
    KEPT_RAW_COLS,
    RATIO_FEATURE_NAMES,
    LIQUIDITY_EFFICIENCY_FEATURES,
    TREND_FEATURE_NAMES,
    CRISIS_FEATURE_NAMES,
    ENGINEERED_FEATURE_NAMES,
)

from credit_risk_xai.features.engineer import (
    create_engineered_features,
    apply_modeling_filters,
    create_target_variable,
    prepare_modeling_data,
)

print(f"Project root: {PROJ_ROOT}")
print(f"Base cache: {BASE_CACHE_PATH}")
print(f"Feature cache: {FEATURE_CACHE_PATH}")
print(f"Macro cache: {MACRO_CACHE_PATH}")

Project root: /Users/vilhelmkarlin/Code/HHS/BE451_Thesis/credit-risk-xai-thesis
Base cache: /Users/vilhelmkarlin/Code/HHS/BE451_Thesis/credit-risk-xai-thesis/data/interim/serrano_base.parquet
Feature cache: /Users/vilhelmkarlin/Code/HHS/BE451_Thesis/credit-risk-xai-thesis/data/processed/serrano_features.parquet
Macro cache: /Users/vilhelmkarlin/Code/HHS/BE451_Thesis/credit-risk-xai-thesis/data/interim/macro_annual.parquet


In [None]:
# COLUMN DEFINITIONS
# ============================================================================

# All column definitions are imported from credit_risk_xai.config
# Available variables:
# - BASE_COLS, NY_COLS, KEPT_RAW_COLS, RR_SOURCE_COLS, BR_SOURCE_COLS
# - COLS_TO_LOAD, CATEGORICAL_COLS, SME_CATEGORIES
# - RATIO_FEATURE_NAMES, LIQUIDITY_EFFICIENCY_FEATURES
# - TREND_FEATURE_NAMES, CRISIS_FEATURE_NAMES, MACRO_FEATURE_NAMES
# - ENGINEERED_FEATURE_NAMES, FEATURES_FOR_MODEL

print(f"Columns to load: {len(COLS_TO_LOAD)}")
print(f"Total engineered features registered: {len(ENGINEERED_FEATURE_NAMES)}")
print(f"Features for modeling: {len(FEATURES_FOR_MODEL)}")

In [2]:
# DATA LOADING
# ============================================================================
# Load macro data and interim Serrano base dataset

print("Loading macro data...")
if MACRO_CACHE_PATH.exists():
    macro_df = pd.read_parquet(MACRO_CACHE_PATH)
    print(f"✓ Macro summary loaded: {macro_df.shape}")
    print(f"  Years covered: {macro_df['ser_year'].min()}-{macro_df['ser_year'].max()}")
else:
    print(f"✗ Macro cache not found at {MACRO_CACHE_PATH}")
    print("  Run: python -m credit_risk_xai.data.make_macro")
    macro_df = None

print("\nLoading Serrano base dataset...")
if BASE_CACHE_PATH.exists():
    serrano_base = pd.read_parquet(BASE_CACHE_PATH)
    print(f"✓ Base Serrano loaded: {serrano_base.shape}")
    print(f"  Memory: {serrano_base.memory_usage(deep=True).sum() / 1024**3:.2f} GB")
else:
    print(f"✗ Base cache not found at {BASE_CACHE_PATH}")
    print("  Run: python -m credit_risk_xai.data.make_dataset")
    serrano_base = None

Loading macro data...
✓ Macro summary loaded: (45, 13)
  Years covered: 1981-2025

Loading Serrano base dataset...
✓ Base Serrano loaded: (12473668, 116)
  Memory: 11.37 GB


In [None]:
for name, row in serrano_base.dtypes.items():
    print(name, ":", row)

ORGNR :  Int64
ser_namn :  object
ser_year :  Int32
bol_konkurs :  float64
bol_q80dat :  datetime64[ns]
ser_stklf :  float64
bslov_antanst :  float64
ser_aktiv :  float64
ser_nystartat :  float64
ser_regdat :  datetime64[ns]
bransch_sni071_konv :  float64
bransch_borsbransch_konv :  float64
ser_laen :  float64
knc_kncfall :  float64
ny_kapomsh :  float64
ny_avktokap :  float64
ny_rs :  float64
ny_skuldgrd :  float64
ny_solid :  float64
ny_avkegkap :  float64
ny_rorkapo :  float64
ny_kasslikv :  float64
ny_rormarg :  float64
ny_nettomarg :  float64
ny_vinstprc :  float64
ny_omspanst :  float64
ny_foradlvpanst :  float64
ny_omsf :  float64
ny_anstf :  float64
rr01_ntoms :  float64
br09_tillgsu :  float64
br10_eksu :  float64
br07b_kabasu :  float64
br13_ksksu :  float64
br15_lsksu :  float64
rr07_rorresul :  float64
rr15_resar :  float64
rr02_rointov :  float64
rr05_avskriv :  float64
rr04_perskos :  float64
rr03_jfrst :  float64
rr06_rorkoov :  float64
rr09_finkostn :  float64
rr09d_jfr

In [None]:
# FEATURE ENGINEERING
# ============================================================================
# Option 1: Load pre-computed features (fast)
# Option 2: Compute features from base dataset (slow, but fresh)

USE_CACHED_FEATURES = True  # Set to False to recompute

if USE_CACHED_FEATURES and FEATURE_CACHE_PATH.exists():
    print("Loading pre-computed feature matrix...")
    serrano_df = pd.read_parquet(FEATURE_CACHE_PATH)
    print(f"✓ Feature matrix loaded: {serrano_df.shape}")
    print(f"  Memory: {serrano_df.memory_usage(deep=True).sum() / 1024**3:.2f} GB")
    
elif serrano_base is not None and macro_df is not None:
    print("Computing features from base dataset...")
    print("⚠️  This may take 5-15 minutes for large datasets...")
    
    serrano_df = create_engineered_features(serrano_base, macro_df=macro_df)
    
    # Save for next time
    FEATURE_CACHE_PATH.parent.mkdir(parents=True, exist_ok=True)
    serrano_df.to_parquet(FEATURE_CACHE_PATH, index=False)
    print(f"✓ Features computed and cached: {serrano_df.shape}")
    
else:
    print("✗ Cannot compute features - missing base data or macro data")
    print("  Run the data loading cell above first")
    serrano_df = None

if serrano_df is not None:
    engineered_cols = [c for c in serrano_df.columns if c in ENGINEERED_FEATURE_NAMES]
    print(f"\nEngineered features in dataset: {len(engineered_cols)}")

In [None]:
# FEATURE ENGINEERING SUMMARY
# ============================================================================
# Features are computed by credit_risk_xai.features.engineer module
# See the module for details on:
# - Profitability ratios (EBITDA, interest coverage, etc.)
# - Liquidity metrics (cash liquidity, DSO, DPO, inventory days)
# - Capital structure (debt ratios, equity composition)
# - Trends (YoY changes, CAGR, rolling averages/volatility)
# - Credit event history (streaks, event counts)
# - Macro indicators (GDP growth, interest rates, unemployment)

In [None]:
# MODELING DATA PREPARATION
# ============================================================================

if serrano_df is not None:
    print("Applying modeling filters...")
    filtered_df = apply_modeling_filters(serrano_df, min_revenue_ksek=MIN_REVENUE_KSEK)
    
    print("Creating target variable...")
    valid_mask = create_target_variable(filtered_df)
    
    print("Preparing feature matrix and target...")
    X, y = prepare_modeling_data(filtered_df, valid_mask)
    
    print(f"\n{'='*80}")
    print("MODELING DATASET SUMMARY")
    print(f"{'='*80}")
    print(f"Filtered dataset shape: {filtered_df.shape}")
    print(f"Memory usage: {filtered_df.memory_usage(deep=True).sum() / 1024**3:.2f} GB")
    print(f"\nFeature matrix (X): {X.shape}")
    print(f"Target vector (y): {y.shape}")
    print(f"\nTarget distribution:")
    print(y.value_counts().to_string())
    print(f"\nClass imbalance: {(y==0).sum() / (y==1).sum():.1f}:1")
    print(f"{'='*80}")
    
else:
    print("✗ Cannot prepare modeling data - feature matrix not loaded")
    print("  Run the feature engineering cell above first")

In [None]:
# EXPLORATORY DATA ANALYSIS FUNCTIONS
# ============================================================================

def analyze_class_imbalance_by_revenue(df, valid_mask, thresholds=[1000, 5_000, 10_000, 50_000, 100_000, 1_000_000]):
    """Analyze class imbalance across different revenue thresholds"""
    print(f"\n1. Class Imbalance by Revenue Threshold (kSEK = thousands SEK)")
    print("-" * 90)
    print(f"{'Min Revenue (kSEK)':<20} {'Total Rows':<15} {'Credit Events':<15} {'Event Rate %':<15} {'Imbalance':<15}")
    print("-" * 90)
    
    for threshold in thresholds:
        mask = (df['rr01_ntoms'] >= threshold) & valid_mask
        n_samples = mask.sum()
        n_events = df.loc[mask, 'target_next_year'].sum()
        n_no_events = (df.loc[mask, 'target_next_year'] == 0).sum()
        
        if n_events > 0:
            event_rate = 100 * n_events / n_samples
            imbalance = n_no_events / n_events
            print(f"{threshold:<20,} {n_samples:<15,} {n_events:<15,} {event_rate:<15.3f} {imbalance:<15.1f}:1")
        else:
            print(f"{threshold:<20,} {n_samples:<15,} {0:<15,} {'0.000':<15} {'N/A':<15}")


def analyze_class_imbalance_by_year(df, valid_mask):
    """Analyze class imbalance across different years"""
    print(f"\n2. Class Imbalance by Year")
    print("-" * 90)
    print(f"{'Year':<10} {'Total Rows':<15} {'Credit Events':<15} {'Event Rate %':<15} {'Imbalance':<15}")
    print("-" * 90)
    
    years = sorted(df.loc[valid_mask, 'ser_year'].dropna().unique())
    
    for year in years:
        mask = (df['ser_year'] == year) & valid_mask
        n_samples = mask.sum()
        n_events = df.loc[mask, 'target_next_year'].sum()
        n_no_events = (df.loc[mask, 'target_next_year'] == 0).sum()
        
        if n_events > 0:
            event_rate = 100 * n_events / n_samples
            imbalance = n_no_events / n_events
            print(f"{int(year):<10} {n_samples:<15,} {n_events:<15,} {event_rate:<15.3f} {imbalance:<15.1f}:1")


def analyze_class_imbalance_by_sme(df, valid_mask):
    """Analyze class imbalance across SME categories"""
    print(f"\n3. STRICT EU SME Classification (employees AND revenue/assets)")
    print("-" * 90)
    print(f"{'SME Category':<40} {'Total Rows':<15} {'Credit Events':<15} {'Event Rate %':<15} {'Imbalance':<15}")
    print("-" * 90)
    
    for category in SME_CATEGORIES:
        mask = (df['sme_category'] == category) & valid_mask
        n_samples = mask.sum()
        n_events = df.loc[mask, 'target_next_year'].sum()
        n_no_events = (df.loc[mask, 'target_next_year'] == 0).sum()
        
        if n_samples > 0 and n_events > 0:
            event_rate = 100 * n_events / n_samples
            imbalance = n_no_events / n_events
            print(f"{category:<40} {n_samples:<15,} {n_events:<15,} {event_rate:<15.3f} {imbalance:<15.1f}:1")
        elif n_samples > 0:
            print(f"{category:<40} {n_samples:<15,} {0:<15,} {'0.000':<15} {'N/A':<15}")


def generate_eda_report(df, valid_mask):
    """Generate complete EDA report"""
    print("DATA EXPLORATION: Revenue, Years, and SME Classification")
    print("="*90)
    
    analyze_class_imbalance_by_revenue(df, valid_mask)
    analyze_class_imbalance_by_year(df, valid_mask)
    analyze_class_imbalance_by_sme(df, valid_mask)
    
    print("\n" + "="*90)

In [None]:
# Run EDA report
if 'filtered_df' in locals() and 'valid_mask' in locals():
    generate_eda_report(filtered_df, valid_mask)
else:
    print("✗ Cannot run EDA - prepare the modeling data first")

In [None]:
# TRAIN/VAL SPLIT
# ============================================================================

if 'X' in locals() and 'y' in locals():
    X_train, X_val, y_train, y_val = train_test_split(
        X, y, test_size=0.2, stratify=y, random_state=42
    )
    
    print(f"Full dataset: {len(X):,} rows × {X.shape[1]} features")
    print(f"\nTrain: {X_train.shape[0]:,} rows")
    print(f"Validation: {X_val.shape[0]:,} rows")
else:
    print("✗ Cannot split data - X and y not available")
    print("  Run the modeling data preparation cell above first")

In [None]:
# MODEL TRAINING FUNCTIONS
# ============================================================================

def train_lightgbm_model(X_train, y_train, X_val, y_val, params=None):
    """
    Train LightGBM model with early stopping.
    
    Returns:
    - model: Trained LightGBM model
    - training_time: Time in seconds
    """
    scale_pos_weight = (y_train == 0).sum() / (y_train == 1).sum()
    print(f"Calculated scale_pos_weight: {scale_pos_weight:.2f}")
    
    # Default parameters
    default_params = {
        'n_estimators': 10000,
        'learning_rate': 0.05,
        'num_leaves': 31,
        'subsample': 0.8,
        'colsample_bytree': 0.8,
        'objective': 'binary',
        'is_unbalance': True,
        'random_state': 42,
        'n_jobs': -1,
        'verbose': -1,
        'metric': 'auc',
        'reg_alpha': 0.1,
        'reg_lambda': 0.1
    }
    
    # Override with custom params if provided
    if params:
        default_params.update(params)
    
    model = lgb.LGBMClassifier(**default_params)
    
    print("\nTraining LightGBM model...")
    start_time = time.time()
    
    model.fit(
        X_train, y_train,
        eval_set=[(X_train, y_train), (X_val, y_val)],
        eval_metric='auc',
        callbacks=[
            lgb.early_stopping(stopping_rounds=50, verbose=True),
            lgb.log_evaluation(period=50)
        ]
    )
    
    training_time = time.time() - start_time
    print(f"\nLightGBM training completed in {training_time:.1f}s ({training_time/60:.1f} min)")
    
    return model, training_time


def evaluate_model(model, X_val, y_val):
    """
    Evaluate model performance.
    
    Returns:
    - metrics: Dictionary of evaluation metrics
    """
    y_pred = model.predict(X_val)
    y_pred_proba = model.predict_proba(X_val)[:, 1]
    
    auc = roc_auc_score(y_val, y_pred_proba)
    
    print(f"\nValidation AUC: {auc:.4f}")
    print("\nClassification Report:")
    print(classification_report(y_val, y_pred))
    print("\nConfusion Matrix:")
    print(confusion_matrix(y_val, y_pred))
    
    return {
        'auc': auc,
        'y_pred': y_pred,
        'y_pred_proba': y_pred_proba
    }

In [None]:
# TRAIN LIGHTGBM MODEL
# ============================================================================

if all(var in locals() for var in ['X_train', 'y_train', 'X_val', 'y_val']):
    lgb_model, lgb_time = train_lightgbm_model(X_train, y_train, X_val, y_val)
else:
    print("✗ Cannot train model - train/val split not available")
    print("  Run the train/val split cell above first")

In [None]:
# MODEL INTERPRETATION FUNCTIONS
# ============================================================================

def show_feature_importance(model, top_n=20):
    """Display top N feature importances"""
    print(f"\nTop {top_n} Feature Importances:")
    importance_df = pd.DataFrame({
        'feature': model.feature_name_,
        'importance': model.feature_importances_
    }).sort_values('importance', ascending=False).head(top_n)
    
    print(importance_df.to_string(index=False))
    
    return importance_df

def compute_shap_importance(model, X_val, sample_size=10000):
    """
    Compute TreeSHAP values for feature importance.
    
    Returns:
    - shap_importance: DataFrame with features and mean absolute SHAP values
    - shap_values: Raw SHAP values array
    - X_sample: Sample used for SHAP computation
    """
    import shap
    
    print(f"\nComputing TreeSHAP values on {sample_size:,} samples...")
    
    # Sample for computational efficiency
    sample_size = min(sample_size, len(X_val))
    X_sample = X_val.sample(n=sample_size, random_state=42)
    
    # Create SHAP explainer
    explainer = shap.TreeExplainer(model)
    shap_values = explainer.shap_values(X_sample)
    
    # Handle binary classification (returns list of [negative_class, positive_class])
    if isinstance(shap_values, list):
        shap_values = shap_values[1]
    
    # Calculate mean absolute SHAP values per feature
    shap_importance = pd.DataFrame({
        'feature': X_sample.columns,
        'mean_abs_shap': np.abs(shap_values).mean(axis=0)
    }).sort_values('mean_abs_shap', ascending=False)
    
    print(f"TreeSHAP computation completed!")
    
    return shap_importance, shap_values, X_sample


def compute_per_feature_auc(X_val, y_val):
    """
    Train individual models on each feature and compute AUC.
    
    Returns:
    - feature_auc_df: DataFrame with features and their individual AUC scores
    """
    print(f"\nComputing per-feature AUC on {len(X_val):,} samples...")
    print("This may take several minutes...")
    
    feature_aucs = []
    
    for i, feature in enumerate(X_val.columns, 1):
        if i % 20 == 0:
            print(f"  Progress: {i}/{len(X_val.columns)} features processed")
        
        # Get non-null values
        mask = X_val[feature].notna()
        
        if mask.sum() < 100:  # Skip if too few samples
            continue
        
        X_feature = X_val.loc[mask, feature].values.reshape(-1, 1)
        y_feature = y_val.loc[mask]
        
        try:
            # Train simple model on single feature
            temp_model = lgb.LGBMClassifier(
                n_estimators=100,
                learning_rate=0.1,
                num_leaves=7,
                random_state=42,
                verbose=-1
            )
            temp_model.fit(X_feature, y_feature)
            y_pred_proba = temp_model.predict_proba(X_feature)[:, 1]
            
            auc = roc_auc_score(y_feature, y_pred_proba)
            feature_aucs.append({'feature': feature, 'auc': auc})
        except Exception as e:
            print(f"  Skipped {feature}: {str(e)}")
            continue
    
    # Sort by AUC
    feature_auc_df = pd.DataFrame(feature_aucs).sort_values('auc', ascending=False)
    
    print(f"\nPer-feature AUC computation completed!")
    print(f"  Features with AUC > 0.60: {(feature_auc_df['auc'] > 0.60).sum()}")
    print(f"  Features with AUC > 0.65: {(feature_auc_df['auc'] > 0.65).sum()}")
    print(f"  Features with AUC > 0.70: {(feature_auc_df['auc'] > 0.70).sum()}")
    
    return feature_auc_df


def display_feature_analysis(shap_importance, feature_auc_df, top_n=30):
    """
    Display comprehensive feature importance analysis.
    """
    print("\n" + "="*90)
    print("FEATURE IMPORTANCE ANALYSIS")
    print("="*90)
    
    print(f"\n1. TreeSHAP Importance (Top {top_n})")
    print("-"*90)
    print(shap_importance.head(top_n).to_string(index=False))
    
    print(f"\n\n2. Per-Feature AUC (Top {top_n})")
    print("-"*90)
    print(feature_auc_df.head(top_n).to_string(index=False))
    
    # Merge for comparison
    comparison = shap_importance.merge(feature_auc_df, on='feature', how='inner')
    comparison['shap_rank'] = comparison['mean_abs_shap'].rank(ascending=False)
    comparison['auc_rank'] = comparison['auc'].rank(ascending=False)
    comparison['avg_rank'] = (comparison['shap_rank'] + comparison['auc_rank']) / 2
    comparison = comparison.sort_values('avg_rank')
    
    print(f"\n\n3. Combined Ranking (SHAP + AUC, Top {top_n})")
    print("-"*90)
    print(comparison[['feature', 'mean_abs_shap', 'shap_rank', 'auc', 'auc_rank', 'avg_rank']].head(top_n).to_string(index=False))
    
    return comparison

In [None]:
# EVALUATE MODEL
# ============================================================================

if 'lgb_model' in locals():
    importance_df = show_feature_importance(lgb_model, top_n=20)
    metrics = evaluate_model(lgb_model, X_val, y_val)
else:
    print("✗ Cannot evaluate - model not trained")
    print("  Run the model training cell above first")

In [None]:
# FEATURE IMPORTANCE ANALYSIS
# ============================================================================

if 'lgb_model' in locals() and 'X_val' in locals():
    # Compute TreeSHAP importance
    shap_importance, shap_values, X_sample = compute_shap_importance(lgb_model, X_val, sample_size=10000)
    
    # Compute per-feature AUC
    feature_auc_df = compute_per_feature_auc(X_val, y_val)
    
    # Display comprehensive analysis
    comparison_df = display_feature_analysis(shap_importance, feature_auc_df, top_n=30)
else:
    print("✗ Cannot analyze features - model or validation data not available")
    print("  Run the model training and evaluation cells above first")