In [2]:
import sys
from pathlib import Path

import pandas as pd
import numpy as np
import glob

from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score, classification_report, confusion_matrix
import matplotlib.pyplot as plt

import xgboost as xgb
from xgboost.callback import EarlyStopping
import lightgbm as lgb
import time

PROJECT_ROOT = Path.cwd()
if str(PROJECT_ROOT) not in sys.path:
    sys.path.append(str(PROJECT_ROOT))

from src.config import (
    COLS_TO_LOAD,
    FEATURES_FOR_MODEL,
    MACRO_FEATURE_NAMES,
    MACRO_FEATURE_PRIORITIES,
    SME_CATEGORIES,
    CATEGORICAL_COLS,
    MIN_REVENUE_KSEK,
)
from src.macro_features import load_macro_data
from src.feature_engineering import (
    create_engineered_features,
    apply_modeling_filters,
    create_target_variable,
    prepare_modeling_data,
)
from src.data_loading import load_serrano_base



In [3]:
# COLUMN DEFINITIONS
# ============================================================================

from src.config import (
    BASE_COLS as base_cols,
    NY_COLS as ny_cols,
    KEPT_RAW_COLS as kept_raw_cols,
    RR_SOURCE_COLS as rr_source_cols,
    BR_SOURCE_COLS as br_source_cols,
    COLS_TO_LOAD as cols_to_load,
    RATIO_FEATURE_NAMES as ratio_feature_names,
    LIQUIDITY_EFFICIENCY_FEATURES as liquidity_efficiency_features,
    TREND_FEATURE_NAMES as trend_feature_names,
    CRISIS_FEATURE_NAMES as crisis_feature_names,
    MACRO_FEATURE_NAMES as macro_feature_names,
    MACRO_FEATURE_PRIORITIES as macro_feature_priorities,
    ENGINEERED_FEATURE_NAMES as engineered_feature_names,
    CATEGORICAL_COLS as categorical_cols,
    SME_CATEGORIES as sme_categories,
    FEATURES_FOR_MODEL as features_for_model,
)

print(f"Columns to load: {len(cols_to_load)}")
print(f"Total engineered features registered: {len(engineered_feature_names)}")



In [6]:
# DATA LOADING AND MACRO PREP
# ============================================================================

processed_serrano_path = PROJECT_ROOT / 'processed_serrano.parquet'
macro_summary_path = PROJECT_ROOT / 'macro_data' / 'macro_summary.parquet'

macro_df = load_macro_data(cache_path=macro_summary_path)
print(f"Macro summary shape: {macro_df.shape}")

serrano_base = load_serrano_base(processed_serrano_path)
print(f"Base Serrano shape: {serrano_base.shape}")



In [7]:
feature_cache_path = PROJECT_ROOT / 'processed_serrano_features.parquet'

serrano_df = create_engineered_features(serrano_base, macro_df=macro_df)
serrano_df.to_parquet(feature_cache_path, index=False)

print(f"Engineered features shape: {serrano_df.shape}")
print(f"Engineered columns added: {len([c for c in serrano_df.columns if c in engineered_feature_names])}")



Processing Stata_2025/serrano*.dta files...
Minimal filtering: ser_jurform=49 only
Preserving all revenue levels and activity statuses for company history
  Processing 1/10: serrano6.dta


  df['company_age'] = df['ser_year'] - df['ser_regdat'].dt.year
  df['credit_event'] = ((df['bol_konkurs'] == 1) | (df['bol_q80dat'].notna())).astype('int8')
  df['sme_category'] = df.apply(


  Processing 2/10: serrano7.dta


  df['company_age'] = df['ser_year'] - df['ser_regdat'].dt.year
  df['credit_event'] = ((df['bol_konkurs'] == 1) | (df['bol_q80dat'].notna())).astype('int8')
  df['sme_category'] = df.apply(


  Processing 3/10: serrano5.dta


  df['company_age'] = df['ser_year'] - df['ser_regdat'].dt.year
  df['credit_event'] = ((df['bol_konkurs'] == 1) | (df['bol_q80dat'].notna())).astype('int8')
  df['sme_category'] = df.apply(


  Processing 4/10: serrano4.dta


  df['company_age'] = df['ser_year'] - df['ser_regdat'].dt.year
  df['credit_event'] = ((df['bol_konkurs'] == 1) | (df['bol_q80dat'].notna())).astype('int8')
  df['sme_category'] = df.apply(


  Processing 5/10: serrano1.dta


  df['company_age'] = df['ser_year'] - df['ser_regdat'].dt.year
  df['credit_event'] = ((df['bol_konkurs'] == 1) | (df['bol_q80dat'].notna())).astype('int8')
  df['sme_category'] = df.apply(


  Processing 6/10: serrano3.dta


  df['company_age'] = df['ser_year'] - df['ser_regdat'].dt.year
  df['credit_event'] = ((df['bol_konkurs'] == 1) | (df['bol_q80dat'].notna())).astype('int8')
  df['sme_category'] = df.apply(


  Processing 7/10: serrano2.dta


  df['company_age'] = df['ser_year'] - df['ser_regdat'].dt.year
  df['credit_event'] = ((df['bol_konkurs'] == 1) | (df['bol_q80dat'].notna())).astype('int8')
  df['sme_category'] = df.apply(


  Processing 8/10: serrano9.dta


  df['company_age'] = df['ser_year'] - df['ser_regdat'].dt.year
  df['credit_event'] = ((df['bol_konkurs'] == 1) | (df['bol_q80dat'].notna())).astype('int8')
  df['sme_category'] = df.apply(


  Processing 9/10: serrano8.dta


  df['company_age'] = df['ser_year'] - df['ser_regdat'].dt.year
  df['credit_event'] = ((df['bol_konkurs'] == 1) | (df['bol_q80dat'].notna())).astype('int8')
  df['sme_category'] = df.apply(


  Processing 10/10: serrano10.dta


  df['company_age'] = df['ser_year'] - df['ser_regdat'].dt.year
  df['credit_event'] = ((df['bol_konkurs'] == 1) | (df['bol_q80dat'].notna())).astype('int8')
  df['sme_category'] = df.apply(


  Concatenating...


  result = pd.concat(df_list, ignore_index=True)
  result = pd.concat(df_list, ignore_index=True)
  result = pd.concat(df_list, ignore_index=True)
  result = pd.concat(df_list, ignore_index=True)
  result = pd.concat(df_list, ignore_index=True)
  result = pd.concat(df_list, ignore_index=True)
  result = pd.concat(df_list, ignore_index=True)


  Writing to Parquet...

Saved to processed_serrano.parquet!
  Rows before: 16,228,555
  Rows after: 12,473,668
  Reduction: 23.1%
Loading processed_serrano.parquet...
  Loaded! Shape: (12473668, 116)
  Memory usage: 11.13 GB

Final shape: (12473668, 116)
Categorical columns: ['sme_category']

Memory usage: 11.13 GB


In [16]:
# FEATURE ENGINEERING EXECUTION PLACEHOLDER
# (Functionality provided by src.feature_engineering module.)



In [None]:
filtered_df = apply_modeling_filters(serrano_df, min_revenue_ksek=MIN_REVENUE_KSEK)

valid_mask = create_target_variable(filtered_df)

X, y = prepare_modeling_data(filtered_df, valid_mask)

print(f"Filtered dataset shape: {filtered_df.shape}")
print(f"Memory usage: {filtered_df.memory_usage(deep=True).sum() / 1024**3:.2f} GB")



Remapped ser_stklf: 9 → None
ser_stklf missing values: 0


  df[f'{col}_yoy_pct'] = group[col].pct_change()
  df[f'{col}_yoy_pct'] = group[col].pct_change()
  df[f'{col}_yoy_pct'] = group[col].pct_change()
  df['ratio_cash_liquidity_yoy_pct'] = group['ratio_cash_liquidity'].pct_change()
  df['ratio_ebit_interest_cov_yoy_pct'] = group['ratio_ebit_interest_cov'].pct_change()


Dropped raw source columns after engineering: 76 columns

Applying modeling filters:
  - ser_aktiv == 1 (active companies)
  - rr01_ntoms >= 1,000 kSEK
  Rows: 12,473,668 → 5,006,332 (40.1% retained)

Original data: 5,006,332 rows
Valid rows (have next year outcome): 4,413,099 rows
Rows excluded: 593,233

Data ready for modeling:
Shape of X (features): (4413099, 91)
Shape of y (target): (4413099,)

Target distribution (credit events in NEXT year):
target_next_year
0    4393627
1      19472
Name: count, dtype: Int64

Class imbalance ratio: 225.6:1

Serrano_df retained for lookups: (5006332, 100)
Memory usage: 3.75 GB


In [13]:
# EXPLORATORY DATA ANALYSIS FUNCTIONS
# ============================================================================

def analyze_class_imbalance_by_revenue(df, valid_mask, thresholds=[1000, 5_000, 10_000, 50_000, 100_000, 1_000_000]):
    """Analyze class imbalance across different revenue thresholds"""
    print(f"\n1. Class Imbalance by Revenue Threshold (kSEK = thousands SEK)")
    print("-" * 90)
    print(f"{'Min Revenue (kSEK)':<20} {'Total Rows':<15} {'Credit Events':<15} {'Event Rate %':<15} {'Imbalance':<15}")
    print("-" * 90)
    
    for threshold in thresholds:
        mask = (df['rr01_ntoms'] >= threshold) & valid_mask
        n_samples = mask.sum()
        n_events = df.loc[mask, 'target_next_year'].sum()
        n_no_events = (df.loc[mask, 'target_next_year'] == 0).sum()
        
        if n_events > 0:
            event_rate = 100 * n_events / n_samples
            imbalance = n_no_events / n_events
            print(f"{threshold:<20,} {n_samples:<15,} {n_events:<15,} {event_rate:<15.3f} {imbalance:<15.1f}:1")
        else:
            print(f"{threshold:<20,} {n_samples:<15,} {0:<15,} {'0.000':<15} {'N/A':<15}")


def analyze_class_imbalance_by_year(df, valid_mask):
    """Analyze class imbalance across different years"""
    print(f"\n2. Class Imbalance by Year")
    print("-" * 90)
    print(f"{'Year':<10} {'Total Rows':<15} {'Credit Events':<15} {'Event Rate %':<15} {'Imbalance':<15}")
    print("-" * 90)
    
    years = sorted(df.loc[valid_mask, 'ser_year'].dropna().unique())
    
    for year in years:
        mask = (df['ser_year'] == year) & valid_mask
        n_samples = mask.sum()
        n_events = df.loc[mask, 'target_next_year'].sum()
        n_no_events = (df.loc[mask, 'target_next_year'] == 0).sum()
        
        if n_events > 0:
            event_rate = 100 * n_events / n_samples
            imbalance = n_no_events / n_events
            print(f"{int(year):<10} {n_samples:<15,} {n_events:<15,} {event_rate:<15.3f} {imbalance:<15.1f}:1")


def analyze_class_imbalance_by_sme(df, valid_mask):
    """Analyze class imbalance across SME categories"""
    print(f"\n3. STRICT EU SME Classification (employees AND revenue/assets)")
    print("-" * 90)
    print(f"{'SME Category':<40} {'Total Rows':<15} {'Credit Events':<15} {'Event Rate %':<15} {'Imbalance':<15}")
    print("-" * 90)
    
    for category in sme_categories:
        mask = (df['sme_category'] == category) & valid_mask
        n_samples = mask.sum()
        n_events = df.loc[mask, 'target_next_year'].sum()
        n_no_events = (df.loc[mask, 'target_next_year'] == 0).sum()
        
        if n_samples > 0 and n_events > 0:
            event_rate = 100 * n_events / n_samples
            imbalance = n_no_events / n_events
            print(f"{category:<40} {n_samples:<15,} {n_events:<15,} {event_rate:<15.3f} {imbalance:<15.1f}:1")
        elif n_samples > 0:
            print(f"{category:<40} {n_samples:<15,} {0:<15,} {'0.000':<15} {'N/A':<15}")


def generate_eda_report(df, valid_mask):
    """Generate complete EDA report"""
    print("DATA EXPLORATION: Revenue, Years, and SME Classification")
    print("="*90)
    
    analyze_class_imbalance_by_revenue(df, valid_mask)
    analyze_class_imbalance_by_year(df, valid_mask)
    analyze_class_imbalance_by_sme(df, valid_mask)
    
    print("\n" + "="*90)

In [8]:
generate_eda_report(filtered_df, valid_mask)


DATA EXPLORATION: Revenue, Years, and SME Classification

1. Class Imbalance by Revenue Threshold (kSEK = thousands SEK)
------------------------------------------------------------------------------------------
Min Revenue (kSEK)   Total Rows      Credit Events   Event Rate %    Imbalance      
------------------------------------------------------------------------------------------
1,000                4,413,099       19,472          0.441           225.6          :1
5,000                1,990,887       12,759          0.641           155.0          :1
10,000               1,237,787       8,630           0.697           142.4          :1
50,000               334,002         2,327           0.697           142.5          :1
100,000              180,806         1,114           0.616           161.3          :1
1,000,000            19,798          61              0.308           323.6          :1

2. Class Imbalance by Year
--------------------------------------------------------------

In [18]:
X_train, X_val, y_train, y_val = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)

print(f"Full dataset: {len(X):,} rows × {X.shape[1]} features")
print(f"\nTrain: {X_train.shape[0]:,} rows")
print(f"Validation: {X_val.shape[0]:,} rows")

Full dataset: 4,413,099 rows × 91 features

Train: 3,530,479 rows
Validation: 882,620 rows


In [19]:
# MODEL TRAINING FUNCTIONS
# ============================================================================

def train_lightgbm_model(X_train, y_train, X_val, y_val, params=None):
    """
    Train LightGBM model with early stopping.
    
    Returns:
    - model: Trained LightGBM model
    - training_time: Time in seconds
    """
    scale_pos_weight = (y_train == 0).sum() / (y_train == 1).sum()
    print(f"Calculated scale_pos_weight: {scale_pos_weight:.2f}")
    
    # Default parameters
    default_params = {
        'n_estimators': 10000,
        'learning_rate': 0.05,
        'num_leaves': 31,
        'subsample': 0.8,
        'colsample_bytree': 0.8,
        'objective': 'binary',
        'is_unbalance': True,
        'random_state': 42,
        'n_jobs': -1,
        'verbose': -1,
        'metric': 'auc',
        'reg_alpha': 0.1,
        'reg_lambda': 0.1
    }
    
    # Override with custom params if provided
    if params:
        default_params.update(params)
    
    model = lgb.LGBMClassifier(**default_params)
    
    print("\nTraining LightGBM model...")
    start_time = time.time()
    
    model.fit(
        X_train, y_train,
        eval_set=[(X_train, y_train), (X_val, y_val)],
        eval_metric='auc',
        callbacks=[
            lgb.early_stopping(stopping_rounds=50, verbose=True),
            lgb.log_evaluation(period=50)
        ]
    )
    
    training_time = time.time() - start_time
    print(f"\nLightGBM training completed in {training_time:.1f}s ({training_time/60:.1f} min)")
    
    return model, training_time


def evaluate_model(model, X_val, y_val):
    """
    Evaluate model performance.
    
    Returns:
    - metrics: Dictionary of evaluation metrics
    """
    y_pred = model.predict(X_val)
    y_pred_proba = model.predict_proba(X_val)[:, 1]
    
    auc = roc_auc_score(y_val, y_pred_proba)
    
    print(f"\nValidation AUC: {auc:.4f}")
    print("\nClassification Report:")
    print(classification_report(y_val, y_pred))
    print("\nConfusion Matrix:")
    print(confusion_matrix(y_val, y_pred))
    
    return {
        'auc': auc,
        'y_pred': y_pred,
        'y_pred_proba': y_pred_proba
    }

In [20]:
lgb_model, lgb_time = train_lightgbm_model(X_train, y_train, X_val, y_val)

Calculated scale_pos_weight: 225.63

Training LightGBM model...
Training until validation scores don't improve for 50 rounds
[50]	training's auc: 0.951089	valid_1's auc: 0.935905
[100]	training's auc: 0.958727	valid_1's auc: 0.940281
[150]	training's auc: 0.96394	valid_1's auc: 0.942819
[200]	training's auc: 0.967653	valid_1's auc: 0.94422
[250]	training's auc: 0.970632	valid_1's auc: 0.945083
[300]	training's auc: 0.973338	valid_1's auc: 0.945837
[350]	training's auc: 0.975707	valid_1's auc: 0.946332
[400]	training's auc: 0.977753	valid_1's auc: 0.946544
[450]	training's auc: 0.979581	valid_1's auc: 0.946764
[500]	training's auc: 0.981193	valid_1's auc: 0.946959
[550]	training's auc: 0.982659	valid_1's auc: 0.947119
[600]	training's auc: 0.983929	valid_1's auc: 0.947194
Early stopping, best iteration is:
[581]	training's auc: 0.98351	valid_1's auc: 0.947263

LightGBM training completed in 171.2s (2.9 min)


In [21]:
# MODEL INTERPRETATION FUNCTIONS
# ============================================================================

def show_feature_importance(model, top_n=20):
    """Display top N feature importances"""
    print(f"\nTop {top_n} Feature Importances:")
    importance_df = pd.DataFrame({
        'feature': model.feature_name_,
        'importance': model.feature_importances_
    }).sort_values('importance', ascending=False).head(top_n)
    
    print(importance_df.to_string(index=False))
    
    return importance_df

def compute_shap_importance(model, X_val, sample_size=10000):
    """
    Compute TreeSHAP values for feature importance.
    
    Returns:
    - shap_importance: DataFrame with features and mean absolute SHAP values
    - shap_values: Raw SHAP values array
    - X_sample: Sample used for SHAP computation
    """
    import shap
    
    print(f"\nComputing TreeSHAP values on {sample_size:,} samples...")
    
    # Sample for computational efficiency
    sample_size = min(sample_size, len(X_val))
    X_sample = X_val.sample(n=sample_size, random_state=42)
    
    # Create SHAP explainer
    explainer = shap.TreeExplainer(model)
    shap_values = explainer.shap_values(X_sample)
    
    # Handle binary classification (returns list of [negative_class, positive_class])
    if isinstance(shap_values, list):
        shap_values = shap_values[1]
    
    # Calculate mean absolute SHAP values per feature
    shap_importance = pd.DataFrame({
        'feature': X_sample.columns,
        'mean_abs_shap': np.abs(shap_values).mean(axis=0)
    }).sort_values('mean_abs_shap', ascending=False)
    
    print(f"TreeSHAP computation completed!")
    
    return shap_importance, shap_values, X_sample


def compute_per_feature_auc(X_val, y_val):
    """
    Train individual models on each feature and compute AUC.
    
    Returns:
    - feature_auc_df: DataFrame with features and their individual AUC scores
    """
    print(f"\nComputing per-feature AUC on {len(X_val):,} samples...")
    print("This may take several minutes...")
    
    feature_aucs = []
    
    for i, feature in enumerate(X_val.columns, 1):
        if i % 20 == 0:
            print(f"  Progress: {i}/{len(X_val.columns)} features processed")
        
        # Get non-null values
        mask = X_val[feature].notna()
        
        if mask.sum() < 100:  # Skip if too few samples
            continue
        
        X_feature = X_val.loc[mask, feature].values.reshape(-1, 1)
        y_feature = y_val.loc[mask]
        
        try:
            # Train simple model on single feature
            temp_model = lgb.LGBMClassifier(
                n_estimators=100,
                learning_rate=0.1,
                num_leaves=7,
                random_state=42,
                verbose=-1
            )
            temp_model.fit(X_feature, y_feature)
            y_pred_proba = temp_model.predict_proba(X_feature)[:, 1]
            
            auc = roc_auc_score(y_feature, y_pred_proba)
            feature_aucs.append({'feature': feature, 'auc': auc})
        except Exception as e:
            print(f"  Skipped {feature}: {str(e)}")
            continue
    
    # Sort by AUC
    feature_auc_df = pd.DataFrame(feature_aucs).sort_values('auc', ascending=False)
    
    print(f"\nPer-feature AUC computation completed!")
    print(f"  Features with AUC > 0.60: {(feature_auc_df['auc'] > 0.60).sum()}")
    print(f"  Features with AUC > 0.65: {(feature_auc_df['auc'] > 0.65).sum()}")
    print(f"  Features with AUC > 0.70: {(feature_auc_df['auc'] > 0.70).sum()}")
    
    return feature_auc_df


def display_feature_analysis(shap_importance, feature_auc_df, top_n=30):
    """
    Display comprehensive feature importance analysis.
    """
    print("\n" + "="*90)
    print("FEATURE IMPORTANCE ANALYSIS")
    print("="*90)
    
    print(f"\n1. TreeSHAP Importance (Top {top_n})")
    print("-"*90)
    print(shap_importance.head(top_n).to_string(index=False))
    
    print(f"\n\n2. Per-Feature AUC (Top {top_n})")
    print("-"*90)
    print(feature_auc_df.head(top_n).to_string(index=False))
    
    # Merge for comparison
    comparison = shap_importance.merge(feature_auc_df, on='feature', how='inner')
    comparison['shap_rank'] = comparison['mean_abs_shap'].rank(ascending=False)
    comparison['auc_rank'] = comparison['auc'].rank(ascending=False)
    comparison['avg_rank'] = (comparison['shap_rank'] + comparison['auc_rank']) / 2
    comparison = comparison.sort_values('avg_rank')
    
    print(f"\n\n3. Combined Ranking (SHAP + AUC, Top {top_n})")
    print("-"*90)
    print(comparison[['feature', 'mean_abs_shap', 'shap_rank', 'auc', 'auc_rank', 'avg_rank']].head(top_n).to_string(index=False))
    
    return comparison

In [22]:
importance_df = show_feature_importance(lgb_model, top_n=20)

evaluate_model(lgb_model, X_val, y_val)


Top 20 Feature Importances:
                        feature  importance
            bransch_sni071_konv         494
                       dpo_days         464
                    ny_avkegkap         441
                     rr01_ntoms         388
                     ny_kapomsh         336
             ny_skuldgrd_vol_3y         335
                ny_foradlvpanst         326
ratio_ebit_interest_cov_yoy_pct         321
              ny_rormarg_vol_3y         319
        ratio_cash_interest_cov         319
           ratio_cash_liquidity         312
              dso_days_yoy_diff         298
   ratio_cash_liquidity_yoy_pct         292
                    ny_omspanst         290
                     rr15_resar         282
              dpo_days_yoy_diff         280
                    company_age         273
                 assets_cagr_3y         264
              dso_days_trend_3y         262
                          ny_rs         258

Validation AUC: 0.9473

Classification Report:

{'auc': 0.9472628131672735,
 'y_pred': array([0., 0., 1., ..., 0., 0., 0.], shape=(882620,)),
 'y_pred_proba': array([0.32859254, 0.00154698, 0.56315761, ..., 0.06410042, 0.0372332 ,
        0.00925024], shape=(882620,))}

In [23]:
# Compute TreeSHAP importance
shap_importance, shap_values, X_sample = compute_shap_importance(lgb_model, X_val, sample_size=10000)

# Compute per-feature AUC
feature_auc_df = compute_per_feature_auc(X_val, y_val)

# Display comprehensive analysis
comparison_df = display_feature_analysis(shap_importance, feature_auc_df, top_n=30)

  from .autonotebook import tqdm as notebook_tqdm



Computing TreeSHAP values on 10,000 samples...


Python(11335) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.


TreeSHAP computation completed!

Computing per-feature AUC on 882,620 samples...
This may take several minutes...




  Progress: 20/91 features processed




  Progress: 40/91 features processed




  Progress: 60/91 features processed




  Progress: 80/91 features processed





Per-feature AUC computation completed!
  Features with AUC > 0.60: 71
  Features with AUC > 0.65: 57
  Features with AUC > 0.70: 22

FEATURE IMPORTANCE ANALYSIS

1. TreeSHAP Importance (Top 30)
------------------------------------------------------------------------------------------
                        feature  mean_abs_shap
                    ny_avkegkap       0.551273
           ratio_cash_liquidity       0.539814
                     ny_kapomsh       0.490971
          ratio_dividend_payout       0.463969
                     rr01_ntoms       0.301023
                       ny_solid       0.174451
        ratio_depreciation_cost       0.167878
              ny_rormarg_vol_3y       0.165339
             ny_skuldgrd_vol_3y       0.162818
                    company_age       0.161999
                    ny_skuldgrd       0.136862
            bransch_sni071_konv       0.127688
                       dpo_days       0.119116
                  bslov_antanst       0.115793
         

