In [None]:
import sys
from pathlib import Path
import pandas as pd
import numpy as np

PROJ_ROOT = Path.cwd().parent
if str(PROJ_ROOT) not in sys.path:
    sys.path.append(str(PROJ_ROOT))

from credit_risk_xai.config import (
    FEATURE_CACHE_PATH,
    BASE_CACHE_PATH,
    SME_CATEGORIES,
)

from credit_risk_xai.features.engineer import prepare_modeling_data

In [None]:
# Load raw data
from credit_risk_xai.config import RAW_DATA_DIR


# bokslut = pd.read_stata(RAW_DATA_DIR / "bokslut1.dta")
serrano = pd.read_stata(RAW_DATA_DIR / "serrano1.dta")
# bol = pd.read_stata(RAW_DATA_DIR / "bol1.dta")
# ftg = pd.read_stata(RAW_DATA_DIR / "ftg1.dta")
# knc = pd.read_stata(RAW_DATA_DIR / "knc1.dta")
# nyckeltal = pd.read_stata(RAW_DATA_DIR / "nyckeltal1.dta")

In [None]:
serrano.columns.to_list()

In [None]:
# Load feature-engineered dataset
serrano_df = pd.read_parquet(FEATURE_CACHE_PATH)

# Filter to Small and Medium SME companies
serrano_df = serrano_df[serrano_df['sme_category'].isin(['Small', 'Medium'])]

print(f"Loaded: {serrano_df.shape[0]:,} rows × {serrano_df.shape[1]} columns")
print(f"Memory: {serrano_df.memory_usage(deep=True).sum() / 1024**3:.2f} GB")
print(f"\nSME distribution:\n{serrano_df['sme_category'].value_counts()}")

In [None]:
# Data overview
print(f"Columns: {len(serrano_df.columns)}")
print(f"Years: {serrano_df['ser_year'].min()}-{serrano_df['ser_year'].max()}")
print(f"Unique companies: {serrano_df['ORGNR'].nunique():,}")
print(f"\nTarget distribution:\n{serrano_df['target_next_year'].value_counts(dropna=False)}")

In [None]:
serrano_df.head(20)

In [None]:
serrano_df.head(50)

In [None]:
# EXPLORATORY DATA ANALYSIS
# See reports/engineered_features.md for feature catalogue

In [None]:
# Apply filtering: active companies with minimum revenue
filtered_df = serrano_df[(serrano_df["ser_aktiv"] == 1) & (serrano_df["rr01_ntoms"] >= MIN_REVENUE_KSEK)]

X, y = prepare_modeling_data(filtered_df)

print(f"Filtered: {filtered_df.shape[0]:,} rows")
print(f"Modeling data: {X.shape[0]:,} rows × {X.shape[1]} features")
print(f"\nTarget distribution:\n{y.value_counts()}")
print(f"Class imbalance: {(y==0).sum() / (y==1).sum():.1f}:1")

In [None]:
# EXPLORATORY DATA ANALYSIS FUNCTIONS
# ============================================================================

def analyze_class_imbalance_by_revenue(df, thresholds=[1000, 5_000, 10_000, 50_000, 100_000, 1_000_000]):
    """Analyze class imbalance across different revenue thresholds"""
    print(f"\n1. Class Imbalance by Revenue Threshold (kSEK = thousands SEK)")
    print("-" * 90)
    print(f"{'Min Revenue (kSEK)':<20} {'Total Rows':<15} {'Credit Events':<15} {'Event Rate %':<15} {'Imbalance':<15}")
    print("-" * 90)
    
    for threshold in thresholds:
        valid_mask = df["target_next_year"].notna()
        mask = (df['rr01_ntoms'] >= threshold) & valid_mask
        n_samples = mask.sum()
        n_events = df.loc[mask, 'target_next_year'].sum()
        n_no_events = (df.loc[mask, 'target_next_year'] == 0).sum()
        
        if n_events > 0:
            event_rate = 100 * n_events / n_samples
            imbalance = n_no_events / n_events
            print(f"{threshold:<20,} {n_samples:<15,} {n_events:<15,} {event_rate:<15.3f} {imbalance:<15.1f}:1")
        else:
            print(f"{threshold:<20,} {n_samples:<15,} {0:<15,} {'0.000':<15} {'N/A':<15}")


def analyze_class_imbalance_by_year(df):
    """Analyze class imbalance across different years"""
    print(f"\n2. Class Imbalance by Year")
    print("-" * 90)
    print(f"{'Year':<10} {'Total Rows':<15} {'Credit Events':<15} {'Event Rate %':<15} {'Imbalance':<15}")
    print("-" * 90)
    
    valid_mask = df["target_next_year"].notna()
    years = sorted(df.loc[valid_mask, 'ser_year'].dropna().unique())
    
    for year in years:
        mask = (df['ser_year'] == year) & valid_mask
        n_samples = mask.sum()
        n_events = df.loc[mask, 'target_next_year'].sum()
        n_no_events = (df.loc[mask, 'target_next_year'] == 0).sum()
        
        if n_events > 0:
            event_rate = 100 * n_events / n_samples
            imbalance = n_no_events / n_events
            print(f"{int(year):<10} {n_samples:<15,} {n_events:<15,} {event_rate:<15.3f} {imbalance:<15.1f}:1")


def analyze_class_imbalance_by_sme(df):
    """Analyze class imbalance across SME categories"""
    print(f"\n3. STRICT EU SME Classification (employees AND revenue/assets)")
    print("-" * 90)
    print(f"{'SME Category':<40} {'Total Rows':<15} {'Credit Events':<15} {'Event Rate %':<15} {'Imbalance':<15}")
    print("-" * 90)
    
    valid_mask = df["target_next_year"].notna()
    for category in SME_CATEGORIES:
        mask = (df['sme_category'] == category) & valid_mask
        n_samples = mask.sum()
        n_events = df.loc[mask, 'target_next_year'].sum()
        n_no_events = (df.loc[mask, 'target_next_year'] == 0).sum()
        
        if n_samples > 0 and n_events > 0:
            event_rate = 100 * n_events / n_samples
            imbalance = n_no_events / n_events
            print(f"{category:<40} {n_samples:<15,} {n_events:<15,} {event_rate:<15.3f} {imbalance:<15.1f}:1")
        elif n_samples > 0:
            print(f"{category:<40} {n_samples:<15,} {0:<15,} {'0.000':<15} {'N/A':<15}")


def generate_eda_report(df):
    """Generate complete EDA report"""
    print("DATA EXPLORATION: Revenue, Years, and SME Classification")
    print("="*90)
    
    analyze_class_imbalance_by_revenue(df)
    analyze_class_imbalance_by_year(df)
    analyze_class_imbalance_by_sme(df)
    
    print("\n" + "="*90)

In [None]:
generate_eda_report(filtered_df)