In [None]:
import sys
from pathlib import Path
import time
import shap
import pandas as pd
import numpy as np
import lightgbm as lgb
import xgboost as xgb
from sklearn.model_selection import train_test_split

from sklearn.metrics import roc_auc_score, average_precision_score, classification_report, confusion_matrix

PROJ_ROOT = Path.cwd().parent
if str(PROJ_ROOT) not in sys.path:
    sys.path.append(str(PROJ_ROOT))

from credit_risk_xai.modeling.train import DEFAULT_PARAMS

from credit_risk_xai.config import FEATURE_CACHE_PATH
from credit_risk_xai.features.engineer import prepare_modeling_data
from sklearn.calibration import calibration_curve
import matplotlib.pyplot as plt

def plot_calibration_curve(y_true, y_pred_proba, n_bins=100, model_name="Model"):
    fraction_of_positives, mean_predicted_value = calibration_curve(
        y_true, y_pred_proba, n_bins=n_bins, strategy='quantile'
    )
    
    plt.figure(figsize=(8, 6))
    plt.plot(mean_predicted_value, fraction_of_positives, "s-", label=model_name)
    plt.plot([0, 1], [0, 1], "k--", label="Perfect calibration")
    plt.xlabel("Mean predicted probability")
    plt.ylabel("Fraction of positives")
    plt.legend()
    plt.title(f"Calibration Curve - {model_name}")
    plt.show()
    
    # ECE (Expected Calibration Error)
    ece = np.mean(np.abs(fraction_of_positives - mean_predicted_value))
    print(f"ECE: {ece:.4f}")

In [None]:
# Load and filter data
MIN_REVENUE_KSEK = 1_000
df = pd.read_parquet(FEATURE_CACHE_PATH)
df = df[(df["ser_aktiv"] == 1) & (df["sme_category"].isin(["Small", "Medium"]))]
X, y = prepare_modeling_data(df)

print(f"Features: {X.shape[1]} | Samples: {len(X):,}")
print(f"Target distribution:\n{y.value_counts()}")
print(f"Imbalance: {(y==0).sum()/(y==1).sum():.1f}:1")

In [None]:
from credit_risk_xai.features.engineer import prepare_modeling_data
from credit_risk_xai.modeling.train import run_lightgbm_training

df = pd.read_parquet(FEATURE_CACHE_PATH)
mask = (df.ser_aktiv == 1) & (df.sme_category.isin(["Small", "Medium"]))  # add any extra filters here
X, y = prepare_modeling_data(df.loc[mask])

results = run_lightgbm_training(
    X=X,
    y=y,
    dataset_description="ser_aktiv==1 & SME∈{Small,Medium}",  # optional note for W&B
    use_wandb=False,
    wandb_project="credit-risk-xai",
    wandb_run_name="lgbm_pre_prune2",
)

In [None]:
# Evaluate
model = results["model"]
X_train = results["X_train"]
X_val = results["X_val"]
y_train = results["y_train"]
y_val = results["y_val"]
y_pred_proba = results["y_val_proba"]
y_pred = (y_pred_proba >= 0.5).astype(int)

auc = roc_auc_score(y_val, y_pred_proba)
pr_auc = average_precision_score(y_val, y_pred_proba)

print(f"AUC: {auc:.4f}")
print(f"PR-AUC: {pr_auc:.4f}\n")
print("Classification Report:")
print(classification_report(y_val, y_pred))
print("\nConfusion Matrix:")
print(confusion_matrix(y_val, y_pred))

plot_calibration_curve(y_val, y_pred_proba, model_name="Predicted Model")

In [None]:
# ============================================================================
# TEMPORAL FEATURE SELECTION - Setup
# Define temporal feature groups for systematic analysis
# ============================================================================

print("=" * 90)
print("TEMPORAL FEATURE SELECTION ANALYSIS")
print("=" * 90)
print("\nObjective: For each (metric, computation_type), find optimal time window")
print("Method: Systematic ablation study")
print("-" * 90)

# Define all temporal feature groups based on engineered_features.md
temporal_feature_groups = {
    'revenue': {
        'cagr': ['revenue_cagr_3y', 'revenue_cagr_5y'],
        'drawdown': ['revenue_drawdown_5y']
    },
    'assets': {
        'cagr': ['assets_cagr_3y', 'assets_cagr_5y']
    },
    'equity': {
        'cagr': ['equity_cagr_3y', 'equity_cagr_5y'],
        'drawdown': ['equity_drawdown_5y']
    },
    'profit': {
        'cagr': ['profit_cagr_3y', 'profit_cagr_5y']
    },
    'operating_margin': {
        'trend': ['ny_rormarg_trend_3y', 'ny_rormarg_trend_5y'],
        'volatility': ['ny_rormarg_vol_3y', 'ny_rormarg_vol_5y'],
        'average': ['ny_rormarg_avg_2y', 'ny_rormarg_avg_5y']
    },
    'net_margin': {
        'trend': ['ny_nettomarg_trend_3y', 'ny_nettomarg_trend_5y'],
        'volatility': ['ny_nettomarg_vol_3y', 'ny_nettomarg_vol_5y'],
        'average': ['ny_nettomarg_avg_2y', 'ny_nettomarg_avg_5y']
    },
    'leverage': {
        'trend': ['ny_skuldgrd_trend_3y', 'ny_skuldgrd_trend_5y'],
        'volatility': ['ny_skuldgrd_vol_3y', 'ny_skuldgrd_vol_5y']
    },
    'cash_liquidity': {
        'trend': ['ratio_cash_liquidity_trend_3y', 'ratio_cash_liquidity_trend_5y'],
        'volatility': ['ratio_cash_liquidity_vol_3y'],
        'average': ['ratio_cash_liquidity_avg_2y', 'ratio_cash_liquidity_avg_5y']
    },
    'working_capital': {
        'trend': ['dso_days_trend_3y', 'inventory_days_trend_3y', 'dpo_days_trend_3y']
    }
}

# Flatten all temporal features
all_temporal_features = []
for metric, computations in temporal_feature_groups.items():
    for comp_type, features in computations.items():
        all_temporal_features.extend(features)

# Get baseline (non-temporal) features from current feature set
baseline_features = [f for f in X_train.columns if f not in all_temporal_features]

print(f"\nTotal temporal features: {len(all_temporal_features)}")
print(f"Baseline (non-temporal) features: {len(baseline_features)}")
print(f"Total features in model: {len(X_train.columns)}")

# Verify all temporal features exist in dataset
missing_temporal = [f for f in all_temporal_features if f not in X_train.columns]
if missing_temporal:
    print(f"\n⚠️ Warning: {len(missing_temporal)} temporal features not found in dataset:")
    print(missing_temporal)
    # Remove missing features from groups
    for metric in temporal_feature_groups:
        for comp_type in temporal_feature_groups[metric]:
            temporal_feature_groups[metric][comp_type] = [
                f for f in temporal_feature_groups[metric][comp_type] 
                if f in X_train.columns
            ]

print("\n✓ Temporal feature groups defined")
print("=" * 90)

In [None]:
# ============================================================================
# NESTED CROSS-VALIDATION FRAMEWORK (5×3)
# Outer 5-fold: Unbiased evaluation
# Inner 3-fold: Feature selection decisions (averaged)
# ============================================================================

from sklearn.model_selection import StratifiedKFold

print("=" * 90)
print("NESTED CROSS-VALIDATION SETUP")
print("=" * 90)
print("Structure: 5 outer folds × 3 inner folds = 15 train/val splits per test")
print("Purpose: Reduce selection bias and provide unbiased performance estimates")
print("-" * 90)

# Define CV splitters
outer_cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
inner_cv = StratifiedKFold(n_splits=3, shuffle=True, random_state=43)

def train_and_evaluate_cv(features, X, y, cv_splitter, verbose=False):
    """
    Train and evaluate model using cross-validation.
    Returns: mean AUC, std AUC, and list of fold AUCs
    """
    fold_aucs = []
    
    for fold_idx, (train_idx, val_idx) in enumerate(cv_splitter.split(X, y)):
        X_tr, X_v = X.iloc[train_idx], X.iloc[val_idx]
        y_tr, y_v = y.iloc[train_idx], y.iloc[val_idx]
        
        X_tr_sub = X_tr[features]
        X_v_sub = X_v[features]
        
        model = lgb.LGBMClassifier(**DEFAULT_PARAMS)
        model.fit(
            X_tr_sub, y_tr,
            eval_set=[(X_v_sub, y_v)],
            eval_metric='logloss',
            callbacks=[lgb.log_evaluation(0), lgb.early_stopping(50)]
        )
        
        y_pred = model.predict_proba(X_v_sub)[:, 1]
        auc = roc_auc_score(y_v, y_pred)
        fold_aucs.append(auc)
        
        if verbose:
            print(f"    Fold {fold_idx+1}: AUC = {auc:.6f}")
    
    return np.mean(fold_aucs), np.std(fold_aucs), fold_aucs

def format_auc_with_std(mean, std):
    """Format AUC as mean ± std."""
    return f"{mean:.6f} ± {std:.4f}"

print("\n✓ Cross-validation framework configured")
print(f"  - Outer CV: {outer_cv.n_splits} folds (unbiased test)")
print(f"  - Inner CV: {inner_cv.n_splits} folds (feature selection)")
print("=" * 90)

# ============================================================================
# EXPERIMENT 1: Window Selection (5×3 NESTED CV)
# For each (metric, computation_type), find optimal time window
# Selection: Based on inner 3-fold CV (averaged to reduce selection bias)
# Evaluation: Based on outer 5-fold CV (unbiased performance estimates)
# ============================================================================

print("\n" + "=" * 90)
print("EXPERIMENT 1: TIME WINDOW SELECTION (5×3 NESTED CV)")
print("=" * 90)
print("Testing: 2y vs 3y vs 5y for each (metric, computation_type)")
print("Decision threshold: synergy > 0.0005 → keep both windows")
print("-" * 90)

# Combine train and val for nested CV
X_full = pd.concat([X_train, X_val], axis=0).reset_index(drop=True)
y_full = pd.concat([y_train, y_val], axis=0).reset_index(drop=True)

print(f"Full dataset: {len(X_full):,} samples, {y_full.sum():,} positives ({100*y_full.mean():.2f}%)")

window_selection_results_nested = []

for metric, computations in temporal_feature_groups.items():
    for comp_type, features in computations.items():
        if len(features) < 2:
            continue
        
        print(f"\n{metric} - {comp_type}")
        print("-" * 70)
        
        # Extract windows
        windows = {}
        for f in features:
            if '2y' in f:
                windows['2y'] = f
            elif '3y' in f:
                windows['3y'] = f
            elif '5y' in f:
                windows['5y'] = f
        
        if len(windows) < 2:
            print(f"  Skipped: Only one time window")
            continue
        
        # Store results across outer folds
        outer_fold_decisions = []
        outer_fold_test_aucs = []
        
        # OUTER CV LOOP (5 folds for unbiased evaluation)
        for outer_fold_idx, (outer_train_idx, outer_test_idx) in enumerate(outer_cv.split(X_full, y_full)):
            X_outer_train = X_full.iloc[outer_train_idx]
            X_outer_test = X_full.iloc[outer_test_idx]
            y_outer_train = y_full.iloc[outer_train_idx]
            y_outer_test = y_full.iloc[outer_test_idx]
            
            # INNER CV LOOP (3 folds for feature selection decision)
            # Test individual windows using inner CV
            inner_results = {}
            
            for window_name, window_feat in windows.items():
                test_features = baseline_features + [window_feat]
                mean_auc, std_auc, _ = train_and_evaluate_cv(
                    test_features, X_outer_train, y_outer_train, inner_cv
                )
                inner_results[f'{window_name}_only'] = mean_auc
            
            # Test all windows together using inner CV
            test_features = baseline_features + list(windows.values())
            mean_auc_all, std_auc_all, _ = train_and_evaluate_cv(
                test_features, X_outer_train, y_outer_train, inner_cv
            )
            inner_results['all'] = mean_auc_all
            
            # Make decision based on INNER CV results
            best_single = max([(k, v) for k, v in inner_results.items() if k != 'all'],
                             key=lambda x: x[1])
            synergy = mean_auc_all - best_single[1]
            decision = 'keep_both' if synergy > 0.0005 else best_single[0].replace('_only', '')
            
            outer_fold_decisions.append({
                'fold': outer_fold_idx,
                'decision': decision,
                'synergy': synergy,
                'inner_cv_results': inner_results
            })
            
            # Evaluate the selected configuration on OUTER TEST SET (unbiased)
            if decision == 'keep_both':
                selected_features = baseline_features + list(windows.values())
            else:
                selected_features = baseline_features + [windows[decision]]
            
            # Train on full outer train, test on outer test
            model = lgb.LGBMClassifier(**DEFAULT_PARAMS)
            model.fit(
                X_outer_train[selected_features], y_outer_train,
                eval_set=[(X_outer_test[selected_features], y_outer_test)],
                eval_metric='logloss',
                callbacks=[lgb.log_evaluation(0), lgb.early_stopping(50)]
            )
            test_auc = roc_auc_score(
                y_outer_test, 
                model.predict_proba(X_outer_test[selected_features])[:, 1]
            )
            outer_fold_test_aucs.append(test_auc)
        
        # Aggregate results across outer folds
        decisions_count = {}
        for fold_res in outer_fold_decisions:
            dec = fold_res['decision']
            decisions_count[dec] = decisions_count.get(dec, 0) + 1
        
        # Majority vote for final decision
        final_decision = max(decisions_count.items(), key=lambda x: x[1])[0]
        
        # Calculate statistics for test performance
        mean_test_auc = np.mean(outer_fold_test_aucs)
        std_test_auc = np.std(outer_fold_test_aucs)
        se_test_auc = std_test_auc / np.sqrt(len(outer_fold_test_aucs))
        
        print(f"  Decision votes: {decisions_count}")
        print(f"  → Final decision: {final_decision}")
        print(f"  Test AUC: {format_auc_with_std(mean_test_auc, std_test_auc)} (SE: {se_test_auc:.4f})")
        
        # Store results
        window_selection_results_nested.append({
            'metric': metric,
            'computation': comp_type,
            'final_decision': final_decision,
            'test_auc_mean': mean_test_auc,
            'test_auc_std': std_test_auc,
            'test_auc_se': se_test_auc,
            'decision_votes': str(decisions_count),
            'features_to_keep': list(windows.values()) if final_decision == 'keep_both'
                               else [windows[final_decision]]
        })

# Convert to DataFrame
window_df = pd.DataFrame(window_selection_results_nested)

print("\n" + "=" * 90)
print("EXPERIMENT 1 SUMMARY (NESTED CV)")
print("=" * 90)
display_cols = ['metric', 'computation', 'final_decision', 'test_auc_mean', 'test_auc_std', 'decision_votes']
print(window_df[display_cols].to_string(index=False))

# Save results
window_df.to_csv('temporal_window_selection_nested_cv.csv', index=False)
print("\n✓ Saved results to: temporal_window_selection_nested_cv.csv")

In [None]:
# ============================================================================
# EXPERIMENT 2: Computation Type Redundancy (5×3 NESTED CV)
# For each metric, determine which computation types are necessary
# Using nested CV to avoid overfitting to specific validation quirks
# ============================================================================

print("=" * 90)
print("EXPERIMENT 2: COMPUTATION TYPE REDUNDANCY (5×3 NESTED CV)")
print("=" * 90)
print("Testing: Which combinations of computation types are necessary per metric?")
print("Selection: Based on inner 3-fold CV (averaged)")
print("Evaluation: Based on outer 5-fold CV (unbiased)")
print("-" * 90)

from itertools import combinations

computation_redundancy_results = []

for metric, computations in temporal_feature_groups.items():
    if len(computations) <= 1:
        # Only one computation type, skip
        continue
    
    print(f"\n{metric.upper()}")
    print("-" * 70)
    
    # Get optimal features from Experiment 1 (window selection)
    optimal_features = {}
    for comp_type, features in computations.items():
        matching = window_df[
            (window_df['metric'] == metric) & 
            (window_df['computation'] == comp_type)
        ]
        
        if len(matching) > 0:
            optimal_features[comp_type] = matching.iloc[0]['features_to_keep']
        else:
            # No window selection (single window or skipped), use all features
            optimal_features[comp_type] = features
    
    # Generate test configurations (all subsets of computation types)
    comp_types = list(optimal_features.keys())
    
    configs = {}
    for r in range(1, len(comp_types) + 1):
        for combo in combinations(comp_types, r):
            config_name = '+'.join(combo)
            config_features = []
            for ct in combo:
                config_features.extend(optimal_features[ct])
            configs[config_name] = config_features
    
    # Store results across outer folds
    outer_fold_results = {config_name: [] for config_name in configs}
    outer_fold_decisions = []
    
    # OUTER CV LOOP (5 folds)
    for outer_fold_idx, (outer_train_idx, outer_test_idx) in enumerate(outer_cv.split(X_full, y_full)):
        X_outer_train = X_full.iloc[outer_train_idx]
        X_outer_test = X_full.iloc[outer_test_idx]
        y_outer_train = y_full.iloc[outer_train_idx]
        y_outer_test = y_full.iloc[outer_test_idx]
        
        # INNER CV LOOP - evaluate each configuration on inner 3-fold CV
        inner_cv_scores = {}
        
        for config_name, config_features in configs.items():
            test_features = baseline_features + config_features
            mean_auc, _, _ = train_and_evaluate_cv(
                test_features, X_outer_train, y_outer_train, inner_cv
            )
            inner_cv_scores[config_name] = mean_auc
        
        # Decision based on INNER CV results
        # Find best configuration (highest AUC)
        sorted_configs = sorted(inner_cv_scores.items(), key=lambda x: (-x[1], len(x[0].split('+'))))
        best_config = sorted_configs[0]
        
        # Check if simpler configs are within threshold of best
        threshold = 0.0003
        final_config = best_config
        for config_name, auc in sorted_configs[1:]:
            if best_config[1] - auc < threshold:
                if len(config_name.split('+')) < len(best_config[0].split('+')):
                    final_config = (config_name, auc)
                    break
        
        outer_fold_decisions.append({
            'fold': outer_fold_idx,
            'decision': final_config[0],
            'inner_cv_auc': final_config[1],
            'all_inner_scores': inner_cv_scores
        })
        
        # Evaluate ALL configurations on OUTER TEST SET (for comparison)
        for config_name, config_features in configs.items():
            test_features = baseline_features + config_features
            model = lgb.LGBMClassifier(**DEFAULT_PARAMS)
            model.fit(
                X_outer_train[test_features], y_outer_train,
                eval_set=[(X_outer_test[test_features], y_outer_test)],
                eval_metric='logloss',
                callbacks=[lgb.log_evaluation(0), lgb.early_stopping(50)]
            )
            test_auc = roc_auc_score(
                y_outer_test,
                model.predict_proba(X_outer_test[test_features])[:, 1]
            )
            outer_fold_results[config_name].append(test_auc)
    
    # Aggregate results
    decisions_count = {}
    for fold_res in outer_fold_decisions:
        dec = fold_res['decision']
        decisions_count[dec] = decisions_count.get(dec, 0) + 1
    
    # Final decision: majority vote
    final_recommendation = max(decisions_count.items(), key=lambda x: x[1])[0]
    
    # Calculate test performance statistics for each config
    print(f"  Test performance across {outer_cv.n_splits} folds:")
    config_stats = []
    for config_name in sorted(configs.keys(), key=lambda x: (-np.mean(outer_fold_results[x]), len(x.split('+')))):
        mean_auc = np.mean(outer_fold_results[config_name])
        std_auc = np.std(outer_fold_results[config_name])
        se_auc = std_auc / np.sqrt(len(outer_fold_results[config_name]))
        config_stats.append({
            'config': config_name,
            'mean': mean_auc,
            'std': std_auc,
            'se': se_auc
        })
        print(f"    {config_name:30s}: {format_auc_with_std(mean_auc, std_auc)} (SE: {se_auc:.4f})")
    
    print(f"  Decision votes: {decisions_count}")
    print(f"  → Final recommendation: {final_recommendation}")
    
    # Store results
    final_config_stats = [s for s in config_stats if s['config'] == final_recommendation][0]
    computation_redundancy_results.append({
        'metric': metric,
        'recommended_config': final_recommendation,
        'test_auc_mean': final_config_stats['mean'],
        'test_auc_std': final_config_stats['std'],
        'test_auc_se': final_config_stats['se'],
        'decision_votes': str(decisions_count),
        'features_to_keep': configs[final_recommendation],
        'num_features': len(configs[final_recommendation]),
        'all_config_stats': config_stats
    })

# Summary table
print("\n" + "=" * 90)
print("EXPERIMENT 2 SUMMARY (NESTED CV)")
print("=" * 90)

summary_data = []
for result in computation_redundancy_results:
    summary_data.append({
        'metric': result['metric'],
        'recommended_config': result['recommended_config'],
        'test_auc_mean': result['test_auc_mean'],
        'test_auc_std': result['test_auc_std'],
        'num_features': result['num_features'],
        'decision_votes': result['decision_votes']
    })

summary_df = pd.DataFrame(summary_data)
print(summary_df.to_string(index=False))

# Save detailed results
import json
with open('temporal_computation_redundancy_nested_cv.json', 'w') as f:
    # Convert to serializable format
    export_data = []
    for result in computation_redundancy_results:
        export_data.append({
            'metric': result['metric'],
            'recommended_config': result['recommended_config'],
            'test_auc_mean': result['test_auc_mean'],
            'test_auc_std': result['test_auc_std'],
            'features_to_keep': result['features_to_keep'],
            'num_features': result['num_features']
        })
    json.dump(export_data, f, indent=2)

print("\n✓ Saved results to: temporal_computation_redundancy_nested_cv.json")

In [None]:
# ============================================================================
# EXPERIMENT 3: Metric Prioritization (5×3 NESTED CV)
# Which metrics benefit most from temporal features?
# Using nested CV for robust evaluation
# ============================================================================

print("=" * 90)
print("EXPERIMENT 3: METRIC PRIORITIZATION (5×3 NESTED CV)")
print("=" * 90)
print("Testing: Impact of removing each metric's temporal features")
print("Method: Ablation study with nested CV")
print("-" * 90)

# Get recommended features from Experiment 2
recommended_temporal_features = {}

# For metrics with computation redundancy results, use those
for result in computation_redundancy_results:
    recommended_temporal_features[result['metric']] = result['features_to_keep']

# For metrics with only one computation type, use window selection results
for metric, computations in temporal_feature_groups.items():
    if metric not in recommended_temporal_features:
        # Use results from window selection if available
        all_features = []
        for comp_type, features in computations.items():
            matching = window_df[
                (window_df['metric'] == metric) & 
                (window_df['computation'] == comp_type)
            ]
            if len(matching) > 0:
                all_features.extend(matching.iloc[0]['features_to_keep'])
            else:
                all_features.extend(features)
        recommended_temporal_features[metric] = all_features

# Flatten to get all recommended temporal features
all_recommended_temporal = []
for features in recommended_temporal_features.values():
    all_recommended_temporal.extend(features)

print(f"Total recommended temporal features: {len(all_recommended_temporal)}")
print(f"Reduction from original: {len(all_temporal_features)} → {len(all_recommended_temporal)} ({100*(len(all_temporal_features) - len(all_recommended_temporal))/len(all_temporal_features):.1f}%)")

# Baseline: all recommended temporal features
baseline_with_all_temporal = baseline_features + all_recommended_temporal

# Test impact of each metric using nested CV
metric_importance_results = []

print("\n" + "-" * 90)
print("Testing impact of removing each metric's temporal features:")
print("-" * 90)

for metric, features in recommended_temporal_features.items():
    print(f"\n{metric}")
    
    # Features WITHOUT this metric's temporal features
    features_without_metric = [f for f in baseline_with_all_temporal if f not in features]
    
    # Store results across outer folds
    outer_test_aucs_with = []
    outer_test_aucs_without = []
    
    # OUTER CV LOOP (5 folds)
    for outer_fold_idx, (outer_train_idx, outer_test_idx) in enumerate(outer_cv.split(X_full, y_full)):
        X_outer_train = X_full.iloc[outer_train_idx]
        X_outer_test = X_full.iloc[outer_test_idx]
        y_outer_train = y_full.iloc[outer_train_idx]
        y_outer_test = y_full.iloc[outer_test_idx]
        
        # Train WITH metric features
        model_with = lgb.LGBMClassifier(**DEFAULT_PARAMS)
        model_with.fit(
            X_outer_train[baseline_with_all_temporal], y_outer_train,
            eval_set=[(X_outer_test[baseline_with_all_temporal], y_outer_test)],
            eval_metric='logloss',
            callbacks=[lgb.log_evaluation(0), lgb.early_stopping(50)]
        )
        auc_with = roc_auc_score(
            y_outer_test,
            model_with.predict_proba(X_outer_test[baseline_with_all_temporal])[:, 1]
        )
        outer_test_aucs_with.append(auc_with)
        
        # Train WITHOUT metric features
        model_without = lgb.LGBMClassifier(**DEFAULT_PARAMS)
        model_without.fit(
            X_outer_train[features_without_metric], y_outer_train,
            eval_set=[(X_outer_test[features_without_metric], y_outer_test)],
            eval_metric='logloss',
            callbacks=[lgb.log_evaluation(0), lgb.early_stopping(50)]
        )
        auc_without = roc_auc_score(
            y_outer_test,
            model_without.predict_proba(X_outer_test[features_without_metric])[:, 1]
        )
        outer_test_aucs_without.append(auc_without)
    
    # Calculate statistics
    mean_auc_with = np.mean(outer_test_aucs_with)
    std_auc_with = np.std(outer_test_aucs_with)
    mean_auc_without = np.mean(outer_test_aucs_without)
    std_auc_without = np.std(outer_test_aucs_without)
    
    # AUC drop (positive = hurts performance to remove)
    auc_drops = [w - wo for w, wo in zip(outer_test_aucs_with, outer_test_aucs_without)]
    mean_drop = np.mean(auc_drops)
    std_drop = np.std(auc_drops)
    se_drop = std_drop / np.sqrt(len(auc_drops))
    
    # Decision: keep if removing causes significant drop (> 0.0005)
    keep = mean_drop > 0.0005
    
    print(f"  With metric:    {format_auc_with_std(mean_auc_with, std_auc_with)}")
    print(f"  Without metric: {format_auc_with_std(mean_auc_without, std_auc_without)}")
    print(f"  AUC drop:       {mean_drop:+.6f} ± {std_drop:.4f} (SE: {se_drop:.4f})")
    print(f"  → Decision: {'KEEP' if keep else 'DROP'}")
    
    metric_importance_results.append({
        'metric': metric,
        'auc_with_mean': mean_auc_with,
        'auc_with_std': std_auc_with,
        'auc_without_mean': mean_auc_without,
        'auc_without_std': std_auc_without,
        'auc_drop_mean': mean_drop,
        'auc_drop_std': std_drop,
        'auc_drop_se': se_drop,
        'num_features': len(features),
        'keep': keep
    })

# Sort by importance (drop magnitude)
metric_df = pd.DataFrame(metric_importance_results).sort_values('auc_drop_mean', ascending=False)

print("\n" + "=" * 90)
print("EXPERIMENT 3 SUMMARY - Ranked by Impact (NESTED CV)")
print("=" * 90)
display_cols = ['metric', 'auc_drop_mean', 'auc_drop_std', 'num_features', 'keep']
print(metric_df[display_cols].to_string(index=False))

# Save results
metric_df.to_csv('temporal_metric_importance_nested_cv.csv', index=False)
print("\n✓ Saved results to: temporal_metric_importance_nested_cv.csv")

In [None]:
# ============================================================================
# FINAL RECOMMENDATIONS: Temporal Feature Selection (NESTED CV)
# Compile final recommendations and evaluate with unbiased test performance
# ============================================================================

print("=" * 90)
print("FINAL TEMPORAL FEATURE RECOMMENDATIONS (NESTED CV)")
print("=" * 90)

# Compile final feature list based on metric importance
final_temporal_features = []

for result in metric_importance_results:
    if result['keep']:
        final_temporal_features.extend(
            recommended_temporal_features[result['metric']]
        )

print(f"\n" + "=" * 70)
print("TEMPORAL FEATURE REDUCTION SUMMARY")
print("=" * 70)
print(f"Original temporal features:      {len(all_temporal_features)}")
print(f"After window selection:          {len(all_recommended_temporal)}")
print(f"After metric prioritization:     {len(final_temporal_features)}")
print(f"Total reduction:                 {len(all_temporal_features) - len(final_temporal_features)} features ({100*(len(all_temporal_features) - len(final_temporal_features))/len(all_temporal_features):.1f}%)")

# ============================================================================
# FINAL EVALUATION WITH NESTED CV
# Compare: Original (all temporal) vs Optimized (selected temporal) vs Baseline (no temporal)
# ============================================================================

print("\n" + "=" * 70)
print("FINAL MODEL EVALUATION (5-FOLD CV)")
print("=" * 70)

# Define three configurations to test
configs = {
    'Baseline (no temporal)': baseline_features,
    'Original (all temporal)': baseline_features + all_temporal_features,
    'Optimized (selected temporal)': baseline_features + final_temporal_features
}

print(f"Testing {len(configs)} configurations across {outer_cv.n_splits} folds...")

# Evaluate each configuration using outer CV (no inner CV needed - final evaluation only)
config_results = {}

for config_name, features in configs.items():
    print(f"\n{config_name}")
    print(f"  Features: {len(features)} ({len([f for f in features if f in all_temporal_features])} temporal)")
    
    test_aucs = []
    
    for outer_fold_idx, (outer_train_idx, outer_test_idx) in enumerate(outer_cv.split(X_full, y_full)):
        X_outer_train = X_full.iloc[outer_train_idx]
        X_outer_test = X_full.iloc[outer_test_idx]
        y_outer_train = y_full.iloc[outer_train_idx]
        y_outer_test = y_full.iloc[outer_test_idx]
        
        model = lgb.LGBMClassifier(**DEFAULT_PARAMS)
        model.fit(
            X_outer_train[features], y_outer_train,
            eval_set=[(X_outer_test[features], y_outer_test)],
            eval_metric='logloss',
            callbacks=[lgb.log_evaluation(0), lgb.early_stopping(50)]
        )
        
        test_auc = roc_auc_score(
            y_outer_test,
            model.predict_proba(X_outer_test[features])[:, 1]
        )
        test_aucs.append(test_auc)
        print(f"    Fold {outer_fold_idx+1}: {test_auc:.6f}")
    
    mean_auc = np.mean(test_aucs)
    std_auc = np.std(test_aucs)
    se_auc = std_auc / np.sqrt(len(test_aucs))
    
    print(f"  → Mean: {format_auc_with_std(mean_auc, std_auc)} (SE: {se_auc:.4f})")
    
    config_results[config_name] = {
        'mean': mean_auc,
        'std': std_auc,
        'se': se_auc,
        'fold_aucs': test_aucs
    }

# Statistical comparison
print("\n" + "=" * 70)
print("PERFORMANCE COMPARISON")
print("=" * 70)

baseline_result = config_results['Baseline (no temporal)']
original_result = config_results['Original (all temporal)']
optimized_result = config_results['Optimized (selected temporal)']

improvement_vs_baseline = optimized_result['mean'] - baseline_result['mean']
improvement_vs_original = optimized_result['mean'] - original_result['mean']

print(f"\nBaseline (no temporal):         {format_auc_with_std(baseline_result['mean'], baseline_result['std'])}")
print(f"Original (all temporal):        {format_auc_with_std(original_result['mean'], original_result['std'])}")
print(f"Optimized (selected temporal):  {format_auc_with_std(optimized_result['mean'], optimized_result['std'])}")
print(f"\nImprovement over baseline:      {improvement_vs_baseline:+.6f} ({100*improvement_vs_baseline/baseline_result['mean']:+.2f}%)")
print(f"Improvement over original:      {improvement_vs_original:+.6f} ({100*improvement_vs_original/original_result['mean']:+.2f}%)")
print(f"Feature reduction:              {len(all_temporal_features)} → {len(final_temporal_features)} ({100*(len(all_temporal_features) - len(final_temporal_features))/len(all_temporal_features):.1f}%)")

# Breakdown by metric
print("\n" + "=" * 70)
print("FINAL TEMPORAL FEATURES BY METRIC")
print("=" * 70)

for metric in sorted(recommended_temporal_features.keys()):
    if metric in [r['metric'] for r in metric_importance_results if r['keep']]:
        features = recommended_temporal_features[metric]
        print(f"\n{metric.upper()} ({len(features)} features):")
        for f in sorted(features):
            print(f"  - {f}")

# ============================================================================
# EXPORT RESULTS
# ============================================================================

print("\n" + "=" * 70)
print("EXPORTING RESULTS")
print("=" * 70)

# Save final temporal feature list
with open('final_temporal_features_nested_cv.txt', 'w') as f:
    f.write(f"# Final Temporal Features After Systematic Selection (5×3 Nested CV)\n")
    f.write(f"# Generated: {pd.Timestamp.now().strftime('%Y-%m-%d %H:%M:%S')}\n")
    f.write(f"# Original: {len(all_temporal_features)} → Final: {len(final_temporal_features)}\n")
    f.write(f"# Reduction: {100*(len(all_temporal_features) - len(final_temporal_features))/len(all_temporal_features):.1f}%\n")
    f.write(f"# Mean test AUC: {optimized_result['mean']:.6f} ± {optimized_result['std']:.4f}\n")
    f.write(f"# Improvement over baseline: {improvement_vs_baseline:+.6f}\n")
    f.write(f"# Improvement over original: {improvement_vs_original:+.6f}\n\n")
    for feat in sorted(final_temporal_features):
        f.write(feat + '\n')

print("✓ Saved: final_temporal_features_nested_cv.txt")

# Save comparison results
comparison_df = pd.DataFrame([
    {
        'Configuration': name,
        'Num_Features': len(configs[name]),
        'Temporal_Features': len([f for f in configs[name] if f in all_temporal_features]),
        'Mean_AUC': res['mean'],
        'Std_AUC': res['std'],
        'SE_AUC': res['se']
    }
    for name, res in config_results.items()
])
comparison_df.to_csv('temporal_final_comparison_nested_cv.csv', index=False)
print("✓ Saved: temporal_final_comparison_nested_cv.csv")

# Save comprehensive summary
summary_nested_cv = {
    'Methodology': '5×3 Nested Cross-Validation',
    'Original Temporal Features': len(all_temporal_features),
    'After Window Selection': len(all_recommended_temporal),
    'After Metric Prioritization': len(final_temporal_features),
    'Total Reduction': len(all_temporal_features) - len(final_temporal_features),
    'Reduction %': f"{100*(len(all_temporal_features) - len(final_temporal_features))/len(all_temporal_features):.1f}%",
    'Baseline Mean AUC': f"{baseline_result['mean']:.6f}",
    'Baseline Std AUC': f"{baseline_result['std']:.6f}",
    'Original Mean AUC': f"{original_result['mean']:.6f}",
    'Original Std AUC': f"{original_result['std']:.6f}",
    'Optimized Mean AUC': f"{optimized_result['mean']:.6f}",
    'Optimized Std AUC': f"{optimized_result['std']:.6f}",
    'Improvement vs Baseline': f"{improvement_vs_baseline:+.6f}",
    'Improvement vs Original': f"{improvement_vs_original:+.6f}"
}

summary_df = pd.DataFrame([summary_nested_cv]).T
summary_df.columns = ['Value']
summary_df.to_csv('temporal_feature_selection_summary_nested_cv.csv')
print("✓ Saved: temporal_feature_selection_summary_nested_cv.csv")

print("\n" + "=" * 90)
print("✓ TEMPORAL FEATURE SELECTION COMPLETE (5×3 NESTED CV)!")
print("=" * 90)
print("\nGenerated files:")
print("  - temporal_window_selection_nested_cv.csv")
print("  - temporal_computation_redundancy_nested_cv.json")
print("  - temporal_metric_importance_nested_cv.csv")
print("  - final_temporal_features_nested_cv.txt")
print("  - temporal_final_comparison_nested_cv.csv")
print("  - temporal_feature_selection_summary_nested_cv.csv")
print("\n" + "=" * 90)
print("METHODOLOGY IMPROVEMENTS:")
print("  ✓ Reduced selection bias via inner 3-fold CV averaging")
print("  ✓ Unbiased performance estimates via outer 5-fold CV")
print("  ✓ Uncertainty quantification (std, SE) for all decisions")
print("  ✓ Majority voting across folds for robust feature selection")
print("  ✓ Standard error reported: ~" + f"{optimized_result['se']:.4f}" + " AUC")
print("=" * 90)