# Model Comparison and Performance Visualization

This notebook compares different model outputs and visualizes their performance using various metrics including:
- Score distributions for failed vs non-failed companies
- Smoothed P(failure|Score) curves
- Model weights comparison
- Performance metrics

## Models to Compare:
- Baseline model (to be calculated)
- Scottv1 model
- Scottv2 model
- Any additional models

In [None]:
# Reload all modules automatically
#%autoreload
#%autoreload 2


ImportError: attempted relative import with no known parent package

In [7]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
import warnings
warnings.filterwarnings('ignore')

# Set style for better plots
plt.style.use('seaborn-v0_8')
sns.set_palette("husl")

# Configure display options
pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)

print("Libraries imported successfully!")

Libraries imported successfully!


## 1. Data Loading and Setup

In [9]:
# Define paths
base_path = Path('../')
model_outputs_path = base_path / 'model_outputs'
data_path = base_path / '../covenant/SampleCovenantData'

# Model configurations
models = {
    'baseline': {
        'path': model_outputs_path / 'baseline',
        'color': 'gray',
        'linestyle': '--'
    },
    'scottv1': {
        'path': model_outputs_path / 'scottv1',
        'color': 'blue',
        'linestyle': '-'
    },
    'scottv2': {
        'path': model_outputs_path / 'scottv2',
        'color': 'red',
        'linestyle': '-'
    }
}

print("Model configurations set up!")

Model configurations set up!


In [11]:
# Load baseline model
baseline_model = pd.read_csv('../common_data/scott_baseline.csv')

In [None]:
def load_model_data(model_name, model_config):
    """Load scores and weights for a given model"""
    try:
        scores_path = model_config['path'] / 'scores.csv'
        weights_path = model_config['path'] / 'weights.csv'
        
        if scores_path.exists() and weights_path.exists():
            scores = pd.read_csv(scores_path)
            weights = pd.read_csv(weights_path)
            
            # Convert signal_date to datetime
            scores['signal_date'] = pd.to_datetime(scores['signal_date'])
            
            return scores, weights
        else:
            print(f"Missing files for {model_name}")
            return None, None
    except Exception as e:
        print(f"Error loading {model_name}: {e}")
        return None, None

# Load all available model data
model_data = {}
for model_name, model_config in models.items():
    scores, weights = load_model_data(model_name, model_config)
    if scores is not None:
        model_data[model_name] = {
            'scores': scores,
            'weights': weights,
            'config': model_config
        }
        print(f"Loaded {model_name}: {len(scores)} records")

print(f"\nLoaded {len(model_data)} models")

## 2. Score Distribution Analysis

In [None]:
def plot_score_distributions(model_data):
    """Plot score distributions for failed vs non-failed companies"""
    fig, axes = plt.subplots(1, len(model_data), figsize=(6*len(model_data), 5))
    if len(model_data) == 1:
        axes = [axes]
    
    for i, (model_name, data) in enumerate(model_data.items()):
        scores_df = data['scores']
        
        # Plot distributions
        sns.kdeplot(data=scores_df[scores_df['fail'] == 1], x='score', 
                   label='Failed', fill=True, common_norm=False, ax=axes[i])
        sns.kdeplot(data=scores_df[scores_df['fail'] == 0], x='score', 
                   label='Non-failed', fill=True, common_norm=False, ax=axes[i])
        
        axes[i].set_title(f'{model_name.upper()} Score Distribution')
        axes[i].set_xlabel('Score')
        axes[i].set_ylabel('Density')
        axes[i].legend()
        
        # Add statistics
        failed_mean = scores_df[scores_df['fail'] == 1]['score'].mean()
        non_failed_mean = scores_df[scores_df['fail'] == 0]['score'].mean()
        axes[i].text(0.02, 0.98, f'Failed mean: {failed_mean:.3f}\nNon-failed mean: {non_failed_mean:.3f}', 
                    transform=axes[i].transAxes, verticalalignment='top',
                    bbox=dict(boxstyle='round', facecolor='white', alpha=0.8))
    
    plt.tight_layout()
    plt.show()

plot_score_distributions(model_data)

## 3. Smoothed P(failure|Score) Analysis

In [None]:
def calculate_smoothed_failure_probability(scores_df, window_size=100):
    """Calculate smoothed P(failure|Score) using rolling average"""
    # Sort by score
    df_sorted = scores_df.sort_values('score').reset_index(drop=True)
    
    # Calculate rolling average of failure probability
    rolling_avg = df_sorted['fail'].rolling(window=window_size, center=True).mean()
    
    return df_sorted['score'], rolling_avg

def plot_smoothed_failure_probability(model_data, window_size=100):
    """Plot smoothed P(failure|Score) for all models"""
    plt.figure(figsize=(12, 8))
    
    for model_name, data in model_data.items():
        scores_df = data['scores']
        config = data['config']
        
        scores, probs = calculate_smoothed_failure_probability(scores_df, window_size)
        
        plt.plot(scores, probs, 
                color=config['color'], 
                linestyle=config['linestyle'],
                linewidth=2, 
                label=f'{model_name.upper()}')
    
    plt.xlabel('Score')
    plt.ylabel('P(failure)')
    plt.title(f'Smoothed P(failure | Score) - Window Size: {window_size}')
    plt.legend()
    plt.grid(True, alpha=0.3)
    plt.tight_layout()
    plt.show()

# Plot with different window sizes
for window_size in [50, 100, 200]:
    plot_smoothed_failure_probability(model_data, window_size)

## 4. Model Weights Comparison

In [None]:
def plot_weights_comparison(model_data, top_n=15):
    """Compare weights across models"""
    # Create a combined dataframe for weights
    weights_df = pd.DataFrame()
    
    for model_name, data in model_data.items():
        weights = data['weights'].set_index('Feature')['Weight']
        weights_df[model_name] = weights
    
    # Get top features by absolute weight across all models
    max_abs_weights = weights_df.abs().max(axis=1)
    top_features = max_abs_weights.nlargest(top_n).index
    
    # Plot
    fig, (ax1, ax2) = plt.subplots(2, 1, figsize=(12, 12))
    
    # Heatmap
    weights_subset = weights_df.loc[top_features]
    sns.heatmap(weights_subset, annot=True, cmap='RdBu_r', center=0, ax=ax1)
    ax1.set_title('Feature Weights Comparison (Heatmap)')
    
    # Bar plot
    weights_subset.plot(kind='bar', ax=ax2)
    ax2.set_title('Feature Weights Comparison (Bar Plot)')
    ax2.set_xlabel('Features')
    ax2.set_ylabel('Weight')
    ax2.legend()
    ax2.tick_params(axis='x', rotation=45)
    
    plt.tight_layout()
    plt.show()

plot_weights_comparison(model_data)

## 5. Performance Metrics

In [None]:
def calculate_performance_metrics(scores_df):
    """Calculate various performance metrics"""
    from sklearn.metrics import roc_auc_score, precision_recall_curve, auc
    
    # ROC AUC
    roc_auc = roc_auc_score(scores_df['fail'], scores_df['score'])
    
    # PR AUC
    precision, recall, _ = precision_recall_curve(scores_df['fail'], scores_df['score'])
    pr_auc = auc(recall, precision)
    
    # Score statistics
    failed_scores = scores_df[scores_df['fail'] == 1]['score']
    non_failed_scores = scores_df[scores_df['fail'] == 0]['score']
    
    metrics = {
        'roc_auc': roc_auc,
        'pr_auc': pr_auc,
        'failed_mean': failed_scores.mean(),
        'non_failed_mean': non_failed_scores.mean(),
        'failed_std': failed_scores.std(),
        'non_failed_std': non_failed_scores.std(),
        'separation': (failed_scores.mean() - non_failed_scores.mean()) / np.sqrt((failed_scores.var() + non_failed_scores.var()) / 2)
    }
    
    return metrics

# Calculate and display metrics for all models
metrics_df = pd.DataFrame()

for model_name, data in model_data.items():
    metrics = calculate_performance_metrics(data['scores'])
    metrics_df[model_name] = metrics

metrics_df = metrics_df.T
print("Performance Metrics:")
print(metrics_df.round(4))

# Plot ROC curves
plt.figure(figsize=(10, 5))

plt.subplot(1, 2, 1)
for model_name, data in model_data.items():
    scores_df = data['scores']
    config = data['config']
    
    from sklearn.metrics import roc_curve
    fpr, tpr, _ = roc_curve(scores_df['fail'], scores_df['score'])
    plt.plot(fpr, tpr, color=config['color'], linestyle=config['linestyle'], 
             linewidth=2, label=f'{model_name.upper()} (AUC: {metrics_df.loc[model_name, "roc_auc"]:.3f})')

plt.plot([0, 1], [0, 1], 'k--', alpha=0.5)
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curves')
plt.legend()
plt.grid(True, alpha=0.3)

plt.subplot(1, 2, 2)
for model_name, data in model_data.items():
    scores_df = data['scores']
    config = data['config']
    
    precision, recall, _ = precision_recall_curve(scores_df['fail'], scores_df['score'])
    plt.plot(recall, precision, color=config['color'], linestyle=config['linestyle'], 
             linewidth=2, label=f'{model_name.upper()} (AUC: {metrics_df.loc[model_name, "pr_auc"]:.3f})')

plt.xlabel('Recall')
plt.ylabel('Precision')
plt.title('Precision-Recall Curves')
plt.legend()
plt.grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

## 6. Baseline Model Calculation

This section calculates a baseline model for comparison. The baseline uses equal weights for all features.

In [None]:
def calculate_baseline_model():
    """Calculate baseline model with equal weights"""
    # Load signals data
    signals_file = data_path / 'binarySignalsPart00.csv'
    if not signals_file.exists():
        print("Signals file not found. Skipping baseline calculation.")
        return None, None
    
    # Load signals
    signals_df = pd.read_csv(signals_file, parse_dates=['signal_date'], date_format='%m/%d/%Y')
    
    # Get feature columns (exclude companyid and signal_date)
    feature_cols = [col for col in signals_df.columns if col not in ['companyid', 'signal_date']]
    
    # Create equal weights
    equal_weights = pd.Series(1.0/len(feature_cols), index=feature_cols)
    
    # Calculate scores
    scores = signals_df[['companyid', 'signal_date']].copy()
    scores['score'] = signals_df[feature_cols].dot(equal_weights)
    scores['fail'] = 0  # Placeholder - would need actual failure data
    
    # Save baseline results
    baseline_path = model_outputs_path / 'baseline'
    baseline_path.mkdir(exist_ok=True)
    
    # Save weights
    weights_df = equal_weights.reset_index()
    weights_df.columns = ['Feature', 'Weight']
    weights_df.to_csv(baseline_path / 'weights.csv', index=False)
    
    # Save scores
    scores.to_csv(baseline_path / 'scores.csv', index=False)
    
    print(f"Baseline model saved to {baseline_path}")
    return scores, equal_weights

# Uncomment to calculate baseline
# baseline_scores, baseline_weights = calculate_baseline_model()
# if baseline_scores is not None:
#     # Reload model data to include baseline
#     scores, weights = load_model_data('baseline', models['baseline'])
#     if scores is not None:
#         model_data['baseline'] = {
#             'scores': scores,
#             'weights': weights,
#             'config': models['baseline']
#         }
#         print("Baseline model loaded successfully!")

print("Baseline calculation function ready. Uncomment to run.")

## 7. Summary and Insights

In [None]:
# Summary statistics
print("=== MODEL COMPARISON SUMMARY ===\n")

for model_name, data in model_data.items():
    scores_df = data['scores']
    
    print(f"{model_name.upper()}:")
    print(f"  Total records: {len(scores_df):,}")
    print(f"  Failed companies: {scores_df['fail'].sum():,}")
    print(f"  Non-failed companies: {(scores_df['fail'] == 0).sum():,}")
    print(f"  Score range: {scores_df['score'].min():.3f} to {scores_df['score'].max():.3f}")
    print(f"  Score std: {scores_df['score'].std():.3f}")
    
    if model_name in metrics_df.index:
        print(f"  ROC AUC: {metrics_df.loc[model_name, 'roc_auc']:.3f}")
        print(f"  PR AUC: {metrics_df.loc[model_name, 'pr_auc']:.3f}")
        print(f"  Separation: {metrics_df.loc[model_name, 'separation']:.3f}")
    
    print()

print("=== KEY INSIGHTS ===")
print("1. The smoothed P(failure|Score) curves show how well each model separates failed from non-failed companies")
print("2. Higher separation between failed and non-failed score distributions indicates better performance")
print("3. ROC AUC and PR AUC provide quantitative measures of model performance")
print("4. Feature weights show which signals are most important for each model")