In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import RobustScaler
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import Ridge
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import os
from scipy import stats

# Create output directory for visualizations
viz_dir = 'enhanced_model_evaluation'
os.makedirs(viz_dir, exist_ok=True)

# Set visualization style
plt.style.use('ggplot')
sns.set_style("whitegrid")
sns.set_palette("viridis")
plt.rcParams['figure.figsize'] = (12, 8)
plt.rcParams['figure.dpi'] = 100

print("Loading dataset...")
df = pd.read_csv('./processed_data/common_features.csv')
print(f"Dataset shape: {df.shape}")

# --------------------------------------------------
# 1. TARGET VARIABLE PREPARATION
# --------------------------------------------------
target = 'total_resolution_hours'
df['log_total_resolution_hours'] = np.log1p(df[target])

# --------------------------------------------------
# 2. PLANNING-TIME FEATURES
# --------------------------------------------------

# Define features available at planning time (excluding project_duration_days)
planning_features = [
    # Project scope indicators
    'total_issues',                    # Estimated during planning
    
    # Project composition estimates
    'priority_critical_pct',           # Expected critical issues
    'priority_high_pct',               # Expected high priority issues
    'priority_medium_pct',             # Expected medium priority issues
    'priority_low_pct',                # Expected low priority issues
    'priority_blocker_pct',            # Expected blocker issues
    
    # Issue type distribution (estimated from similar projects)
    'type_bug_pct',                    # Expected bug percentage
    'type_task_pct',                   # Expected task percentage
    'type_new_feature_pct',            # Expected feature work
    'type_improvement_pct',            # Expected improvements
    'type_documentation_pct',          # Expected documentation work
    
    # Team composition
    'team_size_creators',              # Planned team size
    'team_size_assignees',             # Planned assignees
    'team_size_combined',              # Overall team size
    
    # Complexity indicators
    'weighted_priority_score',         # Expected priority complexity
    'issue_type_entropy',              # Expected variety of issues
    
    # Historical indicators that could be estimated
    'high_to_low_priority_ratio',      # Expected priority distribution
    'bug_ratio',                       # Expected bug ratio
]

# Filter to features that exist in the dataframe
planning_features = [f for f in planning_features if f in df.columns]

# --------------------------------------------------
# 3. DATA PREPARATION
# --------------------------------------------------

# Prepare the data for modeling
df_planning = df[planning_features + ['log_total_resolution_hours', target]].copy()

# Check for missing values
missing_values = df_planning.isnull().sum()
print("\nMissing values per feature:")
print(missing_values)

# Handle missing values with imputation
imputer = SimpleImputer(strategy='median')
df_planning[planning_features] = imputer.fit_transform(df_planning[planning_features])

# Check for and replace infinite values
for col in planning_features:
    mask = np.isinf(df_planning[col])
    if mask.any():
        print(f"Replacing {mask.sum()} infinite values in {col}")
        df_planning.loc[mask, col] = df_planning[col].median()

# Split data into features and target
X = df_planning[planning_features]
y_log = df_planning['log_total_resolution_hours']
y_original = df_planning[target]

# Scale features
scaler = RobustScaler()
X_scaled = scaler.fit_transform(X)

# Split into training and testing sets
X_train, X_test, y_log_train, y_log_test, y_orig_train, y_orig_test = train_test_split(
    X_scaled, y_log, y_original, test_size=0.2, random_state=42)

# --------------------------------------------------
# 4. MODEL TRAINING
# --------------------------------------------------

# Train Ridge Regression model (on log-transformed target)
ridge_model = Ridge(alpha=1.0)
ridge_model.fit(X_train, y_log_train)

# Train Random Forest model (on log-transformed target)
rf_model = RandomForestRegressor(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_log_train)

# --------------------------------------------------
# 5. MODEL EVALUATION WITH MULTIPLE METRICS
# --------------------------------------------------

# Function to calculate comprehensive metrics
def calculate_metrics(y_true, y_pred, y_true_orig=None, y_pred_orig=None, model_name="Model"):
    """
    Calculate and print comprehensive error metrics for model evaluation.
    
    Args:
        y_true: Actual values (log-transformed)
        y_pred: Predicted values (log-transformed)
        y_true_orig: Actual values (original scale)
        y_pred_orig: Predicted values (original scale)
        model_name: Name of the model for display
    
    Returns:
        Dictionary of metrics
    """
    metrics = {}
    
    # Calculate metrics in log space
    metrics['rmse_log'] = np.sqrt(mean_squared_error(y_true, y_pred))
    metrics['mae_log'] = mean_absolute_error(y_true, y_pred)
    metrics['r2_log'] = r2_score(y_true, y_pred)
    
    # Error distribution analysis
    errors = y_true - y_pred
    metrics['error_mean'] = np.mean(errors)
    metrics['error_std'] = np.std(errors)
    metrics['error_skew'] = stats.skew(errors)
    metrics['error_kurtosis'] = stats.kurtosis(errors)
    
    # Metrics in original scale (if provided)
    if y_true_orig is not None and y_pred_orig is not None:
        metrics['rmse_original'] = np.sqrt(mean_squared_error(y_true_orig, y_pred_orig))
        metrics['mae_original'] = mean_absolute_error(y_true_orig, y_pred_orig)
        metrics['r2_original'] = r2_score(y_true_orig, y_pred_orig)
        
        # MAPE (Mean Absolute Percentage Error)
        # Handle zero or near-zero values in the denominator
        mape = np.mean(np.abs((y_true_orig - y_pred_orig) / (y_true_orig + 1e-10))) * 100
        metrics['mape'] = mape
        
        # SMAPE (Symmetric Mean Absolute Percentage Error)
        # Less sensitive to small denominators
        smape = 100 * np.mean(2 * np.abs(y_pred_orig - y_true_orig) / 
                            (np.abs(y_true_orig) + np.abs(y_pred_orig) + 1e-10))
        metrics['smape'] = smape
    
    # Print metrics summary
    print(f"\n{model_name} Performance:")
    print(f"Log Space - RMSE: {metrics['rmse_log']:.4f}, MAE: {metrics['mae_log']:.4f}, R²: {metrics['r2_log']:.4f}")
    print(f"Error Distribution - Mean: {metrics['error_mean']:.4f}, Std: {metrics['error_std']:.4f}")
    print(f"Error Distribution - Skew: {metrics['error_skew']:.4f}, Kurtosis: {metrics['error_kurtosis']:.4f}")
    
    if y_true_orig is not None and y_pred_orig is not None:
        print(f"Original Scale - RMSE: {metrics['rmse_original']:.4f}, MAE: {metrics['mae_original']:.4f}")
        print(f"Percentage Errors - MAPE: {metrics['mape']:.2f}%, SMAPE: {metrics['smape']:.2f}%")
    
    return metrics

# Make predictions (log space)
ridge_pred_log = ridge_model.predict(X_test)
rf_pred_log = rf_model.predict(X_test)

# Transform predictions back to original scale
ridge_pred_orig = np.expm1(ridge_pred_log)
rf_pred_orig = np.expm1(rf_pred_log)

# Calculate comprehensive metrics
ridge_metrics = calculate_metrics(y_log_test, ridge_pred_log, y_orig_test, ridge_pred_orig, "Ridge Regression")
rf_metrics = calculate_metrics(y_log_test, rf_pred_log, y_orig_test, rf_pred_orig, "Random Forest")

# --------------------------------------------------
# 6. ERROR DISTRIBUTION ANALYSIS
# --------------------------------------------------

# As suggested by the Chai & Draxler paper, checking error distribution is important
# to determine if RMSE or MAE is more appropriate

# Create error distribution plots
plt.figure(figsize=(16, 6))

# Ridge model errors
plt.subplot(1, 2, 1)
ridge_errors = y_log_test - ridge_pred_log
sns.histplot(ridge_errors, kde=True, color='steelblue')
plt.axvline(x=0, color='red', linestyle='--')
plt.title('Ridge Model Error Distribution', fontsize=14)
plt.xlabel('Error (Actual - Predicted)', fontsize=12)
plt.ylabel('Frequency', fontsize=12)

# Add normal distribution fit
mu, std = np.mean(ridge_errors), np.std(ridge_errors)
x = np.linspace(mu - 3*std, mu + 3*std, 100)
p = stats.norm.pdf(x, mu, std)
plt.plot(x, p * len(ridge_errors) * (plt.gca().get_xlim()[1] - plt.gca().get_xlim()[0]) / 10, 
         'r-', linewidth=2, label=f'Normal Fit\nμ={mu:.2f}, σ={std:.2f}')
plt.legend()

# Random Forest model errors
plt.subplot(1, 2, 2)
rf_errors = y_log_test - rf_pred_log
sns.histplot(rf_errors, kde=True, color='forestgreen')
plt.axvline(x=0, color='red', linestyle='--')
plt.title('Random Forest Error Distribution', fontsize=14)
plt.xlabel('Error (Actual - Predicted)', fontsize=12)
plt.ylabel('Frequency', fontsize=12)

# Add normal distribution fit
mu, std = np.mean(rf_errors), np.std(rf_errors)
x = np.linspace(mu - 3*std, mu + 3*std, 100)
p = stats.norm.pdf(x, mu, std)
plt.plot(x, p * len(rf_errors) * (plt.gca().get_xlim()[1] - plt.gca().get_xlim()[0]) / 10, 
         'r-', linewidth=2, label=f'Normal Fit\nμ={mu:.2f}, σ={std:.2f}')
plt.legend()

plt.tight_layout()
plt.savefig(f'{viz_dir}/error_distributions.png', dpi=300)
plt.close()

# --------------------------------------------------
# 7. QUANTILE-QUANTILE PLOTS FOR NORMALITY
# --------------------------------------------------

# As noted in the paper, normality of errors determines whether RMSE is appropriate
plt.figure(figsize=(16, 6))

# Ridge model Q-Q plot
plt.subplot(1, 2, 1)
stats.probplot(ridge_errors, dist="norm", plot=plt)
plt.title('Ridge Model Q-Q Plot', fontsize=14)

# Random Forest model Q-Q plot
plt.subplot(1, 2, 2)
stats.probplot(rf_errors, dist="norm", plot=plt)
plt.title('Random Forest Model Q-Q Plot', fontsize=14)

plt.tight_layout()
plt.savefig(f'{viz_dir}/qq_plots.png', dpi=300)
plt.close()

# --------------------------------------------------
# 8. RMSE/MAE RATIO ANALYSIS
# --------------------------------------------------

# The paper mentions that RMSE/MAE ratio gives insights into error distribution
ridge_rmse_mae_ratio_log = ridge_metrics['rmse_log'] / ridge_metrics['mae_log']
rf_rmse_mae_ratio_log = rf_metrics['rmse_log'] / rf_metrics['mae_log']

ridge_rmse_mae_ratio_orig = ridge_metrics['rmse_original'] / ridge_metrics['mae_original']
rf_rmse_mae_ratio_orig = rf_metrics['rmse_original'] / rf_metrics['mae_original']

print("\nRMSE/MAE Ratio Analysis:")
print(f"Ridge - Log Space: {ridge_rmse_mae_ratio_log:.2f}, Original Space: {ridge_rmse_mae_ratio_orig:.2f}")
print(f"RF - Log Space: {rf_rmse_mae_ratio_log:.2f}, Original Space: {rf_rmse_mae_ratio_orig:.2f}")
print("Note: Ratio closer to 1 indicates more uniform error distribution")
print("      Higher ratio indicates presence of larger errors (RMSE more sensitive)")

# Create ratio comparison plot
plt.figure(figsize=(10, 6))
models = ['Ridge', 'Random Forest']
log_ratios = [ridge_rmse_mae_ratio_log, rf_rmse_mae_ratio_log]
orig_ratios = [ridge_rmse_mae_ratio_orig, rf_rmse_mae_ratio_orig]

x = np.arange(len(models))
width = 0.35

plt.bar(x - width/2, log_ratios, width, label='Log Space', color='steelblue')
plt.bar(x + width/2, orig_ratios, width, label='Original Space', color='forestgreen')

plt.ylabel('RMSE/MAE Ratio', fontsize=12)
plt.title('RMSE/MAE Ratio by Model and Space', fontsize=14)
plt.xticks(x, models, fontsize=12)
plt.axhline(y=1, color='red', linestyle='--', alpha=0.7, label='Uniform Error Reference')
plt.legend()

# Add ratio values on bars
for i, v in enumerate(log_ratios):
    plt.text(i - width/2, v + 0.1, f'{v:.2f}', ha='center', fontsize=10)

for i, v in enumerate(orig_ratios):
    plt.text(i + width/2, v + 0.1, f'{v:.2f}', ha='center', fontsize=10)

plt.tight_layout()
plt.savefig(f'{viz_dir}/rmse_mae_ratio.png', dpi=300)
plt.close()

# --------------------------------------------------
# 9. ACTUAL VS PREDICTED VISUALIZATIONS
# --------------------------------------------------

# Create scatter plots in log space
plt.figure(figsize=(16, 12))

# Ridge Model - Log Space
plt.subplot(2, 2, 1)
plt.scatter(y_log_test, ridge_pred_log, alpha=0.6, color='steelblue')
plt.plot([y_log_test.min(), y_log_test.max()], [y_log_test.min(), y_log_test.max()], 'r--')
plt.title('Ridge: Actual vs Predicted (Log Space)', fontsize=14)
plt.xlabel('Actual Log(Total Resolution Hours)', fontsize=12)
plt.ylabel('Predicted Log(Total Resolution Hours)', fontsize=12)
plt.annotate(f'RMSE = {ridge_metrics["rmse_log"]:.4f}\nMAE = {ridge_metrics["mae_log"]:.4f}\nR² = {ridge_metrics["r2_log"]:.4f}', 
             xy=(0.05, 0.85), xycoords='axes fraction', fontsize=12)

# Random Forest - Log Space
plt.subplot(2, 2, 2)
plt.scatter(y_log_test, rf_pred_log, alpha=0.6, color='forestgreen')
plt.plot([y_log_test.min(), y_log_test.max()], [y_log_test.min(), y_log_test.max()], 'r--')
plt.title('Random Forest: Actual vs Predicted (Log Space)', fontsize=14)
plt.xlabel('Actual Log(Total Resolution Hours)', fontsize=12)
plt.ylabel('Predicted Log(Total Resolution Hours)', fontsize=12)
plt.annotate(f'RMSE = {rf_metrics["rmse_log"]:.4f}\nMAE = {rf_metrics["mae_log"]:.4f}\nR² = {rf_metrics["r2_log"]:.4f}', 
             xy=(0.05, 0.85), xycoords='axes fraction', fontsize=12)

# Ridge Model - Original Space (with axis limits to handle extreme values)
plt.subplot(2, 2, 3)
max_val = np.percentile(np.concatenate([y_orig_test, ridge_pred_orig]), 95)  # Use 95th percentile to limit extreme values
plt.scatter(y_orig_test, ridge_pred_orig, alpha=0.6, color='steelblue')
plt.plot([0, max_val], [0, max_val], 'r--')
plt.title('Ridge: Actual vs Predicted (Original Space)', fontsize=14)
plt.xlabel('Actual Total Resolution Hours', fontsize=12)
plt.ylabel('Predicted Total Resolution Hours', fontsize=12)
plt.xlim(0, max_val)
plt.ylim(0, max_val)
plt.annotate(f'RMSE = {ridge_metrics["rmse_original"]:.4f}\nMAE = {ridge_metrics["mae_original"]:.4f}\nMAPE = {ridge_metrics["mape"]:.2f}%', 
             xy=(0.05, 0.85), xycoords='axes fraction', fontsize=12)

# Random Forest - Original Space (with axis limits to handle extreme values)
plt.subplot(2, 2, 4)
plt.scatter(y_orig_test, rf_pred_orig, alpha=0.6, color='forestgreen')
plt.plot([0, max_val], [0, max_val], 'r--')
plt.title('Random Forest: Actual vs Predicted (Original Space)', fontsize=14)
plt.xlabel('Actual Total Resolution Hours', fontsize=12)
plt.ylabel('Predicted Total Resolution Hours', fontsize=12)
plt.xlim(0, max_val)
plt.ylim(0, max_val)
plt.annotate(f'RMSE = {rf_metrics["rmse_original"]:.4f}\nMAE = {rf_metrics["mae_original"]:.4f}\nMAPE = {rf_metrics["mape"]:.2f}%', 
             xy=(0.05, 0.85), xycoords='axes fraction', fontsize=12)

plt.tight_layout()
plt.savefig(f'{viz_dir}/actual_vs_predicted_comparison.png', dpi=300)
plt.close()

# --------------------------------------------------
# 10. ERROR MAGNITUDE ANALYSIS
# --------------------------------------------------

# Create error magnitude plots
plt.figure(figsize=(16, 6))

# Ridge model absolute errors vs actual values
plt.subplot(1, 2, 1)
plt.scatter(y_log_test, np.abs(ridge_errors), alpha=0.6, color='steelblue')
plt.title('Ridge: Error Magnitude vs Actual Value', fontsize=14)
plt.xlabel('Actual Log(Total Resolution Hours)', fontsize=12)
plt.ylabel('Absolute Error', fontsize=12)
plt.grid(True, linestyle='--', alpha=0.7)

# Random Forest absolute errors vs actual values
plt.subplot(1, 2, 2)
plt.scatter(y_log_test, np.abs(rf_errors), alpha=0.6, color='forestgreen')
plt.title('Random Forest: Error Magnitude vs Actual Value', fontsize=14)
plt.xlabel('Actual Log(Total Resolution Hours)', fontsize=12)
plt.ylabel('Absolute Error', fontsize=12)
plt.grid(True, linestyle='--', alpha=0.7)

plt.tight_layout()
plt.savefig(f'{viz_dir}/error_magnitude_analysis.png', dpi=300)
plt.close()

# --------------------------------------------------
# 11. CONCLUSION AND RECOMMENDATIONS
# --------------------------------------------------

# Based on the paper by Chai and Draxler, provide recommendations
print("\n=== METRIC SELECTION RECOMMENDATIONS ===")
print("Based on the error distribution analysis and the paper by Chai & Draxler (2014):")

# Check if errors are approximately normal (using Shapiro-Wilk test)
_, ridge_p_value = stats.shapiro(ridge_errors)
_, rf_p_value = stats.shapiro(rf_errors)

print(f"\nNormality Test (Shapiro-Wilk):")
print(f"Ridge Model: p-value = {ridge_p_value:.6f} ({'Normal' if ridge_p_value > 0.05 else 'Non-normal'} distribution)")
print(f"Random Forest Model: p-value = {rf_p_value:.6f} ({'Normal' if rf_p_value > 0.05 else 'Non-normal'} distribution)")

print("\nRecommended Metrics:")
if ridge_p_value > 0.05:
    print("- For Ridge Model: RMSE is appropriate (errors follow normal distribution)")
else:
    print("- For Ridge Model: MAE may be more appropriate (errors don't follow normal distribution)")

if rf_p_value > 0.05:
    print("- For Random Forest Model: RMSE is appropriate (errors follow normal distribution)")
else:
    print("- For Random Forest Model: MAE may be more appropriate (errors don't follow normal distribution)")

print("\nOverall Recommendation:")
print("- Use multiple metrics (RMSE, MAE, and R²) to provide a complete picture")
print("- Report RMSE/MAE ratio to give insight into error distribution")
print("- Consider log-space metrics for model comparison (more stable distribution)")
print("- Use original-space metrics for practical interpretation of results")

print(f"\nAnalysis complete! All visualizations saved to {viz_dir}/")