# Individual Task Effort Estimation Model

This notebook focuses on estimating individual task effort using the cleaned Jira dataset. The goal is to build models that can predict how long it will take to complete a task based on its characteristics.

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, RandomizedSearchCV, cross_val_score
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from scipy.stats import spearmanr
from xgboost import XGBRegressor
import pickle
import time
import os
import warnings
from scipy import stats
warnings.filterwarnings('ignore')

# Create directory for results
results_dir = 'individual_task_effort_results'
os.makedirs(results_dir, exist_ok=True)

# Set plot style
plt.style.use('ggplot')
sns.set(style="whitegrid")

# Define a function to format column groups
def format_column_group(col):
    if col in ['is_type_bug', 'is_type_task', 'is_type_story', 'is_type_improvement', 
              'is_type_new_feature', 'is_type_epic', 'is_type_sub-task']:
        return 'Issue Type'
    elif col in ['is_priority_blocker', 'is_priority_critical', 'is_priority_major', 
               'is_priority_minor', 'is_priority_trivial']:
        return 'Priority'
    elif col in ['inward_count', 'outward_count']:
        return 'Relationships'
    elif col in ['age_days', 'created_is_weekend', 'created_hour', 'created_month', 'created_year']:
        return 'Temporal'
    elif col in ['fields.creator.active', 'fields.issuetype.id', 'is_completed', 'is_resolved']:
        return 'Status'
    else:
        return 'Other'

## 1. Data Loading and Preparation

In [None]:
# Load the cleaned Jira dataset
print("Loading cleaned Jira dataset...")
try:
    df = pd.read_csv('cleaned_jira_dataset.csv')
    print(f"Dataset loaded: {df.shape[0]} tasks, {df.shape[1]} features")
except Exception as e:
    print(f"Error loading dataset: {e}")
    raise

# Display the first few rows to understand the data structure
df.head()

In [None]:
# Check for missing values
print("\nChecking for missing values...")
missing_values = df.isnull().sum()
missing_percent = (missing_values / len(df)) * 100

# Display columns with missing values
missing_df = pd.DataFrame({
    'Missing Values': missing_values,
    'Percentage': missing_percent
})

missing_df = missing_df[missing_df['Missing Values'] > 0].sort_values('Missing Values', ascending=False)
print(missing_df)

# Visualize missing values
plt.figure(figsize=(12, 6))
sns.barplot(x=missing_df.index[:15], y=missing_df['Percentage'][:15])
plt.title('Top 15 Columns with Missing Values')
plt.xlabel('Columns')
plt.ylabel('Percentage Missing')
plt.xticks(rotation=90)
plt.tight_layout()
plt.savefig(f'{results_dir}/missing_values.png')
plt.show()

In [None]:
# Get target variable distribution
plt.figure(figsize=(12, 6))

# Resolution hours distribution (before filtering)
plt.subplot(1, 2, 1)
sns.histplot(df['resolution_hours'], bins=50, kde=True)
plt.title('Resolution Hours Distribution')
plt.xlabel('Resolution Hours')
plt.ylabel('Frequency')

# Log-transformed resolution hours
plt.subplot(1, 2, 2)
df['log_resolution_hours'] = np.log1p(df['resolution_hours'])
sns.histplot(df['log_resolution_hours'], bins=50, kde=True)
plt.title('Log(Resolution Hours) Distribution')
plt.xlabel('Log(Resolution Hours)')
plt.ylabel('Frequency')
plt.tight_layout()
plt.savefig(f'{results_dir}/target_distribution_raw.png')
plt.show()

print(f"\nTarget variable statistics (before filtering):")
print(f"  Mean: {df['resolution_hours'].mean():.2f} hours")
print(f"  Median: {df['resolution_hours'].median():.2f} hours")
print(f"  Min: {df['resolution_hours'].min():.2f} hours")
print(f"  Max: {df['resolution_hours'].max():.2f} hours")
print(f"  Standard Deviation: {df['resolution_hours'].std():.2f} hours")

In [None]:
# Data cleaning
print("\nCleaning the dataset...")

# 1. Filter out tasks with invalid resolution hours
df_filtered = df.copy()
df_filtered = df_filtered.dropna(subset=['resolution_hours'])
df_filtered = df_filtered[df_filtered['resolution_hours'] >= 0]

# 2. Cap extremely long resolution times (e.g., > 6 months)
resolution_cap = 6 * 30 * 24  # 6 months in hours
long_tasks = df_filtered['resolution_hours'] > resolution_cap
print(f"Capped {long_tasks.sum()} tasks with resolution times > {resolution_cap} hours (6 months)")
df_filtered.loc[long_tasks, 'resolution_hours'] = resolution_cap
df_filtered.loc[long_tasks, 'log_resolution_hours'] = np.log1p(resolution_cap)

# 3. Remove columns with high percentage of missing values (e.g., > 50%)
high_missing_cols = missing_df[missing_df['Percentage'] > 50].index.tolist()
df_filtered = df_filtered.drop(columns=high_missing_cols, errors='ignore')
print(f"Removed {len(high_missing_cols)} columns with more than 50% missing values")

# 4. Fill remaining missing values
# For numeric columns, fill with median
numeric_cols = df_filtered.select_dtypes(include=['number']).columns
df_filtered[numeric_cols] = df_filtered[numeric_cols].fillna(df_filtered[numeric_cols].median())

# For categorical columns, fill with mode
categorical_cols = df_filtered.select_dtypes(include=['object']).columns
for col in categorical_cols:
    df_filtered[col] = df_filtered[col].fillna(df_filtered[col].mode()[0])

# 5. Check for remaining missing values
remaining_missing = df_filtered.isnull().sum().sum()
print(f"Remaining missing values after cleaning: {remaining_missing}")

# Display cleaned dataset info
print(f"\nCleaned dataset shape: {df_filtered.shape[0]} tasks, {df_filtered.shape[1]} features")

# Visualize cleaned target variable
plt.figure(figsize=(12, 6))

# Resolution hours distribution (after cleaning)
plt.subplot(1, 2, 1)
sns.histplot(df_filtered['resolution_hours'], bins=50, kde=True)
plt.title('Resolution Hours Distribution (Cleaned)')
plt.xlabel('Resolution Hours')
plt.ylabel('Frequency')

# Log-transformed resolution hours
plt.subplot(1, 2, 2)
sns.histplot(df_filtered['log_resolution_hours'], bins=50, kde=True)
plt.title('Log(Resolution Hours) Distribution (Cleaned)')
plt.xlabel('Log(Resolution Hours)')
plt.ylabel('Frequency')
plt.tight_layout()
plt.savefig(f'{results_dir}/target_distribution_cleaned.png')
plt.show()

print(f"\nTarget variable statistics (after cleaning):")
print(f"  Mean: {df_filtered['resolution_hours'].mean():.2f} hours")
print(f"  Median: {df_filtered['resolution_hours'].median():.2f} hours")
print(f"  Min: {df_filtered['resolution_hours'].min():.2f} hours")
print(f"  Max: {df_filtered['resolution_hours'].max():.2f} hours")
print(f"  Standard Deviation: {df_filtered['resolution_hours'].std():.2f} hours")

## 2. Exploratory Data Analysis

In [None]:
# Explore relationships between task characteristics and resolution time
print("\nExploring relationships with resolution time...")

# 1. Issue type impact on resolution time
plt.figure(figsize=(12, 6))
for col in ['is_type_bug', 'is_type_task', 'is_type_story', 'is_type_improvement', 'is_type_new_feature', 'is_type_epic', 'is_type_sub-task']:
    if col in df_filtered.columns:
        plt.subplot(2, 4, ['is_type_bug', 'is_type_task', 'is_type_story', 'is_type_improvement', 
                         'is_type_new_feature', 'is_type_epic', 'is_type_sub-task'].index(col) % 8 + 1)
        
        # Create groups based on the issue type
        type_true = df_filtered[df_filtered[col] == 1]['resolution_hours']
        type_false = df_filtered[df_filtered[col] == 0]['resolution_hours']
        
        # Boxplot for this issue type
        plt.boxplot([type_true, type_false], labels=['Yes', 'No'])
        plt.title(f'{col.replace("is_type_", "").replace("_", " ").title()}')
        plt.ylabel('Resolution Hours')
        
        # Print mean resolution time for this type
        print(f"{col.replace('is_type_', '').replace('_', ' ').title()} - "
              f"Mean resolution time: Yes={type_true.mean():.2f}, No={type_false.mean():.2f}")

plt.tight_layout()
plt.savefig(f'{results_dir}/issue_type_impact.png')
plt.show()

# 2. Priority impact
plt.figure(figsize=(12, 6))
for col in ['is_priority_blocker', 'is_priority_critical', 'is_priority_major', 'is_priority_minor', 'is_priority_trivial']:
    if col in df_filtered.columns:
        plt.subplot(2, 3, ['is_priority_blocker', 'is_priority_critical', 'is_priority_major', 
                         'is_priority_minor', 'is_priority_trivial'].index(col) % 6 + 1)
        
        # Create groups based on the priority
        priority_true = df_filtered[df_filtered[col] == 1]['resolution_hours']
        priority_false = df_filtered[df_filtered[col] == 0]['resolution_hours']
        
        # Boxplot for this priority
        plt.boxplot([priority_true, priority_false], labels=['Yes', 'No'])
        plt.title(f'{col.replace("is_priority_", "").replace("_", " ").title()}')
        plt.ylabel('Resolution Hours')
        
        # Print mean resolution time for this priority
        print(f"{col.replace('is_priority_', '').replace('_', ' ').title()} - "
              f"Mean resolution time: Yes={priority_true.mean():.2f}, No={priority_false.mean():.2f}")

plt.tight_layout()
plt.savefig(f'{results_dir}/priority_impact.png')
plt.show()

# 3. Correlation between numeric features and resolution time
numeric_features = df_filtered.select_dtypes(include=['number']).columns.tolist()
numeric_features.remove('resolution_hours')  # Remove target from features
if 'log_resolution_hours' in numeric_features:
    numeric_features.remove('log_resolution_hours')

# Calculate correlations with resolution time
correlations = []
for col in numeric_features:
    corr = df_filtered[col].corr(df_filtered['resolution_hours'])
    correlations.append((col, corr))

# Sort by absolute correlation value
correlations.sort(key=lambda x: abs(x[1]), reverse=True)

# Display top 15 correlated features
top_corr = pd.DataFrame(correlations[:15], columns=['Feature', 'Correlation'])
print("\nTop 15 features correlated with resolution time:")
print(top_corr)

# Visualize top 10 correlations
plt.figure(figsize=(12, 6))
top10 = pd.DataFrame(correlations[:10], columns=['Feature', 'Correlation'])
sns.barplot(x='Correlation', y='Feature', data=top10)
plt.title('Top 10 Features Correlated with Resolution Time')
plt.tight_layout()
plt.savefig(f'{results_dir}/top_correlations.png')
plt.show()

# 4. Heatmap of correlations between top features
top_features = [x[0] for x in correlations[:15]] + ['resolution_hours']
corr_matrix = df_filtered[top_features].corr()

plt.figure(figsize=(14, 12))
sns.heatmap(corr_matrix, annot=True, cmap='coolwarm', fmt='.2f', linewidths=0.5)
plt.title('Correlation Matrix of Top Features')
plt.tight_layout()
plt.savefig(f'{results_dir}/correlation_heatmap.png')
plt.show()

In [None]:
# Analyze feature categories and their correlations with resolution time
print("\nAnalyzing feature categories...")

# Group features by category
category_features = {}
for col in numeric_features:
    category = format_column_group(col)
    if category not in category_features:
        category_features[category] = []
    category_features[category].append(col)

# Calculate average correlation by category
category_correlations = []
for category, features in category_features.items():
    category_corr = np.mean([abs(df_filtered[feat].corr(df_filtered['resolution_hours'])) for feat in features])
    category_correlations.append((category, category_corr, len(features)))

# Sort by correlation strength
category_correlations.sort(key=lambda x: x[1], reverse=True)

# Display category correlations
category_corr_df = pd.DataFrame(category_correlations, columns=['Category', 'Avg. Abs. Correlation', 'Feature Count'])
print("Average absolute correlation by feature category:")
print(category_corr_df)

# Visualize category correlations
plt.figure(figsize=(10, 6))
sns.barplot(x='Avg. Abs. Correlation', y='Category', data=category_corr_df)
plt.title('Average Correlation Strength by Feature Category')
plt.tight_layout()
plt.savefig(f'{results_dir}/category_correlations.png')
plt.show()

## 3. Feature Selection and Data Preparation

In [None]:
# Prepare features and target for modeling
print("\nPreparing features for modeling...")

# 1. Select features for model training
# Remove ID columns and target-related columns
id_columns = ['id', 'fields.issuetype.id', 'fields.project.id', 'fields.priority.id']
target_columns = ['resolution_hours', 'log_resolution_hours']

# Columns to drop
columns_to_drop = id_columns + target_columns

# Drop columns that aren't useful for modeling
for col in df_filtered.columns:
    # Drop date columns that have been converted to features
    if col in ['fields.created', 'fields.updated']:
        columns_to_drop.append(col)
        
    # Drop any string/object columns that haven't been encoded
    elif df_filtered[col].dtype == 'object' and col not in id_columns:
        columns_to_drop.append(col)

# Remove columns that don't exist
columns_to_drop = [col for col in columns_to_drop if col in df_filtered.columns]

# Create feature matrix
X = df_filtered.drop(columns=columns_to_drop)

# Set target variable (log-transformed for better model performance)
y = df_filtered['log_resolution_hours']

# Display selected features
print(f"Selected {X.shape[1]} features for modeling")
print("Feature list:")
print(X.columns.tolist())

# 2. Check for multicollinearity
corr_matrix = X.corr()

# Find highly correlated pairs (|r| > 0.8)
high_corr_pairs = []
for i in range(len(corr_matrix.columns)):
    for j in range(i):
        if abs(corr_matrix.iloc[i, j]) > 0.8:
            high_corr_pairs.append((corr_matrix.columns[i], corr_matrix.columns[j], corr_matrix.iloc[i, j]))

if high_corr_pairs:
    print("\nHighly correlated feature pairs (|r| > 0.8):")
    for col1, col2, corr in high_corr_pairs:
        print(f"{col1} <-> {col2}: {corr:.3f}")

    # Create a set of features to drop (keep the one with higher correlation to target)
    to_drop = set()
    for col1, col2, _ in high_corr_pairs:
        corr1 = abs(df_filtered[col1].corr(df_filtered['resolution_hours']))
        corr2 = abs(df_filtered[col2].corr(df_filtered['resolution_hours']))
        if corr1 >= corr2:
            to_drop.add(col2)
        else:
            to_drop.add(col1)
    
    print(f"\nDropping {len(to_drop)} features due to multicollinearity:")
    print(list(to_drop))
    X = X.drop(columns=list(to_drop))

# 3. Split data into training, validation, and test sets
print("\nSplitting data into train, validation, and test sets...")

# First split: 80% train+validation, 20% test
X_train_val, X_test, y_train_val, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# Second split: 75% train, 25% validation (60%/20% of original data)
X_train, X_val, y_train, y_val = train_test_split(
    X_train_val, y_train_val, test_size=0.25, random_state=42
)

print(f"Training set: {X_train.shape[0]} samples")
print(f"Validation set: {X_val.shape[0]} samples")
print(f"Test set: {X_test.shape[0]} samples")

# Save data splits for reproducibility
splits = {
    'X_train': X_train,
    'y_train': y_train,
    'X_val': X_val,
    'y_val': y_val,
    'X_test': X_test,
    'y_test': y_test,
    'feature_names': X.columns.tolist(),
    'log_transform': True  # We're using log-transformed target
}

with open(f'{results_dir}/data_splits.pkl', 'wb') as f:
    pickle.dump(splits, f)

## 4. Model Training and Evaluation (Initial Models)

In [None]:
# Train and evaluate regression models
print("\nTraining initial regression models...")

# Function to evaluate models and return metrics
def evaluate_model(model, X_val, y_val, model_name, is_val=True):
    # Get predictions
    y_pred = model.predict(X_val)
    
    # Convert back from log space
    y_val_orig = np.expm1(y_val)
    y_pred_orig = np.expm1(y_pred)
    
    # Calculate metrics
    mae = mean_absolute_error(y_val_orig, y_pred_orig)
    rmse = np.sqrt(mean_squared_error(y_val_orig, y_pred_orig))
    r2 = r2_score(y_val_orig, y_pred_orig)
    
    # Calculate Spearman rank correlation (handles non-linear relationships)
    spearman_corr, _ = spearmanr(y_val_orig, y_pred_orig)
    
    # Print results
    dataset = "Validation" if is_val else "Test"
    print(f"\n{model_name} ({dataset} Set):")
    print(f"  MAE: {mae:.2f} hours")
    print(f"  RMSE: {rmse:.2f} hours")
    print(f"  R²: {r2:.4f}")
    print(f"  Spearman Correlation: {spearman_corr:.4f}")
    
    # Create actual vs. predicted plot
    plt.figure(figsize=(8, 6))
    plt.scatter(y_val_orig, y_pred_orig, alpha=0.3)
    plt.plot([0, y_val_orig.max()], [0, y_val_orig.max()], 'r--')
    plt.title(f'{model_name}: Actual vs. Predicted')
    plt.xlabel('Actual Resolution Hours')
    plt.ylabel('Predicted Resolution Hours')
    plt.savefig(f'{results_dir}/{model_name.replace(" ", "_").lower()}_predictions.png')
    plt.close()
    
    # Plot residuals
    residuals = y_val_orig - y_pred_orig
    plt.figure(figsize=(8, 6))
    plt.scatter(y_pred_orig, residuals, alpha=0.3)
    plt.axhline(y=0, color='r', linestyle='--')
    plt.title(f'{model_name}: Residuals')
    plt.xlabel('Predicted Resolution Hours')
    plt.ylabel('Residuals (Actual - Predicted)')
    plt.savefig(f'{results_dir}/{model_name.replace(" ", "_").lower()}_residuals.png')
    plt.close()
    
    return {
        'model': model,
        'name': model_name,
        'mae': mae,
        'rmse': rmse,
        'r2': r2,
        'spearman': spearman_corr,
        'predictions': y_pred,
        'original_predictions': y_pred_orig,
        'original_values': y_val_orig
    }

# Define base models
base_models = {
    'Random Forest': RandomForestRegressor(n_estimators=100, random_state=42),
    'Gradient Boosting': GradientBoostingRegressor(n_estimators=100, random_state=42),
    'XGBoost': XGBRegressor(n_estimators=100, random_state=42)
}

# Train and evaluate models
model_results = {}
for name, model in base_models.items():
    print(f"\nTraining {name}...")
    start_time = time.time()
    model.fit(X_train, y_train)
    train_time = time.time() - start_time
    print(f"Training completed in {train_time:.2f} seconds")
    
    # Evaluate on validation set
    results = evaluate_model(model, X_val, y_val, name)
    results['train_time'] = train_time
    model_results[name] = results
    
    # Save model
    with open(f'{results_dir}/original_{name.replace(" ", "_")}_model.pkl', 'wb') as f:
        pickle.dump(model, f)

# Create ensemble predictions (average of all models)
print("\nCreating ensemble prediction...")
y_pred_ensemble = np.zeros_like(y_val)
for name, results in model_results.items():
    y_pred_ensemble += results['predictions']
y_pred_ensemble /= len(model_results)

# Evaluate ensemble
ensemble_results = {
    'name': 'Ensemble',
    'predictions': y_pred_ensemble,
    'original_predictions': np.expm1(y_pred_ensemble),
    'original_values': np.expm1(y_val)
}

# Calculate ensemble metrics
ensemble_results['mae'] = mean_absolute_error(
    ensemble_results['original_values'], 
    ensemble_results['original_predictions']
)
ensemble_results['rmse'] = np.sqrt(mean_squared_error(
    ensemble_results['original_values'], 
    ensemble_results['original_predictions']
))
ensemble_results['r2'] = r2_score(
    ensemble_results['original_values'], 
    ensemble_results['original_predictions']
)
ensemble_results['spearman'], _ = spearmanr(
    ensemble_results['original_values'], 
    ensemble_results['original_predictions']
)

# Print ensemble results
print(f"\nEnsemble (Validation Set):")
print(f"  MAE: {ensemble_results['mae']:.2f} hours")
print(f"  RMSE: {ensemble_results['rmse']:.2f} hours")
print(f"  R²: {ensemble_results['r2']:.4f}")
print(f"  Spearman Correlation: {ensemble_results['spearman']:.4f}")

# Create ensemble plots
plt.figure(figsize=(8, 6))
plt.scatter(
    ensemble_results['original_values'], 
    ensemble_results['original_predictions'], 
    alpha=0.3
)
plt.plot([0, ensemble_results['original_values'].max()], 
         [0, ensemble_results['original_values'].max()], 'r--')
plt.title('Ensemble: Actual vs. Predicted')
plt.xlabel('Actual Resolution Hours')
plt.ylabel('Predicted Resolution Hours')
plt.savefig(f'{results_dir}/ensemble_predictions.png')
plt.close()

# Add ensemble to results
model_results['Ensemble'] = ensemble_results

# Save original results
with open(f'{results_dir}/original_results.pkl', 'wb') as f:
    pickle.dump(model_results, f)

# Save original hyperparameters
original_hyperparams = {
    'Random Forest': {**base_models['Random Forest'].get_params()},
    'Gradient Boosting': {**base_models['Gradient Boosting'].get_params()},
    'XGBoost': {**base_models['XGBoost'].get_params()}
}

with open(f'{results_dir}/original_hyperparameters.pkl', 'wb') as f:
    pickle.dump(original_hyperparams, f)

# Compare model performance
performance_comparison = pd.DataFrame([
    (name, results['mae'], results['rmse'], results['r2'], results['spearman'])
    for name, results in model_results.items()
], columns=['Model', 'MAE', 'RMSE', 'R²', 'Spearman'])

print("\nModel Performance Comparison:")
print(performance_comparison.sort_values('Spearman', ascending=False))

In [None]:
# Feature importance analysis
print("\nAnalyzing feature importance...")

# Create function to visualize feature importance
def plot_feature_importance(model, model_name, top_n=20):
    if not hasattr(model, 'feature_importances_'):
        print(f"{model_name} does not provide feature importances")
        return None
    
    # Get feature importances
    importances = model.feature_importances_
    feature_importance = pd.DataFrame({
        'Feature': X.columns,
        'Importance': importances
    })
    
    # Sort by importance
    feature_importance = feature_importance.sort_values('Importance', ascending=False)
    
    # Get top N features
    top_features = feature_importance.head(top_n)
    
    # Plot importance
    plt.figure(figsize=(10, 8))
    plt.barh(top_features['Feature'][::-1], top_features['Importance'][::-1])
    plt.title(f'{model_name}: Top {top_n} Feature Importance')
    plt.xlabel('Importance')
    plt.tight_layout()
    plt.savefig(f'{results_dir}/{model_name.replace(" ", "_").lower()}_feature_importance.png')
    plt.close()
    
    return feature_importance

# Analyze each model's feature importance
feature_importance_results = {}
for name, results in model_results.items():
    if name != 'Ensemble' and hasattr(results['model'], 'feature_importances_'):
        print(f"\nAnalyzing feature importance for {name}...")
        importance = plot_feature_importance(results['model'], name)
        
        if importance is not None:
            feature_importance_results[name] = importance
            
            # Print top 10 features
            print(f"Top 10 features for {name}:")
            print(importance.head(10))

# Find most common important features across models
if feature_importance_results:
    print("\nIdentifying most common important features across models...")
    
    # Get top 10 features from each model
    top_features_by_model = {}
    all_top_features = set()
    
    for name, importance in feature_importance_results.items():
        top10 = importance.head(10)['Feature'].tolist()
        top_features_by_model[name] = top10
        all_top_features.update(top10)
    
    # Count feature occurrences across models
    feature_counts = {}
    for feature in all_top_features:
        count = sum(1 for model_features in top_features_by_model.values() if feature in model_features)
        feature_counts[feature] = count
    
    # Sort by count
    sorted_features = sorted(feature_counts.items(), key=lambda x: x[1], reverse=True)
    
    print("Features appearing in multiple models' top 10:")
    for feature, count in sorted_features:
        if count > 1:
            print(f"  {feature}: {count} models")
    
    # Create consolidated feature importance
    best_model = performance_comparison.sort_values('R²', ascending=False).iloc[0]['Model']
    best_importance = feature_importance_results[best_model]
    
    # Save best feature importance
    best_importance.to_csv(f'{results_dir}/best_feature_importance.csv', index=False)
    
    # Create a plot of the best model's feature importance
    plt.figure(figsize=(12, 8))
    top_n = 20
    top_features = best_importance.head(top_n)
    plt.barh(top_features['Feature'][::-1], top_features['Importance'][::-1])
    plt.title(f'Top {top_n} Most Important Features for Task Effort Estimation')
    plt.xlabel('Importance')
    plt.tight_layout()
    plt.savefig(f'{results_dir}/best_feature_importance.png')
    plt.show()

## 5. Model Hyperparameter Tuning

In [None]:
# Hyperparameter tuning for the models
print("\nPerforming hyperparameter tuning...")

# Define hyperparameter search spaces
param_spaces = {
    'Random Forest': {
        'n_estimators': [100, 200, 300, 500],
        'max_depth': [None, 10, 20, 30, 40],
        'min_samples_split': [2, 5, 10],
        'min_samples_leaf': [1, 2, 4],
        'max_features': ['auto', 'sqrt']
    },
    'Gradient Boosting': {
        'n_estimators': [100, 200, 300, 500],
        'learning_rate': [0.01, 0.05, 0.1, 0.2],
        'max_depth': [3, 5, 7, 9],
        'min_samples_split': [2, 5, 10],
        'min_samples_leaf': [1, 2, 4],
        'subsample': [0.8, 0.9, 1.0]
    },
    'XGBoost': {
        'n_estimators': [100, 200, 300, 500],
        'learning_rate': [0.01, 0.05, 0.1, 0.2],
        'max_depth': [3, 5, 7, 9],
        'min_child_weight': [1, 3, 5],
        'subsample': [0.6, 0.8, 1.0],
        'colsample_bytree': [0.6, 0.8, 1.0],
        'gamma': [0, 0.1, 0.2]
    }
}

# Define scoring metric for hyperparameter tuning
def spearman_scorer(estimator, X, y):
    y_pred = estimator.predict(X)
    # Convert from log space
    y_orig = np.expm1(y)
    y_pred_orig = np.expm1(y_pred)
    corr, _ = spearmanr(y_orig, y_pred_orig)
    return corr

# Function to tune hyperparameters
def tune_model(model, params, model_name, X_train, y_train, X_val, y_val, n_iter=20):
    print(f"\nTuning hyperparameters for {model_name}...")
    
    # Initialize RandomizedSearchCV
    random_search = RandomizedSearchCV(
        model, param_distributions=params, n_iter=n_iter,
        scoring=spearman_scorer, cv=5, random_state=42, n_jobs=-1
    )
    
    # Fit the search
    start_time = time.time()
    random_search.fit(X_train, y_train)
    tuning_time = time.time() - start_time
    print(f"Tuning completed in {tuning_time:.2f} seconds")
    
    # Get best model and parameters
    best_model = random_search.best_estimator_
    best_params = random_search.best_params_
    best_score = random_search.best_score_
    
    print(f"Best CV score: {best_score:.4f}")
    print(f"Best parameters: {best_params}")
    
    # Evaluate the tuned model
    tuned_results = evaluate_model(best_model, X_val, y_val, f"Tuned {model_name}")
    tuned_results['best_params'] = best_params
    tuned_results['cv_score'] = best_score
    tuned_results['tuning_time'] = tuning_time
    
    # Save tuned model
    with open(f'{results_dir}/tuned_{model_name.replace(" ", "_")}_model.pkl', 'wb') as f:
        pickle.dump(best_model, f)
    
    return tuned_results, random_search.cv_results_

# Store tuned results
tuned_model_results = {}
cv_results_dict = {}

# Tune each model
for name, model in base_models.items():
    tuned_results, cv_results = tune_model(
        model, param_spaces[name], name,
        X_train, y_train, X_val, y_val
    )
    tuned_model_results[name] = tuned_results
    cv_results_dict[name] = cv_results

# Save tuning results
with open(f'{results_dir}/tuned_results.pkl', 'wb') as f:
    pickle.dump(tuned_model_results, f)

with open(f'{results_dir}/tuned_cv_results.pkl', 'wb') as f:
    pickle.dump(cv_results_dict, f)

# Create tuned ensemble predictions
print("\nCreating tuned ensemble prediction...")
y_pred_tuned_ensemble = np.zeros_like(y_val)
for name, results in tuned_model_results.items():
    y_pred_tuned_ensemble += results['predictions']
y_pred_tuned_ensemble /= len(tuned_model_results)

# Evaluate tuned ensemble
tuned_ensemble_results = {
    'name': 'Tuned Ensemble',
    'predictions': y_pred_tuned_ensemble,
    'original_predictions': np.expm1(y_pred_tuned_ensemble),
    'original_values': np.expm1(y_val)
}

# Calculate tuned ensemble metrics
tuned_ensemble_results['mae'] = mean_absolute_error(
    tuned_ensemble_results['original_values'], 
    tuned_ensemble_results['original_predictions']
)
tuned_ensemble_results['rmse'] = np.sqrt(mean_squared_error(
    tuned_ensemble_results['original_values'], 
    tuned_ensemble_results['original_predictions']
))
tuned_ensemble_results['r2'] = r2_score(
    tuned_ensemble_results['original_values'], 
    tuned_ensemble_results['original_predictions']
)
tuned_ensemble_results['spearman'], _ = spearmanr(
    tuned_ensemble_results['original_values'], 
    tuned_ensemble_results['original_predictions']
)

# Print tuned ensemble results
print(f"\nTuned Ensemble (Validation Set):")
print(f"  MAE: {tuned_ensemble_results['mae']:.2f} hours")
print(f"  RMSE: {tuned_ensemble_results['rmse']:.2f} hours")
print(f"  R²: {tuned_ensemble_results['r2']:.4f}")
print(f"  Spearman Correlation: {tuned_ensemble_results['spearman']:.4f}")

# Create tuned ensemble plots
plt.figure(figsize=(8, 6))
plt.scatter(
    tuned_ensemble_results['original_values'], 
    tuned_ensemble_results['original_predictions'], 
    alpha=0.3
)
plt.plot([0, tuned_ensemble_results['original_values'].max()], 
         [0, tuned_ensemble_results['original_values'].max()], 'r--')
plt.title('Tuned Ensemble: Actual vs. Predicted')
plt.xlabel('Actual Resolution Hours')
plt.ylabel('Predicted Resolution Hours')
plt.savefig(f'{results_dir}/tuned_ensemble_predictions.png')
plt.close()

# Add tuned ensemble to results
tuned_model_results['Ensemble'] = tuned_ensemble_results

# Compare original vs. tuned model performance
model_comparison = []
for name in base_models.keys():
    original_results = model_results[name]
    tuned_results = tuned_model_results[name]
    
    model_comparison.append({
        'Model': name,
        'Original MAE': original_results['mae'],
        'Tuned MAE': tuned_results['mae'],
        'MAE Improvement': original_results['mae'] - tuned_results['mae'],
        'MAE Improvement %': (original_results['mae'] - tuned_results['mae']) / original_results['mae'] * 100,
        'Original RMSE': original_results['rmse'],
        'Tuned RMSE': tuned_results['rmse'],
        'RMSE Improvement': original_results['rmse'] - tuned_results['rmse'],
        'RMSE Improvement %': (original_results['rmse'] - tuned_results['rmse']) / original_results['rmse'] * 100,
        'Original R²': original_results['r2'],
        'Tuned R²': tuned_results['r2'],
        'R² Improvement': tuned_results['r2'] - original_results['r2'],
        'Original Spearman': original_results['spearman'],
        'Tuned Spearman': tuned_results['spearman'],
        'Spearman Improvement': tuned_results['spearman'] - original_results['spearman']
    })

# Add ensemble comparison
model_comparison.append({
    'Model': 'Ensemble',
    'Original MAE': model_results['Ensemble']['mae'],
    'Tuned MAE': tuned_ensemble_results['mae'],
    'MAE Improvement': model_results['Ensemble']['mae'] - tuned_ensemble_results['mae'],
    'MAE Improvement %': (model_results['Ensemble']['mae'] - tuned_ensemble_results['mae']) / model_results['Ensemble']['mae'] * 100,
    'Original RMSE': model_results['Ensemble']['rmse'],
    'Tuned RMSE': tuned_ensemble_results['rmse'],
    'RMSE Improvement': model_results['Ensemble']['rmse'] - tuned_ensemble_results['rmse'],
    'RMSE Improvement %': (model_results['Ensemble']['rmse'] - tuned_ensemble_results['rmse']) / model_results['Ensemble']['rmse'] * 100,
    'Original R²': model_results['Ensemble']['r2'],
    'Tuned R²': tuned_ensemble_results['r2'],
    'R² Improvement': tuned_ensemble_results['r2'] - model_results['Ensemble']['r2'],
    'Original Spearman': model_results['Ensemble']['spearman'],
    'Tuned Spearman': tuned_ensemble_results['spearman'],
    'Spearman Improvement': tuned_ensemble_results['spearman'] - model_results['Ensemble']['spearman']
})

# Create comparison DataFrame
comparison_df = pd.DataFrame(model_comparison)
print("\nComparison of Original vs. Tuned Models:")
print(comparison_df[['Model', 'Original MAE', 'Tuned MAE', 'MAE Improvement %', 
                     'Original R²', 'Tuned R²', 'R² Improvement',
                     'Original Spearman', 'Tuned Spearman', 'Spearman Improvement']])

# Save comparison
comparison_df.to_csv(f'{results_dir}/model_comparison.csv', index=False)

# Create comparison directory
comparison_dir = f'{results_dir}/comparison'
os.makedirs(comparison_dir, exist_ok=True)

# Visualize improvements
# MAE comparison
plt.figure(figsize=(10, 6))
barWidth = 0.35
r1 = np.arange(len(comparison_df))
r2 = [x + barWidth for x in r1]

plt.bar(r1, comparison_df['Original MAE'], width=barWidth, label='Original', color='skyblue')
plt.bar(r2, comparison_df['Tuned MAE'], width=barWidth, label='Tuned', color='lightgreen')
plt.title('MAE Comparison: Original vs. Tuned Models')
plt.xlabel('Model')
plt.ylabel('Mean Absolute Error')
plt.xticks([r + barWidth/2 for r in range(len(comparison_df))], comparison_df['Model'])
plt.legend()
plt.savefig(f'{comparison_dir}/MAE_comparison.png')
plt.close()

# RMSE comparison
plt.figure(figsize=(10, 6))
plt.bar(r1, comparison_df['Original RMSE'], width=barWidth, label='Original', color='skyblue')
plt.bar(r2, comparison_df['Tuned RMSE'], width=barWidth, label='Tuned', color='lightgreen')
plt.title('RMSE Comparison: Original vs. Tuned Models')
plt.xlabel('Model')
plt.ylabel('Root Mean Squared Error')
plt.xticks([r + barWidth/2 for r in range(len(comparison_df))], comparison_df['Model'])
plt.legend()
plt.savefig(f'{comparison_dir}/RMSE_comparison.png')
plt.close()

# R² comparison
plt.figure(figsize=(10, 6))
plt.bar(r1, comparison_df['Original R²'], width=barWidth, label='Original', color='skyblue')
plt.bar(r2, comparison_df['Tuned R²'], width=barWidth, label='Tuned', color='lightgreen')
plt.title('R² Comparison: Original vs. Tuned Models')
plt.xlabel('Model')
plt.ylabel('R²')
plt.xticks([r + barWidth/2 for r in range(len(comparison_df))], comparison_df['Model'])
plt.legend()
plt.savefig(f'{comparison_dir}/R2_comparison.png')
plt.close()

# Spearman comparison
plt.figure(figsize=(10, 6))
plt.bar(r1, comparison_df['Original Spearman'], width=barWidth, label='Original', color='skyblue')
plt.bar(r2, comparison_df['Tuned Spearman'], width=barWidth, label='Tuned', color='lightgreen')
plt.title('Spearman Correlation Comparison: Original vs. Tuned Models')
plt.xlabel('Model')
plt.ylabel('Spearman Correlation')
plt.xticks([r + barWidth/2 for r in range(len(comparison_df))], comparison_df['Model'])
plt.legend()
plt.savefig(f'{comparison_dir}/Spearman_comparison.png')
plt.close()

# Improvement heatmap
improvement_data = comparison_df[['Model', 'MAE Improvement %', 'RMSE Improvement %', 'R² Improvement', 'Spearman Improvement']]
improvement_data = improvement_data.set_index('Model')

plt.figure(figsize=(10, 6))
sns.heatmap(improvement_data, annot=True, cmap='RdYlGn', center=0, fmt='.2f')
plt.title('Model Improvement Heatmap')
plt.tight_layout()
plt.savefig(f'{comparison_dir}/improvement_heatmap.png')
plt.close()

# Best model comparison
plt.figure(figsize=(12, 6))

# Find best model based on Spearman correlation
best_model_idx = comparison_df['Tuned Spearman'].idxmax()
best_model = comparison_df.iloc[best_model_idx]['Model']

metrics = ['MAE', 'RMSE', 'R²', 'Spearman']
original_values = [comparison_df.iloc[best_model_idx]['Original MAE'],
                  comparison_df.iloc[best_model_idx]['Original RMSE'],
                  comparison_df.iloc[best_model_idx]['Original R²'],
                  comparison_df.iloc[best_model_idx]['Original Spearman']]
tuned_values = [comparison_df.iloc[best_model_idx]['Tuned MAE'],
                comparison_df.iloc[best_model_idx]['Tuned RMSE'],
                comparison_df.iloc[best_model_idx]['Tuned R²'],
                comparison_df.iloc[best_model_idx]['Tuned Spearman']]

# Create barplot
plt.subplot(1, 2, 1)
barWidth = 0.35
r1 = np.arange(len(metrics))
r2 = [x + barWidth for x in r1]

plt.bar(r1, original_values[:2], width=barWidth, label='Original', color='skyblue')  # MAE and RMSE
plt.bar(r2, tuned_values[:2], width=barWidth, label='Tuned', color='lightgreen')  # MAE and RMSE
plt.title(f'Best Model: {best_model} (Error Metrics)')
plt.ylabel('Error Value (lower is better)')
plt.xticks([r + barWidth/2 for r in range(2)], metrics[:2])
plt.legend()

# Create barplot for R² and Spearman
plt.subplot(1, 2, 2)
plt.bar(r1[2:], original_values[2:], width=barWidth, label='Original', color='skyblue')  # R² and Spearman
plt.bar(r2[2:], tuned_values[2:], width=barWidth, label='Tuned', color='lightgreen')  # R² and Spearman
plt.title(f'Best Model: {best_model} (Correlation Metrics)')
plt.ylabel('Correlation Value (higher is better)')
plt.xticks([r + barWidth/2 for r in range(2)], metrics[2:])
plt.legend()

plt.tight_layout()
plt.savefig(f'{comparison_dir}/best_models_comparison.png')
plt.close()

## 6. Final Model Evaluation and Selection

In [None]:
# Final model evaluation on the test set
print("\nEvaluating best model on the test set...")

# Find the best model based on validation results
model_names = list(tuned_model_results.keys())
model_names.remove('Ensemble')  # Remove ensemble from consideration

best_model_name = model_names[0]
best_score = tuned_model_results[best_model_name]['spearman']

for name in model_names[1:]:
    if tuned_model_results[name]['spearman'] > best_score:
        best_score = tuned_model_results[name]['spearman']
        best_model_name = name

print(f"Best model is {best_model_name} with validation Spearman correlation of {best_score:.4f}")

# Load the best model
with open(f'{results_dir}/tuned_{best_model_name.replace(" ", "_")}_model.pkl', 'rb') as f:
    best_model = pickle.load(f)

# Evaluate on test set
test_results = evaluate_model(best_model, X_test, y_test, f"Best Model ({best_model_name})", is_val=False)

# Save as the best tuned model
with open(f'{results_dir}/best_tuned_model.pkl', 'wb') as f:
    pickle.dump(best_model, f)

# Generate feature importance for the best model
if hasattr(best_model, 'feature_importances_'):
    best_importance = plot_feature_importance(best_model, f"Best Model ({best_model_name})")
    best_importance.to_csv(f'{results_dir}/best_tuned_model_feature_importance.csv', index=False)
    
    # Plot feature importance by category
    if best_importance is not None:
        # Add category to importance DataFrame
        best_importance['Category'] = best_importance['Feature'].apply(format_column_group)
        
        # Aggregate by category
        category_importance = best_importance.groupby('Category')['Importance'].sum().reset_index()
        category_importance = category_importance.sort_values('Importance', ascending=False)
        
        # Plot category importance
        plt.figure(figsize=(10, 6))
        plt.pie(category_importance['Importance'], labels=category_importance['Category'], 
                autopct='%1.1f%%', startangle=90)
        plt.axis('equal')  # Equal aspect ratio ensures that pie is drawn as a circle
        plt.title(f'Feature Importance by Category')
        plt.tight_layout()
        plt.savefig(f'{results_dir}/best_tuned_model_feature_importance_by_category.png')
        plt.close()
        
        # Also create a bar chart of category importance
        plt.figure(figsize=(10, 6))
        sns.barplot(x='Importance', y='Category', data=category_importance)
        plt.title('Feature Category Importance')
        plt.tight_layout()
        plt.savefig(f'{results_dir}/category_importance_bar.png')
        plt.close()
        
        # Save category importance
        category_importance.to_csv(f'{results_dir}/category_importance.csv', index=False)

# Create text summary of the best model
with open(f'{results_dir}/model_summary.txt', 'w') as f:
    f.write(f"# Task Effort Estimation Model Summary\n\n")
    f.write(f"## Best Model: {best_model_name}\n\n")
    
    f.write(f"### Test Set Performance\n")
    f.write(f"- MAE: {test_results['mae']:.2f} hours\n")
    f.write(f"- RMSE: {test_results['rmse']:.2f} hours\n")
    f.write(f"- R²: {test_results['r2']:.4f}\n")
    f.write(f"- Spearman Correlation: {test_results['spearman']:.4f}\n\n")
    
    if hasattr(best_model, 'feature_importances_'):
        f.write(f"### Top 10 Most Important Features\n")
        for idx, row in best_importance.head(10).iterrows():
            f.write(f"- {row['Feature']}: {row['Importance']:.4f}\n")
        
        f.write(f"\n### Feature Importance by Category\n")
        for idx, row in category_importance.iterrows():
            f.write(f"- {row['Category']}: {row['Importance']:.4f} ({row['Importance']*100:.1f}%)\n")
    
    f.write(f"\n### Model Hyperparameters\n")
    for param, value in best_model.get_params().items():
        f.write(f"- {param}: {value}\n")

print("\nFinal model evaluation complete. Results saved to the 'model_summary.txt' file.")

## 7. Task Effort Predictor Function

In [None]:
# Create a prediction function for new tasks
def predict_task_effort(task_features, model_path=f'{results_dir}/best_tuned_model.pkl'):
    """
    Predict resolution hours for a new task.
    
    Parameters:
    -----------
    task_features : dict
        Dictionary of task features
    model_path : str
        Path to the trained model pickle file
    
    Returns:
    --------
    dict
        Dictionary containing the prediction and additional information
    """
    # Load the trained model
    with open(model_path, 'rb') as f:
        model = pickle.load(f)
    
    # Load the feature names used during training
    with open(f'{results_dir}/data_splits.pkl', 'rb') as f:
        splits = pickle.load(f)
        feature_names = splits['feature_names']
    
    # Convert task features to DataFrame
    task_df = pd.DataFrame([task_features])
    
    # Ensure all required features are present
    for feat in feature_names:
        if feat not in task_df.columns:
            task_df[feat] = 0  # Default value for missing features
    
    # Keep only the features used in training
    task_df = task_df[feature_names]
    
    # Make prediction
    pred_log = model.predict(task_df)[0]
    
    # Convert from log space
    pred_hours = np.expm1(pred_log)
    
    # Calculate prediction intervals (approximate)
    lower_bound = pred_hours * 0.7  # 30% lower than prediction
    upper_bound = pred_hours * 1.3  # 30% higher than prediction
    
    return {
        'prediction_hours': pred_hours,
        'prediction_days': pred_hours / 24,
        'prediction_work_days': pred_hours / 8,  # Assuming 8-hour workdays
        'lower_bound': lower_bound,
        'upper_bound': upper_bound
    }

# Example usage
print("\nExample Task Effort Prediction:")

# Create example task features
example_task = {
    'is_type_bug': 1,
    'is_priority_major': 1,
    'inward_count': 2,
    'outward_count': 1,
    'age_days': 5,
    'created_is_weekend': 0,
    'created_hour': 14,
    'is_completed': 0
}

# Make prediction
prediction = predict_task_effort(example_task)
print(f"Predicted resolution time: {prediction['prediction_hours']:.2f} hours")
print(f"Equivalent to: {prediction['prediction_days']:.2f} days or {prediction['prediction_work_days']:.2f} work days")
print(f"Prediction interval: {prediction['lower_bound']:.2f} to {prediction['upper_bound']:.2f} hours")

# Save the predictor function as a separate Python module
with open(f'{results_dir}/task_effort_predictor.py', 'w') as f:
    f.write("""
import pandas as pd
import numpy as np
import pickle
import os

def predict_task_effort(task_features, model_path='best_tuned_model.pkl'):
    """
    Predict resolution hours for a new task.
    
    Parameters:
    -----------
    task_features : dict
        Dictionary of task features
    model_path : str
        Path to the trained model pickle file
    
    Returns:
    --------
    dict
        Dictionary containing the prediction and additional information
    """
    # Load the trained model
    with open(model_path, 'rb') as f:
        model = pickle.load(f)
    
    # Load the feature names used during training
    with open('data_splits.pkl', 'rb') as f:
        splits = pickle.load(f)
        feature_names = splits['feature_names']
    
    # Convert task features to DataFrame
    task_df = pd.DataFrame([task_features])
    
    # Ensure all required features are present
    for feat in feature_names:
        if feat not in task_df.columns:
            task_df[feat] = 0  # Default value for missing features
    
    # Keep only the features used in training
    task_df = task_df[feature_names]
    
    # Make prediction
    pred_log = model.predict(task_df)[0]
    
    # Convert from log space
    pred_hours = np.expm1(pred_log)
    
    # Calculate prediction intervals (approximate)
    lower_bound = pred_hours * 0.7  # 30% lower than prediction
    upper_bound = pred_hours * 1.3  # 30% higher than prediction
    
    return {
        'prediction_hours': pred_hours,
        'prediction_days': pred_hours / 24,
        'prediction_work_days': pred_hours / 8,  # Assuming 8-hour workdays
        'lower_bound': lower_bound,
        'upper_bound': upper_bound
    }

def get_task_effort_range(task_type, priority, complexity):
    """
    Get predefined effort ranges based on task type, priority, and complexity.
    This is a simplified example that could be expanded with more detailed logic.
    
    Parameters:
    -----------
    task_type : str
        Type of task (e.g. 'bug', 'feature', 'improvement')
    priority : str
        Priority level (e.g. 'low', 'medium', 'high', 'critical')
    complexity : str
        Complexity level (e.g. 'low', 'medium', 'high')
    
    Returns:
    --------
    dict
        Dictionary with min, max, and typical effort in hours
    """
    # Define base hours by task type
    if task_type.lower() == 'bug':
        base_hours = 8  # 1 day
    elif task_type.lower() == 'feature':
        base_hours = 24  # 3 days
    elif task_type.lower() == 'improvement':
        base_hours = 16  # 2 days
    else:
        base_hours = 12  # 1.5 days (default)
    
    # Apply priority multiplier
    if priority.lower() == 'critical':
        priority_mult = 0.8  # Critical items might be fixed faster
    elif priority.lower() == 'high':
        priority_mult = 0.9
    elif priority.lower() == 'low':
        priority_mult = 1.2
    else:  # medium
        priority_mult = 1.0
    
    # Apply complexity multiplier
    if complexity.lower() == 'high':
        complexity_mult = 2.0
    elif complexity.lower() == 'low':
        complexity_mult = 0.5
    else:  # medium
        complexity_mult = 1.0
    
    # Calculate typical hours
    typical_hours = base_hours * priority_mult * complexity_mult
    
    # Define range
    return {
        'min_hours': typical_hours * 0.7,
        'typical_hours': typical_hours,
        'max_hours': typical_hours * 1.5,
        'work_days': typical_hours / 8
    }
""")

print("\nTask effort predictor module saved to 'task_effort_predictor.py'")

## 8. Conclusion and Insights

In [None]:
# Summarize key findings and insights
print("\nKey Findings and Insights:")

# Load the best model metrics
with open(f'{results_dir}/model_summary.txt', 'r') as f:
    model_summary = f.read()

# Extract metrics from summary
import re
mae_match = re.search(r'MAE: ([0-9.]+)', model_summary)
rmse_match = re.search(r'RMSE: ([0-9.]+)', model_summary)
r2_match = re.search(r'R²: ([0-9.]+)', model_summary)
spearman_match = re.search(r'Spearman Correlation: ([0-9.]+)', model_summary)

mae = float(mae_match.group(1)) if mae_match else 0
rmse = float(rmse_match.group(1)) if rmse_match else 0
r2 = float(r2_match.group(1)) if r2_match else 0
spearman = float(spearman_match.group(1)) if spearman_match else 0

# Print model metrics
print(f"1. Model Performance:")
print(f"   - Our best model achieves a Mean Absolute Error of {mae:.2f} hours")
print(f"   - The Spearman correlation of {spearman:.4f} indicates a moderate to strong ability to rank tasks by effort")
print(f"   - The model explains approximately {r2*100:.1f}% of the variance in task resolution time")

# Load category importance
try:
    category_importance = pd.read_csv(f'{results_dir}/category_importance.csv')
    print(f"\n2. Most Important Feature Categories:")
    for idx, row in category_importance.head(3).iterrows():
        print(f"   - {row['Category']}: {row['Importance']*100:.1f}% importance")
except:
    pass

# Load feature importance
try:
    feature_importance = pd.read_csv(f'{results_dir}/best_tuned_model_feature_importance.csv')
    print(f"\n3. Top 5 Individual Features:")
    for idx, row in feature_importance.head(5).iterrows():
        print(f"   - {row['Feature']}: {row['Importance']*100:.1f}% importance")
except:
    pass

# Create final visualization summarizing actual vs. predicted
print("\nGenerating final visualization of model performance...")

# Get predictions from best model on test set
y_pred = best_model.predict(X_test)
y_test_orig = np.expm1(y_test)
y_pred_orig = np.expm1(y_pred)

# Create scatter plot with hexbin for density
plt.figure(figsize=(10, 8))
plt.hexbin(y_test_orig, y_pred_orig, gridsize=50, cmap='viridis', bins='log')
plt.plot([0, y_test_orig.max()], [0, y_test_orig.max()], 'r--', linewidth=2)
plt.xlabel('Actual Resolution Hours', fontsize=14)
plt.ylabel('Predicted Resolution Hours', fontsize=14)
plt.title('Task Effort Estimation: Actual vs. Predicted', fontsize=16)

# Add annotation with model performance
plt.annotate(
    f"MAE: {mae:.1f} hours\nRMSE: {rmse:.1f} hours\nR²: {r2:.3f}\nSpearman: {spearman:.3f}", 
    xy=(0.05, 0.95), xycoords='axes fraction',
    bbox=dict(boxstyle="round,pad=0.3", fc="white", ec="gray", alpha=0.8),
    fontsize=12, verticalalignment='top'
)

cb = plt.colorbar()
cb.set_label('Log Count (Density)', fontsize=12)
plt.tight_layout()
plt.savefig(f'{results_dir}/actual_vs_predicted.png', dpi=300)
plt.savefig('task_effort_prediction.png', dpi=300)  # Save a copy in the main directory
plt.close()

print("\nSummary visualization saved as 'task_effort_prediction.png'")
print("\nTask effort estimation model complete!")