In [ ]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import SelectKBest, f_regression, mutual_info_regression
from sklearn.ensemble import RandomForestRegressor, ExtraTreesRegressor, GradientBoostingRegressor
from sklearn.linear_model import Lasso, ElasticNet
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor
from sklearn.neural_network import MLPRegressor
from xgboost import XGBRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.cluster import KMeans
import pickle
import time
import os
import warnings
import glob
from joblib import Parallel, delayed
warnings.filterwarnings('ignore')

# Create directory for results
results_dir = 'task_estimation_results'
os.makedirs(results_dir, exist_ok=True)

# 1. Load the project-level metrics
print("Loading project-level metrics...")
# Load the project classification model results
try:
    with open('../Models/Results/project_classification_results/results/cluster_profiles.json', 'r') as f:
        import json
        project_clusters = json.load(f)
    print("Loaded project cluster profiles")
    
    # Find all project models
    model_files = glob.glob('../Models/*_*.pkl')
    project_models = [os.path.basename(f) for f in model_files]
    print(f"Found {len(project_models)} project-level estimation models")
except Exception as e:
    print(f"Error loading project models: {e}")
    project_clusters = {}
    project_models = []

# 2. Load the pre-processed scaled feature data
print("\nLoading scaled feature data...")
try:
    # Load the preprocessed scaled features file
    task_data = pd.read_csv('../prepared_processed_data/common_features_scaled_with_original_targets.csv')
    print(f"Task data loaded: {task_data.shape[0]} tasks, {task_data.shape[1]} features")
except FileNotFoundError:
    raise ValueError("Could not find common_features_scaled_with_original_targets.csv file")

# Check for any remaining non-numeric columns
non_numeric_cols = task_data.select_dtypes(exclude=np.number).columns.tolist()
print(f"Found {len(non_numeric_cols)} non-numeric columns that need handling")

# 3. Identify project information from the data
print("\nIdentifying project information...")
if 'remainder__project_id' in task_data.columns:
    # Extract unique project IDs
    project_ids = task_data['remainder__project_id'].unique()
    print(f"Found {len(project_ids)} unique projects in the data")
    
    # Map projects to their cluster assignments if available
    if project_clusters:
        # Map project IDs to their cluster assignments
        project_to_cluster = {}
        for cluster_id, cluster_info in project_clusters.items():
            if 'projects' in cluster_info:
                for project in cluster_info['projects']:
                    project_to_cluster[project] = cluster_id
        
        # Add cluster information to the task data
        task_data['project_cluster'] = task_data['remainder__project_id'].map(
            lambda x: project_to_cluster.get(str(x), -1)
        )
        
        # Count tasks by cluster
        cluster_counts = task_data['project_cluster'].value_counts()
        print("Task distribution by project cluster:")
        print(cluster_counts)
    else:
        print("No project cluster information available")
else:
    print("No project ID column found in the data")
    project_ids = []

# 4. Prepare the target variable - resolution time in hours
print("\nPreparing target variable...")

# Select the appropriate target variable from scaled dataset
target_variable = 'avg_resolution_hours'
if target_variable not in task_data.columns:
    # Try alternative target variables
    target_candidates = ['median_resolution_hours', 'resolution_hours', 'total_resolution_hours']
    for candidate in target_candidates:
        if candidate in task_data.columns:
            target_variable = candidate
            break
    else:
        raise ValueError("No suitable resolution time target found in the dataset")

# Filter out invalid target values
task_data = task_data.dropna(subset=[target_variable])
task_data = task_data[task_data[target_variable] >= 0]
task_data = task_data[task_data[target_variable] <= 10000]  # Cap at ~417 days

print(f"\nTarget variable statistics:")
print(f"  Mean: {task_data[target_variable].mean():.2f} hours")
print(f"  Median: {task_data[target_variable].median():.2f} hours")
print(f"  Min: {task_data[target_variable].min():.2f} hours")
print(f"  Max: {task_data[target_variable].max():.2f} hours")

# 5. Feature preparation
print("\nPreparing features...")
feature_data = task_data.copy()

# 5.1 Split features into categories
scaled_features = [col for col in feature_data.columns if any(prefix in col for prefix in 
                                                             ['time_power__', 'pct_minmax__', 'count_std__', 
                                                              'stat_robust__', 'link_std__'])]

remainder_features = [col for col in feature_data.columns if 'remainder__' in col]

# List all numeric features for modeling (excluding target variables)
target_vars = ['avg_resolution_hours', 'median_resolution_hours', 'min_resolution_hours', 
               'max_resolution_hours', 'resolution_hours_std', 'total_resolution_hours']

# Combine all features
numeric_cols = []
numeric_cols.extend(scaled_features)

# Add remainder features excluding target variables and project ID
for col in remainder_features:
    if col not in target_vars and col != 'remainder__project_id':
        numeric_cols.append(col)

# Add project cluster if available
if 'project_cluster' in feature_data.columns:
    numeric_cols.append('project_cluster')

print(f"Using {len(numeric_cols)} features")
print(f"Final dataset shape: {feature_data.shape}")

# 6. Split features and target
print("\nPreparing data for modeling...")
X = feature_data[numeric_cols]
y = feature_data[target_variable]

# Apply log transformation to handle skew in target
use_log_transform = True
if use_log_transform:
    print("Applying log transformation to target variable")
    y = np.log1p(y)  # log(1+x) to handle zeros

# 7. Data partitioning: 50% training, 25% validation, 25% testing
# First split: 50% training, 50% remaining
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.5, random_state=42)

# Second split: divide the remaining 50% into equal parts for validation and testing
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

print("Data partitioning complete:")
print(f"  Training set: {X_train.shape[0]} samples ({X_train.shape[0]/len(X)*100:.1f}%)")
print(f"  Validation set: {X_val.shape[0]} samples ({X_val.shape[0]/len(X)*100:.1f}%)")
print(f"  Test set: {X_test.shape[0]} samples ({X_test.shape[0]/len(X)*100:.1f}%)")

# 8. Exploratory visualization
plt.figure(figsize=(12, 4))

# Training set target distribution
plt.subplot(131)
plt.hist(y_train, bins=30, alpha=0.7)
plt.title('Training Target Distribution')
plt.xlabel('Log(Resolution Hours)')
plt.ylabel('Frequency')

# Validation set target distribution
plt.subplot(132)
plt.hist(y_val, bins=30, alpha=0.7)
plt.title('Validation Target Distribution')
plt.xlabel('Log(Resolution Hours)')

# Test set target distribution
plt.subplot(133)
plt.hist(y_test, bins=30, alpha=0.7)
plt.title('Test Target Distribution')
plt.xlabel('Log(Resolution Hours)')

plt.tight_layout()
plt.savefig(f'{results_dir}/target_distributions.png')
plt.show()

print(f"\nTarget variable statistics:")
print(f"  Training: mean={y_train.mean():.2f}, median={y_train.median():.2f}, min={y_train.min():.2f}, max={y_train.max():.2f}")
print(f"  Validation: mean={y_val.mean():.2f}, median={y_val.median():.2f}, min={y_val.min():.2f}, max={y_val.max():.2f}")
print(f"  Test: mean={y_test.mean():.2f}, median={y_test.median():.2f}, min={y_test.min():.2f}, max={y_test.max():.2f}")

# Save the splits for reproducibility
splits = {
    'X_train': X_train,
    'y_train': y_train,
    'X_val': X_val,
    'y_val': y_val,
    'X_test': X_test,
    'y_test': y_test,
    'log_transform': use_log_transform
}

with open(f'{results_dir}/data_splits.pkl', 'wb') as f:
    pickle.dump(splits, f)

print("\nInitial setup complete. Ready for model training.")

In [ ]:
# 9. Model Training and Evaluation
print("Training and evaluating regression models...")

# Create a stratified model approach based on project clusters if available
if 'project_cluster' in X.columns and len(X['project_cluster'].unique()) > 1:
    cluster_based_modeling = True
    print(f"Using cluster-based modeling approach with {len(X['project_cluster'].unique())} clusters")
    
    # Store models per cluster
    cluster_models = {}
else:
    cluster_based_modeling = False
    print("Using global modeling approach (no cluster stratification)")

# Skip standardization - features are already scaled in the dataset
X_train_scaled = X_train
X_val_scaled = X_val

# Define a function to evaluate models
def evaluate_model(model, X_val, y_val, model_name, cluster_id=None):
    # Make predictions
    y_pred = model.predict(X_val)
    
    # Convert from log space if necessary
    if use_log_transform:
        y_val_orig = np.expm1(y_val)
        y_pred_orig = np.expm1(y_pred)
    else:
        y_val_orig = y_val
        y_pred_orig = y_pred
    
    # Calculate metrics
    mae = mean_absolute_error(y_val_orig, y_pred_orig)
    rmse = np.sqrt(mean_squared_error(y_val_orig, y_pred_orig))
    r2 = r2_score(y_val_orig, y_pred_orig)
    
    # Calculate median absolute error and mean absolute percentage error
    median_ae = np.median(np.abs(y_val_orig - y_pred_orig))
    mape = np.mean(np.abs((y_val_orig - y_pred_orig) / (y_val_orig + 1))) * 100  # Adding 1 to avoid division by zero
    
    suffix = f" (Cluster {cluster_id})" if cluster_id is not None else ""
    print(f"\n{model_name}{suffix}:")
    print(f"  MAE: {mae:.2f} hours")
    print(f"  RMSE: {rmse:.2f} hours")
    print(f"  MedianAE: {median_ae:.2f} hours")
    print(f"  MAPE: {mape:.2f}%")
    print(f"  R²: {r2:.4f}")
    
    # Create a scatter plot of actual vs. predicted values
    plt.figure(figsize=(8, 6))
    plt.scatter(y_val_orig, y_pred_orig, alpha=0.5)
    plt.plot([0, y_val_orig.max()], [0, y_val_orig.max()], 'r--')
    model_file_name = model_name.replace(" ", "_")
    if cluster_id is not None:
        model_file_name += f"_cluster_{cluster_id}"
    plt.title(f'{model_name}{suffix}: Actual vs. Predicted')
    plt.xlabel('Actual Resolution Hours')
    plt.ylabel('Predicted Resolution Hours')
    plt.savefig(f'{results_dir}/{model_file_name}_predictions.png')
    plt.show()
    
    # Return the metrics
    return {
        'model': model,
        'name': f"{model_name}{suffix}",
        'cluster': cluster_id,
        'mae': mae,
        'rmse': rmse,
        'median_ae': median_ae,
        'mape': mape,
        'r2': r2
    }

# Initialize models
base_models = {
    'Random Forest': RandomForestRegressor(n_estimators=100, random_state=42),
    'Gradient Boosting': GradientBoostingRegressor(n_estimators=100, random_state=42),
    'XGBoost': XGBRegressor(n_estimators=100, learning_rate=0.1, random_state=42),
    'Extra Trees': ExtraTreesRegressor(n_estimators=100, random_state=42),
    'Elastic Net': ElasticNet(alpha=1.0, l1_ratio=0.5, random_state=42)
}

# Train and evaluate models
results = []

# Define function to train models for a specific cluster or globally
def train_models_for_data(X_train, y_train, X_val, y_val, models, cluster_id=None):
    local_results = []
    suffix = f" (Cluster {cluster_id})" if cluster_id is not None else ""
    
    for name, model in models.items():
        print(f"\nTraining {name}{suffix}...")
        start_time = time.time()
        model.fit(X_train, y_train)
        train_time = time.time() - start_time
        print(f"Training completed in {train_time:.2f} seconds")
        
        # Evaluate the model
        result = evaluate_model(model, X_val, y_val, name, cluster_id)
        result['train_time'] = train_time
        local_results.append(result)
        
        # Save the model
        model_file_name = name.replace(" ", "_")
        if cluster_id is not None:
            model_file_name += f"_cluster_{cluster_id}"
            
        with open(f'{results_dir}/{model_file_name}_model.pkl', 'wb') as f:
            pickle.dump(model, f)
    
    return local_results

# If using cluster-based modeling, train separate models per cluster
if cluster_based_modeling:
    for cluster_id in sorted(X_train['project_cluster'].unique()):
        if cluster_id == -1:  # Skip unknown cluster
            continue
            
        print(f"\n=== Training models for Cluster {cluster_id} ===")
        
        # Get data for this cluster
        X_train_cluster = X_train[X_train['project_cluster'] == cluster_id]
        y_train_cluster = y_train.loc[X_train_cluster.index]
        
        X_val_cluster = X_val[X_val['project_cluster'] == cluster_id]
        y_val_cluster = y_val.loc[X_val_cluster.index]
        
        print(f"Cluster {cluster_id} train size: {len(X_train_cluster)}, validation size: {len(X_val_cluster)}")
        
        if len(X_train_cluster) < 50 or len(X_val_cluster) < 20:
            print(f"Skipping Cluster {cluster_id} due to insufficient data")
            continue
        
        # Train models for this cluster
        cluster_results = train_models_for_data(
            X_train_cluster, y_train_cluster, 
            X_val_cluster, y_val_cluster,
            base_models, cluster_id
        )
        
        # Add cluster results to overall results
        results.extend(cluster_results)
        
        # Store best model for this cluster
        best_model_idx = max(range(len(cluster_results)), key=lambda i: cluster_results[i]['r2'])
        cluster_models[cluster_id] = cluster_results[best_model_idx]['model']

# Train global model for all data
print("\n=== Training global models ===")
global_results = train_models_for_data(X_train_scaled, y_train, X_val_scaled, y_val, base_models)
results.extend(global_results)

# Determine if we should use global model for unknown clusters
if cluster_based_modeling:
    # Check if we have data with unknown cluster
    X_train_unknown = X_train[X_train['project_cluster'] == -1]
    y_train_unknown = y_train.loc[X_train_unknown.index]
    
    X_val_unknown = X_val[X_val['project_cluster'] == -1]
    y_val_unknown = y_val.loc[X_val_unknown.index]
    
    if len(X_train_unknown) > 50 and len(X_val_unknown) > 20:
        print("\n=== Training models for Unknown Cluster ===")
        print(f"Unknown cluster train size: {len(X_train_unknown)}, validation size: {len(X_val_unknown)}")
        
        # Train models for unknown cluster
        unknown_results = train_models_for_data(
            X_train_unknown, y_train_unknown, 
            X_val_unknown, y_val_unknown,
            base_models, -1
        )
        
        # Add unknown cluster results to overall results
        results.extend(unknown_results)
        
        # Store best model for unknown cluster
        best_model_idx = max(range(len(unknown_results)), key=lambda i: unknown_results[i]['r2'])
        cluster_models[-1] = unknown_results[best_model_idx]['model']
    else:
        print("\nInsufficient data for Unknown Cluster. Will use global model.")
        
        # Use best global model for unknown cluster
        best_global_idx = max(range(len(global_results)), key=lambda i: global_results[i]['r2'])
        cluster_models[-1] = global_results[best_global_idx]['model']

# Summarize model performance
summary = pd.DataFrame([(r['name'], r.get('cluster', 'Global'), r['mae'], r['rmse'], r['r2'], r['mape'], r['train_time']) 
                       for r in results],
                      columns=['Model', 'Cluster', 'MAE (hours)', 'RMSE (hours)', 'R²', 'MAPE (%)', 'Train Time (s)'])

# Sort by R² (descending)
summary = summary.sort_values('R²', ascending=False)
print("\nModel Performance Summary:")
print(summary)

# Save the summary
summary.to_csv(f'{results_dir}/model_performance_summary.csv', index=False)

# Identify the best model
best_model_idx = summary['R²'].idxmax()
best_model_row = summary.loc[best_model_idx]
best_model_name = best_model_row['Model']
best_cluster = best_model_row['Cluster']

print(f"\nBest model overall: {best_model_name} with R² = {best_model_row['R²']:.4f}")

# Find the actual model object
if best_cluster == 'Global':
    best_model_idx = [i for i, r in enumerate(global_results) if r['name'] == best_model_name][0]
    best_model = global_results[best_model_idx]['model']
else:
    cluster_specific_results = [r for r in results if r.get('cluster') == best_cluster and r['name'] == best_model_name]
    best_model = cluster_specific_results[0]['model']

# Save the best model separately
with open(f'{results_dir}/best_model.pkl', 'wb') as f:
    pickle.dump(best_model, f)

# If using cluster-based approach, save the cluster models dictionary
if cluster_based_modeling:
    with open(f'{results_dir}/cluster_models.pkl', 'wb') as f:
        pickle.dump(cluster_models, f)
    print(f"Saved {len(cluster_models)} cluster-specific models")

print("Model training complete.")

In [ ]:
# 10. Feature Importance Analysis
print("Analyzing feature importance...")

# Create function to analyze feature importance for a model
def analyze_feature_importance(model, feature_names, title="Feature Importance", output_prefix="feature_importance"):
    # Extract feature importances if model supports it
    if hasattr(model, 'feature_importances_'):
        # Get feature importances
        importances = model.feature_importances_
        
        # Create feature importance DataFrame
        feature_importance = pd.DataFrame({
            'Feature': feature_names,
            'Importance': importances
        })
        
        # Sort by importance
        feature_importance = feature_importance.sort_values('Importance', ascending=False)
        
        # Display top 20 most important features
        print(f"\nTop 20 Most Important Features for {title}:")
        print(feature_importance.head(20))
        
        # Visualize feature importance
        plt.figure(figsize=(12, 8))
        # Plot top 20 features
        top_features = feature_importance.head(20)
        
        # Create horizontal bar plot
        plt.barh(top_features['Feature'][::-1], top_features['Importance'][::-1])
        plt.title(f'Top 20 Feature Importance - {title}')
        plt.xlabel('Importance')
        plt.tight_layout()
        plt.savefig(f'{results_dir}/{output_prefix}.png')
        plt.show()
        
        # Save feature importance
        feature_importance.to_csv(f'{results_dir}/{output_prefix}.csv', index=False)
        
        # Group features by categories (if they follow a pattern in their naming)
        if any('__' in feat for feat in feature_names):
            print(f"\nFeature importance by category for {title}:")
            # Extract category from feature name (assuming format like 'category__feature_name')
            feature_importance['Category'] = feature_importance['Feature'].apply(
                lambda x: x.split('__')[0] if '__' in x else 'Other'
            )
            
            # Aggregate importance by category
            category_importance = feature_importance.groupby('Category')['Importance'].sum().reset_index()
            category_importance = category_importance.sort_values('Importance', ascending=False)
            
            print(category_importance)
            
            # Visualize category importance
            plt.figure(figsize=(10, 6))
            plt.bar(category_importance['Category'], category_importance['Importance'])
            plt.title(f'Feature Category Importance - {title}')
            plt.xlabel('Category')
            plt.ylabel('Importance')
            plt.xticks(rotation=45)
            plt.tight_layout()
            plt.savefig(f'{results_dir}/{output_prefix}_by_category.png')
            plt.show()
            
            # Save category importance
            category_importance.to_csv(f'{results_dir}/{output_prefix}_by_category.csv', index=False)
        
        return feature_importance
    else:
        print(f"The model does not provide feature importances.")
        return None

# Analyze the best overall model
try:
    with open(f'{results_dir}/best_model.pkl', 'rb') as f:
        best_model = pickle.load(f)
    
    # Get best model name from model_performance_summary.csv
    summary = pd.read_csv(f'{results_dir}/model_performance_summary.csv')
    best_model_idx = summary['R²'].idxmax()
    best_model_row = summary.loc[best_model_idx]
    best_model_name = best_model_row['Model']
    best_cluster = best_model_row['Cluster']
    
    # Analyze feature importance for best model
    feature_names = X.columns
    title = f"Best Model ({best_model_name})"
    if best_cluster != 'Global':
        title += f" - Cluster {best_cluster}"
    
    overall_importance = analyze_feature_importance(best_model, feature_names, title, "best_model_feature_importance")
    
    # If we have cluster-specific models, analyze each cluster
    try:
        with open(f'{results_dir}/cluster_models.pkl', 'rb') as f:
            cluster_models = pickle.load(f)
        
        # Compare feature importance across clusters
        if len(cluster_models) > 1:
            print("\nComparing feature importance across clusters...")
            
            # Collect top 10 features from each cluster
            top_features_by_cluster = {}
            all_top_features = set()
            
            for cluster_id, model in cluster_models.items():
                if hasattr(model, 'feature_importances_'):
                    importances = model.feature_importances_
                    imp_df = pd.DataFrame({'Feature': X.columns, 'Importance': importances})
                    top10 = imp_df.sort_values('Importance', ascending=False).head(10)
                    top_features_by_cluster[cluster_id] = top10
                    all_top_features.update(top10['Feature'].tolist())
                    
                    # Analyze this cluster's model
                    analyze_feature_importance(
                        model, X.columns, 
                        f"Cluster {cluster_id}", 
                        f"cluster_{cluster_id}_feature_importance"
                    )
            
            # Create comparison chart of top features across clusters
            if all_top_features:
                all_top_features = list(all_top_features)
                comparison_data = []
                
                for feature in all_top_features:
                    row = {'Feature': feature}
                    for cluster_id, top10 in top_features_by_cluster.items():
                        feat_importance = top10[top10['Feature'] == feature]['Importance'].values
                        row[f'Cluster {cluster_id}'] = feat_importance[0] if len(feat_importance) > 0 else 0
                    comparison_data.append(row)
                
                comparison_df = pd.DataFrame(comparison_data)
                comparison_df = comparison_df.sort_values('Feature')
                
                # Save comparison
                comparison_df.to_csv(f'{results_dir}/feature_importance_by_cluster.csv', index=False)
                
                # Visualize comparison (top 15)
                top15_comparison = comparison_df.head(15)
                plt.figure(figsize=(14, 8))
                
                cluster_cols = [col for col in comparison_df.columns if 'Cluster' in col]
                
                # Plot as grouped bar chart
                bar_width = 0.8 / len(cluster_cols)
                for i, col in enumerate(cluster_cols):
                    plt.bar(
                        [x + i * bar_width for x in range(len(top15_comparison))],
                        top15_comparison[col],
                        width=bar_width,
                        label=col
                    )
                
                plt.xlabel('Feature')
                plt.ylabel('Importance')
                plt.title('Feature Importance Comparison Across Clusters')
                plt.xticks(
                    [x + bar_width * (len(cluster_cols) - 1) / 2 for x in range(len(top15_comparison))],
                    top15_comparison['Feature'],
                    rotation=90
                )
                plt.legend()
                plt.tight_layout()
                plt.savefig(f'{results_dir}/feature_importance_cluster_comparison.png')
                plt.show()
                    
    except Exception as e:
        print(f"Error analyzing cluster models: {e}")
            
except Exception as e:
    print(f"Error loading models: {e}")    
    
# If available, check if any feature categories are particularly important for certain clusters
if 'overall_importance' in locals() and overall_importance is not None:
    # Check which feature categories are most important
    if 'Category' in overall_importance.columns:
        top_categories = overall_importance.groupby('Category')['Importance'].sum().sort_values(ascending=False)
        
        print("\nTop feature categories for task estimation:")
        print(top_categories.head(5))
        
        # Analyze correlation between top features and resolution time
        try:
            # Get the top 20 features
            top_features = overall_importance['Feature'].head(20).tolist()
            
            # Get original target (non-log transformed)
            if use_log_transform:
                y_orig = np.expm1(y)
            else:
                y_orig = y
                
            # Create a copy of X with just top features
            top_features_df = X[top_features].copy()
            top_features_df['target'] = y_orig
            
            # Calculate correlations
            correlations = top_features_df.corr()['target'].sort_values(ascending=False)
            
            print("\nCorrelation of top features with resolution time:")
            print(correlations)
            
            # Plot correlation heatmap of top features
            plt.figure(figsize=(10, 8))
            corr_matrix = top_features_df.corr()
            sns.heatmap(corr_matrix, annot=False, cmap='coolwarm', center=0)
            plt.title('Correlation Matrix of Top Features')
            plt.tight_layout()
            plt.savefig(f'{results_dir}/top_features_correlation.png')
            plt.show()
            
        except Exception as e:
            print(f"Error analyzing correlations: {e}")

In [ ]:
# 11. Final Model Evaluation on Test Set
print("Final model evaluation on test set...")

# Load the best model
with open(f'{results_dir}/best_model.pkl', 'rb') as f:
    best_model = pickle.load(f)

# Check if we're using cluster-based modeling
try:
    with open(f'{results_dir}/cluster_models.pkl', 'rb') as f:
        cluster_models = pickle.load(f)
    cluster_based_modeling = True
    print(f"Using cluster-based modeling with {len(cluster_models)} clusters")
except:
    cluster_based_modeling = False
    print("Using global model for all test data")

# Define function to make predictions using cluster-based or global approach
def predict_with_appropriate_model(X_data):
    if not cluster_based_modeling or 'project_cluster' not in X_data.columns:
        # Use global model for all predictions
        return best_model.predict(X_data)
    else:
        # Make predictions using appropriate cluster model
        predictions = np.zeros(len(X_data))
        
        for cluster_id, model in cluster_models.items():
            # Get data for this cluster
            cluster_mask = X_data['project_cluster'] == cluster_id
            if cluster_mask.sum() > 0:
                # Predict with the appropriate model
                predictions[cluster_mask] = model.predict(X_data[cluster_mask])
        
        return predictions

# Apply to test data
X_test_scaled = X_test  # No scaling needed - features already scaled

# Make predictions using cluster-based approach if available
y_pred = predict_with_appropriate_model(X_test_scaled)

# Convert predictions back from log space
if use_log_transform:
    y_test_orig = np.expm1(y_test)
    y_pred_orig = np.expm1(y_pred)
else:
    y_test_orig = y_test
    y_pred_orig = y_pred

# Calculate metrics
mae = mean_absolute_error(y_test_orig, y_pred_orig)
rmse = np.sqrt(mean_squared_error(y_test_orig, y_pred_orig))
r2 = r2_score(y_test_orig, y_pred_orig)
median_ae = np.median(np.abs(y_test_orig - y_pred_orig))
mape = np.mean(np.abs((y_test_orig - y_pred_orig) / (y_test_orig + 1))) * 100  # Adding 1 to avoid division by zero

print(f"\nFinal Model Test Results:")
print(f"  MAE: {mae:.2f} hours")
print(f"  RMSE: {rmse:.2f} hours")
print(f"  MedianAE: {median_ae:.2f} hours")
print(f"  MAPE: {mape:.2f}%")
print(f"  R²: {r2:.4f}")

# Create a scatter plot of actual vs. predicted values
plt.figure(figsize=(10, 8))
plt.scatter(y_test_orig, y_pred_orig, alpha=0.5)
plt.plot([0, y_test_orig.max()], [0, y_test_orig.max()], 'r--')
plt.title('Final Model: Actual vs. Predicted')
plt.xlabel('Actual Resolution Hours')
plt.ylabel('Predicted Resolution Hours')
plt.savefig(f'{results_dir}/final_model_predictions.png')
plt.show()

# If we have cluster models, evaluate each cluster separately
if cluster_based_modeling and 'project_cluster' in X_test.columns:
    print("\nAnalyzing performance by cluster:")
    
    cluster_metrics = []
    for cluster_id in sorted(X_test['project_cluster'].unique()):
        # Skip clusters with very few samples
        cluster_mask = X_test['project_cluster'] == cluster_id
        if cluster_mask.sum() < 20:
            print(f"  Skipping Cluster {cluster_id} - insufficient test data ({cluster_mask.sum()} samples)")
            continue
            
        # Get data for this cluster
        X_test_cluster = X_test[cluster_mask]
        y_test_cluster = y_test.loc[X_test_cluster.index]
        
        # Get appropriate model for this cluster
        if cluster_id in cluster_models:
            model = cluster_models[cluster_id]
        else:
            # Use global model for unknown clusters
            model = best_model
            
        # Make predictions
        y_pred_cluster = model.predict(X_test_cluster)
        
        # Convert back from log space
        if use_log_transform:
            y_test_cluster_orig = np.expm1(y_test_cluster)
            y_pred_cluster_orig = np.expm1(y_pred_cluster)
        else:
            y_test_cluster_orig = y_test_cluster
            y_pred_cluster_orig = y_pred_cluster
            
        # Calculate metrics
        cluster_mae = mean_absolute_error(y_test_cluster_orig, y_pred_cluster_orig)
        cluster_rmse = np.sqrt(mean_squared_error(y_test_cluster_orig, y_pred_cluster_orig))
        cluster_r2 = r2_score(y_test_cluster_orig, y_pred_cluster_orig)
        cluster_median_ae = np.median(np.abs(y_test_cluster_orig - y_pred_cluster_orig))
        cluster_mape = np.mean(np.abs((y_test_cluster_orig - y_pred_cluster_orig) / (y_test_cluster_orig + 1))) * 100
        
        print(f"\n  Cluster {cluster_id} ({cluster_mask.sum()} test samples):")
        print(f"    MAE: {cluster_mae:.2f} hours")
        print(f"    RMSE: {cluster_rmse:.2f} hours")
        print(f"    MedianAE: {cluster_median_ae:.2f} hours")
        print(f"    MAPE: {cluster_mape:.2f}%")
        print(f"    R²: {cluster_r2:.4f}")
        
        # Store metrics
        cluster_metrics.append({
            'cluster_id': cluster_id,
            'test_samples': cluster_mask.sum(),
            'mae': cluster_mae,
            'rmse': cluster_rmse,
            'median_ae': cluster_median_ae,
            'mape': cluster_mape,
            'r2': cluster_r2
        })
        
        # Plot cluster-specific actual vs. predicted
        plt.figure(figsize=(8, 6))
        plt.scatter(y_test_cluster_orig, y_pred_cluster_orig, alpha=0.5)
        plt.plot([0, y_test_cluster_orig.max()], [0, y_test_cluster_orig.max()], 'r--')
        plt.title(f'Cluster {cluster_id}: Actual vs. Predicted')
        plt.xlabel('Actual Resolution Hours')
        plt.ylabel('Predicted Resolution Hours')
        plt.savefig(f'{results_dir}/cluster_{cluster_id}_final_predictions.png')
        plt.show()
    
    # Save cluster metrics
    if cluster_metrics:
        cluster_metrics_df = pd.DataFrame(cluster_metrics)
        cluster_metrics_df.to_csv(f'{results_dir}/cluster_performance_metrics.csv', index=False)
        
        # Plot comparative bar chart of R² by cluster
        plt.figure(figsize=(10, 6))
        cluster_ids = cluster_metrics_df['cluster_id'].tolist()
        r2_values = cluster_metrics_df['r2'].tolist()
        
        plt.bar(cluster_ids, r2_values)
        plt.axhline(y=r2, color='r', linestyle='--', label=f'Overall R² = {r2:.4f}')
        plt.title('Model Performance (R²) by Project Cluster')
        plt.xlabel('Cluster')
        plt.ylabel('R²')
        plt.ylim(0, 1)  # R² typically ranges from 0 to 1
        plt.legend()
        plt.savefig(f'{results_dir}/cluster_performance_comparison.png')
        plt.show()

# Save the test results
final_metrics = {
    'mae': mae,
    'rmse': rmse,
    'r2': r2,
    'median_ae': median_ae,
    'mape': mape,
    'cluster_based': cluster_based_modeling
}

with open(f'{results_dir}/final_test_metrics.json', 'w') as f:
    import json
    json.dump(final_metrics, f, indent=4)

print("\nFinal model evaluation complete.")

In [ ]:
# 12. Example: Predict Effort for New Tasks
print("Example: Predicting effort for new tasks")

# Check if we're using cluster-based modeling
try:
    with open(f'{results_dir}/cluster_models.pkl', 'rb') as f:
        cluster_models = pickle.load(f)
    cluster_based_modeling = True
    print(f"Using cluster-based prediction with {len(cluster_models)} clusters")
except:
    cluster_based_modeling = False
    with open(f'{results_dir}/best_model.pkl', 'rb') as f:
        best_model = pickle.load(f)
    print("Using global model for all predictions")

# Define function to predict task effort
def predict_task_effort(task_features, project_cluster=None):
    """
    Predict resolution hours for a new task using the pre-scaled feature space.
    
    Parameters:
    -----------
    task_features : dict
        Dictionary of task features (must match the feature names in the scaled dataset)
    project_cluster : int, optional
        Project cluster ID, if known
    
    Returns:
    --------
    float
        Predicted resolution hours
    """
    # Add cluster information if provided
    if project_cluster is not None and cluster_based_modeling:
        task_features['project_cluster'] = project_cluster
    
    # Convert to DataFrame
    task_df = pd.DataFrame([task_features])
    
    # Ensure all required features are present
    for col in X.columns:
        if col not in task_df.columns:
            task_df[col] = 0  # Default to 0 for missing features
    
    # Keep only the features used in training
    task_df = task_df[X.columns]
    
    # Choose appropriate model based on cluster
    if cluster_based_modeling and project_cluster is not None and project_cluster in cluster_models:
        model = cluster_models[project_cluster]
        print(f"Using cluster {project_cluster} specific model")
    elif cluster_based_modeling and project_cluster is not None and -1 in cluster_models:
        model = cluster_models[-1]  # Use unknown cluster model
        print("Using unknown cluster model")
    else:
        # Use best global model
        with open(f'{results_dir}/best_model.pkl', 'rb') as f:
            model = pickle.load(f)
        print("Using global model")
    
    # Make prediction (no scaling needed as we're already in the scaled feature space)
    pred_log = model.predict(task_df)[0]
    
    # Transform prediction back from log space if needed
    if use_log_transform:
        prediction = np.expm1(pred_log)
    else:
        prediction = pred_log
        
    return prediction

# Get a list of available project clusters
available_clusters = []
if cluster_based_modeling:
    available_clusters = sorted([cid for cid in cluster_models.keys() if cid != -1])
    print(f"Available project clusters: {available_clusters}")

print("\nWARNING: These are example predictions. In practice, you would need to:")
print("1. Transform new raw data into the same scaled feature space")
print("2. Ensure feature names match those in the training dataset exactly")
print("3. Apply the same preprocessing steps used on the original data\n")

# Get column names from dataset to create meaningful examples
# Get the most influential features based on importance
top_feature_names = []
try:
    with open(f'{results_dir}/best_model_feature_importance.csv', 'r') as f:
        feature_importance = pd.read_csv(f)
        top_feature_names = feature_importance.head(10)['Feature'].tolist()
except:
    # If no feature importance file, just use X columns
    if hasattr(best_model, 'feature_importances_'):
        importances = best_model.feature_importances_
        feat_importance = pd.DataFrame({'Feature': X.columns, 'Importance': importances})
        top_feature_names = feat_importance.sort_values('Importance', ascending=False).head(10)['Feature'].tolist()
    else:
        top_feature_names = X.columns[:10].tolist()

print(f"Top 10 influential features: {top_feature_names}")

# Create example predictions using values in the range of the training data
example_tasks = []

# Task 1: Low complexity task (use values from the 25th percentile of the dataset for key features)
task1 = {}
for feat in top_feature_names:
    task1[feat] = X[feat].quantile(0.25)
example_tasks.append({"name": "Low complexity task", "features": task1, "cluster": None})

# Task 2: Medium complexity task (use median values)
task2 = {}
for feat in top_feature_names:
    task2[feat] = X[feat].median()
example_tasks.append({"name": "Medium complexity task", "features": task2, "cluster": None})

# Task 3: High complexity task (use values from the 75th percentile)
task3 = {}
for feat in top_feature_names:
    task3[feat] = X[feat].quantile(0.75)
example_tasks.append({"name": "High complexity task", "features": task3, "cluster": None})

# Add cluster-specific examples if clusters are available
if available_clusters:
    # Choose a cluster for demonstration
    demo_cluster = available_clusters[0]
    
    # Get data specific to this cluster
    if 'project_cluster' in X.columns:
        X_cluster = X[X['project_cluster'] == demo_cluster]
        
        if len(X_cluster) > 0:
            # Cluster-specific task
            task_cluster = {}
            for feat in top_feature_names:
                task_cluster[feat] = X_cluster[feat].median()
            
            example_tasks.append({
                "name": f"Cluster {demo_cluster} specific task", 
                "features": task_cluster,
                "cluster": demo_cluster
            })

print("\nPredicted resolution times:")
for task in example_tasks:
    hours = predict_task_effort(task["features"], task["cluster"])
    days = hours / 24
    workdays = hours / 8  # Assuming 8-hour workdays
    
    cluster_info = f" (Cluster {task['cluster']})" if task["cluster"] is not None else ""
    print(f"{task['name']}{cluster_info}: {hours:.2f} hours ({days:.2f} calendar days, {workdays:.2f} work days)")

# Save the prediction function
with open(f'{results_dir}/task_effort_predictor.py', 'w') as f:
    f.write("""
import pandas as pd
import numpy as np
import pickle
import os

def predict_task_effort(task_features, model_path='best_model.pkl', cluster_models_path='cluster_models.pkl', project_cluster=None):
    \"\"\"
    Predict resolution hours for a new task using pre-scaled features.
    
    Parameters:
    -----------
    task_features : dict
        Dictionary of task features in the scaled feature space
    model_path : str
        Path to the trained model pickle file
    cluster_models_path : str
        Path to the cluster models pickle file (optional)
    project_cluster : int, optional
        Project cluster ID, if known
    
    Returns:
    --------
    float
        Predicted resolution hours
    \"\"\"
    # Check if cluster models are available
    cluster_based_modeling = False
    if os.path.exists(cluster_models_path):
        try:
            with open(cluster_models_path, 'rb') as f:
                cluster_models = pickle.load(f)
            cluster_based_modeling = True
        except:
            cluster_models = {}
    
    # Add cluster information if provided
    if project_cluster is not None and cluster_based_modeling:
        task_features['project_cluster'] = project_cluster
    
    # Get list of expected features from model
    with open('data_splits.pkl', 'rb') as f:
        splits = pickle.load(f)
        expected_features = splits['X_train'].columns
        use_log_transform = splits.get('log_transform', True)
    
    # Convert to DataFrame
    task_df = pd.DataFrame([task_features])
    
    # Ensure all required features are present
    for col in expected_features:
        if col not in task_df.columns:
            task_df[col] = 0  # Default to 0 for missing features
    
    # Keep only the features used in training
    task_df = task_df[expected_features]
    
    # Choose appropriate model based on cluster
    if cluster_based_modeling and project_cluster is not None and project_cluster in cluster_models:
        model = cluster_models[project_cluster]
    elif cluster_based_modeling and -1 in cluster_models:
        model = cluster_models[-1]  # Use unknown cluster model
    else:
        # Use best global model
        with open(model_path, 'rb') as f:
            model = pickle.load(f)
    
    # Make prediction (no scaling needed for pre-scaled features)
    pred_log = model.predict(task_df)[0]
    
    # Transform prediction back from log space if needed
    if use_log_transform:
        prediction = np.expm1(pred_log)
    else:
        prediction = pred_log
        
    return prediction

def estimate_project_task_effort(project_features, task_features):
    \"\"\"
    Combine project-level and task-level features to estimate task effort.
    
    Parameters:
    -----------
    project_features : dict
        Dictionary of project features to determine cluster and project context
    task_features : dict
        Dictionary of task-specific features
    
    Returns:
    --------
    dict
        Dictionary containing prediction and confidence interval
    \"\"\"
    # This is a placeholder for a more sophisticated function that would:
    # 1. Determine project cluster from project features
    # 2. Adjust task features based on project context
    # 3. Generate prediction with confidence intervals
    
    # For now, we'll do a simple implementation
    all_features = {}
    all_features.update(task_features)
    
    # Detect project cluster (placeholder)
    project_cluster = None
    
    # Make prediction
    hours = predict_task_effort(all_features, project_cluster=project_cluster)
    
    # Return results with placeholder confidence interval
    return {
        'prediction': hours,
        'days': hours / 24,
        'work_days': hours / 8,
        'confidence_low': hours * 0.7,  # Placeholder - would be calculated from model
        'confidence_high': hours * 1.3  # Placeholder - would be calculated from model
    }
""")

print("\nPrediction function saved to task_effort_predictor.py")
print("Model training and evaluation complete.")