In [9]:

import os
import time
import logging
from Classification.Classification import ProjectClassificationSystem
from ClusterRegression.ClusterRegression import ClusterRegressionModule

# Set up logging
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s',
    handlers=[
        logging.FileHandler("see_pipeline.log"),
        logging.StreamHandler()
    ]
)
logger = logging.getLogger(__name__)

def run_pipeline(data_files, target_column="total_resolution_hours", output_dir="see_results"):
    """
    Run the two-stage software effort estimation pipeline
    
    Parameters:
    -----------
    data_files : list
        List of data files to process
    target_column : str
        Target column to predict
    output_dir : str
        Output directory for results
    """
    # Create output directories
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)
    
    class_dir = os.path.join(output_dir, "classification")
    reg_dir = os.path.join(output_dir, "regression")
    
    # Output file paths
    classified_path = os.path.join(class_dir, "results", "classified_projects.csv")
    profiles_path = os.path.join(class_dir, "results", "cluster_profiles.json")
    
    # Stage 1: Classification
    logger.info("=" * 50)
    logger.info("STAGE 1: PROJECT CLASSIFICATION")
    logger.info("=" * 50)
    
    classifier = ProjectClassificationSystem(output_dir=class_dir)
    
    start_time = time.time()
    classified_df, optimal_k, cluster_profiles = classifier.execute_classification_pipeline(
        file_paths=data_files,
        target_column=target_column
    )
    class_time = time.time() - start_time
    logger.info(f"Classification completed in {class_time:.2f} seconds")
    
    # Stage 2: Regression
    logger.info("=" * 50)
    logger.info("STAGE 2: CLUSTER-BASED REGRESSION")
    logger.info("=" * 50)
    
    regression = ClusterRegressionModule(output_dir=reg_dir)
    
    start_time = time.time()
    regression_results = regression.run(
        classified_projects_path=classified_path,
        target_col=target_column
    )
    reg_time = time.time() - start_time
    logger.info(f"Regression completed in {reg_time:.2f} seconds")
    
    # Summary
    logger.info("=" * 50)
    logger.info("PIPELINE EXECUTION COMPLETE")
    logger.info("=" * 50)
    logger.info(f"Total execution time: {class_time + reg_time:.2f} seconds")
    logger.info(f"Results saved to: {output_dir}")
    
    return {
        'classification': {
            'classified_df': classified_df,
            'optimal_k': optimal_k,
            'cluster_profiles': cluster_profiles
        },
        'regression': regression_results
    }

if __name__ == "__main__":
    # Example usage
    data_files = [
        # "../DataSets/data_export_1741772203780.csv", 
        "../DataSets/data_export_1741699774916.csv"
    ]
    
    results = run_pipeline(
        data_files=data_files,
        target_column="total_resolution_hours",
        output_dir="see_results"
    )

2025-03-12 23:51:13,432 - INFO - STAGE 1: PROJECT CLASSIFICATION




Loaded ../DataSets/data_export_1741699774916.csv: 159 rows, 100 columns
Starting data cleaning...
Replacing 1 negative values in min_resolution_hours with 0
Dropped 6 columns with constant values
Cleaning complete. Resulting dataset: 159 rows, 94 columns
Added team metrics: ['creator_count', 'reporter_count', 'team_size_estimate', 'issues_per_team_member', 'resolution_hours_per_team_member', 'team_role_diversity']

Analyzing feature importance for predicting total_resolution_hours...
Identified 8 features that account for 90% of importance
Top 5 most important features: issue_count, issuetype.name_Suggestion, issuetype.name_Bug, issuetype.name_Support Request, issuetype.name_Public Security Vulnerability
Analyzed correlations between 8 key features

Using Elbow Method to determine optimal number of clusters...
Elbow method suggests optimal number of clusters: 3

Using Silhouette Analysis to determine optimal number of clusters...
Silhouette analysis suggests optimal number of cluster

2025-03-12 23:51:15,723 - INFO - Classification completed in 2.29 seconds
2025-03-12 23:51:15,724 - INFO - STAGE 2: CLUSTER-BASED REGRESSION



Medium High-effort Projects (Cluster 0) (156 projects, 98.1%):
  Sample projects: WINDUP - Red Hat Application Migration Toolkit, Atlassian Marketplace, apiman (API Management)
  Distinguishing characteristics:
    - issuetype.name_Public Security Vulnerability: 72.2% lower than average
    - creator_count: 65.6% lower than average
    - reporter_count: 63.3% lower than average
    - issuetype.name_Support Request: 59.2% lower than average
    - issuetype.name_Bug: 52.2% lower than average

Large High-effort Projects (Cluster 1) (2 projects, 1.3%):
  Sample projects: Minecraft: Java Edition, Minecraft (Bedrock codebase)
  Distinguishing characteristics:
    - creator_count: 4917.0% higher than average
    - reporter_count: 4745.5% higher than average
    - issuetype.name_Bug: 3911.2% higher than average
    - issue_count: 2564.0% higher than average
    - issuetype.name_Suggestion: 100.0% lower than average

Large High-effort Projects (Cluster 2) (1 projects, 0.6%):
  Sample projects:

TypeError: Could not convert ['Cluster 0'] to numeric