## BMC

In [None]:
import xml.etree.ElementTree as ET
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from datetime import datetime
import os
import glob

class BenchmarkComparator:
    def __init__(self):
        self.dataframes = {}
        self.comparison_dfs = {}
        
    def parse_xml_file(self, file_path, algorithm_name, input_type):
        """Parse XML file and extract benchmark results"""
        tree = ET.parse(file_path)
        root = tree.getroot()
        
        data = []
        for run in root.findall('run'):
            # Extract benchmark name and properties
            run_name = run.get('name', '')
            properties = run.get('properties', '')
            expected_verdict = run.get('expectedVerdict', '')
            
            # Extract column values
            columns = {}
            for col in run.findall('column'):
                title = col.get('title')
                value = col.get('value')
                columns[title] = value
            
            # Extract file information
            files_elem = run.get('files', '[]')
            
            # Extract algorithm from filename or use provided name
            if 'BMC' in algorithm_name.upper():
                algo_type = 'BMC'
            elif 'CEGAR' in algorithm_name.upper():
                if 'predicative' in algorithm_name.lower():
                    algo_type = 'CEGAR_Predicative'
                elif 'explicit' in algorithm_name.lower():
                    algo_type = 'CEGAR_Explicit'
                else:
                    algo_type = 'CEGAR'
            elif 'IMC' in algorithm_name.upper():
                algo_type = 'IMC'
            elif 'K-INDUCTION' in algorithm_name.upper() or 'K_INDUCTION' in algorithm_name.upper():
                algo_type = 'K_Induction'
            else:
                algo_type = algorithm_name
            
            row_data = {
                'algorithm': algo_type,
                'input_type': input_type,
                'run_name': run_name,
                'properties': properties,
                'expected_verdict': expected_verdict,
                'status': columns.get('status', ''),
                'cputime': float(columns.get('cputime', '0').replace('s', '')) if columns.get('cputime') else 0,
                'walltime': float(columns.get('walltime', '0').replace('s', '')) if columns.get('walltime') else 0,
                'memory': int(columns.get('memory', '0').replace('B', '')) if columns.get('memory') else 0,
                'host': columns.get('host', ''),
                'files': files_elem
            }
            
            data.append(row_data)
        
        return pd.DataFrame(data)
    
    def load_all_data(self, result_dir="result_xmls"):
        """Load all XML files from directory"""
        xml_files = glob.glob(os.path.join(result_dir, "*.xml"))
        
        if not xml_files:
            print(f"No XML files found in {result_dir}")
            return
        
        for xml_file in xml_files:
            filename = os.path.basename(xml_file)
            
            # Extract algorithm name from filename
            algo_name = filename
            for pattern in ['BMC', 'CEGAR', 'IMC', 'K-Induction', 'K_Induction']:
                if pattern in filename:
                    algo_name = pattern
                    break
            
            # Determine input type
            if 'btor2' in filename.lower():
                input_type = 'btor2'
            elif 'c-bit' in filename.lower() or '.c' in filename.lower():
                input_type = 'c'
            else:
                input_type = 'unknown'
            
            print(f"Loading {algo_name} ({input_type}) from {filename}")
            
            try:
                df = self.parse_xml_file(xml_file, algo_name, input_type)
                key = f"{algo_name}_{input_type}"
                self.dataframes[key] = df
                print(f"  Loaded {len(df)} benchmarks")
            except Exception as e:
                print(f"  Error loading {filename}: {e}")
    
    def load_specific_files(self, file_dict):
        """Load specific XML files with algorithm names"""
        for algo_name, file_info in file_dict.items():
            file_path = file_info['path']
            input_type = file_info.get('input_type', 'unknown')
            
            print(f"Loading {algo_name} ({input_type}) from {file_path}")
            
            try:
                df = self.parse_xml_file(file_path, algo_name, input_type)
                key = f"{algo_name}_{input_type}"
                self.dataframes[key] = df
                print(f"  Loaded {len(df)} benchmarks")
            except Exception as e:
                print(f"  Error loading {file_path}: {e}")
    
    def create_comparison_dataframes(self):
        """Create comparison dataframes for each algorithm across input types"""
        # Group dataframes by algorithm
        algorithm_groups = {}
        
        for key, df in self.dataframes.items():
            # Extract algorithm name (remove input type suffix)
            parts = key.split('_')
            algo_name = '_'.join(parts[:-1])  # Everything except the last part
            
            if algo_name not in algorithm_groups:
                algorithm_groups[algo_name] = {}
            
            # Store dataframe with input type
            input_type = parts[-1]
            algorithm_groups[algo_name][input_type] = df
        
        # Create comparison for each algorithm
        for algo_name, input_dfs in algorithm_groups.items():
            if 'btor2' in input_dfs and 'c' in input_dfs:
                # Extract benchmark identifiers for matching
                def extract_benchmark_id(run_name):
                    start = run_name.rfind("/") + 1
                    end = run_name.rfind(".yml")
                    return run_name[start:end] if end != -1 else run_name[start:]
                
                # Add benchmark IDs
                df_btor2 = input_dfs['btor2'].copy()
                df_c = input_dfs['c'].copy()
                
                df_btor2['benchmark_id'] = df_btor2['run_name'].apply(extract_benchmark_id)
                df_c['benchmark_id'] = df_c['run_name'].apply(extract_benchmark_id)
                
                # Merge dataframes on benchmark_id
                merged = pd.merge(
                    df_btor2, 
                    df_c, 
                    on='benchmark_id', 
                    suffixes=('_btor2', '_c'),
                    how='inner'
                )
                
                self.comparison_dfs[algo_name] = merged
                print(f"Created comparison for {algo_name}: {len(merged)} matched benchmarks")
        
        return self.comparison_dfs
    
    def create_algorithm_comparison_plots(self):
        """Create comparison plots for each algorithm"""
        if not self.comparison_dfs:
            self.create_comparison_dataframes()
        
        # Create individual plots for each algorithm
        for algo_name, comp_df in self.comparison_dfs.items():
            self._create_single_algorithm_plots(algo_name, comp_df)
        
        # Create aggregated comparison across all algorithms
        self._create_aggregated_comparison()
    
    def _create_single_algorithm_plots(self, algo_name, comp_df):
        """Create plots for a single algorithm"""
        # Create directory for algorithm plots
        algo_dir = f"plots_{algo_name}"
        os.makedirs(algo_dir, exist_ok=True)
        
        # 1. Performance Comparison Plot
        fig, axes = plt.subplots(2, 2, figsize=(15, 12))
        fig.suptitle(f'Performance Comparison: {algo_name} - BTOR2 vs C Input Types', fontsize=16)
        
        # CPU Time Comparison
        axes[0, 0].scatter(comp_df['cputime_btor2'], 
                          comp_df['cputime_c'], alpha=0.6)
        axes[0, 0].plot([0, comp_df[['cputime_btor2', 'cputime_c']].max().max()], 
                       [0, comp_df[['cputime_btor2', 'cputime_c']].max().max()], 
                       'r--', alpha=0.8)
        axes[0, 0].set_xlabel('BTOR2 CPU Time (s)')
        axes[0, 0].set_ylabel('C CPU Time (s)')
        axes[0, 0].set_title('CPU Time Comparison')
        axes[0, 0].grid(True, alpha=0.3)
        
        # Memory Usage Comparison
        axes[1, 0].scatter(comp_df['memory_btor2']/1e9, 
                          comp_df['memory_c']/1e9, alpha=0.6)
        max_mem = comp_df[['memory_btor2', 'memory_c']].max().max()/1e9
        axes[1, 0].plot([0, max_mem], [0, max_mem], 'r--', alpha=0.8)
        axes[1, 0].set_xlabel('BTOR2 Memory (GB)')
        axes[1, 0].set_ylabel('C Memory (GB)')
        axes[1, 0].set_title('Memory Usage Comparison')
        axes[1, 0].grid(True, alpha=0.3)
        
        # Status Distribution
        status_counts = pd.DataFrame({
            'BTOR2': comp_df['status_btor2'].value_counts(),
            'C': comp_df['status_c'].value_counts()
        }).fillna(0)
        
        status_counts.plot(kind='bar', ax=axes[1, 1])
        axes[1, 1].set_title('Status Distribution Comparison')
        axes[1, 1].set_ylabel('Count')
        axes[1, 1].tick_params(axis='x', rotation=45)
        axes[1, 1].grid(True, alpha=0.3)
        
        plt.tight_layout()
        plt.savefig(os.path.join(algo_dir, f'{algo_name}_performance_comparison.png'), 
                   dpi=300, bbox_inches='tight')
        plt.close()
        
        # 2. Speedup Analysis
        self._create_speedup_analysis(algo_name, comp_df, algo_dir)
        
        # 3. Status Comparison Matrix
        self._create_status_matrix(algo_name, comp_df, algo_dir)
    
    def _create_speedup_analysis(self, algo_name, comp_df, output_dir):
        """Create speedup analysis for a single algorithm"""
        # Calculate speedup factors
        comp_df['cpu_speedup'] = comp_df['cputime_btor2'] / comp_df['cputime_c']
        comp_df['wall_speedup'] = comp_df['walltime_btor2'] / comp_df['walltime_c']
        comp_df['memory_ratio'] = comp_df['memory_btor2'] / comp_df['memory_c']
        
        fig, axes = plt.subplots(1, 3, figsize=(18, 6))
        fig.suptitle(f'Speedup Analysis: {algo_name} - BTOR2 vs C', fontsize=16)
        
        # CPU Speedup distribution
        cpu_data = comp_df['cpu_speedup'].replace([np.inf, -np.inf], np.nan).dropna()
        axes[0].hist(cpu_data, bins=50, alpha=0.7, color='skyblue')
        axes[0].axvline(1, color='red', linestyle='--', label='Equal Performance')
        axes[0].set_xlabel('CPU Speedup (BTOR2/C)')
        axes[0].set_ylabel('Frequency')
        axes[0].set_title('CPU Time Speedup Distribution')
        axes[0].legend()
        axes[0].grid(True, alpha=0.3)
        
        # Wall Speedup distribution
        wall_data = comp_df['wall_speedup'].replace([np.inf, -np.inf], np.nan).dropna()
        axes[1].hist(wall_data, bins=50, alpha=0.7, color='lightgreen')
        axes[1].axvline(1, color='red', linestyle='--', label='Equal Performance')
        axes[1].set_xlabel('Wall Speedup (BTOR2/C)')
        axes[1].set_ylabel('Frequency')
        axes[1].set_title('Wall Time Speedup Distribution')
        axes[1].legend()
        axes[1].grid(True, alpha=0.3)
        
        # Memory ratio distribution
        mem_data = comp_df['memory_ratio'].replace([np.inf, -np.inf], np.nan).dropna()
        axes[2].hist(mem_data, bins=50, alpha=0.7, color='salmon')
        axes[2].axvline(1, color='red', linestyle='--', label='Equal Memory')
        axes[2].set_xlabel('Memory Ratio (BTOR2/C)')
        axes[2].set_ylabel('Frequency')
        axes[2].set_title('Memory Usage Ratio Distribution')
        axes[2].legend()
        axes[2].grid(True, alpha=0.3)
        
        plt.tight_layout()
        plt.savefig(os.path.join(output_dir, f'{algo_name}_speedup_analysis.png'), 
                   dpi=300, bbox_inches='tight')
        plt.close()
        
        # Print summary statistics
        print(f"\n{algo_name} Speedup Analysis Summary:")
        print(f"Average CPU Speedup: {cpu_data.mean():.2f}")
        print(f"Average Wall Speedup: {wall_data.mean():.2f}")
        print(f"Average Memory Ratio: {mem_data.mean():.2f}")
    
    def _create_status_matrix(self, algo_name, comp_df, output_dir):
        """Create status comparison matrix for a single algorithm"""
        status_matrix = pd.crosstab(
            comp_df['status_btor2'], 
            comp_df['status_c'],
            margins=True
        )
        
        plt.figure(figsize=(10, 8))
        sns.heatmap(status_matrix.iloc[:-1, :-1], annot=True, fmt='d', cmap='YlOrRd')
        plt.title(f'Status Transition Matrix: {algo_name} - BTOR2 vs C')
        plt.xlabel('C Status')
        plt.ylabel('BTOR2 Status')
        plt.tight_layout()
        plt.savefig(os.path.join(output_dir, f'{algo_name}_status_matrix.png'), 
                   dpi=300, bbox_inches='tight')
        plt.close()
    
    def _create_aggregated_comparison(self):
        """Create aggregated comparison across all algorithms"""
        if len(self.comparison_dfs) < 2:
            print("Need at least 2 algorithms for aggregated comparison")
            return
        
        # Prepare aggregated data
        aggregated_data = []
        for algo_name, comp_df in self.comparison_dfs.items():
            # Calculate average metrics
            avg_cpu_btor2 = comp_df['cputime_btor2'].mean()
            avg_cpu_c = comp_df['cputime_c'].mean()
            avg_mem_btor2 = comp_df['memory_btor2'].mean() / 1e9  # GB
            avg_mem_c = comp_df['memory_c'].mean() / 1e9  # GB
            
            # Count statuses
            success_btor2 = len(comp_df[comp_df['status_btor2'] != 'TIMEOUT'])
            success_c = len(comp_df[comp_df['status_c'] != 'TIMEOUT'])
            total = len(comp_df)
            
            aggregated_data.append({
                'Algorithm': algo_name,
                'Avg_CPU_BTOR2': avg_cpu_btor2,
                'Avg_CPU_C': avg_cpu_c,
                'Avg_Mem_BTOR2': avg_mem_btor2,
                'Avg_Mem_C': avg_mem_c,
                'Success_BTOR2': success_btor2,
                'Success_C': success_c,
                'Total': total
            })
        
        agg_df = pd.DataFrame(aggregated_data)
        
        # Create aggregated plots
        fig, axes = plt.subplots(2, 2, figsize=(16, 12))
        
        # CPU Time comparison across algorithms
        x = np.arange(len(agg_df))
        width = 0.35
        axes[0, 0].bar(x - width/2, agg_df['Avg_CPU_BTOR2'], width, label='BTOR2')
        axes[0, 0].bar(x + width/2, agg_df['Avg_CPU_C'], width, label='C')
        axes[0, 0].set_xlabel('Algorithm')
        axes[0, 0].set_ylabel('Average CPU Time (s)')
        axes[0, 0].set_title('Average CPU Time Comparison Across Algorithms')
        axes[0, 0].set_xticks(x)
        axes[0, 0].set_xticklabels(agg_df['Algorithm'], rotation=45)
        axes[0, 0].legend()
        axes[0, 0].grid(True, alpha=0.3)
        
        # Memory comparison across algorithms
        axes[0, 1].bar(x - width/2, agg_df['Avg_Mem_BTOR2'], width, label='BTOR2')
        axes[0, 1].bar(x + width/2, agg_df['Avg_Mem_C'], width, label='C')
        axes[0, 1].set_xlabel('Algorithm')
        axes[0, 1].set_ylabel('Average Memory (GB)')
        axes[0, 1].set_title('Average Memory Usage Comparison Across Algorithms')
        axes[0, 1].set_xticks(x)
        axes[0, 1].set_xticklabels(agg_df['Algorithm'], rotation=45)
        axes[0, 1].legend()
        axes[0, 1].grid(True, alpha=0.3)
        
        # Success rate comparison
        success_rate_btor2 = agg_df['Success_BTOR2'] / agg_df['Total'] * 100
        success_rate_c = agg_df['Success_C'] / agg_df['Total'] * 100
        axes[1, 0].bar(x - width/2, success_rate_btor2, width, label='BTOR2')
        axes[1, 0].bar(x + width/2, success_rate_c, width, label='C')
        axes[1, 0].set_xlabel('Algorithm')
        axes[1, 0].set_ylabel('Success Rate (%)')
        axes[1, 0].set_title('Success Rate Comparison Across Algorithms')
        axes[1, 0].set_xticks(x)
        axes[1, 0].set_xticklabels(agg_df['Algorithm'], rotation=45)
        axes[1, 0].legend()
        axes[1, 0].grid(True, alpha=0.3)
        
        # Speedup factors across algorithms
        cpu_speedup = agg_df['Avg_CPU_BTOR2'] / agg_df['Avg_CPU_C']
        mem_speedup = agg_df['Avg_Mem_BTOR2'] / agg_df['Avg_Mem_C']
        axes[1, 1].plot(agg_df['Algorithm'], cpu_speedup, 'o-', label='CPU Speedup')
        axes[1, 1].plot(agg_df['Algorithm'], mem_speedup, 's-', label='Memory Speedup')
        axes[1, 1].axhline(1, color='red', linestyle='--', label='Baseline (1.0)')
        axes[1, 1].set_xlabel('Algorithm')
        axes[1, 1].set_ylabel('Speedup Ratio (BTOR2/C)')
        axes[1, 1].set_title('Speedup Factors Across Algorithms')
        axes[1, 1].set_xticklabels(agg_df['Algorithm'], rotation=45)
        axes[1, 1].legend()
        axes[1, 1].grid(True, alpha=0.3)
        
        plt.tight_layout()
        plt.savefig('aggregated_algorithm_comparison.png', dpi=300, bbox_inches='tight')
        plt.show()
        
        return agg_df
    
    def generate_summary_reports(self):
        """Generate summary reports for each algorithm"""
        if not self.comparison_dfs:
            self.create_comparison_dataframes()
        
        for algo_name, comp_df in self.comparison_dfs.items():
            print("\n" + "="*60)
            print(f"SUMMARY REPORT: {algo_name}")
            print("="*60)
            
            print(f"\nTotal Benchmarks Compared: {len(comp_df)}")
            
            # Status summary
            print("\nStatus Distribution:")
            print("BTOR2:")
            print(comp_df['status_btor2'].value_counts())
            print("\nC:")
            print(comp_df['status_c'].value_counts())
            
            # Performance metrics
            print("\nPerformance Metrics (Average):")
            metrics = ['cputime', 'walltime', 'memory']
            for metric in metrics:
                btor2_avg = comp_df[f'{metric}_btor2'].mean()
                c_avg = comp_df[f'{metric}_c'].mean()
                ratio = btor2_avg / c_avg if c_avg > 0 else float('inf')
                print(f"{metric.upper()}: BTOR2={btor2_avg:.2f}, C={c_avg:.2f}, Ratio={ratio:.2f}")
            
            # Success rate comparison
            btor2_success = len(comp_df[comp_df['status_btor2'] != 'TIMEOUT'])
            c_success = len(comp_df[comp_df['status_c'] != 'TIMEOUT'])
            
            print(f"\nSuccess Rates:")
            print(f"BTOR2: {btor2_success/len(comp_df)*100:.1f}%")
            print(f"C: {c_success/len(comp_df)*100:.1f}%")
            print("="*60)


# Example usage
def main():
    # Initialize comparator
    comparator = BenchmarkComparator()
    
    # Option 1: Load all XML files from directory
    print("Loading all XML files from result_xmls directory...")
    comparator.load_all_data("result_xmls")
    
    # Option 2: Or load specific files with algorithm names
    """
    file_dict = {
        'BMC': {
            'path': "result_xmls\\theta_algos.2025-10-29_06-24-29.results.btor2-bounded.btor2.xml",
            'input_type': 'btor2'
        },
        'BMC_C': {
            'path': "result_xmls\\theta_algos.2025-10-29_06-24-29.results.c-bit-bounded.c-bit.xml",
            'input_type': 'c'
        },
        'CEGAR_Explicit': {
            'path': "path_to_cegar_explicit_btor2.xml",
            'input_type': 'btor2'
        },
        # ... add more algorithms
    }
    comparator.load_specific_files(file_dict)
    """
    
    # Generate all analyses
    if comparator.dataframes:
        print("\nCreating comparison dataframes...")
        comparator.create_comparison_dataframes()
        
        print("\nGenerating plots for each algorithm...")
        comparator.create_algorithm_comparison_plots()
        
        print("\nGenerating summary reports...")
        comparator.generate_summary_reports()
        
        print("\nAll analyses completed! Check the generated plot directories.")
    else:
        print("No data loaded. Please check your XML files.")

if __name__ == "__main__":
    main()

## All Algos

In [None]:
import xml.etree.ElementTree as ET
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import os
import re
from pathlib import Path
import warnings
warnings.filterwarnings('ignore')

class AlgorithmComparator:
    def __init__(self, folder_path):
        self.folder_path = folder_path
        self.all_data = pd.DataFrame()
        self.algorithm_results = {}
        
    def detect_algorithm_from_filename(self, filename):
        """Detect algorithm type from filename patterns"""
        filename_lower = filename.lower()
        
        if 'c-bit' in filename_lower:
            input_type = 'C'
        elif 'btor2' in filename_lower:
            input_type = 'BTOR2'
        else:
            input_type = 'UNKNOWN'
            
        # Detect algorithm
        if 'cegar' in filename_lower and 'pred' in filename_lower:
            algorithm = 'CEGAR_PRED'
        elif 'cegar' in filename_lower and 'expl' in filename_lower:
            algorithm = 'CEGAR_EXPL'
        elif 'bounded' in filename_lower:
            algorithm = 'BMC'
        elif 'imc' in filename_lower:
            algorithm = 'IMC'
        elif 'kind' in filename_lower:
            algorithm = 'K-IND'
        elif 'ic3' in filename_lower:
            algorithm = 'IC3'
        else:
            algorithm = 'UNKNOWN'
            
        return algorithm, input_type
    
    def parse_xml_file(self, file_path, algorithm, input_type):
        """Parse XML file and extract benchmark results"""
        try:
            tree = ET.parse(file_path)
            root = tree.getroot()
            
            data = []
            for run in root.findall('run'):
                # Extract benchmark name and properties
                run_name = run.get('name', '')
                properties = run.get('properties', '')
                expected_verdict = run.get('expectedVerdict', '')
                
                # Extract column values
                columns = {}
                for col in run.findall('column'):
                    title = col.get('title')
                    value = col.get('value')
                    columns[title] = value
                
                # Extract benchmark category
                benchmark_category = self.extract_benchmark_category(run_name)
                
                # FIX: Correct success detection - consider both 'true' and 'false' status as successful if they complete
                status = columns.get('status', '')
                # Consider runs with status 'true', 'false', or 'correct' as successful completions
                is_successful = status in ['true', 'false', 'correct', 'false(unreach-call)', ]
                # Consider timeouts and OOM as failures
                is_timeout = status == 'timeout' or 'timeout' in status.lower()
                is_oom = status == 'out of memory' or 'memory' in status.lower()
                
                row_data = {
                    'algorithm': algorithm,
                    'input_type': input_type,
                    'file_name': os.path.basename(file_path),
                    'run_name': run_name,
                    'benchmark_category': benchmark_category,
                    'benchmark_name': self.extract_benchmark_name(run_name),
                    'properties': properties,
                    'expected_verdict': expected_verdict,
                    'status': status,
                    'cputime': self.safe_float(columns.get('cputime', '0').replace('s', '')),
                    'walltime': self.safe_float(columns.get('walltime', '0').replace('s', '')),
                    'memory': self.safe_int(columns.get('memory', '0').replace('B', '')),
                    'host': columns.get('host', ''),
                    'success': 1 if is_successful else 0,
                    'timeout': 1 if is_timeout else 0,
                    'oom': 1 if is_oom else 0
                }
                
                data.append(row_data)
            
            return pd.DataFrame(data)
            
        except Exception as e:
            print(f"Error parsing {file_path}: {e}")
            return pd.DataFrame()
    
    def safe_float(self, value):
        """Safely convert to float"""
        try:
            return float(value)
        except:
            return 0.0
    
    def safe_int(self, value):
        """Safely convert to int"""
        try:
            return int(value)
        except:
            return 0
    
    def extract_benchmark_category(self, run_name):
        """Extract benchmark category from run name"""
        categories = ['adding', 'anderson', 'at', 'bakery', 'blocks', 'bridge', 'brp', 'brp2']
        for category in categories:
            if category in run_name.lower():
                return category
        return 'other'
    
    def extract_benchmark_name(self, run_name):
        """Extract simplified benchmark name"""
        # Extract the core benchmark identifier
        start = run_name.rfind("/") + 1
        end = run_name.rfind(".yml")
        return run_name[start:end] if end != -1 else run_name[start:]
    
    def load_all_data(self):
        """Load and parse all XML files in the folder"""
        xml_files = list(Path(self.folder_path).glob('*.xml'))
        print(f"Found {len(xml_files)} XML files")
        
        all_dfs = []
        
        for xml_file in xml_files:
            algorithm, input_type = self.detect_algorithm_from_filename(xml_file.name)
            print(f"Processing: {xml_file.name} -> {algorithm}/{input_type}")
            
            df = self.parse_xml_file(xml_file, algorithm, input_type)
            if not df.empty:
                all_dfs.append(df)
                print(f"  Loaded {len(df)} runs, {df['success'].sum()} successful")
        
        if all_dfs:
            self.all_data = pd.concat(all_dfs, ignore_index=True)
            print(f"\nLoaded {len(self.all_data)} benchmark results")
            print(f"Algorithms: {self.all_data['algorithm'].unique()}")
            print(f"Input types: {self.all_data['input_type'].unique()}")
            
            # Print success statistics by algorithm and input type
            print("\nSuccess counts by algorithm and input type:")
            success_counts = self.all_data.groupby(['algorithm', 'input_type'])['success'].agg(['sum', 'count'])
            success_counts['success_rate'] = (success_counts['sum'] / success_counts['count'] * 100).round(1)
            print(success_counts)
        else:
            print("No data loaded!")
        
        return self.all_data
    
    def calculate_algorithm_metrics(self):
        """Calculate performance metrics for each algorithm"""
        if self.all_data.empty:
            self.load_all_data()
        
        metrics = []
        
        for (algorithm, input_type), group in self.all_data.groupby(['algorithm', 'input_type']):
            total_benchmarks = len(group)
            successful_count = group['success'].sum()
            success_rate = (successful_count / total_benchmarks) * 100 if total_benchmarks > 0 else 0
            timeout_rate = group['timeout'].mean() * 100
            oom_rate = group['oom'].mean() * 100
            
            # Average performance metrics (only for successful runs)
            successful_runs = group[group['success'] == 1]
            avg_cputime = successful_runs['cputime'].mean() if len(successful_runs) > 0 else 0
            avg_walltime = successful_runs['walltime'].mean() if len(successful_runs) > 0 else 0
            avg_memory = successful_runs['memory'].mean() if len(successful_runs) > 0 else 0
            
            metrics.append({
                'algorithm': algorithm,
                'input_type': input_type,
                'total_benchmarks': total_benchmarks,
                'success_rate': success_rate,
                'timeout_rate': timeout_rate,
                'oom_rate': oom_rate,
                'avg_cputime': avg_cputime,
                'avg_walltime': avg_walltime,
                'avg_memory_gb': avg_memory / 1e9,
                'successful_runs': successful_count
            })
        
        self.algorithm_metrics = pd.DataFrame(metrics)
        return self.algorithm_metrics
    
    def plot_success_rates(self):
        """Plot success rates for all algorithms"""
        if not hasattr(self, 'algorithm_metrics'):
            self.calculate_algorithm_metrics()
        
        # Set Times New Roman font and larger text sizes
        plt.rcParams['font.family'] = 'Times New Roman'
        plt.rcParams['font.size'] = 14
        plt.rcParams['axes.titlesize'] = 16
        plt.rcParams['axes.labelsize'] = 14
        plt.rcParams['xtick.labelsize'] = 12
        plt.rcParams['ytick.labelsize'] = 12
        plt.rcParams['legend.fontsize'] = 12
        
        fig, axes = plt.subplots(2, 2, figsize=(18, 14))
        fig.suptitle('Algorithm Performance Comparison', fontsize=20, fontweight='bold')
        
        # Success rates by algorithm and input type
        pivot_success = self.algorithm_metrics.pivot(index='algorithm', columns='input_type', values='success_rate')
        pivot_success.plot(kind='bar', ax=axes[0, 0], color=['skyblue', 'lightcoral'])
        axes[0, 0].set_title('Success Rate by Algorithm and Input Type', fontweight='bold', fontsize=16)
        axes[0, 0].set_ylabel('Success Rate (%)', fontsize=14)
        axes[0, 0].set_xlabel('Algorithm', fontsize=14)
        axes[0, 0].tick_params(axis='x', rotation=45)
        axes[0, 0].legend(title='Input Type', title_fontsize=12)
        axes[0, 0].grid(True, alpha=0.3)
        
        # Timeout rates
        pivot_timeout = self.algorithm_metrics.pivot(index='algorithm', columns='input_type', values='timeout_rate')
        pivot_timeout.plot(kind='bar', ax=axes[0, 1], color=['skyblue', 'lightcoral'])
        axes[0, 1].set_title('Timeout Rate by Algorithm and Input Type', fontweight='bold', fontsize=16)
        axes[0, 1].set_ylabel('Timeout Rate (%)', fontsize=14)
        axes[0, 1].set_xlabel('Algorithm', fontsize=14)
        axes[0, 1].tick_params(axis='x', rotation=45)
        axes[0, 1].legend(title='Input Type', title_fontsize=12)
        axes[0, 1].grid(True, alpha=0.3)
        
        # Average CPU time (successful runs only)
        successful_metrics = self.algorithm_metrics[self.algorithm_metrics['successful_runs'] > 0]
        if not successful_metrics.empty:
            pivot_cputime = successful_metrics.pivot(index='algorithm', columns='input_type', values='avg_cputime')
            pivot_cputime.plot(kind='bar', ax=axes[1, 0], color=['lightgreen', 'orange'])
            axes[1, 0].set_title('Average CPU Time (Successful Runs)', fontweight='bold', fontsize=16)
            axes[1, 0].set_ylabel('CPU Time (s)', fontsize=14)
            axes[1, 0].set_xlabel('Algorithm', fontsize=14)
            axes[1, 0].tick_params(axis='x', rotation=45)
            axes[1, 0].legend(title='Input Type', title_fontsize=12)
            axes[1, 0].grid(True, alpha=0.3)
        
        # Average memory usage
        if not successful_metrics.empty:
            pivot_memory = successful_metrics.pivot(index='algorithm', columns='input_type', values='avg_memory_gb')
            pivot_memory.plot(kind='bar', ax=axes[1, 1], color=['lightgreen', 'orange'])
            axes[1, 1].set_title('Average Memory Usage (Successful Runs)', fontweight='bold', fontsize=16)
            axes[1, 1].set_ylabel('Memory (GB)', fontsize=14)
            axes[1, 1].set_xlabel('Algorithm', fontsize=14)
            axes[1, 1].tick_params(axis='x', rotation=45)
            axes[1, 1].legend(title='Input Type', title_fontsize=12)
            axes[1, 1].grid(True, alpha=0.3)
        
        plt.tight_layout()
        plt.savefig('algorithm_performance_comparison.png', dpi=300, bbox_inches='tight')
        plt.show()
        
        # Reset font settings to default
        plt.rcParams.update(plt.rcParamsDefault)
    
    def plot_benchmark_category_analysis(self):
        """Analyze performance by benchmark category"""
        if self.all_data.empty:
            self.load_all_data()
        
        # Set Times New Roman font for this plot as well
        plt.rcParams['font.family'] = 'Times New Roman'
        plt.rcParams['font.size'] = 12
        
        # Calculate success rates by category and algorithm
        category_stats = self.all_data.groupby(['benchmark_category', 'algorithm', 'input_type']).agg({
            'success': 'mean',
            'cputime': 'mean',
            'memory': 'mean',
            'run_name': 'count'
        }).reset_index()
        
        category_stats['success_rate'] = category_stats['success'] * 100
        
        # Pivot for heatmap
        pivot_success = category_stats.pivot_table(
            index='benchmark_category', 
            columns=['algorithm', 'input_type'], 
            values='success_rate', 
            aggfunc='mean'
        ).fillna(0)
        
        plt.figure(figsize=(16, 12))
        sns.heatmap(pivot_success, annot=True, fmt='.1f', cmap='RdYlGn', 
                   center=50, vmin=0, vmax=100, annot_kws={"size": 10})
        plt.title('Success Rate by Benchmark Category and Algorithm (%)', fontsize=16, fontweight='bold')
        plt.tight_layout()
        plt.savefig('category_success_heatmap.png', dpi=300, bbox_inches='tight')
        plt.show()
        
        # Reset font settings to default
        plt.rcParams.update(plt.rcParamsDefault)
        
        return category_stats
    
    def rank_algorithms(self):
        """Rank algorithms based on multiple criteria"""
        if not hasattr(self, 'algorithm_metrics'):
            self.calculate_algorithm_metrics()
        
        # Create scoring system
        ranking_data = self.algorithm_metrics.copy()
        
        # Normalize metrics (higher is better for success, lower is better for time/memory)
        ranking_data['score_success'] = ranking_data['success_rate'] / 100
        
        # Only calculate time/memory scores for algorithms with successful runs
        max_cputime = ranking_data['avg_cputime'].max()
        max_memory = ranking_data['avg_memory_gb'].max()
        
        ranking_data['score_time'] = ranking_data.apply(
            lambda x: 1 - (x['avg_cputime'] / max_cputime) if max_cputime > 0 and x['successful_runs'] > 0 else 0, 
            axis=1
        )
        ranking_data['score_memory'] = ranking_data.apply(
            lambda x: 1 - (x['avg_memory_gb'] / max_memory) if max_memory > 0 and x['successful_runs'] > 0 else 0, 
            axis=1
        )
        
        # Combined score (weighted)
        ranking_data['combined_score'] = (
            0.5 * ranking_data['score_success'] +  # Success rate is most important
            0.3 * ranking_data['score_time'] +     # Time performance
            0.2 * ranking_data['score_memory']     # Memory efficiency
        )
        
        # Rank within each input type
        ranking_data['rank'] = ranking_data.groupby('input_type')['combined_score'].rank(ascending=False)
        
        # Sort by input type and rank
        ranking_data = ranking_data.sort_values(['input_type', 'rank'])
        
        return ranking_data[['algorithm', 'input_type', 'success_rate', 'avg_cputime', 
                           'avg_memory_gb', 'combined_score', 'rank', 'successful_runs']]
    
    def generate_comprehensive_report(self):
        """Generate a comprehensive performance report"""
        print("="*80)
        print("COMPREHENSIVE ALGORITHM PERFORMANCE ANALYSIS")
        print("="*80)
        
        if self.all_data.empty:
            self.load_all_data()
        
        # Basic statistics
        total_benchmarks = len(self.all_data)
        unique_algorithms = self.all_data['algorithm'].nunique()
        unique_benchmarks = self.all_data['benchmark_name'].nunique()
        
        print(f"\nDataset Overview:")
        print(f"Total benchmark runs: {total_benchmarks}")
        print(f"Unique algorithms: {unique_algorithms}")
        print(f"Unique benchmarks: {unique_benchmarks}")
        
        # Overall success rates
        overall_success = self.all_data['success'].mean() * 100
        overall_timeout = self.all_data['timeout'].mean() * 100
        overall_oom = self.all_data['oom'].mean() * 100
        
        print(f"\nOverall Statistics:")
        print(f"Success rate: {overall_success:.1f}%")
        print(f"Timeout rate: {overall_timeout:.1f}%")
        print(f"Out-of-memory rate: {overall_oom:.1f}%")
        
        # Algorithm rankings
        rankings = self.rank_algorithms()
        
        print(f"\nüèÜ ALGORITHM RANKINGS:")
        for input_type in ['BTOR2', 'C']:
            print(f"\n{input_type} Input Type:")
            input_rankings = rankings[rankings['input_type'] == input_type]
            for _, row in input_rankings.iterrows():
                print(f"  {row['rank']:.0f}. {row['algorithm']}: "
                      f"Success={row['success_rate']:.1f}% ({row['successful_runs']} runs), "
                      f"CPU Time={row['avg_cputime']:.1f}s, "
                      f"Score={row['combined_score']:.3f}")
        
        # Best overall algorithm
        if not rankings.empty:
            best_overall = rankings.loc[rankings['combined_score'].idxmax()]
            print(f"\nüéØ BEST OVERALL ALGORITHM:")
            print(f"  {best_overall['algorithm']} ({best_overall['input_type']})")
            print(f"  Success Rate: {best_overall['success_rate']:.1f}%")
            print(f"  Successful Runs: {best_overall['successful_runs']}")
            print(f"  Average CPU Time: {best_overall['avg_cputime']:.1f}s")
            print(f"  Performance Score: {best_overall['combined_score']:.3f}")
        
        # Save detailed results
        self.all_data.to_csv('all_benchmark_results.csv', index=False)
        rankings.to_csv('algorithm_rankings.csv', index=False)
        
        print(f"\nüìä Detailed results saved to:")
        print(f"  - all_benchmark_results.csv")
        print(f"  - algorithm_rankings.csv")
        print(f"  - algorithm_performance_comparison.png")
        print(f"  - category_success_heatmap.png")
    
    def run_complete_analysis(self):
        """Run complete analysis pipeline"""
        print("Starting comprehensive algorithm analysis...")
        
        # Load data
        self.load_all_data()
        
        # Calculate metrics
        self.calculate_algorithm_metrics()
        
        # Generate visualizations
        self.plot_success_rates()
        self.plot_benchmark_category_analysis()
        
        # Generate report
        self.generate_comprehensive_report()
        
        print("\n‚úÖ Analysis complete!")

# Example usage
def main():
    # Specify the folder containing all XML files
    folder_path = "result_xmls"  # Current directory, change as needed
    
    # Initialize comparator
    comparator = AlgorithmComparator(folder_path)
    
    # Run complete analysis
    comparator.run_complete_analysis()

if __name__ == "__main__":
    main()