# # Performance Analysis: Tree Training and Cross-Validation
# 
# This notebook analyzes the performance comparison between serial and parallel implementations for:
# 1. Decision Tree Training
# 2. Cross-Validation
# 
# We'll examine performance across different tree depths, thread counts, and datasets (cancer, hmeq).


In [None]:

import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

# Set style for better-looking plots
plt.style.use('seaborn-v0_8')
sns.set_palette("husl")

# Set figure size defaults
plt.rcParams['figure.figsize'] = (12, 8)
plt.rcParams['font.size'] = 10

In [None]:
def load_tree_benchmark_data():
    """Load and combine all tree benchmark data files"""
    # Load serial data
    serial_df = pd.read_csv('benchmark_results_serial.csv')
    
    # Load parallel data with different thread counts
    parallel_dfs = []
    for threads in [1, 2, 3, 4]:
        df = pd.read_csv(f'benchmark_results_parallel_{threads}threads.csv')
        df['threads'] = threads
        parallel_dfs.append(df)
    
    # Combine all parallel data
    parallel_df = pd.concat(parallel_dfs, ignore_index=True)
    
    return serial_df, parallel_df

def load_cv_benchmark_data():
    """Load and combine all CV benchmark data files"""
    # Load serial CV data
    serial_cv_df = pd.read_csv('cv_results_serial.csv')
    
    # Load parallel CV data with different thread counts
    parallel_cv_dfs = []
    for threads in [1, 2, 3, 4]:
        df = pd.read_csv(f'cv_results_parallel_{threads}threads.csv')
        df['threads'] = threads
        parallel_cv_dfs.append(df)
    
    # Combine all parallel CV data
    parallel_cv_df = pd.concat(parallel_cv_dfs, ignore_index=True)
    
    return serial_cv_df, parallel_cv_df

In [None]:
# Load all data
print("Loading tree benchmark data...")
serial_tree_df, parallel_tree_df = load_tree_benchmark_data()

print("Loading CV benchmark data...")
serial_cv_df, parallel_cv_df = load_cv_benchmark_data()

print(f"Tree data loaded: {len(serial_tree_df)} serial rows, {len(parallel_tree_df)} parallel rows")
print(f"CV data loaded: {len(serial_cv_df)} serial CV rows, {len(parallel_cv_df)} parallel CV rows")


## Data Overview


In [None]:
print("=== Tree Benchmark Data Structure ===")
print("Serial tree data columns:", serial_tree_df.columns.tolist())
print("Parallel tree data columns:", parallel_tree_df.columns.tolist())
print("\nUnique datasets:", serial_tree_df['dataset'].unique())
print("Tree depth range:", f"{serial_tree_df['max_depth'].min()} - {serial_tree_df['max_depth'].max()}")


In [None]:
print("=== CV Benchmark Data Structure ===")
print("Serial CV data columns:", serial_cv_df.columns.tolist())
print("Parallel CV data columns:", parallel_cv_df.columns.tolist())
print("\nUnique datasets:", serial_cv_df['dataset'].unique())
print("CV depth range:", f"{serial_cv_df['max_depth'].min()} - {serial_cv_df['max_depth'].max()}")


In [None]:

# Sample data preview
print("=== Sample Tree Data ===")
display(serial_tree_df.head(3))

print("\n=== Sample CV Data ===")
display(serial_cv_df.head(3))

In [None]:
def plot_tree_performance():
    """Create tree training performance graphs"""
    datasets = ['cancer', 'hmeq']
    
    # Create figure with subplots for each dataset
    fig, axes = plt.subplots(1, 2, figsize=(15, 6))
    fig.suptitle('Tree Training Performance: Serial vs Parallel', fontsize=16, fontweight='bold')
    
    for i, dataset in enumerate(datasets):
        ax = axes[i]
        
        # Filter data for current dataset
        serial_data = serial_tree_df[serial_tree_df['dataset'] == dataset]
        
        # Plot serial performance
        ax.plot(serial_data['max_depth'], serial_data['train_time_ms'], 
                marker='o', linewidth=2, label='Serial Tree', color='red')
        
        # Plot parallel performance for different thread counts
        colors = ['orange', 'green', 'blue', 'purple']
        for j, threads in enumerate([1, 2, 3, 4]):
            parallel_data = parallel_tree_df[(parallel_tree_df['dataset'] == dataset) & 
                                          (parallel_tree_df['threads'] == threads)]
            ax.plot(parallel_data['max_depth'], parallel_data['train_time_ms'],
                   marker='s', linewidth=2, label=f'Parallel Tree ({threads} threads)', 
                   color=colors[j], alpha=0.8)
        
        ax.set_xlabel('Tree Depth', fontsize=12)
        ax.set_ylabel('Training Time (ms)', fontsize=12)
        ax.set_title(f'{dataset.capitalize()} Dataset', fontsize=14, fontweight='bold')
        ax.legend()
        ax.grid(True, alpha=0.3)
        ax.set_yscale('log')  # Log scale for better visualization of time differences
    
    plt.tight_layout()
    plt.savefig('tree_performance_comparison.png', dpi=300, bbox_inches='tight')
    plt.show()

plot_tree_performance()


In [None]:
def plot_cv_performance():
    """Create cross-validation performance graphs"""
    datasets = ['cancer', 'hmeq']
    
    # Create figure with subplots for each dataset
    fig, axes = plt.subplots(1, 2, figsize=(15, 6))
    fig.suptitle('Cross-Validation Performance: Serial vs Parallel', fontsize=16, fontweight='bold')
    
    for i, dataset in enumerate(datasets):
        ax = axes[i]
        
        # Filter data for current dataset
        serial_data = serial_cv_df[serial_cv_df['dataset'] == dataset]
        
        # Plot serial CV performance
        ax.plot(serial_data['max_depth'], serial_data['cv_time_ms'], 
                marker='o', linewidth=2, label='Serial CV', color='red')
        
        # Plot parallel CV performance for different thread counts
        colors = ['orange', 'green', 'blue', 'purple']
        for j, threads in enumerate([1, 2, 3, 4]):
            parallel_data = parallel_cv_df[(parallel_cv_df['dataset'] == dataset) & 
                                         (parallel_cv_df['threads'] == threads)]
            ax.plot(parallel_data['max_depth'], parallel_data['cv_time_ms'],
                   marker='s', linewidth=2, label=f'Parallel CV ({threads} threads)', 
                   color=colors[j], alpha=0.8)
        
        ax.set_xlabel('Tree Depth', fontsize=12)
        ax.set_ylabel('Cross-Validation Time (ms)', fontsize=12)
        ax.set_title(f'{dataset.capitalize()} Dataset', fontsize=14, fontweight='bold')
        ax.legend()
        ax.grid(True, alpha=0.3)
        ax.set_yscale('log')  # Log scale for better visualization of time differences
    
    plt.tight_layout()
    plt.savefig('cv_performance_comparison.png', dpi=300, bbox_inches='tight')
    plt.show()

# Execute the CV performance analysis
plot_cv_performance()

In [None]:
def plot_speedup_analysis():
    """Create speedup analysis graphs"""
    datasets = ['cancer', 'hmeq']
    
    # Create figure with subplots
    fig, axes = plt.subplots(2, 2, figsize=(15, 12))
    fig.suptitle('Speedup Analysis: Serial vs Parallel (4 Threads)', fontsize=16, fontweight='bold')
    
    for i, dataset in enumerate(datasets):
        # Tree speedup
        ax_tree = axes[0, i]
        serial_tree = serial_tree_df[serial_tree_df['dataset'] == dataset]
        parallel_tree = parallel_tree_df[(parallel_tree_df['dataset'] == dataset) & 
                                      (parallel_tree_df['threads'] == 4)]
        
        # Calculate speedup
        speedup_tree = serial_tree['train_time_ms'].values / parallel_tree['train_time_ms'].values
        
        ax_tree.plot(serial_tree['max_depth'], speedup_tree, 
                    marker='o', linewidth=2, color='blue')
        ax_tree.axhline(y=1, color='red', linestyle='--', alpha=0.7, label='No speedup')
        ax_tree.set_xlabel('Tree Depth')
        ax_tree.set_ylabel('Speedup Factor')
        ax_tree.set_title(f'Tree Training Speedup - {dataset.capitalize()}')
        ax_tree.legend()
        ax_tree.grid(True, alpha=0.3)
        
        # CV speedup
        ax_cv = axes[1, i]
        serial_cv = serial_cv_df[serial_cv_df['dataset'] == dataset]
        parallel_cv = parallel_cv_df[(parallel_cv_df['dataset'] == dataset) & 
                                   (parallel_cv_df['threads'] == 4)]
        
        # Calculate speedup
        speedup_cv = serial_cv['cv_time_ms'].values / parallel_cv['cv_time_ms'].values
        
        ax_cv.plot(serial_cv['max_depth'], speedup_cv, 
                  marker='s', linewidth=2, color='green')
        ax_cv.axhline(y=1, color='red', linestyle='--', alpha=0.7, label='No speedup')
        ax_cv.set_xlabel('Tree Depth')
        ax_cv.set_ylabel('Speedup Factor')
        ax_cv.set_title(f'Cross-Validation Speedup - {dataset.capitalize()}')
        ax_cv.legend()
        ax_cv.grid(True, alpha=0.3)
    
    plt.tight_layout()
    plt.savefig('speedup_analysis.png', dpi=300, bbox_inches='tight')
    plt.show()

# Execute speedup analysis
plot_speedup_analysis()

In [None]:
def plot_thread_scaling():
    """Create thread scaling analysis"""
    # Select specific depths for analysis
    depths_to_analyze = [5, 10, 15, 20]
    
    fig, axes = plt.subplots(2, 2, figsize=(15, 12))
    fig.suptitle('Thread Scaling Analysis', fontsize=16, fontweight='bold')
    
    datasets = ['cancer', 'hmeq']
    
    for i, dataset in enumerate(datasets):
        # Tree scaling
        ax_tree = axes[0, i]
        for depth in depths_to_analyze:
            times = []
            for threads in [1, 2, 3, 4]:
                data = parallel_tree_df[(parallel_tree_df['dataset'] == dataset) & 
                                     (parallel_tree_df['max_depth'] == depth) & 
                                     (parallel_tree_df['threads'] == threads)]
                if not data.empty:
                    times.append(data['train_time_ms'].iloc[0])
                else:
                    times.append(None)
            
            # Filter out None values
            thread_counts = [1, 2, 3, 4]
            valid_data = [(t, time) for t, time in zip(thread_counts, times) if time is not None]
            if valid_data:
                threads, times = zip(*valid_data)
                ax_tree.plot(threads, times, marker='o', linewidth=2, label=f'Depth {depth}')
        
        ax_tree.set_xlabel('Number of Threads')
        ax_tree.set_ylabel('Training Time (ms)')
        ax_tree.set_title(f'Tree Training - {dataset.capitalize()}')
        ax_tree.legend()
        ax_tree.grid(True, alpha=0.3)
        ax_tree.set_yscale('log')
        
        # CV scaling
        ax_cv = axes[1, i]
        for depth in depths_to_analyze:
            times = []
            for threads in [1, 2, 3, 4]:
                data = parallel_cv_df[(parallel_cv_df['dataset'] == dataset) & 
                                    (parallel_cv_df['max_depth'] == depth) & 
                                    (parallel_cv_df['threads'] == threads)]
                if not data.empty:
                    times.append(data['cv_time_ms'].iloc[0])
                else:
                    times.append(None)
            
            # Filter out None values
            thread_counts = [1, 2, 3, 4]
            valid_data = [(t, time) for t, time in zip(thread_counts, times) if time is not None]
            if valid_data:
                threads, times = zip(*valid_data)
                ax_cv.plot(threads, times, marker='s', linewidth=2, label=f'Depth {depth}')
        
        ax_cv.set_xlabel('Number of Threads')
        ax_cv.set_ylabel('CV Time (ms)')
        ax_cv.set_title(f'Cross-Validation - {dataset.capitalize()}')
        ax_cv.legend()
        ax_cv.grid(True, alpha=0.3)
        ax_cv.set_yscale('log')
    
    plt.tight_layout()
    plt.savefig('thread_scaling_analysis.png', dpi=300, bbox_inches='tight')
    plt.show()

# Execute thread scaling analysis
plot_thread_scaling()



## 5. Performance Summary Statistics

In [None]:



# %%
def calculate_performance_stats():
    """Calculate and display performance summary statistics"""
    print("=== PERFORMANCE SUMMARY STATISTICS ===\n")
    
    for dataset in ['cancer', 'hmeq']:
        print(f"📊 {dataset.upper()} DATASET")
        print("=" * 50)
        
        # Tree performance stats
        serial_tree = serial_tree_df[serial_tree_df['dataset'] == dataset]
        parallel_tree_4 = parallel_tree_df[(parallel_tree_df['dataset'] == dataset) & 
                                         (parallel_tree_df['threads'] == 4)]
        
        avg_tree_speedup = (serial_tree['train_time_ms'].mean() / 
                           parallel_tree_4['train_time_ms'].mean())
        
        print(f"🌳 Tree Training:")
        print(f"   Average serial time: {serial_tree['train_time_ms'].mean():.2f} ms")
        print(f"   Average parallel time (4 threads): {parallel_tree_4['train_time_ms'].mean():.2f} ms")
        print(f"   Average speedup: {avg_tree_speedup:.2f}x")
        
        # CV performance stats
        serial_cv = serial_cv_df[serial_cv_df['dataset'] == dataset]
        parallel_cv_4 = parallel_cv_df[(parallel_cv_df['dataset'] == dataset) & 
                                     (parallel_cv_df['threads'] == 4)]
        
        avg_cv_speedup = (serial_cv['cv_time_ms'].mean() / 
                         parallel_cv_4['cv_time_ms'].mean())
        
        print(f"🔄 Cross-Validation:")
        print(f"   Average serial time: {serial_cv['cv_time_ms'].mean():.2f} ms")
        print(f"   Average parallel time (4 threads): {parallel_cv_4['cv_time_ms'].mean():.2f} ms")
        print(f"   Average speedup: {avg_cv_speedup:.2f}x")
        print()

# Calculate and display summary statistics
calculate_performance_stats()


## 6. Best Performance Configurations


In [None]:
def find_best_configurations():
    """Find the best performing configurations"""
    print("=== BEST PERFORMANCE CONFIGURATIONS ===\n")
    
    for dataset in ['cancer', 'hmeq']:
        print(f"🏆 {dataset.upper()} DATASET - OPTIMAL CONFIGURATIONS")
        print("=" * 60)
        
        # Find best tree training configuration
        dataset_tree_data = parallel_tree_df[parallel_tree_df['dataset'] == dataset]
        best_tree_config = dataset_tree_data.loc[dataset_tree_data['train_time_ms'].idxmin()]
        
        print(f"🌳 Fastest Tree Training:")
        print(f"   Depth: {best_tree_config['max_depth']}")
        print(f"   Threads: {best_tree_config['threads']}")
        print(f"   Time: {best_tree_config['train_time_ms']:.2f} ms")
        print(f"   Accuracy: {best_tree_config['test_accuracy']:.3f}")
        
        # Find best CV configuration
        dataset_cv_data = parallel_cv_df[parallel_cv_df['dataset'] == dataset]
        best_cv_config = dataset_cv_data.loc[dataset_cv_data['cv_time_ms'].idxmin()]
        
        print(f"🔄 Fastest Cross-Validation:")
        print(f"   Depth: {best_cv_config['max_depth']}")
        print(f"   Threads: {best_cv_config['threads']}")
        print(f"   Time: {best_cv_config['cv_time_ms']:.2f} ms")
        print(f"   CV Accuracy: {best_cv_config['mean_cv_accuracy']:.3f}")
        print()

# Find and display best configurations
find_best_configurations()
