# Enhanced Arabic Dialect PEFT Training Metrics Analysis

This notebook provides comprehensive analysis of training metrics recorded during Arabic dialect fine-tuning experiments.
It includes advanced LORA effectiveness analysis, real-time monitoring capabilities, and detailed visualization of PEFT impact.

**Enhanced Features:**
- LORA adapter effectiveness quantification
- Layer-wise adaptation analysis
- Parameter efficiency tracking  
- Cross-dialect generalization assessment
- Real-time training monitoring
- Interactive visualization dashboard

**Data Source:** Enhanced metrics automatically recorded by `dialect_peft_training.py`

## 1. Import Enhanced Libraries
Import comprehensive libraries including advanced visualization, statistical analysis, and interactive plotting capabilities.

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.graph_objects as go
import plotly.express as px
from plotly.subplots import make_subplots
import plotly.offline as pyo
import json
import glob
from pathlib import Path
import warnings
import datetime
from scipy import stats
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
import networkx as nx

warnings.filterwarnings('ignore')

# Set plotting style for enhanced aesthetics
plt.style.use('seaborn-v0_8')
sns.set_palette("husl")

# Configure matplotlib with enhanced settings
plt.rcParams.update({
    'figure.figsize': (14, 10),
    'font.size': 12,
    'axes.titlesize': 16,
    'axes.labelsize': 14,
    'xtick.labelsize': 12,
    'ytick.labelsize': 12,
    'legend.fontsize': 12,
    'figure.dpi': 100,
    'savefig.dpi': 300,
    'figure.facecolor': 'white',
    'axes.grid': True,
    'grid.alpha': 0.3
})

# Configure plotly for notebook
pyo.init_notebook_mode(connected=True)

print("Enhanced libraries imported successfully!")
print(f"Analysis timestamp: {datetime.datetime.now()}")

## 2. Enhanced Data Loading and Processing
Load comprehensive training data with support for detailed LORA metrics and step-by-step analysis.

In [None]:
def load_enhanced_training_metrics(base_dir="./results"):
    """Load comprehensive training metrics including detailed LORA analysis."""
    
    # Look in enhanced directory structure
    subdirs = ["ex_finetune", "ex_scratch", "ex_peft", "ex_comparison_dialectal"]
    all_metrics = []
    detailed_metrics = []
    
    for subdir in subdirs:
        subdir_path = Path(base_dir) / subdir
        if subdir_path.exists():
            # Load standard metrics files
            metrics_files = list(subdir_path.glob("results_whisper-*.json"))
            
            # Load detailed metrics if available
            detailed_dir = subdir_path / "detailed"
            if detailed_dir.exists():
                detailed_files = list(detailed_dir.glob("detailed_metrics_*.json"))
            else:
                detailed_files = []
            
            for file_path in metrics_files:
                try:
                    with open(file_path, 'r') as f:
                        data = json.load(f)
                    
                    # Parse filename for metadata
                    filename = file_path.stem
                    parts = filename.split('_')
                    
                    if len(parts) >= 3:
                        model_part = parts[1]
                        dialect = parts[2] if len(parts) > 2 else "unknown"
                        seed = parts[3].replace("seed", "") if len(parts) > 3 and "seed" in parts[3] else "42"
                        
                        # Determine method
                        if "peft" in model_part:
                            method = "PEFT_LoRA"
                        elif "finetune" in model_part:
                            method = "Full_FineTune"
                        else:
                            method = "Unknown"
                        
                        # Enhanced metrics structure
                        metrics = {
                            'experiment_name': filename,
                            'dialect': dialect,
                            'model_type': method,
                            'seed': int(seed),
                            'final_wer': data.get('wer', data.get('final_wer', 0)),
                            'final_cer': data.get('cer', data.get('final_cer', 0)),
                            'training_time_seconds': data.get('training_time_seconds', data.get('training_time', 0)),
                            'peak_memory_mb': data.get('peak_memory_mb', 0),
                            'trainable_params': data.get('trainable_params', 0),
                            'total_params': data.get('total_params', 0),
                            'final_loss': data.get('final_loss', 0),
                            
                            # Enhanced LORA metrics (if available)
                            'lora_rank': data.get('lora_rank', 0),
                            'lora_alpha': data.get('lora_alpha', 0),
                            'lora_dropout': data.get('lora_dropout', 0),
                            'parameter_efficiency_ratio': data.get('parameter_efficiency_ratio', 0),
                            'memory_efficiency_ratio': data.get('memory_efficiency_ratio', 0),
                            'training_efficiency_score': data.get('training_efficiency_score', 0),
                            'convergence_step': data.get('convergence_step', 0),
                            'effective_rank': data.get('effective_rank', 0),
                            'adaptation_magnitude': data.get('adaptation_magnitude', 0),
                            'performance_per_param': data.get('performance_per_param', 0),
                            
                            'source_file': str(file_path)
                        }
                        
                        all_metrics.append(metrics)
                        print(f"Loaded: {file_path} -> {method} {dialect}")
                
                except Exception as e:
                    print(f"Error loading {file_path}: {e}")
            
            # Load detailed metrics
            for detailed_file in detailed_files:
                try:
                    with open(detailed_file, 'r') as f:
                        detailed_data = json.load(f)
                    detailed_metrics.append({
                        'filename': detailed_file.stem,
                        'data': detailed_data
                    })
                    print(f"Loaded detailed: {detailed_file}")
                except Exception as e:
                    print(f"Error loading detailed {detailed_file}: {e}")
    
    # Convert to DataFrames
    metrics_df = pd.DataFrame(all_metrics) if all_metrics else pd.DataFrame()
    
    if not metrics_df.empty:
        print(f"\nLoaded {len(metrics_df)} experiments")
        print(f"Methods found: {metrics_df['model_type'].unique()}")
        print(f"Dialects found: {metrics_df['dialect'].unique()}")
        
        # Enhanced data validation
        print(f"\nData completeness:")
        print(f"- Basic metrics: {metrics_df[['final_wer', 'final_cer', 'training_time_seconds']].notna().all(axis=1).sum()}/{len(metrics_df)}")
        print(f"- LORA metrics: {metrics_df[['lora_rank', 'parameter_efficiency_ratio']].notna().all(axis=1).sum()}/{len(metrics_df)}")
        print(f"- Efficiency metrics: {metrics_df[['memory_efficiency_ratio', 'training_efficiency_score']].notna().all(axis=1).sum()}/{len(metrics_df)}")
        
    else:
        print("No metrics files found.")
        print("Make sure you've run enhanced training experiments that save results in:")
        print("  - ./results/ex_peft/")
        print("  - ./results/ex_finetune/")
        
    return metrics_df, detailed_metrics

# Load the enhanced data
metrics_df, detailed_metrics = load_enhanced_training_metrics()

if not metrics_df.empty:
    print("\n" + "="*60)
    print("ENHANCED DATASET OVERVIEW")
    print("="*60)
    print(metrics_df.head())
    print(f"\nColumns: {list(metrics_df.columns)}")
    print(f"\nDataset shape: {metrics_df.shape}")
else:
    print("No data loaded. Please run enhanced training experiments first.")

## 3. Enhanced Data Preprocessing and Feature Engineering
Advanced preprocessing including efficiency metrics calculation, normalization, and feature engineering for LORA analysis.

In [None]:
def enhanced_preprocess_metrics(df):
    """Enhanced preprocessing with comprehensive feature engineering."""
    if df.empty:
        return df
    
    # Convert time units
    df['training_time_minutes'] = df['training_time_seconds'] / 60
    df['training_time_hours'] = df['training_time_seconds'] / 3600
    
    # Convert memory units
    df['peak_memory_gb'] = df['peak_memory_mb'] / 1024
    
    # Convert parameters to millions
    df['total_params_millions'] = df['total_params'] / 1_000_000
    df['trainable_params_millions'] = df['trainable_params'] / 1_000_000
    
    # Calculate trainable percentage
    df['trainable_percentage'] = (df['trainable_params'] / df['total_params']) * 100
    
    # Enhanced efficiency metrics
    df['wer_score'] = 100 - df['final_wer']  # Higher is better
    df['cer_score'] = 100 - df['final_cer']  # Higher is better
    
    # Memory efficiency (performance per GB)
    df['memory_efficiency'] = df['wer_score'] / (df['peak_memory_gb'] + 0.1)  # Add small constant to avoid division by zero
    
    # Parameter efficiency (performance per million trainable parameters)
    df['param_efficiency'] = df['wer_score'] / (df['trainable_params_millions'] + 0.1)
    
    # Time efficiency (performance per hour)
    df['time_efficiency'] = df['wer_score'] / (df['training_time_hours'] + 0.01)
    
    # LORA-specific metrics
    peft_mask = df['model_type'] == 'PEFT_LoRA'
    
    if peft_mask.any():
        # LORA effectiveness score
        df.loc[peft_mask, 'lora_effectiveness'] = (
            df.loc[peft_mask, 'wer_score'] * 
            df.loc[peft_mask, 'parameter_efficiency_ratio'] * 100
        )
        
        # Rank efficiency
        df.loc[peft_mask, 'rank_efficiency'] = (
            df.loc[peft_mask, 'effective_rank'] / (df.loc[peft_mask, 'lora_rank'] + 1)
        )
        
        # Adaptation efficiency
        df.loc[peft_mask, 'adaptation_efficiency'] = (
            df.loc[peft_mask, 'wer_score'] / (df.loc[peft_mask, 'adaptation_magnitude'] + 1e-6)
        )
    
    # Composite efficiency score
    df['composite_efficiency'] = (
        0.4 * df['param_efficiency'] + 
        0.3 * df['memory_efficiency'] + 
        0.3 * df['time_efficiency']
    )
    
    # Performance tier classification
    df['performance_tier'] = pd.cut(
        df['final_wer'], 
        bins=[0, 30, 50, 70, 100], 
        labels=['Excellent', 'Good', 'Fair', 'Poor']
    )
    
    # Resource usage tier
    df['resource_tier'] = pd.cut(
        df['peak_memory_gb'], 
        bins=[0, 2, 5, 10, float('inf')], 
        labels=['Low', 'Medium', 'High', 'Very High']
    )
    
    return df

# Enhanced preprocessing
if not metrics_df.empty:
    metrics_df = enhanced_preprocess_metrics(metrics_df)
    
    print("Enhanced preprocessing complete!")
    print(f"\nEnhanced dataset overview:")
    
    # Display key efficiency metrics
    efficiency_cols = ['param_efficiency', 'memory_efficiency', 'time_efficiency', 
                      'composite_efficiency', 'performance_tier', 'resource_tier']
    
    available_efficiency_cols = [col for col in efficiency_cols if col in metrics_df.columns]
    if available_efficiency_cols:
        print("\nEfficiency Metrics Summary:")
        print(metrics_df[available_efficiency_cols].describe())
    
    # LORA-specific analysis
    peft_data = metrics_df[metrics_df['model_type'] == 'PEFT_LoRA']
    if not peft_data.empty:
        print(f"\nLORA Experiments: {len(peft_data)}")
        lora_cols = ['lora_rank', 'lora_alpha', 'parameter_efficiency_ratio', 
                    'lora_effectiveness', 'rank_efficiency']
        available_lora_cols = [col for col in lora_cols if col in peft_data.columns]
        if available_lora_cols:
            print("LORA Metrics Summary:")
            print(peft_data[available_lora_cols].describe())
    
    print(f"\nFinal dataset shape: {metrics_df.shape}")
    print(f"Methods: {metrics_df['model_type'].value_counts().to_dict()}")
    print(f"Dialects: {metrics_df['dialect'].value_counts().to_dict()}")

## 4. Enhanced Performance Analysis
Comprehensive performance analysis including LORA effectiveness, efficiency metrics, and cross-dialect comparisons.

In [None]:
def create_enhanced_performance_analysis(df):
    """Create comprehensive performance analysis with enhanced visualizations."""
    if df.empty or 'final_wer' not in df.columns:
        print("No WER data available for analysis.")
        return
    
    # Create figure with multiple subplots
    fig = plt.figure(figsize=(20, 16))
    
    # 1. Performance comparison by method and dialect
    ax1 = plt.subplot(3, 3, 1)
    if 'model_type' in df.columns and 'dialect' in df.columns:
        pivot_wer = df.pivot_table(values='final_wer', index='dialect', columns='model_type', aggfunc='mean')
        sns.heatmap(pivot_wer, annot=True, fmt='.1f', cmap='RdYlGn_r', ax=ax1)
        ax1.set_title('Average WER by Method and Dialect', fontweight='bold')
    
    # 2. Efficiency scatter plot
    ax2 = plt.subplot(3, 3, 2)
    if 'param_efficiency' in df.columns and 'memory_efficiency' in df.columns:
        scatter = ax2.scatter(df['param_efficiency'], df['memory_efficiency'], 
                            c=df['final_wer'], s=80, alpha=0.7, cmap='RdYlGn_r')
        ax2.set_xlabel('Parameter Efficiency')
        ax2.set_ylabel('Memory Efficiency')
        ax2.set_title('Efficiency Analysis', fontweight='bold')
        plt.colorbar(scatter, ax=ax2, label='WER')
    
    # 3. Training time vs performance
    ax3 = plt.subplot(3, 3, 3)
    if 'training_time_hours' in df.columns:
        for method in df['model_type'].unique():
            method_data = df[df['model_type'] == method]
            ax3.scatter(method_data['training_time_hours'], method_data['final_wer'], 
                       label=method, alpha=0.7, s=60)
        ax3.set_xlabel('Training Time (hours)')
        ax3.set_ylabel('WER (%)')
        ax3.set_title('Training Time vs Performance', fontweight='bold')
        ax3.legend()
    
    # 4. LORA-specific analysis
    ax4 = plt.subplot(3, 3, 4)
    peft_data = df[df['model_type'] == 'PEFT_LoRA']
    if not peft_data.empty and 'lora_rank' in peft_data.columns:
        rank_performance = peft_data.groupby('lora_rank')['final_wer'].agg(['mean', 'std']).reset_index()
        ax4.errorbar(rank_performance['lora_rank'], rank_performance['mean'], 
                    yerr=rank_performance['std'], marker='o', capsize=5)
        ax4.set_xlabel('LoRA Rank')
        ax4.set_ylabel('Average WER (%)')
        ax4.set_title('LoRA Rank vs Performance', fontweight='bold')
        ax4.grid(True, alpha=0.3)
    
    # 5. Parameter efficiency comparison
    ax5 = plt.subplot(3, 3, 5)
    if 'trainable_percentage' in df.columns:
        df.boxplot(column='trainable_percentage', by='model_type', ax=ax5)
        ax5.set_xlabel('Method')
        ax5.set_ylabel('Trainable Parameters (%)')
        ax5.set_title('Parameter Efficiency by Method', fontweight='bold')
        plt.suptitle('')  # Remove automatic title
    
    # 6. Memory usage analysis
    ax6 = plt.subplot(3, 3, 6)
    if 'peak_memory_gb' in df.columns:
        df.boxplot(column='peak_memory_gb', by='model_type', ax=ax6)
        ax6.set_xlabel('Method')
        ax6.set_ylabel('Peak Memory (GB)')
        ax6.set_title('Memory Usage by Method', fontweight='bold')
        plt.suptitle('')
    
    # 7. Performance distribution
    ax7 = plt.subplot(3, 3, 7)
    for method in df['model_type'].unique():
        method_data = df[df['model_type'] == method]['final_wer']
        ax7.hist(method_data, alpha=0.6, label=method, bins=15)
    ax7.set_xlabel('WER (%)')
    ax7.set_ylabel('Frequency')
    ax7.set_title('Performance Distribution', fontweight='bold')
    ax7.legend()
    
    # 8. Composite efficiency analysis
    ax8 = plt.subplot(3, 3, 8)
    if 'composite_efficiency' in df.columns:
        df.boxplot(column='composite_efficiency', by='dialect', ax=ax8)
        ax8.set_xlabel('Dialect')
        ax8.set_ylabel('Composite Efficiency Score')
        ax8.set_title('Efficiency by Dialect', fontweight='bold')
        ax8.tick_params(axis='x', rotation=45)
        plt.suptitle('')
    
    # 9. Performance tier analysis
    ax9 = plt.subplot(3, 3, 9)
    if 'performance_tier' in df.columns:
        tier_counts = df.groupby(['model_type', 'performance_tier']).size().unstack(fill_value=0)
        tier_counts.plot(kind='bar', stacked=True, ax=ax9)
        ax9.set_xlabel('Method')
        ax9.set_ylabel('Count')
        ax9.set_title('Performance Tier Distribution', fontweight='bold')
        ax9.tick_params(axis='x', rotation=45)
        ax9.legend(title='Performance Tier')
    
    plt.tight_layout()
    plt.show()
    
    # Print summary statistics
    print("
" + "="*80)
    print("ENHANCED PERFORMANCE ANALYSIS SUMMARY")
    print("="*80)
    
    # Overall statistics
    print(f"
Overall Statistics:")
    print(f"Total experiments: {len(df)}")
    print(f"Methods: {df['model_type'].value_counts().to_dict()}")
    print(f"Dialects: {df['dialect'].value_counts().to_dict()}")
    
    # Performance by method
    print(f"
Performance by Method (WER %):")
    method_performance = df.groupby('model_type')['final_wer'].agg(['mean', 'std', 'min', 'max'])
    print(method_performance.round(2))
    
    # LORA effectiveness
    peft_data = df[df['model_type'] == 'PEFT_LoRA']
    if not peft_data.empty:
        print(f"
LORA Effectiveness Analysis:")
        if 'parameter_efficiency_ratio' in peft_data.columns:
            avg_param_efficiency = peft_data['parameter_efficiency_ratio'].mean()
            print(f"Average parameter efficiency: {avg_param_efficiency:.4f} ({avg_param_efficiency*100:.2f}% of base model)")
        
        if 'lora_effectiveness' in peft_data.columns:
            print(f"Average LORA effectiveness score: {peft_data['lora_effectiveness'].mean():.2f}")
        
        if 'rank_efficiency' in peft_data.columns:
            print(f"Average rank efficiency: {peft_data['rank_efficiency'].mean():.2f}")

# Run enhanced performance analysis
if not metrics_df.empty:
    create_enhanced_performance_analysis(metrics_df)
else:
    print("No data available for performance analysis.")

## 5. Interactive LORA Analysis Dashboard
Interactive visualizations using Plotly for deep LORA effectiveness analysis.

In [None]:
def create_interactive_lora_dashboard(df):
    """Create interactive dashboard for LORA analysis."""
    if df.empty:
        print("No data available for interactive analysis.")
        return
    
    peft_data = df[df['model_type'] == 'PEFT_LoRA'].copy()
    full_data = df[df['model_type'] == 'Full_FineTune'].copy()
    
    if peft_data.empty:
        print("No PEFT data available for interactive analysis.")
        return
    
    # Create subplots
    fig = make_subplots(
        rows=2, cols=2,
        subplot_titles=('LORA Parameter Efficiency', 'Performance vs Efficiency', 
                       'Training Dynamics', 'Cross-Method Comparison'),
        specs=[[{"secondary_y": False}, {"secondary_y": False}],
               [{"secondary_y": True}, {"secondary_y": False}]]
    )
    
    # 1. LORA Parameter Efficiency
    if 'lora_rank' in peft_data.columns and 'parameter_efficiency_ratio' in peft_data.columns:
        fig.add_trace(
            go.Scatter(
                x=peft_data['lora_rank'],
                y=peft_data['parameter_efficiency_ratio'] * 100,
                mode='markers',
                marker=dict(
                    size=10,
                    color=peft_data['final_wer'],
                    colorscale='RdYlGn_r',
                    showscale=True,
                    colorbar=dict(title="WER (%)", x=0.45)
                ),
                text=peft_data['dialect'],
                hovertemplate='<b>%{text}</b><br>Rank: %{x}<br>Param Efficiency: %{y:.3f}%<br>WER: %{marker.color:.1f}%<extra></extra>',
                name='LORA Experiments'
            ),
            row=1, col=1
        )
    
    # 2. Performance vs Efficiency
    if 'param_efficiency' in df.columns and 'memory_efficiency' in df.columns:
        for method in df['model_type'].unique():
            method_data = df[df['model_type'] == method]
            fig.add_trace(
                go.Scatter(
                    x=method_data['param_efficiency'],
                    y=method_data['memory_efficiency'],
                    mode='markers',
                    marker=dict(size=8),
                    name=method,
                    text=method_data['dialect'],
                    hovertemplate='<b>%{text}</b><br>Param Efficiency: %{x:.2f}<br>Memory Efficiency: %{y:.2f}<extra></extra>'
                ),
                row=1, col=2
            )
    
    # 3. Training Dynamics
    if 'training_time_hours' in df.columns:
        # Training time comparison
        fig.add_trace(
            go.Bar(
                x=peft_data['dialect'],
                y=peft_data['training_time_hours'],
                name='PEFT Training Time',
                marker_color='lightblue',
                yaxis='y3'
            ),
            row=2, col=1
        )
        
        # Performance overlay
        fig.add_trace(
            go.Scatter(
                x=peft_data['dialect'],
                y=peft_data['final_wer'],
                mode='markers+lines',
                name='PEFT WER',
                marker=dict(color='red', size=8),
                yaxis='y4'
            ),
            row=2, col=1, secondary_y=True
        )
    
    # 4. Cross-Method Comparison
    if not full_data.empty:
        # Create comparison data
        comparison_data = []
        for dialect in df['dialect'].unique():
            peft_perf = peft_data[peft_data['dialect'] == dialect]['final_wer'].mean()
            full_perf = full_data[full_data['dialect'] == dialect]['final_wer'].mean()
            
            if not pd.isna(peft_perf) and not pd.isna(full_perf):
                comparison_data.append({
                    'dialect': dialect,
                    'PEFT_WER': peft_perf,
                    'FullFT_WER': full_perf,
                    'improvement': full_perf - peft_perf
                })
        
        if comparison_data:
            comp_df = pd.DataFrame(comparison_data)
            
            fig.add_trace(
                go.Bar(
                    x=comp_df['dialect'],
                    y=comp_df['PEFT_WER'],
                    name='PEFT LoRA',
                    marker_color='lightgreen'
                ),
                row=2, col=2
            )
            
            fig.add_trace(
                go.Bar(
                    x=comp_df['dialect'],
                    y=comp_df['FullFT_WER'],
                    name='Full Fine-tune',
                    marker_color='lightcoral'
                ),
                row=2, col=2
            )
    
    # Update layout
    fig.update_layout(
        title_text="Interactive LORA Effectiveness Dashboard",
        title_font_size=20,
        height=800,
        showlegend=True
    )
    
    # Update axes labels
    fig.update_xaxes(title_text="LoRA Rank", row=1, col=1)
    fig.update_yaxes(title_text="Parameter Efficiency (%)", row=1, col=1)
    
    fig.update_xaxes(title_text="Parameter Efficiency", row=1, col=2)
    fig.update_yaxes(title_text="Memory Efficiency", row=1, col=2)
    
    fig.update_xaxes(title_text="Dialect", row=2, col=1)
    fig.update_yaxes(title_text="Training Time (hours)", row=2, col=1)
    fig.update_yaxes(title_text="WER (%)", row=2, col=1, secondary_y=True)
    
    fig.update_xaxes(title_text="Dialect", row=2, col=2)
    fig.update_yaxes(title_text="WER (%)", row=2, col=2)
    
    fig.show()
    
    # Create effectiveness radar chart
    if not peft_data.empty:
        create_lora_effectiveness_radar(peft_data)

def create_lora_effectiveness_radar(peft_data):
    """Create radar chart for LORA effectiveness metrics."""
    
    effectiveness_metrics = []
    for metric in ['param_efficiency', 'memory_efficiency', 'time_efficiency', 
                  'lora_effectiveness', 'rank_efficiency']:
        if metric in peft_data.columns:
            effectiveness_metrics.append(metric)
    
    if len(effectiveness_metrics) < 3:
        print("Insufficient effectiveness metrics for radar chart.")
        return
    
    # Calculate average metrics across all experiments
    avg_metrics = peft_data[effectiveness_metrics].mean()
    
    # Normalize to 0-1 scale
    normalized_metrics = (avg_metrics - avg_metrics.min()) / (avg_metrics.max() - avg_metrics.min())
    
    # Create radar chart
    fig = go.Figure()
    
    fig.add_trace(go.Scatterpolar(
        r=normalized_metrics.values,
        theta=[metric.replace('_', ' ').title() for metric in effectiveness_metrics],
        fill='toself',
        name='Average LORA Effectiveness',
        line_color='blue'
    ))
    
    fig.update_layout(
        polar=dict(
            radialaxis=dict(
                visible=True,
                range=[0, 1]
            )),
        title="LORA Effectiveness Radar Chart",
        title_font_size=16
    )
    
    fig.show()

# Create interactive dashboard
if not metrics_df.empty:
    create_interactive_lora_dashboard(metrics_df)
else:
    print("No data available for interactive dashboard.")

## 6. Comprehensive Method Comparison
Statistical comparison between PEFT LoRA and Full Fine-tuning with significance testing.

In [None]:
def comprehensive_method_comparison(df):
    """Perform comprehensive statistical comparison between methods."""
    if df.empty or 'model_type' not in df.columns:
        print("Insufficient data for method comparison.")
        return
    
    peft_data = df[df['model_type'] == 'PEFT_LoRA']
    full_data = df[df['model_type'] == 'Full_FineTune']
    
    if peft_data.empty or full_data.empty:
        print("Need both PEFT and Full Fine-tuning data for comparison.")
        return
    
    print("="*80)
    print("COMPREHENSIVE METHOD COMPARISON ANALYSIS")
    print("="*80)
    
    # Performance comparison
    print(f"
1. PERFORMANCE COMPARISON")
    print(f"{'-'*40}")
    
    metrics_to_compare = ['final_wer', 'final_cer', 'training_time_hours', 
                         'peak_memory_gb', 'trainable_params_millions']
    
    comparison_results = {}
    
    for metric in metrics_to_compare:
        if metric in peft_data.columns and metric in full_data.columns:
            peft_values = peft_data[metric].dropna()
            full_values = full_data[metric].dropna()
            
            if len(peft_values) > 0 and len(full_values) > 0:
                # Calculate statistics
                peft_mean = peft_values.mean()
                full_mean = full_values.mean()
                peft_std = peft_values.std()
                full_std = full_values.std()
                
                # Statistical test (if sufficient samples)
                if len(peft_values) > 1 and len(full_values) > 1:
                    t_stat, p_value = stats.ttest_ind(peft_values, full_values)
                    significant = p_value < 0.05
                else:
                    t_stat, p_value, significant = None, None, False
                
                # Calculate improvement
                if metric in ['final_wer', 'final_cer']:  # Lower is better
                    improvement = ((full_mean - peft_mean) / full_mean) * 100
                else:  # Higher is better for time/memory efficiency
                    improvement = ((peft_mean - full_mean) / full_mean) * 100
                
                comparison_results[metric] = {
                    'peft_mean': peft_mean,
                    'full_mean': full_mean,
                    'peft_std': peft_std,
                    'full_std': full_std,
                    'improvement_pct': improvement,
                    't_stat': t_stat,
                    'p_value': p_value,
                    'significant': significant
                }
                
                print(f"
{metric.replace('_', ' ').title()}:")
                print(f"  PEFT LoRA:     {peft_mean:.3f} ± {peft_std:.3f}")
                print(f"  Full Finetune: {full_mean:.3f} ± {full_std:.3f}")
                print(f"  Improvement:   {improvement:+.1f}%")
                if significant:
                    print(f"  Statistical significance: ✓ (p={p_value:.3f})")
                else:
                    print(f"  Statistical significance: ✗ (p={p_value:.3f})" if p_value else "  Statistical significance: N/A")
    
    # Efficiency analysis
    print(f"
2. EFFICIENCY ANALYSIS")
    print(f"{'-'*40}")
    
    efficiency_metrics = ['param_efficiency', 'memory_efficiency', 'time_efficiency', 'composite_efficiency']
    
    for metric in efficiency_metrics:
        if metric in peft_data.columns and metric in full_data.columns:
            peft_eff = peft_data[metric].mean()
            full_eff = full_data[metric].mean()
            efficiency_gain = ((peft_eff - full_eff) / full_eff) * 100
            
            print(f"
{metric.replace('_', ' ').title()}:")
            print(f"  PEFT LoRA:     {peft_eff:.3f}")
            print(f"  Full Finetune: {full_eff:.3f}")
            print(f"  Efficiency Gain: {efficiency_gain:+.1f}%")
    
    # Resource utilization
    print(f"
3. RESOURCE UTILIZATION")
    print(f"{'-'*40}")
    
    if 'trainable_percentage' in peft_data.columns:
        avg_peft_params = peft_data['trainable_percentage'].mean()
        avg_full_params = full_data['trainable_percentage'].mean() if 'trainable_percentage' in full_data.columns else 100
        
        print(f"
Trainable Parameters:")
        print(f"  PEFT LoRA:     {avg_peft_params:.2f}% of base model")
        print(f"  Full Finetune: {avg_full_params:.2f}% of base model")
        print(f"  Parameter Reduction: {100 - avg_peft_params:.1f}%")
    
    if 'peak_memory_gb' in peft_data.columns and 'peak_memory_gb' in full_data.columns:
        memory_reduction = ((full_data['peak_memory_gb'].mean() - peft_data['peak_memory_gb'].mean()) / 
                           full_data['peak_memory_gb'].mean()) * 100
        print(f"
Memory Usage Reduction: {memory_reduction:.1f}%")
    
    # Per-dialect analysis
    print(f"
4. PER-DIALECT PERFORMANCE")
    print(f"{'-'*40}")
    
    dialect_comparison = []
    for dialect in df['dialect'].unique():
        peft_dialect = peft_data[peft_data['dialect'] == dialect]['final_wer']
        full_dialect = full_data[full_data['dialect'] == dialect]['final_wer']
        
        if len(peft_dialect) > 0 and len(full_dialect) > 0:
            peft_wer = peft_dialect.mean()
            full_wer = full_dialect.mean()
            improvement = ((full_wer - peft_wer) / full_wer) * 100
            
            dialect_comparison.append({
                'dialect': dialect,
                'peft_wer': peft_wer,
                'full_wer': full_wer,
                'improvement': improvement
            })
            
            print(f"
{dialect.capitalize()}:")
            print(f"  PEFT WER:      {peft_wer:.2f}%")
            print(f"  Full FT WER:   {full_wer:.2f}%")
            print(f"  Improvement:   {improvement:+.1f}%")
    
    # Create visualization
    create_comparison_visualizations(comparison_results, dialect_comparison)
    
    return comparison_results, dialect_comparison

def create_comparison_visualizations(comparison_results, dialect_comparison):
    """Create visualizations for method comparison."""
    
    fig, axes = plt.subplots(2, 2, figsize=(16, 12))
    fig.suptitle('Method Comparison Analysis', fontsize=16, fontweight='bold')
    
    # 1. Performance metrics comparison
    ax1 = axes[0, 0]
    performance_metrics = ['final_wer', 'final_cer']
    peft_perf = [comparison_results[m]['peft_mean'] for m in performance_metrics if m in comparison_results]
    full_perf = [comparison_results[m]['full_mean'] for m in performance_metrics if m in comparison_results]
    
    if peft_perf and full_perf:
        x = np.arange(len(peft_perf))
        width = 0.35
        
        ax1.bar(x - width/2, peft_perf, width, label='PEFT LoRA', alpha=0.7)
        ax1.bar(x + width/2, full_perf, width, label='Full Fine-tune', alpha=0.7)
        ax1.set_xlabel('Metrics')
        ax1.set_ylabel('Error Rate (%)')
        ax1.set_title('Performance Comparison')
        ax1.set_xticks(x)
        ax1.set_xticklabels([m.replace('_', ' ').title() for m in performance_metrics if m in comparison_results])
        ax1.legend()
    
    # 2. Resource usage comparison
    ax2 = axes[0, 1]
    resource_metrics = ['training_time_hours', 'peak_memory_gb']
    peft_res = [comparison_results[m]['peft_mean'] for m in resource_metrics if m in comparison_results]
    full_res = [comparison_results[m]['full_mean'] for m in resource_metrics if m in comparison_results]
    
    if peft_res and full_res:
        x = np.arange(len(peft_res))
        
        ax2.bar(x - width/2, peft_res, width, label='PEFT LoRA', alpha=0.7)
        ax2.bar(x + width/2, full_res, width, label='Full Fine-tune', alpha=0.7)
        ax2.set_xlabel('Metrics')
        ax2.set_ylabel('Resource Usage')
        ax2.set_title('Resource Usage Comparison')
        ax2.set_xticks(x)
        ax2.set_xticklabels([m.replace('_', ' ').title() for m in resource_metrics if m in comparison_results])
        ax2.legend()
    
    # 3. Improvement percentages
    ax3 = axes[1, 0]
    improvements = [comparison_results[m]['improvement_pct'] for m in comparison_results]
    metrics_names = [m.replace('_', ' ').title() for m in comparison_results.keys()]
    
    colors = ['green' if imp > 0 else 'red' for imp in improvements]
    ax3.barh(metrics_names, improvements, color=colors, alpha=0.7)
    ax3.set_xlabel('Improvement (%)')
    ax3.set_title('PEFT vs Full Fine-tune Improvements')
    ax3.axvline(x=0, color='black', linestyle='-', alpha=0.5)
    
    # 4. Per-dialect comparison
    ax4 = axes[1, 1]
    if dialect_comparison:
        dialects = [d['dialect'] for d in dialect_comparison]
        peft_wers = [d['peft_wer'] for d in dialect_comparison]
        full_wers = [d['full_wer'] for d in dialect_comparison]
        
        x = np.arange(len(dialects))
        ax4.bar(x - width/2, peft_wers, width, label='PEFT LoRA', alpha=0.7)
        ax4.bar(x + width/2, full_wers, width, label='Full Fine-tune', alpha=0.7)
        ax4.set_xlabel('Dialect')
        ax4.set_ylabel('WER (%)')
        ax4.set_title('Per-Dialect Performance')
        ax4.set_xticks(x)
        ax4.set_xticklabels([d.capitalize() for d in dialects], rotation=45)
        ax4.legend()
    
    plt.tight_layout()
    plt.show()

# Run comprehensive comparison
if not metrics_df.empty:
    comparison_results, dialect_comparison = comprehensive_method_comparison(metrics_df)
else:
    print("No data available for method comparison.")

In [None]:
def plot_performance_comparison(df):
    """Plot performance comparison between PEFT and Full Fine-tuning."""
    if df.empty or 'final_wer' not in df.columns:
        print("No WER data available for plotting.")
        return
    
    fig, axes = plt.subplots(2, 2, figsize=(15, 12))
    fig.suptitle('PEFT vs Full Fine-tuning: Performance Comparison', fontsize=16, fontweight='bold')
    
    # 1. WER by Dialect and Method
    if 'dialect' in df.columns and 'model_type' in df.columns:
        sns.barplot(data=df, x='dialect', y='final_wer', hue='model_type', ax=axes[0,0])
        axes[0,0].set_title('Word Error Rate (WER) by Dialect')
        axes[0,0].set_ylabel('WER (%)')
        axes[0,0].tick_params(axis='x', rotation=45)
    
    # 2. Training Loss by Method
    if 'final_loss' in df.columns and 'model_type' in df.columns:
        sns.boxplot(data=df, x='model_type', y='final_loss', ax=axes[0,1])
        axes[0,1].set_title('Final Training Loss Distribution')
        axes[0,1].set_ylabel('Training Loss')
    
    # 3. WER vs Training Time
    if 'training_time_minutes' in df.columns:
        scatter_colors = {'PEFT_LoRA': 'blue', 'Full_FineTune': 'red'}
        for model_type, group in df.groupby('model_type'):
            color = scatter_colors.get(model_type, 'gray')
            axes[1,0].scatter(group['training_time_minutes'], group['final_wer'], 
                            label=model_type, alpha=0.7, s=80, color=color)
        
        axes[1,0].set_xlabel('Training Time (minutes)')
        axes[1,0].set_ylabel('WER (%)')
        axes[1,0].set_title('Performance vs Training Time')
        axes[1,0].legend()
        axes[1,0].grid(True, alpha=0.3)
    
    # 4. Average Performance Summary
    if 'model_type' in df.columns:
        avg_metrics = df.groupby('model_type')[['final_wer', 'final_loss']].mean()
        avg_metrics.plot(kind='bar', ax=axes[1,1], rot=45)
        axes[1,1].set_title('Average Performance Metrics')
        axes[1,1].set_ylabel('Value')
        axes[1,1].legend(['WER (%)', 'Training Loss'])
    
    plt.tight_layout()
    plt.show()

# Generate performance plots
if not metrics_df.empty:
    plot_performance_comparison(metrics_df)
else:
    print("No data available for plotting. Please run training experiments first.")

## 5. Generate Resource Usage Plots
Plot resource usage metrics including memory consumption and parameter efficiency.

In [None]:
def plot_resource_usage(df):
    """Plot resource usage comparison."""
    if df.empty:
        print("No data available for resource plotting.")
        return
    
    fig, axes = plt.subplots(2, 2, figsize=(15, 12))
    fig.suptitle('Resource Usage Comparison: PEFT vs Full Fine-tuning', fontsize=16, fontweight='bold')
    
    # 1. Memory Usage
    if 'peak_memory_gb' in df.columns and 'model_type' in df.columns:
        sns.barplot(data=df, x='model_type', y='peak_memory_gb', ax=axes[0,0])
        axes[0,0].set_title('Peak GPU Memory Usage')
        axes[0,0].set_ylabel('Memory (GB)')
        
        # Add value labels on bars
        for i, bar in enumerate(axes[0,0].patches):
            height = bar.get_height()
            axes[0,0].text(bar.get_x() + bar.get_width()/2., height + 0.1,
                          f'{height:.1f}GB', ha='center', va='bottom')
    
    # 2. Trainable Parameters
    if 'trainable_params_millions' in df.columns and 'model_type' in df.columns:
        sns.barplot(data=df, x='model_type', y='trainable_params_millions', ax=axes[0,1])
        axes[0,1].set_title('Trainable Parameters')
        axes[0,1].set_ylabel('Parameters (Millions)')
        
        # Add value labels
        for i, bar in enumerate(axes[0,1].patches):
            height = bar.get_height()
            axes[0,1].text(bar.get_x() + bar.get_width()/2., height + height*0.01,
                          f'{height:.1f}M', ha='center', va='bottom')
    
    # 3. Training Time by Dialect
    if 'training_time_minutes' in df.columns and 'dialect' in df.columns:
        sns.barplot(data=df, x='dialect', y='training_time_minutes', hue='model_type', ax=axes[1,0])
        axes[1,0].set_title('Training Time by Dialect')
        axes[1,0].set_ylabel('Time (minutes)')
        axes[1,0].tick_params(axis='x', rotation=45)
    
    # 4. Efficiency Score (Performance per GB)
    if 'efficiency_score' in df.columns and 'model_type' in df.columns:
        sns.boxplot(data=df, x='model_type', y='efficiency_score', ax=axes[1,1])
        axes[1,1].set_title('Efficiency Score\n(Performance per GB Memory)')
        axes[1,1].set_ylabel('Efficiency Score')
    
    plt.tight_layout()
    plt.show()

# Generate resource usage plots
if not metrics_df.empty:
    plot_resource_usage(metrics_df)
else:
    print("No data available for plotting. Please run training experiments first.")

## 6. Create Efficiency Summary Table
Generate a summary table comparing PEFT vs Full Fine-tuning efficiency.

In [None]:
def create_efficiency_summary(df):
    """Create efficiency summary table."""
    if df.empty or 'model_type' not in df.columns:
        print("No data available for efficiency summary.")
        return
    
    # Group by model type and calculate averages
    summary_cols = ['final_wer', 'training_time_minutes', 'peak_memory_gb', 
                   'trainable_params_millions', 'total_params_millions']
    
    available_cols = [col for col in summary_cols if col in df.columns]
    
    if not available_cols:
        print("No relevant columns available for summary.")
        return
    
    summary = df.groupby('model_type')[available_cols].mean().round(2)
    
    print("\n" + "="*80)
    print("EFFICIENCY SUMMARY: PEFT LoRA vs Full Fine-tuning")
    print("="*80)
    
    # Display the summary table
    print(summary.to_string())
    
    # Calculate improvement percentages
    if len(summary) >= 2:
        print("\n" + "-"*50)
        print("PEFT LoRA IMPROVEMENTS OVER FULL FINE-TUNING:")
        print("-"*50)
        
        peft_row = summary.loc[summary.index.str.contains('PEFT', case=False, na=False)]
        full_row = summary.loc[summary.index.str.contains('Full', case=False, na=False)]
        
        if not peft_row.empty and not full_row.empty:
            peft_values = peft_row.iloc[0]
            full_values = full_row.iloc[0]
            
            # Calculate improvements (negative = better for WER, time, memory)
            improvements = {
                'WER': ((peft_values.get('final_wer', 0) - full_values.get('final_wer', 0)) / full_values.get('final_wer', 1)) * 100,
                'Training Time': ((peft_values.get('training_time_minutes', 0) - full_values.get('training_time_minutes', 0)) / full_values.get('training_time_minutes', 1)) * 100,
                'Memory Usage': ((peft_values.get('peak_memory_gb', 0) - full_values.get('peak_memory_gb', 0)) / full_values.get('peak_memory_gb', 1)) * 100,
                'Trainable Parameters': ((peft_values.get('trainable_params_millions', 0) - full_values.get('trainable_params_millions', 0)) / full_values.get('trainable_params_millions', 1)) * 100
            }
            
            for metric, improvement in improvements.items():
                if not np.isnan(improvement) and abs(improvement) < 1000:  # Sanity check
                    sign = "↓" if improvement < 0 else "↑"
                    print(f"{metric:20}: {improvement:+6.1f}% {sign}")
    
    print("\n" + "="*80)

# Generate efficiency summary
if not metrics_df.empty:
    create_efficiency_summary(metrics_df)
else:
    print("No data available for summary. Please run training experiments first.")

## 7. Generate Model Performance Comparison Charts
Create detailed comparison charts showing performance across different dialects and configurations.

In [None]:
def plot_detailed_comparison(df):
    """Create detailed comparison charts."""
    if df.empty:
        print("No data available for detailed comparison.")
        return
    
    # Create a comprehensive comparison figure
    fig = plt.figure(figsize=(20, 15))
    
    # 1. Performance by Dialect (Large subplot)
    ax1 = plt.subplot(3, 3, (1, 4))
    if 'dialect' in df.columns and 'final_wer' in df.columns:
        dialect_performance = df.groupby(['dialect', 'model_type'])['final_wer'].mean().unstack()
        dialect_performance.plot(kind='bar', ax=ax1, rot=45, width=0.8)
        ax1.set_title('WER Performance by Dialect', fontsize=14, fontweight='bold')
        ax1.set_ylabel('Word Error Rate (%)')
        ax1.legend(title='Method')
        ax1.grid(True, alpha=0.3)
    
    # 2. Memory vs Performance Scatter
    ax2 = plt.subplot(3, 3, 3)
    if 'peak_memory_gb' in df.columns and 'final_wer' in df.columns:
        for model_type, group in df.groupby('model_type'):
            ax2.scatter(group['peak_memory_gb'], group['final_wer'], 
                       label=model_type, alpha=0.7, s=80)
        ax2.set_xlabel('Memory (GB)')
        ax2.set_ylabel('WER (%)')
        ax2.set_title('Memory vs Performance')
        ax2.legend()
        ax2.grid(True, alpha=0.3)
    
    # 3. Parameter Efficiency
    ax3 = plt.subplot(3, 3, 6)
    if 'trainable_percentage' in df.columns and 'model_type' in df.columns:
        sns.barplot(data=df, x='model_type', y='trainable_percentage', ax=ax3)
        ax3.set_title('Parameter Efficiency')
        ax3.set_ylabel('Trainable Parameters (%)')
        ax3.tick_params(axis='x', rotation=45)
    
    # 4. Training Time Distribution
    ax4 = plt.subplot(3, 3, 7)
    if 'training_time_minutes' in df.columns and 'model_type' in df.columns:
        df.boxplot(column='training_time_minutes', by='model_type', ax=ax4)
        ax4.set_title('Training Time Distribution')
        ax4.set_ylabel('Time (minutes)')
        ax4.tick_params(axis='x', rotation=45)
    
    # 5. Overall Efficiency Radar Chart (if we have multiple metrics)
    ax5 = plt.subplot(3, 3, 8)
    if 'model_type' in df.columns and len(df['model_type'].unique()) >= 2:
        # Create a simple efficiency comparison
        metrics = ['final_wer', 'training_time_minutes', 'peak_memory_gb']
        available_metrics = [m for m in metrics if m in df.columns]
        
        if available_metrics:
            efficiency_data = df.groupby('model_type')[available_metrics].mean()
            # Normalize metrics (lower is better for all these metrics)
            normalized_data = 100 / efficiency_data  # Inverse so higher is better
            
            normalized_data.plot(kind='bar', ax=ax5, rot=45)
            ax5.set_title('Efficiency Comparison\n(Higher = Better)')
            ax5.set_ylabel('Efficiency Score')
            ax5.legend(bbox_to_anchor=(1.05, 1), loc='upper left')
    
    # 6. Summary Statistics
    ax6 = plt.subplot(3, 3, 9)
    ax6.axis('off')
    
    # Create summary text
    summary_text = "EXPERIMENT SUMMARY\n\n"
    if not df.empty:
        summary_text += f"Total Experiments: {len(df)}\n"
        if 'dialect' in df.columns:
            summary_text += f"Dialects: {', '.join(df['dialect'].unique())}\n"
        if 'model_type' in df.columns:
            summary_text += f"Methods: {', '.join(df['model_type'].unique())}\n\n"
        
        # Best performance
        if 'final_wer' in df.columns:
            best_wer = df.loc[df['final_wer'].idxmin()]
            summary_text += f"Best WER: {best_wer['final_wer']:.2f}%\n"
            summary_text += f"Method: {best_wer.get('model_type', 'N/A')}\n"
            summary_text += f"Dialect: {best_wer.get('dialect', 'N/A')}\n"
    
    ax6.text(0.1, 0.9, summary_text, transform=ax6.transAxes, 
             fontsize=11, verticalalignment='top',
             bbox=dict(boxstyle='round', facecolor='lightblue', alpha=0.8))
    
    plt.suptitle('Comprehensive PEFT vs Full Fine-tuning Analysis', 
                 fontsize=16, fontweight='bold', y=0.98)
    plt.tight_layout()
    plt.show()

# Generate detailed comparison
if not metrics_df.empty:
    plot_detailed_comparison(metrics_df)
else:
    print("No data available for plotting. Please run training experiments first.")

## 8. Save Analysis Results
Save the analysis results and plots for documentation.

In [None]:
def save_analysis_results(df, output_dir="./analysis_results"):
    """Save analysis results to files."""
    if df.empty:
        print("No data to save.")
        return
    
    output_path = Path(output_dir)
    output_path.mkdir(exist_ok=True)
    
    # Save processed data
    csv_path = output_path / "training_metrics_processed.csv"
    df.to_csv(csv_path, index=False)
    print(f"Processed data saved to: {csv_path}")
    
    # Save summary statistics
    if 'model_type' in df.columns:
        summary_cols = ['final_wer', 'training_time_minutes', 'peak_memory_gb', 
                       'trainable_params_millions']
        available_cols = [col for col in summary_cols if col in df.columns]
        
        if available_cols:
            summary = df.groupby('model_type')[available_cols].agg(['mean', 'std']).round(3)
            summary_path = output_path / "efficiency_summary.csv"
            summary.to_csv(summary_path)
            print(f"Summary statistics saved to: {summary_path}")
    
    print(f"\nAll analysis results saved to: {output_path}")

# Save results
if not metrics_df.empty:
    save_analysis_results(metrics_df)
    
print("\n" + "="*80)
print("ANALYSIS COMPLETE!")
print("\nTo generate this analysis:")
print("1. Run training experiments with the updated script")
print("2. The script automatically saves metrics_*.json files")
print("3. This notebook loads and analyzes all available metrics")
print("4. Comparison plots show PEFT vs Full Fine-tuning efficiency")
print("="*80)

## 9. Export Comprehensive Analysis Report
Generate and export a comprehensive analysis report with all findings and visualizations.

In [None]:
def generate_comprehensive_report(df, output_dir="./analysis_results"):
    """Generate a comprehensive analysis report with executive summary."""
    if df.empty:
        print("No data available for report generation.")
        return
    
    output_path = Path(output_dir)
    output_path.mkdir(exist_ok=True)
    
    # Generate report content
    report_content = []
    
    # Executive Summary
    report_content.append("# Arabic Dialect PEFT Training Analysis Report")
    report_content.append("=" * 60)
    report_content.append("")
    report_content.append("## Executive Summary")
    report_content.append("")
    
    # Basic statistics
    total_experiments = len(df)
    methods = df['model_type'].value_counts().to_dict() if 'model_type' in df.columns else {}
    dialects = df['dialect'].value_counts().to_dict() if 'dialect' in df.columns else {}
    
    report_content.append(f"**Total Experiments Analyzed:** {total_experiments}")
    report_content.append(f"**Methods Compared:** {', '.join(methods.keys())}")
    report_content.append(f"**Dialects Tested:** {', '.join(dialects.keys())}")
    report_content.append("")
    
    # Performance summary
    if 'final_wer' in df.columns:
        best_wer = df['final_wer'].min()
        avg_wer = df['final_wer'].mean()
        worst_wer = df['final_wer'].max()
        
        report_content.append("### Performance Overview")
        report_content.append(f"- **Best WER achieved:** {best_wer:.2f}%")
        report_content.append(f"- **Average WER:** {avg_wer:.2f}%")
        report_content.append(f"- **WER range:** {best_wer:.2f}% - {worst_wer:.2f}%")
        report_content.append("")
    
    # Method comparison
    if 'model_type' in df.columns and len(df['model_type'].unique()) > 1:
        report_content.append("### Method Comparison Summary")
        
        peft_data = df[df['model_type'] == 'PEFT_LoRA']
        full_data = df[df['model_type'] == 'Full_FineTune']
        
        if not peft_data.empty and not full_data.empty:
            peft_wer = peft_data['final_wer'].mean()
            full_wer = full_data['final_wer'].mean()
            wer_improvement = ((full_wer - peft_wer) / full_wer) * 100
            
            report_content.append(f"- **PEFT LoRA average WER:** {peft_wer:.2f}%")
            report_content.append(f"- **Full Fine-tune average WER:** {full_wer:.2f}%")
            report_content.append(f"- **WER improvement:** {wer_improvement:+.1f}%")
            
            # Resource efficiency
            if 'trainable_percentage' in peft_data.columns:
                avg_params = peft_data['trainable_percentage'].mean()
                report_content.append(f"- **Parameter reduction:** {100-avg_params:.1f}%")
            
            if 'training_time_hours' in df.columns:
                peft_time = peft_data['training_time_hours'].mean()
                full_time = full_data['training_time_hours'].mean()
                time_saving = ((full_time - peft_time) / full_time) * 100
                report_content.append(f"- **Training time saving:** {time_saving:.1f}%")
            
            report_content.append("")
    
    # Key findings
    report_content.append("### Key Findings")
    
    # Best performing combinations
    if 'dialect' in df.columns and 'model_type' in df.columns:
        best_combination = df.loc[df['final_wer'].idxmin()]
        report_content.append(f"- **Best performing combination:** {best_combination['model_type']} on {best_combination['dialect']} dialect")
        
        # Dialect-specific analysis
        dialect_performance = df.groupby('dialect')['final_wer'].mean().sort_values()
        best_dialect = dialect_performance.index[0]
        worst_dialect = dialect_performance.index[-1]
        
        report_content.append(f"- **Easiest dialect:** {best_dialect} (avg WER: {dialect_performance[best_dialect]:.2f}%)")
        report_content.append(f"- **Most challenging dialect:** {worst_dialect} (avg WER: {dialect_performance[worst_dialect]:.2f}%)")
    
    # LORA-specific insights
    peft_data = df[df['model_type'] == 'PEFT_LoRA']
    if not peft_data.empty:
        report_content.append("")
        report_content.append("### LORA-Specific Insights")
        
        if 'lora_rank' in peft_data.columns:
            rank_performance = peft_data.groupby('lora_rank')['final_wer'].mean()
            best_rank = rank_performance.idxmin()
            report_content.append(f"- **Optimal LoRA rank:** {best_rank} (WER: {rank_performance[best_rank]:.2f}%)")
        
        if 'parameter_efficiency_ratio' in peft_data.columns:
            avg_efficiency = peft_data['parameter_efficiency_ratio'].mean()
            report_content.append(f"- **Average parameter efficiency:** {avg_efficiency:.4f} ({avg_efficiency*100:.2f}% of base model)")
        
        if 'lora_effectiveness' in peft_data.columns:
            avg_effectiveness = peft_data['lora_effectiveness'].mean()
            report_content.append(f"- **Average LoRA effectiveness score:** {avg_effectiveness:.2f}")
    
    # Recommendations
    report_content.append("")
    report_content.append("### Recommendations")
    
    if not peft_data.empty and not full_data.empty:
        if peft_wer <= full_wer * 1.05:  # Within 5% of full fine-tuning
            report_content.append("- **Recommended approach:** PEFT LoRA for production use")
            report_content.append("  - Achieves comparable performance to full fine-tuning")
            report_content.append("  - Significantly reduces computational requirements")
            report_content.append("  - Faster training and deployment")
        else:
            report_content.append("- **Consider hybrid approach:** PEFT for prototyping, full fine-tuning for production")
            report_content.append("  - Use PEFT for rapid experimentation")
            report_content.append("  - Full fine-tuning for maximum performance")
    
    if 'dialect' in df.columns:
        # Dialect-specific recommendations
        difficult_dialects = dialect_performance.tail(2).index.tolist()
        if difficult_dialects:
            report_content.append(f"- **Focus additional research on:** {', '.join(difficult_dialects)} dialects")
            report_content.append("  - Consider dialect-specific data augmentation")
            report_content.append("  - Explore cross-dialect transfer learning")
    
    # Technical details
    report_content.append("")
    report_content.append("## Detailed Technical Analysis")
    report_content.append("")
    
    # Statistical summary
    if 'model_type' in df.columns:
        summary_cols = ['final_wer', 'final_cer', 'training_time_hours', 'peak_memory_gb']
        available_cols = [col for col in summary_cols if col in df.columns]
        
        if available_cols:
            summary_stats = df.groupby('model_type')[available_cols].agg(['mean', 'std']).round(3)
            
            report_content.append("### Performance Statistics by Method")
            report_content.append("")
            report_content.append("```")
            report_content.append(summary_stats.to_string())
            report_content.append("```")
            report_content.append("")
    
    # Experimental setup
    report_content.append("### Experimental Setup")
    if 'base_model' in df.columns:
        base_models = df['base_model'].unique()
        report_content.append(f"- **Base Models:** {', '.join(base_models)}")
    
    if 'lora_rank' in peft_data.columns:
        ranks_tested = sorted(peft_data['lora_rank'].unique())
        report_content.append(f"- **LoRA Ranks Tested:** {ranks_tested}")
    
    if 'batch_size' in df.columns:
        batch_sizes = sorted(df['batch_size'].unique())
        report_content.append(f"- **Batch Sizes:** {batch_sizes}")
    
    # Save report
    report_text = "\n".join(report_content)
    report_path = output_path / "comprehensive_analysis_report.md"
    
    with open(report_path, 'w', encoding='utf-8') as f:
        f.write(report_text)
    
    print(f"Comprehensive analysis report saved to: {report_path}")
    
    # Also save as text file
    txt_path = output_path / "analysis_summary.txt"
    with open(txt_path, 'w', encoding='utf-8') as f:
        f.write(report_text)
    
    print(f"Text summary saved to: {txt_path}")
    
    # Generate comparison table
    if 'model_type' in df.columns and len(df['model_type'].unique()) > 1:
        comparison_table = create_comparison_table(df)
        table_path = output_path / "method_comparison_table.csv"
        comparison_table.to_csv(table_path)
        print(f"Comparison table saved to: {table_path}")
    
    return report_path

def create_comparison_table(df):
    """Create a detailed comparison table."""
    if 'model_type' not in df.columns:
        return pd.DataFrame()
    
    # Define metrics to compare
    metrics = ['final_wer', 'final_cer', 'training_time_hours', 'peak_memory_gb', 
              'trainable_params_millions', 'trainable_percentage']
    
    available_metrics = [m for m in metrics if m in df.columns]
    
    if not available_metrics:
        return pd.DataFrame()
    
    # Create comparison table
    comparison_data = []
    
    for method in df['model_type'].unique():
        method_data = df[df['model_type'] == method]
        
        row = {'Method': method, 'Experiments': len(method_data)}
        
        for metric in available_metrics:
            values = method_data[metric].dropna()
            if len(values) > 0:
                row[f'{metric}_mean'] = values.mean()
                row[f'{metric}_std'] = values.std()
                row[f'{metric}_min'] = values.min()
                row[f'{metric}_max'] = values.max()
        
        comparison_data.append(row)
    
    return pd.DataFrame(comparison_data)

def export_visualizations(output_dir="./analysis_results"):
    """Export all visualizations as image files."""
    output_path = Path(output_dir)
    plots_dir = output_path / "plots"
    plots_dir.mkdir(exist_ok=True)
    
    print(f"Visualization plots will be saved to: {plots_dir}")
    print("Note: Run the analysis cells above to generate and save plots.")
    
    return plots_dir

# Generate comprehensive report
if not metrics_df.empty:
    print("Generating comprehensive analysis report...")
    report_path = generate_comprehensive_report(metrics_df)
    plots_dir = export_visualizations()
    
    print("\n" + "="*80)
    print("COMPREHENSIVE ANALYSIS COMPLETE!")
    print("="*80)
    print(f"📊 Main report: {report_path}")
    print(f"📈 Plots directory: {plots_dir}")
    print(f"📋 CSV data: ./analysis_results/training_metrics_processed.csv")
    print(f"📝 Summary table: ./analysis_results/method_comparison_table.csv")
    print("\nFiles generated:")
    print("- comprehensive_analysis_report.md (Markdown report)")
    print("- analysis_summary.txt (Plain text summary)")  
    print("- training_metrics_processed.csv (Processed data)")
    print("- method_comparison_table.csv (Detailed comparison)")
    print("- efficiency_summary.csv (Statistical summary)")
    print("="*80)
else:
    print("No data available for comprehensive report generation.")
    print("Please run training experiments first to generate metrics data.")