# Arabic Dialect PEFT Training Metrics Analysis

This notebook analyzes the training metrics recorded during Arabic dialect fine-tuning experiments.
It compares PEFT LoRA vs Full Fine-tuning across different dialects.

**Data Source:** Metrics automatically recorded by `dialect_peft_training.py`

## 1. Import Required Libraries
Import necessary libraries including matplotlib, pandas, numpy, and seaborn for data visualization and analysis.

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import json
import glob
from pathlib import Path
import warnings
warnings.filterwarnings('ignore')

# Set plotting style
plt.style.use('seaborn-v0_8')
sns.set_palette("husl")

# Configure matplotlib
plt.rcParams['figure.figsize'] = (12, 8)
plt.rcParams['font.size'] = 12
plt.rcParams['axes.titlesize'] = 14
plt.rcParams['axes.labelsize'] = 12
plt.rcParams['xtick.labelsize'] = 10
plt.rcParams['ytick.labelsize'] = 10
plt.rcParams['legend.fontsize'] = 10

print("Libraries imported successfully!")

## 2. Load Training Data
Load the recorded training data from JSON files generated during the finetuning process.

In [None]:
def load_training_metrics(base_dir="./results"):
    """Load all training metrics from JSON files following original repository structure."""
    
    # Look in original repository subdirectories
    subdirs = ["ex_finetune", "ex_scratch", "ex_peft", "ex_comparison_dialectal"]
    all_metrics = []
    
    for subdir in subdirs:
        subdir_path = Path(base_dir) / subdir
        if subdir_path.exists():
            metrics_files = list(subdir_path.glob("results_whisper-*.json"))
            
            for file_path in metrics_files:
                try:
                    with open(file_path, 'r') as f:
                        data = json.load(f)
                    
                    # Parse filename for metadata (following original pattern)
                    filename = file_path.stem  # results_whisper-small-peft_egyptian_seed42
                    parts = filename.split('_')
                    
                    if len(parts) >= 3:
                        # Extract model info from filename
                        model_part = parts[1]  # whisper-small-peft or whisper-small-finetune
                        dialect = parts[2] if len(parts) > 2 else "unknown"
                        seed = parts[3].replace("seed", "") if len(parts) > 3 and "seed" in parts[3] else "42"
                        
                        # Determine method from filename
                        if "peft" in model_part:
                            method = "PEFT_LoRA"
                        elif "finetune" in model_part:
                            method = "Full_FineTune"
                        else:
                            method = "Unknown"
                        
                        # Create consistent data structure
                        metrics = {
                            'experiment_name': filename,
                            'dialect': dialect,
                            'model_type': method,
                            'seed': int(seed),
                            'final_wer': data.get('wer', data.get('final_wer', 0)),
                            'final_cer': data.get('cer', data.get('final_cer', 0)),
                            'training_time_seconds': data.get('training_time_seconds', data.get('training_time', 0)),
                            'peak_memory_mb': data.get('peak_memory_mb', 0),
                            'trainable_params': data.get('trainable_params', 0),
                            'total_params': data.get('total_params', 0),
                            'final_loss': data.get('final_loss', 0),
                            'source_file': str(file_path)
                        }
                        
                        all_metrics.append(metrics)
                        print(f"Loaded: {file_path} -> {method} {dialect}")
                    
                except Exception as e:
                    print(f"Error loading {file_path}: {e}")
    
    # Also check for any metrics files in main results directory  
    main_metrics_files = list(Path(base_dir).glob("metrics_*.json"))
    for file_path in main_metrics_files:
        try:
            with open(file_path, 'r') as f:
                data = json.load(f)
                all_metrics.append(data)
                print(f"Loaded: {file_path}")
        except Exception as e:
            print(f"Error loading {file_path}: {e}")
    
    if all_metrics:
        df = pd.DataFrame(all_metrics)
        print(f"
Loaded {len(df)} experiments")
        print(f"Methods found: {df['model_type'].unique() if 'model_type' in df.columns else 'N/A'}")
        print(f"Dialects found: {df['dialect'].unique() if 'dialect' in df.columns else 'N/A'}")
        return df
    else:
        print("No metrics files found.")
        print("Make sure you've run training experiments that save results in:")
        print("  - ./results/ex_peft/")
        print("  - ./results/ex_finetune/") 
        print("  - ./results/ex_scratch/")
        return pd.DataFrame()

# Load the data
metrics_df = load_training_metrics()

if not metrics_df.empty:
    print("\nDataset overview:")
    print(metrics_df.head())
    print(f"\nColumns: {list(metrics_df.columns)}")
else:
    print("No data loaded. Please run some training experiments first.")

## 3. Data Preprocessing and Cleaning
Clean and preprocess the training data, handle missing values, and format the data for visualization.

In [None]:
def preprocess_metrics(df):
    """Clean and preprocess the metrics data."""
    if df.empty:
        return df
    
    # Convert training time to minutes
    if 'training_time_seconds' in df.columns:
        df['training_time_minutes'] = df['training_time_seconds'] / 60
    
    # Convert memory to GB
    if 'peak_memory_mb' in df.columns:
        df['peak_memory_gb'] = df['peak_memory_mb'] / 1024
    
    # Convert parameters to millions
    if 'total_params' in df.columns:
        df['total_params_millions'] = df['total_params'] / 1_000_000
    if 'trainable_params' in df.columns:
        df['trainable_params_millions'] = df['trainable_params'] / 1_000_000
    
    # Create efficiency ratio (performance / resources)
    if 'final_wer' in df.columns and 'peak_memory_gb' in df.columns:
        # Lower WER is better, so use (100 - WER) for efficiency
        df['efficiency_score'] = (100 - df['final_wer']) / df['peak_memory_gb']
    
    return df

# Preprocess the data
if not metrics_df.empty:
    metrics_df = preprocess_metrics(metrics_df)
    
    print("Data preprocessing complete!")
    print(f"\nSample of processed data:")
    
    # Show key metrics
    key_cols = ['experiment_name', 'dialect', 'model_type', 'final_wer', 
                'training_time_minutes', 'peak_memory_gb', 'trainable_params_millions']
    
    available_cols = [col for col in key_cols if col in metrics_df.columns]
    if available_cols:
        print(metrics_df[available_cols].head())
    
    print(f"\nDataset shape: {metrics_df.shape}")
    print(f"Model types: {metrics_df['model_type'].unique() if 'model_type' in metrics_df.columns else 'N/A'}")
    print(f"Dialects: {metrics_df['dialect'].unique() if 'dialect' in metrics_df.columns else 'N/A'}")

## 4. Generate Training Performance Plots
Create plots showing training performance metrics like WER and training loss.

In [None]:
def plot_performance_comparison(df):
    """Plot performance comparison between PEFT and Full Fine-tuning."""
    if df.empty or 'final_wer' not in df.columns:
        print("No WER data available for plotting.")
        return
    
    fig, axes = plt.subplots(2, 2, figsize=(15, 12))
    fig.suptitle('PEFT vs Full Fine-tuning: Performance Comparison', fontsize=16, fontweight='bold')
    
    # 1. WER by Dialect and Method
    if 'dialect' in df.columns and 'model_type' in df.columns:
        sns.barplot(data=df, x='dialect', y='final_wer', hue='model_type', ax=axes[0,0])
        axes[0,0].set_title('Word Error Rate (WER) by Dialect')
        axes[0,0].set_ylabel('WER (%)')
        axes[0,0].tick_params(axis='x', rotation=45)
    
    # 2. Training Loss by Method
    if 'final_loss' in df.columns and 'model_type' in df.columns:
        sns.boxplot(data=df, x='model_type', y='final_loss', ax=axes[0,1])
        axes[0,1].set_title('Final Training Loss Distribution')
        axes[0,1].set_ylabel('Training Loss')
    
    # 3. WER vs Training Time
    if 'training_time_minutes' in df.columns:
        scatter_colors = {'PEFT_LoRA': 'blue', 'Full_FineTune': 'red'}
        for model_type, group in df.groupby('model_type'):
            color = scatter_colors.get(model_type, 'gray')
            axes[1,0].scatter(group['training_time_minutes'], group['final_wer'], 
                            label=model_type, alpha=0.7, s=80, color=color)
        
        axes[1,0].set_xlabel('Training Time (minutes)')
        axes[1,0].set_ylabel('WER (%)')
        axes[1,0].set_title('Performance vs Training Time')
        axes[1,0].legend()
        axes[1,0].grid(True, alpha=0.3)
    
    # 4. Average Performance Summary
    if 'model_type' in df.columns:
        avg_metrics = df.groupby('model_type')[['final_wer', 'final_loss']].mean()
        avg_metrics.plot(kind='bar', ax=axes[1,1], rot=45)
        axes[1,1].set_title('Average Performance Metrics')
        axes[1,1].set_ylabel('Value')
        axes[1,1].legend(['WER (%)', 'Training Loss'])
    
    plt.tight_layout()
    plt.show()

# Generate performance plots
if not metrics_df.empty:
    plot_performance_comparison(metrics_df)
else:
    print("No data available for plotting. Please run training experiments first.")

## 5. Generate Resource Usage Plots
Plot resource usage metrics including memory consumption and parameter efficiency.

In [None]:
def plot_resource_usage(df):
    """Plot resource usage comparison."""
    if df.empty:
        print("No data available for resource plotting.")
        return
    
    fig, axes = plt.subplots(2, 2, figsize=(15, 12))
    fig.suptitle('Resource Usage Comparison: PEFT vs Full Fine-tuning', fontsize=16, fontweight='bold')
    
    # 1. Memory Usage
    if 'peak_memory_gb' in df.columns and 'model_type' in df.columns:
        sns.barplot(data=df, x='model_type', y='peak_memory_gb', ax=axes[0,0])
        axes[0,0].set_title('Peak GPU Memory Usage')
        axes[0,0].set_ylabel('Memory (GB)')
        
        # Add value labels on bars
        for i, bar in enumerate(axes[0,0].patches):
            height = bar.get_height()
            axes[0,0].text(bar.get_x() + bar.get_width()/2., height + 0.1,
                          f'{height:.1f}GB', ha='center', va='bottom')
    
    # 2. Trainable Parameters
    if 'trainable_params_millions' in df.columns and 'model_type' in df.columns:
        sns.barplot(data=df, x='model_type', y='trainable_params_millions', ax=axes[0,1])
        axes[0,1].set_title('Trainable Parameters')
        axes[0,1].set_ylabel('Parameters (Millions)')
        
        # Add value labels
        for i, bar in enumerate(axes[0,1].patches):
            height = bar.get_height()
            axes[0,1].text(bar.get_x() + bar.get_width()/2., height + height*0.01,
                          f'{height:.1f}M', ha='center', va='bottom')
    
    # 3. Training Time by Dialect
    if 'training_time_minutes' in df.columns and 'dialect' in df.columns:
        sns.barplot(data=df, x='dialect', y='training_time_minutes', hue='model_type', ax=axes[1,0])
        axes[1,0].set_title('Training Time by Dialect')
        axes[1,0].set_ylabel('Time (minutes)')
        axes[1,0].tick_params(axis='x', rotation=45)
    
    # 4. Efficiency Score (Performance per GB)
    if 'efficiency_score' in df.columns and 'model_type' in df.columns:
        sns.boxplot(data=df, x='model_type', y='efficiency_score', ax=axes[1,1])
        axes[1,1].set_title('Efficiency Score\n(Performance per GB Memory)')
        axes[1,1].set_ylabel('Efficiency Score')
    
    plt.tight_layout()
    plt.show()

# Generate resource usage plots
if not metrics_df.empty:
    plot_resource_usage(metrics_df)
else:
    print("No data available for plotting. Please run training experiments first.")

## 6. Create Efficiency Summary Table
Generate a summary table comparing PEFT vs Full Fine-tuning efficiency.

In [None]:
def create_efficiency_summary(df):
    """Create efficiency summary table."""
    if df.empty or 'model_type' not in df.columns:
        print("No data available for efficiency summary.")
        return
    
    # Group by model type and calculate averages
    summary_cols = ['final_wer', 'training_time_minutes', 'peak_memory_gb', 
                   'trainable_params_millions', 'total_params_millions']
    
    available_cols = [col for col in summary_cols if col in df.columns]
    
    if not available_cols:
        print("No relevant columns available for summary.")
        return
    
    summary = df.groupby('model_type')[available_cols].mean().round(2)
    
    print("\n" + "="*80)
    print("EFFICIENCY SUMMARY: PEFT LoRA vs Full Fine-tuning")
    print("="*80)
    
    # Display the summary table
    print(summary.to_string())
    
    # Calculate improvement percentages
    if len(summary) >= 2:
        print("\n" + "-"*50)
        print("PEFT LoRA IMPROVEMENTS OVER FULL FINE-TUNING:")
        print("-"*50)
        
        peft_row = summary.loc[summary.index.str.contains('PEFT', case=False, na=False)]
        full_row = summary.loc[summary.index.str.contains('Full', case=False, na=False)]
        
        if not peft_row.empty and not full_row.empty:
            peft_values = peft_row.iloc[0]
            full_values = full_row.iloc[0]
            
            # Calculate improvements (negative = better for WER, time, memory)
            improvements = {
                'WER': ((peft_values.get('final_wer', 0) - full_values.get('final_wer', 0)) / full_values.get('final_wer', 1)) * 100,
                'Training Time': ((peft_values.get('training_time_minutes', 0) - full_values.get('training_time_minutes', 0)) / full_values.get('training_time_minutes', 1)) * 100,
                'Memory Usage': ((peft_values.get('peak_memory_gb', 0) - full_values.get('peak_memory_gb', 0)) / full_values.get('peak_memory_gb', 1)) * 100,
                'Trainable Parameters': ((peft_values.get('trainable_params_millions', 0) - full_values.get('trainable_params_millions', 0)) / full_values.get('trainable_params_millions', 1)) * 100
            }
            
            for metric, improvement in improvements.items():
                if not np.isnan(improvement) and abs(improvement) < 1000:  # Sanity check
                    sign = "↓" if improvement < 0 else "↑"
                    print(f"{metric:20}: {improvement:+6.1f}% {sign}")
    
    print("\n" + "="*80)

# Generate efficiency summary
if not metrics_df.empty:
    create_efficiency_summary(metrics_df)
else:
    print("No data available for summary. Please run training experiments first.")

## 7. Generate Model Performance Comparison Charts
Create detailed comparison charts showing performance across different dialects and configurations.

In [None]:
def plot_detailed_comparison(df):
    """Create detailed comparison charts."""
    if df.empty:
        print("No data available for detailed comparison.")
        return
    
    # Create a comprehensive comparison figure
    fig = plt.figure(figsize=(20, 15))
    
    # 1. Performance by Dialect (Large subplot)
    ax1 = plt.subplot(3, 3, (1, 4))
    if 'dialect' in df.columns and 'final_wer' in df.columns:
        dialect_performance = df.groupby(['dialect', 'model_type'])['final_wer'].mean().unstack()
        dialect_performance.plot(kind='bar', ax=ax1, rot=45, width=0.8)
        ax1.set_title('WER Performance by Dialect', fontsize=14, fontweight='bold')
        ax1.set_ylabel('Word Error Rate (%)')
        ax1.legend(title='Method')
        ax1.grid(True, alpha=0.3)
    
    # 2. Memory vs Performance Scatter
    ax2 = plt.subplot(3, 3, 3)
    if 'peak_memory_gb' in df.columns and 'final_wer' in df.columns:
        for model_type, group in df.groupby('model_type'):
            ax2.scatter(group['peak_memory_gb'], group['final_wer'], 
                       label=model_type, alpha=0.7, s=80)
        ax2.set_xlabel('Memory (GB)')
        ax2.set_ylabel('WER (%)')
        ax2.set_title('Memory vs Performance')
        ax2.legend()
        ax2.grid(True, alpha=0.3)
    
    # 3. Parameter Efficiency
    ax3 = plt.subplot(3, 3, 6)
    if 'trainable_percentage' in df.columns and 'model_type' in df.columns:
        sns.barplot(data=df, x='model_type', y='trainable_percentage', ax=ax3)
        ax3.set_title('Parameter Efficiency')
        ax3.set_ylabel('Trainable Parameters (%)')
        ax3.tick_params(axis='x', rotation=45)
    
    # 4. Training Time Distribution
    ax4 = plt.subplot(3, 3, 7)
    if 'training_time_minutes' in df.columns and 'model_type' in df.columns:
        df.boxplot(column='training_time_minutes', by='model_type', ax=ax4)
        ax4.set_title('Training Time Distribution')
        ax4.set_ylabel('Time (minutes)')
        ax4.tick_params(axis='x', rotation=45)
    
    # 5. Overall Efficiency Radar Chart (if we have multiple metrics)
    ax5 = plt.subplot(3, 3, 8)
    if 'model_type' in df.columns and len(df['model_type'].unique()) >= 2:
        # Create a simple efficiency comparison
        metrics = ['final_wer', 'training_time_minutes', 'peak_memory_gb']
        available_metrics = [m for m in metrics if m in df.columns]
        
        if available_metrics:
            efficiency_data = df.groupby('model_type')[available_metrics].mean()
            # Normalize metrics (lower is better for all these metrics)
            normalized_data = 100 / efficiency_data  # Inverse so higher is better
            
            normalized_data.plot(kind='bar', ax=ax5, rot=45)
            ax5.set_title('Efficiency Comparison\n(Higher = Better)')
            ax5.set_ylabel('Efficiency Score')
            ax5.legend(bbox_to_anchor=(1.05, 1), loc='upper left')
    
    # 6. Summary Statistics
    ax6 = plt.subplot(3, 3, 9)
    ax6.axis('off')
    
    # Create summary text
    summary_text = "EXPERIMENT SUMMARY\n\n"
    if not df.empty:
        summary_text += f"Total Experiments: {len(df)}\n"
        if 'dialect' in df.columns:
            summary_text += f"Dialects: {', '.join(df['dialect'].unique())}\n"
        if 'model_type' in df.columns:
            summary_text += f"Methods: {', '.join(df['model_type'].unique())}\n\n"
        
        # Best performance
        if 'final_wer' in df.columns:
            best_wer = df.loc[df['final_wer'].idxmin()]
            summary_text += f"Best WER: {best_wer['final_wer']:.2f}%\n"
            summary_text += f"Method: {best_wer.get('model_type', 'N/A')}\n"
            summary_text += f"Dialect: {best_wer.get('dialect', 'N/A')}\n"
    
    ax6.text(0.1, 0.9, summary_text, transform=ax6.transAxes, 
             fontsize=11, verticalalignment='top',
             bbox=dict(boxstyle='round', facecolor='lightblue', alpha=0.8))
    
    plt.suptitle('Comprehensive PEFT vs Full Fine-tuning Analysis', 
                 fontsize=16, fontweight='bold', y=0.98)
    plt.tight_layout()
    plt.show()

# Generate detailed comparison
if not metrics_df.empty:
    plot_detailed_comparison(metrics_df)
else:
    print("No data available for plotting. Please run training experiments first.")

## 8. Save Analysis Results
Save the analysis results and plots for documentation.

In [None]:
def save_analysis_results(df, output_dir="./analysis_results"):
    """Save analysis results to files."""
    if df.empty:
        print("No data to save.")
        return
    
    output_path = Path(output_dir)
    output_path.mkdir(exist_ok=True)
    
    # Save processed data
    csv_path = output_path / "training_metrics_processed.csv"
    df.to_csv(csv_path, index=False)
    print(f"Processed data saved to: {csv_path}")
    
    # Save summary statistics
    if 'model_type' in df.columns:
        summary_cols = ['final_wer', 'training_time_minutes', 'peak_memory_gb', 
                       'trainable_params_millions']
        available_cols = [col for col in summary_cols if col in df.columns]
        
        if available_cols:
            summary = df.groupby('model_type')[available_cols].agg(['mean', 'std']).round(3)
            summary_path = output_path / "efficiency_summary.csv"
            summary.to_csv(summary_path)
            print(f"Summary statistics saved to: {summary_path}")
    
    print(f"\nAll analysis results saved to: {output_path}")

# Save results
if not metrics_df.empty:
    save_analysis_results(metrics_df)
    
print("\n" + "="*80)
print("ANALYSIS COMPLETE!")
print("\nTo generate this analysis:")
print("1. Run training experiments with the updated script")
print("2. The script automatically saves metrics_*.json files")
print("3. This notebook loads and analyzes all available metrics")
print("4. Comparison plots show PEFT vs Full Fine-tuning efficiency")
print("="*80)