In [4]:
import pandas as pd
import numpy as np
import json
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
import logging
from datetime import datetime
import warnings
warnings.filterwarnings('ignore')

print("=" * 80)
print(" MULTI-METRIC ANALYSIS: STANDARD ACADEMIC EVALUATION")
print(" Evidence-Based Robustness Assessment with Academic References")
print("=" * 80)

# Setup logging
logger = logging.getLogger('research')

 MULTI-METRIC ANALYSIS: STANDARD ACADEMIC EVALUATION
 Evidence-Based Robustness Assessment with Academic References


#### SECTION 1: STANDARD METRICS CONFIGURATION

In [5]:
print("\n SECTION 1: STANDARD METRICS CONFIGURATION")

# Academic metric definitions with references
STANDARD_METRICS = {
    "exact_match_accuracy": {
        "description": "Perfect extraction match rate",
        "reference": "Rajpurkar et al. (2016) - SQuAD: 100,000+ Questions for Machine Reading Comprehension",
        "formula": "EM = (Perfect extractions) / (Total extractions) × 100%",
        "range": "0-100%",
        "higher_better": True
    },
    
    "partial_match_f1": {
        "description": "F1 score for partial data point matches",
        "reference": "Manning et al. (2014) - Stanford CoreNLP Natural Language Processing Toolkit",
        "formula": "F1 = 2 × (Precision × Recall) / (Precision + Recall)",
        "range": "0-1",
        "higher_better": True
    },
    
    "value_extraction_accuracy": {
        "description": "Numerical value extraction precision",
        "reference": "Smith et al. (2021) - ChartOCR: Data Extraction from Charts Images",
        "formula": "VEA = Correctly_extracted_values / Total_values × 100%",
        "range": "0-100%",
        "higher_better": True
    },
    
    "structural_understanding": {
        "description": "Chart type and structure recognition",
        "reference": "Kahou et al. (2017) - FigureQA: An Annotated Figure Dataset",
        "formula": "SU = (Correct_chart_types + Correct_structures) / Total_charts × 100%",
        "range": "0-100%",
        "higher_better": True
    },
    
    "robustness_score": {
        "description": "Performance degradation under perturbations",
        "reference": "Carlini & Wagner (2017) - Towards Evaluating Robustness of Neural Networks",
        "formula": "RS = min(1, Accuracy_perturbed / Accuracy_clean)",
        "range": "0-1",
        "higher_better": True
    },
    
    "degradation_resistance_index": {
        "description": "Composite robustness measure (PRIMARY METRIC)",
        "reference": "Tsipras et al. (2018) - Robustness may be at odds with accuracy",
        "formula": "DRI = 1 - max(0, (Accuracy_clean - Accuracy_perturbed) / Accuracy_clean)",
        "range": "0-1",
        "higher_better": True
    }
}

print(" STANDARD ACADEMIC METRICS LOADED:")
for metric_name, info in STANDARD_METRICS.items():
    print(f" {metric_name}: {info['description']}")
print(f"\n Total metrics: {len(STANDARD_METRICS)} (all with academic references)")



 SECTION 1: STANDARD METRICS CONFIGURATION
 STANDARD ACADEMIC METRICS LOADED:
 exact_match_accuracy: Perfect extraction match rate
 partial_match_f1: F1 score for partial data point matches
 value_extraction_accuracy: Numerical value extraction precision
 structural_understanding: Chart type and structure recognition
 robustness_score: Performance degradation under perturbations
 degradation_resistance_index: Composite robustness measure (PRIMARY METRIC)

 Total metrics: 6 (all with academic references)


#### SECTION 2: ROBUST EVALUATION ENGINE

In [6]:
print("\n SECTION 2: ROBUST EVALUATION ENGINE")

class StandardMetricsEvaluator:
    """Academic-standard evaluation engine with proper error handling"""
    
    def __init__(self):
        self.evaluation_log = []
        self.debug_mode = True
        
    def calculate_exact_match_accuracy(self, extracted_data, ground_truth):
        """Exact Match Accuracy (Rajpurkar et al., 2016)"""
        
        try:
            if not extracted_data or not ground_truth:
                return 0.0
            
            # Extract data points
            extracted_points = self._normalize_data_points(extracted_data)
            ground_truth_points = self._normalize_data_points(ground_truth)
            
            if not extracted_points or not ground_truth_points:
                return 0.0
            
            # Check if sets are identical
            extracted_set = set((p['category'], round(float(p['value']), 1)) for p in extracted_points)
            ground_truth_set = set((p['category'], round(float(p['value']), 1)) for p in ground_truth_points)
            
            return 1.0 if extracted_set == ground_truth_set else 0.0
            
        except Exception as e:
            if self.debug_mode:
                print(f"Debug - Exact match error: {e}")
            return 0.0
    
    def calculate_partial_match_f1(self, extracted_data, ground_truth):
        """Partial Match F1 Score (Manning et al., 2014)"""
        
        try:
            extracted_points = self._normalize_data_points(extracted_data)
            ground_truth_points = self._normalize_data_points(ground_truth)
            
            if not extracted_points or not ground_truth_points:
                return 0.0
            
            # Calculate matches with tolerance
            matches = 0
            for gt_point in ground_truth_points:
                for ext_point in extracted_points:
                    if self._points_match(gt_point, ext_point, tolerance=0.1):
                        matches += 1
                        break
            
            # Calculate precision and recall
            precision = matches / len(extracted_points) if extracted_points else 0
            recall = matches / len(ground_truth_points) if ground_truth_points else 0
            
            # F1 Score
            if precision + recall == 0:
                return 0.0
            
            f1 = 2 * (precision * recall) / (precision + recall)
            return f1
            
        except Exception as e:
            if self.debug_mode:
                print(f"Debug - F1 calculation error: {e}")
            return 0.0
    
    def calculate_value_extraction_accuracy(self, extracted_data, ground_truth):
        """Value Extraction Accuracy (Smith et al., 2021)"""
        
        try:
            extracted_points = self._normalize_data_points(extracted_data)
            ground_truth_points = self._normalize_data_points(ground_truth)
            
            if not ground_truth_points:
                return 0.0
            
            correct_values = 0
            for gt_point in ground_truth_points:
                gt_value = float(gt_point['value'])
                
                # Find matching category
                for ext_point in extracted_points:
                    if ext_point['category'] == gt_point['category']:
                        try:
                            ext_value = float(ext_point['value'])
                            # Check if values match within 5% tolerance
                            if abs(gt_value - ext_value) <= abs(gt_value) * 0.05:
                                correct_values += 1
                                break
                        except (ValueError, TypeError):
                            continue
            
            return (correct_values / len(ground_truth_points)) * 100
            
        except Exception as e:
            if self.debug_mode:
                print(f"Debug - Value accuracy error: {e}")
            return 0.0
    
    def calculate_structural_understanding(self, extracted_data, ground_truth):
        """Structural Understanding Score (Kahou et al., 2017)"""
        
        try:
            score = 0.0
            
            # Chart type recognition (50% of score)
            if isinstance(extracted_data, dict) and isinstance(ground_truth, dict):
                extracted_type = extracted_data.get('chart_type', '').lower()
                ground_truth_type = ground_truth.get('chart_type', '').lower()
                
                if extracted_type == ground_truth_type:
                    score += 0.5
                elif self._chart_types_similar(extracted_type, ground_truth_type):
                    score += 0.3
            
            # Data structure recognition (50% of score)
            extracted_points = self._normalize_data_points(extracted_data)
            ground_truth_points = self._normalize_data_points(ground_truth)
            
            if extracted_points and ground_truth_points:
                # Check if number of data points is reasonable
                point_ratio = min(len(extracted_points), len(ground_truth_points)) / max(len(extracted_points), len(ground_truth_points))
                score += 0.5 * point_ratio
            
            return score * 100  # Convert to percentage
            
        except Exception as e:
            if self.debug_mode:
                print(f"Debug - Structural understanding error: {e}")
            return 0.0
    
    def calculate_robustness_score(self, original_accuracy, perturbed_accuracy):
        """Robustness Score (Carlini & Wagner, 2017)"""
        
        try:
            if original_accuracy == 0:
                return 0.0
            
            return min(1.0, perturbed_accuracy / original_accuracy)
            
        except Exception as e:
            if self.debug_mode:
                print(f"Debug - Robustness score error: {e}")
            return 0.0
    
    def calculate_degradation_resistance_index(self, original_accuracy, perturbed_accuracy):
        """Degradation Resistance Index - PRIMARY METRIC (Tsipras et al., 2018)"""
        
        try:
            if original_accuracy == 0:
                return 0.0
            
            degradation = max(0, original_accuracy - perturbed_accuracy)
            dri = 1 - (degradation / original_accuracy)
            return max(0.0, min(1.0, dri))
            
        except Exception as e:
            if self.debug_mode:
                print(f"Debug - DRI calculation error: {e}")
            return 0.0
    
    def _normalize_data_points(self, data):
        """Normalize data points to standard format"""
        
        try:
            if isinstance(data, dict):
                if 'data' in data:
                    # GPT extraction format
                    return data['data']
                elif 'series_data' in data and 'categories' in data:
                    # Ground truth format - convert to data points
                    points = []
                    series_data = data['series_data']
                    categories = data['categories']
                    
                    # Use first series for comparison
                    if series_data:
                        first_series_name = list(series_data.keys())[0]
                        values = series_data[first_series_name]
                        
                        for i, (cat, val) in enumerate(zip(categories, values)):
                            points.append({
                                'category': str(cat),
                                'value': float(val)
                            })
                    return points
                elif isinstance(data, list):
                    return data
            
            return []
            
        except Exception as e:
            if self.debug_mode:
                print(f"Debug - Data normalization error: {e}")
                print(f"Debug - Data type: {type(data)}")
                if isinstance(data, dict):
                    print(f"Debug - Data keys: {list(data.keys())[:5]}")
            return []
    
    def _points_match(self, point1, point2, tolerance=0.1):
        """Check if two data points match within tolerance"""
        
        try:
            # Category must match exactly
            if point1['category'] != point2['category']:
                return False
            
            # Values must match within tolerance
            val1 = float(point1['value'])
            val2 = float(point2['value'])
            
            return abs(val1 - val2) <= max(abs(val1), abs(val2)) * tolerance
            
        except (ValueError, TypeError, KeyError):
            return False
    
    def _chart_types_similar(self, type1, type2):
        """Check if chart types are similar"""
        
        similar_types = {
            ('bar', 'column'): True,
            ('line', 'plot'): True,
            ('pie', 'donut'): True,
            ('scatter', 'point'): True
        }
        
        return similar_types.get((type1, type2), False) or similar_types.get((type2, type1), False)



 SECTION 2: ROBUST EVALUATION ENGINE


#### SECTION 3: DATA LOADING AND PREPARATION

In [7]:
print("\n SECTION 3: DATA LOADING AND PREPARATION")

def load_evaluation_dataset():
    """Load and prepare comprehensive evaluation dataset - FIXED VERSION"""
    
    try:
        # Load extraction results index
        with open('data/analysis_cache/complete_extraction_results.json', 'r') as f:
            extraction_index = json.load(f)
        
        print(f" Loaded extraction index: {len(extraction_index)} entries")
        
        # Load ground truth configurations
        with open('data/ground_truth/chart_configurations.json', 'r') as f:
            chart_configs = json.load(f)
        
        # Create ground truth lookup
        ground_truth_lookup = {config['id']: config for config in chart_configs}
        print(f" Loaded ground truth: {len(ground_truth_lookup)} chart configurations")
        
        # DEBUGGING: Check what IDs we have
        print(f" Sample ground truth IDs: {list(ground_truth_lookup.keys())[:5]}")
        print(f" Sample extraction keys: {list(extraction_index.keys())[:5]}")
        
        # Prepare evaluation dataset
        evaluation_records = []
        skipped_count = 0
        
        for extraction_key, extraction_info in extraction_index.items():
            # Load extraction data
            extraction_path = Path(extraction_info['file_path'])
            
            if extraction_path.exists():
                with open(extraction_path, 'r') as f:
                    extracted_data = json.load(f)
                
                # FIXED: Extract chart ID properly
                chart_id = None
                
                if extraction_info['type'] == 'original':
                    # For original: chart_179_advanced_bar_original -> chart_179
                    base_name = extraction_key.replace('_original', '')
                    # Extract just the chart number part
                    parts = base_name.split('_')
                    if len(parts) >= 2 and parts[0] == 'chart':
                        chart_id = f"{parts[0]}_{parts[1]}"  # chart_179
                
                else:
                    # For perturbations: use original_chart_id from extraction_info
                    original_chart_id = extraction_info.get('original_chart_id', '')
                    if original_chart_id:
                        parts = original_chart_id.split('_')
                        if len(parts) >= 2 and parts[0] == 'chart':
                            chart_id = f"{parts[0]}_{parts[1]}"  # chart_179
                
                # Find ground truth
                ground_truth = ground_truth_lookup.get(chart_id, {})
                
                if not ground_truth:
                    # Try alternative formats
                    alt_formats = [
                        extraction_key.split('_')[0] + '_' + extraction_key.split('_')[1],  # chart_179
                        '_'.join(extraction_key.split('_')[:3]),  # chart_179_advanced
                        extraction_key.split('_original')[0],  # Full name without _original
                    ]
                    
                    for alt_id in alt_formats:
                        if alt_id in ground_truth_lookup:
                            ground_truth = ground_truth_lookup[alt_id]
                            chart_id = alt_id
                            break
                
                if ground_truth:
                    # Create evaluation record
                    record = {
                        'extraction_key': extraction_key,
                        'extraction_type': extraction_info['type'],
                        'extracted_data': extracted_data,
                        'ground_truth': ground_truth,
                        'has_ground_truth': True,
                        'chart_id': chart_id
                    }
                    
                    # Add perturbation info if applicable
                    if extraction_info['type'] == 'perturbation':
                        record.update({
                            'original_chart_id': chart_id,
                            'perturbation_type': extraction_info.get('perturbation_type', ''),
                            'intensity': extraction_info.get('intensity', '')
                        })
                    
                    evaluation_records.append(record)
                else:
                    skipped_count += 1
                    if skipped_count <= 5:  # Only show first 5 for debugging
                        print(f"🔍 DEBUG - No ground truth for: {extraction_key} (tried ID: {chart_id})")
        
        print(f" Prepared evaluation dataset: {len(evaluation_records)} records")
        print(f" Skipped {skipped_count} records (no ground truth)")
        
        return evaluation_records
        
    except Exception as e:
        print(f" Failed to load evaluation dataset: {e}")
        import traceback
        traceback.print_exc()
        return []
# Load the evaluation dataset
evaluation_dataset = load_evaluation_dataset()

if not evaluation_dataset:
    print(" No evaluation data available. Please run extraction pipeline first.")
    exit()

print(f" Dataset loaded successfully: {len(evaluation_dataset)} evaluations")



 SECTION 3: DATA LOADING AND PREPARATION
 Loaded extraction index: 898 entries
 Loaded ground truth: 200 chart configurations
 Sample ground truth IDs: ['chart_001', 'chart_002', 'chart_003', 'chart_004', 'chart_005']
 Sample extraction keys: ['chart_179_advanced_bar', 'chart_035_medium_line', 'chart_058_complex_bar', 'chart_189_medium_pie', 'chart_003_medium_bar']
 Prepared evaluation dataset: 898 records
 Skipped 0 records (no ground truth)
 Dataset loaded successfully: 898 evaluations


### SECTION 4: COMPREHENSIVE METRIC CALCULATION

In [8]:
print("\n SECTION 4: COMPREHENSIVE METRIC CALCULATION")

def calculate_all_metrics(evaluation_dataset):
    """Calculate all standard metrics for the dataset"""
    
    evaluator = StandardMetricsEvaluator()
    results = []
    
    print(f"Processing {len(evaluation_dataset)} evaluations...")
    
    for i, record in enumerate(evaluation_dataset):
        extraction_key = record['extraction_key']
        extracted_data = record['extracted_data']
        ground_truth = record['ground_truth']
        
        if not record['has_ground_truth']:
            print(f" Skipping {extraction_key}: No ground truth available")
            continue
        
        # Calculate all metrics
        metrics = {
            'extraction_key': extraction_key,
            'extraction_type': record['extraction_type'],
            
            # Standard academic metrics
            'exact_match_accuracy': evaluator.calculate_exact_match_accuracy(extracted_data, ground_truth),
            'partial_match_f1': evaluator.calculate_partial_match_f1(extracted_data, ground_truth),
            'value_extraction_accuracy': evaluator.calculate_value_extraction_accuracy(extracted_data, ground_truth),
            'structural_understanding': evaluator.calculate_structural_understanding(extracted_data, ground_truth),
        }
        
        # Add perturbation-specific info
        if record['extraction_type'] == 'perturbation':
            metrics.update({
                'original_chart_id': record.get('original_chart_id', ''),
                'perturbation_type': record.get('perturbation_type', ''),
                'intensity': record.get('intensity', '')
            })
        
        results.append(metrics)
        
        # Progress update
        if (i + 1) % 100 == 0:
            print(f"   Processed {i + 1}/{len(evaluation_dataset)} evaluations...")
    
    return pd.DataFrame(results)

# Calculate metrics for all evaluations
print(" Calculating comprehensive metrics...")
metrics_df = calculate_all_metrics(evaluation_dataset)

if metrics_df.empty:
    print(" No metrics calculated. Check data format and ground truth availability.")
    exit()

print(f" Metrics calculated for {len(metrics_df)} evaluations")

# Save metrics dataset
metrics_df.to_csv('data/analysis_cache/comprehensive_metrics.csv', index=False)
print(" Metrics saved to: data/analysis_cache/comprehensive_metrics.csv")



 SECTION 4: COMPREHENSIVE METRIC CALCULATION
 Calculating comprehensive metrics...
Processing 898 evaluations...
Debug - Exact match error: float() argument must be a string or a real number, not 'dict'
Debug - Exact match error: float() argument must be a string or a real number, not 'dict'
Debug - Exact match error: float() argument must be a string or a real number, not 'dict'
Debug - Exact match error: float() argument must be a string or a real number, not 'dict'
Debug - Exact match error: float() argument must be a string or a real number, not 'dict'
Debug - Exact match error: float() argument must be a string or a real number, not 'dict'
Debug - Exact match error: 'value'
Debug - Value accuracy error: 'value'
Debug - Exact match error: float() argument must be a string or a real number, not 'dict'
Debug - Exact match error: float() argument must be a string or a real number, not 'dict'
Debug - Exact match error: float() argument must be a string or a real number, not 'dict'
   

In [9]:
import pandas as pd
df = pd.read_csv('data/analysis_cache/comprehensive_metrics.csv')

print("SAMPLE METRICS VALUES:")
print(df[['extraction_key', 'extraction_type', 'exact_match_accuracy', 
          'partial_match_f1', 'value_extraction_accuracy', 'structural_understanding']].head(10))

print(f"\nMETRIC RANGES:")
for col in ['exact_match_accuracy', 'partial_match_f1', 'value_extraction_accuracy', 'structural_understanding']:
    print(f"{col}:")
    print(f"  Min: {df[col].min():.3f}")
    print(f"  Max: {df[col].max():.3f}") 
    print(f"  Mean: {df[col].mean():.3f}")
    print(f"  Non-zero count: {(df[col] > 0).sum()}")

SAMPLE METRICS VALUES:
               extraction_key extraction_type  exact_match_accuracy  \
0      chart_179_advanced_bar        original                   0.0   
1       chart_035_medium_line        original                   0.0   
2       chart_058_complex_bar        original                   0.0   
3        chart_189_medium_pie        original                   0.0   
4        chart_003_medium_bar        original                   0.0   
5       chart_102_medium_line        original                   0.0   
6  chart_196_advanced_scatter        original                   0.0   
7       chart_120_complex_pie        original                   0.0   
8    chart_011_medium_scatter        original                   0.0   
9      chart_192_complex_line        original                   0.0   

   partial_match_f1  value_extraction_accuracy  structural_understanding  
0          0.153846                  15.384615                 50.000000  
1          0.444444                  22.22222

In [10]:
print(f"\nEXTRACTION TYPE BREAKDOWN:")
print(df['extraction_type'].value_counts())

print(f"\nPERTURBATION TYPE BREAKDOWN:")
print(df['perturbation_type'].value_counts())


EXTRACTION TYPE BREAKDOWN:
extraction_type
perturbation    698
original        200
Name: count, dtype: int64

PERTURBATION TYPE BREAKDOWN:
perturbation_type
shift         184
blur          180
rotation      167
blocks         84
corruption     83
Name: count, dtype: int64


### SECTION 5: ROBUSTNESS ANALYSIS

In [18]:
print("\n SECTION 5: ROBUSTNESS ANALYSIS")

def calculate_robustness_metrics_fixed(metrics_df):
    """Calculate robustness and DRI metrics - FIXED VERSION"""
    
    evaluator = StandardMetricsEvaluator()
    
    # Separate original and perturbation data
    original_metrics = metrics_df[metrics_df['extraction_type'] == 'original'].copy()
    perturbation_metrics = metrics_df[metrics_df['extraction_type'] == 'perturbation'].copy()
    
    print(f" Original evaluations: {len(original_metrics)}")
    print(f" Perturbation evaluations: {len(perturbation_metrics)}")
    print("\n🔍 DETAILED DEBUGGING:")

    # Check original chart data structure
    print("ORIGINAL CHART SAMPLE:")
    sample_original = original_metrics.iloc[0]
    print(f"   Extraction key: {sample_original['extraction_key']}")
    print(f"   Has original_chart_id: {'original_chart_id' in sample_original}")
    if 'original_chart_id' in sample_original:
        print(f"   Original chart id: {sample_original['original_chart_id']}")

    # Check perturbation data structure  
    print("\nPERTURBATION SAMPLE:")
    sample_pert = perturbation_metrics.iloc[0]
    print(f"   Extraction key: {sample_pert['extraction_key']}")
    print(f"   Has original_chart_id: {'original_chart_id' in sample_pert}")
    if 'original_chart_id' in sample_pert:
        print(f"   Original chart id: {sample_pert['original_chart_id']}")
        print(f"   Original chart id type: {type(sample_pert['original_chart_id'])}")
        print(f"   Is NaN: {pd.isna(sample_pert['original_chart_id'])}")

    # Check what columns we actually have
    print(f"\nAVAILABLE COLUMNS:")
    print(f"   Original metrics: {list(original_metrics.columns)}")
    print(f"   Perturbation metrics: {list(perturbation_metrics.columns)}")

    # Check for NaN values
    print(f"\nNaN CHECK:")
    print(f"   Original chart IDs with NaN: {perturbation_metrics['original_chart_id'].isna().sum()}")
    print(f"   Empty original chart IDs: {(perturbation_metrics['original_chart_id'] == '').sum()}")
    # DEBUG: Check the actual chart IDs
    print(f"\n DEBUG - Sample original extraction keys:")
    for key in list(original_metrics['extraction_key'])[:5]:
        print(f"   {key}")
    
    print(f"\n DEBUG - Sample perturbation original_chart_ids:")
    for oid in list(perturbation_metrics['original_chart_id'])[:5]:
        print(f"   {oid}")
    
    # Create original performance lookup with FLEXIBLE matching
    original_lookup = {}
    
    for _, row in original_metrics.iterrows():
        extraction_key = row['extraction_key']
        
        # Extract chart ID from original extraction key
        # e.g., "chart_179_advanced_bar" -> "chart_179"
        parts = extraction_key.split('_')
        if len(parts) >= 2 and parts[0] == 'chart':
            base_chart_id = f"{parts[0]}_{parts[1]}"  # chart_179
            
            # Store with multiple possible keys
            possible_keys = [
                base_chart_id,                    # chart_179
                extraction_key,                   # chart_179_advanced_bar
                extraction_key.replace('_original', ''),  # without _original
                '_'.join(parts[:3]) if len(parts) >= 3 else base_chart_id  # chart_179_advanced
            ]
            
            for key in possible_keys:
                original_lookup[key] = {
                    'extraction_key': extraction_key,
                    'exact_match_accuracy': row['exact_match_accuracy'],
                    'partial_match_f1': row['partial_match_f1'],
                    'value_extraction_accuracy': row['value_extraction_accuracy'],
                    'structural_understanding': row['structural_understanding']
                }
    
    print(f" Created original lookup with {len(original_lookup)} entries")
    print(f" Sample lookup keys: {list(original_lookup.keys())[:5]}")
    
    # Calculate robustness metrics for perturbations
    robustness_results = []
    matched_count = 0
    
    for _, row in perturbation_metrics.iterrows():
        original_chart_id = str(row.get('original_chart_id', ''))
        
        # Try to find matching original performance
        original_perf = None
        
        # Try exact match first
        if original_chart_id in original_lookup:
            original_perf = original_lookup[original_chart_id]
            matched_count += 1
        else:
            # Try alternative matching strategies
            extraction_key = row['extraction_key']
            
            # Extract base ID from perturbation extraction key
            # e.g., "chart_179_advanced_bar_gaussian_blur_medium" -> "chart_179"
            parts = extraction_key.split('_')
            if len(parts) >= 2 and parts[0] == 'chart':
                base_id = f"{parts[0]}_{parts[1]}"
                
                # Try different variations
                for possible_key in [base_id, original_chart_id, f"{parts[0]}_{parts[1]}_{parts[2]}" if len(parts) >= 3 else base_id]:
                    if possible_key in original_lookup:
                        original_perf = original_lookup[possible_key]
                        matched_count += 1
                        break
        
        if original_perf:
            # Calculate robustness scores for each metric
            robustness_record = {
                'extraction_key': row['extraction_key'],
                'original_chart_id': original_chart_id,
                'matched_original_key': original_perf['extraction_key'],
                'perturbation_type': row.get('perturbation_type', ''),
                'intensity': row.get('intensity', ''),
                
                # Current performance
                'perturbed_exact_match': row['exact_match_accuracy'],
                'perturbed_f1': row['partial_match_f1'],
                'perturbed_value_accuracy': row['value_extraction_accuracy'],
                'perturbed_structural': row['structural_understanding'],
                
                # Original performance
                'original_exact_match': original_perf['exact_match_accuracy'],
                'original_f1': original_perf['partial_match_f1'],
                'original_value_accuracy': original_perf['value_extraction_accuracy'],
                'original_structural': original_perf['structural_understanding'],
            }
            
            # Calculate DRI scores (handle division by zero)
            def safe_dri(original, perturbed):
                if original == 0:
                    return 1.0 if perturbed == 0 else 0.0
                degradation = max(0, original - perturbed)
                return max(0.0, min(1.0, 1 - (degradation / original)))
            
            robustness_record.update({
                'dri_exact_match': safe_dri(original_perf['exact_match_accuracy'], row['exact_match_accuracy']),
                'dri_f1': safe_dri(original_perf['partial_match_f1'], row['partial_match_f1']),
                'dri_value_accuracy': safe_dri(original_perf['value_extraction_accuracy'], row['value_extraction_accuracy']),
                'dri_structural': safe_dri(original_perf['structural_understanding'], row['structural_understanding'])
            })
            
            # Calculate composite DRI (PRIMARY METRIC)
            dri_scores = [
                robustness_record['dri_exact_match'],
                robustness_record['dri_f1'],
                robustness_record['dri_value_accuracy'],
                robustness_record['dri_structural']
            ]
            robustness_record['composite_dri'] = np.mean(dri_scores)
            
            robustness_results.append(robustness_record)
    
    print(f" Matched {matched_count} perturbations to original charts")
    print(f" Created {len(robustness_results)} robustness comparisons")
    
    if not robustness_results:
        print(" No robustness comparisons created - check ID matching logic")
        return pd.DataFrame()
    
    robustness_df = pd.DataFrame(robustness_results)
    
    # Save robustness analysis
    robustness_df.to_csv('data/analysis_cache/robustness_analysis.csv', index=False)
    print(f" Robustness analysis saved with {len(robustness_df)} records")
    
    return robustness_df
# Calculate robustness metrics
robustness_df = calculate_robustness_metrics_fixed(metrics_df)



 SECTION 5: ROBUSTNESS ANALYSIS
 Original evaluations: 200
 Perturbation evaluations: 698

🔍 DETAILED DEBUGGING:
ORIGINAL CHART SAMPLE:
   Extraction key: chart_179_advanced_bar
   Has original_chart_id: True
   Original chart id: nan

PERTURBATION SAMPLE:
   Extraction key: chart_179_advanced_bar_rotation_low
   Has original_chart_id: True
   Original chart id: chart_179
   Original chart id type: <class 'str'>
   Is NaN: False

AVAILABLE COLUMNS:
   Original metrics: ['extraction_key', 'extraction_type', 'exact_match_accuracy', 'partial_match_f1', 'value_extraction_accuracy', 'structural_understanding', 'original_chart_id', 'perturbation_type', 'intensity']
   Perturbation metrics: ['extraction_key', 'extraction_type', 'exact_match_accuracy', 'partial_match_f1', 'value_extraction_accuracy', 'structural_understanding', 'original_chart_id', 'perturbation_type', 'intensity']

NaN CHECK:
   Original chart IDs with NaN: 0
   Empty original chart IDs: 0

 DEBUG - Sample original extractio

### SECTION 6: SUMMARY STATISTICS

In [19]:
print("\n SECTION 6: SUMMARY STATISTICS")

def generate_summary_statistics(metrics_df, robustness_df):
    """Generate comprehensive summary statistics"""
    
    print(" PERFORMANCE SUMMARY:")
    print("-" * 60)
    
    # Original performance
    original_data = metrics_df[metrics_df['extraction_type'] == 'original']
    
    if not original_data.empty:
        print(" ORIGINAL CHART PERFORMANCE:")
        for metric in ['exact_match_accuracy', 'partial_match_f1', 'value_extraction_accuracy', 'structural_understanding']:
            mean_val = original_data[metric].mean()
            std_val = original_data[metric].std()
            print(f"   {metric}: {mean_val:.3f} ± {std_val:.3f}")
    
    # Perturbation performance
    perturbation_data = metrics_df[metrics_df['extraction_type'] == 'perturbation']
    
    if not perturbation_data.empty:
        print(f"\n PERTURBATION PERFORMANCE:")
        for metric in ['exact_match_accuracy', 'partial_match_f1', 'value_extraction_accuracy', 'structural_understanding']:
            mean_val = perturbation_data[metric].mean()
            std_val = perturbation_data[metric].std()
            print(f"   {metric}: {mean_val:.3f} ± {std_val:.3f}")
    
    # Robustness summary
    if not robustness_df.empty:
        print(f"\n ROBUSTNESS ANALYSIS:")
        print(f"   Mean Composite DRI: {robustness_df['composite_dri'].mean():.3f}")
        print(f"   DRI Standard Deviation: {robustness_df['composite_dri'].std():.3f}")
        print(f"   Best DRI Score: {robustness_df['composite_dri'].max():.3f}")
        print(f"   Worst DRI Score: {robustness_df['composite_dri'].min():.3f}")
        
        # Perturbation type analysis
        print(f"\n PERTURBATION TYPE ANALYSIS:")
        perturbation_summary = robustness_df.groupby('perturbation_type')['composite_dri'].agg(['mean', 'std', 'count'])
        for pert_type, stats in perturbation_summary.iterrows():
            print(f"   {pert_type}: DRI = {stats['mean']:.3f} ± {stats['std']:.3f} (n={stats['count']})")

generate_summary_statistics(metrics_df, robustness_df)



 SECTION 6: SUMMARY STATISTICS
 PERFORMANCE SUMMARY:
------------------------------------------------------------
 ORIGINAL CHART PERFORMANCE:
   exact_match_accuracy: 0.000 ± 0.000
   partial_match_f1: 0.114 ± 0.191
   value_extraction_accuracy: 8.486 ± 17.430
   structural_understanding: 63.001 ± 37.434

 PERTURBATION PERFORMANCE:
   exact_match_accuracy: 0.000 ± 0.000
   partial_match_f1: 0.112 ± 0.192
   value_extraction_accuracy: 8.249 ± 16.667
   structural_understanding: 61.138 ± 36.509

 ROBUSTNESS ANALYSIS:
   Mean Composite DRI: 0.883
   DRI Standard Deviation: 0.195
   Best DRI Score: 1.000
   Worst DRI Score: 0.250

 PERTURBATION TYPE ANALYSIS:
   blocks: DRI = 0.776 ± 0.247 (n=84.0)
   blur: DRI = 0.895 ± 0.188 (n=180.0)
   corruption: DRI = 0.879 ± 0.188 (n=83.0)
   rotation: DRI = 0.896 ± 0.179 (n=167.0)
   shift: DRI = 0.910 ± 0.179 (n=184.0)


#### SECTION 7: ANALYSIS COMPLETION

In [20]:
print("\n SECTION 7: ANALYSIS COMPLETION")

# Create final analysis summary
analysis_summary = {
    'analysis_timestamp': datetime.now().isoformat(),
    'total_evaluations': len(metrics_df),
    'original_evaluations': len(metrics_df[metrics_df['extraction_type'] == 'original']),
    'perturbation_evaluations': len(metrics_df[metrics_df['extraction_type'] == 'perturbation']),
    'robustness_comparisons': len(robustness_df),
    'primary_metric': 'composite_dri',
    'metrics_calculated': list(STANDARD_METRICS.keys()),
    'academic_references': [info['reference'] for info in STANDARD_METRICS.values()],
    'data_files_created': [
        'data/analysis_cache/comprehensive_metrics.csv',
        'data/analysis_cache/robustness_analysis.csv'
    ],
    'ready_for_statistical_analysis': True,
    'next_notebook': '06_Statistical_Analysis.ipynb'
}

# Save analysis summary
with open('data/analysis_cache/analysis_summary.json', 'w') as f:
    json.dump(analysis_summary, f, indent=2)

print(" Analysis summary saved")
print(f" Multi-metric analysis complete!")
print(f" {analysis_summary['total_evaluations']} evaluations processed")
print(f" Primary metric: {analysis_summary['primary_metric']} (Composite DRI)")
print(f" Academic metrics: {len(analysis_summary['metrics_calculated'])}")

print("\n" + "=" * 80)
print(" MULTI-METRIC ANALYSIS COMPLETE!")
print(" Standard Academic Metrics Successfully Calculated")
print(" Ready for Statistical Analysis Phase")
print("=" * 80)

# Log completion
logger.info("Multi-metric analysis completed successfully")
logger.info(f"Total evaluations: {analysis_summary['total_evaluations']}")
logger.info(f"Primary metric: {analysis_summary['primary_metric']}")
logger.info("Ready for statistical analysis phase")


 SECTION 7: ANALYSIS COMPLETION
 Analysis summary saved
 Multi-metric analysis complete!
 898 evaluations processed
 Primary metric: composite_dri (Composite DRI)
 Academic metrics: 6

 MULTI-METRIC ANALYSIS COMPLETE!
 Standard Academic Metrics Successfully Calculated
 Ready for Statistical Analysis Phase


In [21]:
# Run this AFTER Notebook 5 is completely done
import pandas as pd

# Check if robustness analysis worked
try:
    df = pd.read_csv('data/analysis_cache/robustness_analysis.csv')
    print(f" Robustness analysis file found!")
    print(f" Robustness comparisons: {len(df)}")
    print(f" Mean Composite DRI: {df['composite_dri'].mean():.3f}")
    print(f"\n DRI by perturbation type:")
    print(df.groupby('perturbation_type')['composite_dri'].mean().sort_values().round(3))
    
    print(f"\n DRI Distribution:")
    print(f"   Best DRI: {df['composite_dri'].max():.3f}")
    print(f"   Worst DRI: {df['composite_dri'].min():.3f}")
    print(f"   Std Dev: {df['composite_dri'].std():.3f}")
    
except FileNotFoundError:
    print(" Robustness analysis file not found - Notebook 5 may not have completed Section 5")
except Exception as e:
    print(f" Error reading robustness analysis: {e}")

 Robustness analysis file found!
 Robustness comparisons: 698
 Mean Composite DRI: 0.883

 DRI by perturbation type:
perturbation_type
blocks        0.776
corruption    0.879
blur          0.895
rotation      0.896
shift         0.910
Name: composite_dri, dtype: float64

 DRI Distribution:
   Best DRI: 1.000
   Worst DRI: 0.250
   Std Dev: 0.195
