In [8]:
#!/usr/bin/env python
# coding: utf-8

# # Fixed Relative Accuracy Analysis - Handles Format Mismatch
# 
# This version correctly handles the format differences between extractions and ground truth

import json
import pandas as pd
import numpy as np
from pathlib import Path
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime
import warnings
warnings.filterwarnings('ignore')

# Set paths
EXTRACTION_PATH = Path(r"E:\langchain\Dissertation\data\extractions")
GROUND_TRUTH_PATH = Path(r"E:\langchain\Dissertation\data\ground_truth")
RESULTS_PATH = Path(r"E:\langchain\Dissertation\results")
RESULTS_PATH.mkdir(exist_ok=True)

print("GPT-4V Chart Extraction Relative Accuracy Analysis (FIXED)")
print("=" * 60)
print(f"Analysis Date: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")

GPT-4V Chart Extraction Relative Accuracy Analysis (FIXED)
Analysis Date: 2025-07-30 13:06:08


#### SECTION 1. Load Data Functions

In [9]:
def load_extraction_results():
    """Load all extraction results from JSON files"""
    extractions = {}
    json_files = list(EXTRACTION_PATH.glob("*.json"))
    
    for json_file in json_files:
        try:
            with open(json_file, 'r', encoding='utf-8') as f:
                data = json.load(f)
                extractions[json_file.stem] = data
        except Exception as e:
            print(f"Error loading {json_file.name}: {e}")
    
    print(f"\nLoaded {len(extractions)} extraction results")
    return extractions

def load_ground_truth():
    """Load ground truth data from chart_configurations.json"""
    ground_truth = {}
    
    config_file = GROUND_TRUTH_PATH / "chart_configurations.json"
    if config_file.exists():
        with open(config_file, 'r') as f:
            configs = json.load(f)
            for config in configs:
                ground_truth[config['id']] = config
        print(f"Loaded {len(ground_truth)} ground truth entries")
    else:
        print("WARNING: chart_configurations.json not found!")
    
    return ground_truth

# Load data
extractions = load_extraction_results()
ground_truth = load_ground_truth()


Loaded 1399 extraction results
Loaded 200 ground truth entries


#### 2. Metric Calculation Functions

In [10]:
def normalize_extraction_data(extracted_data):
    """Normalize extraction data to standard format"""
    normalized = []
    
    if isinstance(extracted_data, list):
        for item in extracted_data:
            if isinstance(item, dict) and 'category' in item and 'value' in item:
                # Split combined category-series format
                category = item['category']
                value = float(item['value']) if item['value'] is not None else 0.0
                normalized.append({'category': category, 'value': value})
    
    elif isinstance(extracted_data, dict):
        for key, value in extracted_data.items():
            normalized.append({'category': key, 'value': float(value) if value is not None else 0.0})
    
    return normalized

def normalize_ground_truth_data(gt_config):
    """Convert ground truth to same format as extraction"""
    normalized = []
    
    # Handle the categories + series_data format
    if 'categories' in gt_config and 'series_data' in gt_config:
        categories = gt_config['categories']
        series_data = gt_config['series_data']
        
        # Check if multi-series
        if series_data and isinstance(series_data[0], dict) and 'name' in series_data[0]:
            # Multi-series format
            for series in series_data:
                series_name = series['name']
                values = series['values']
                for i, cat in enumerate(categories):
                    if i < len(values):
                        # Create combined category name to match extraction format
                        combined_cat = f"{cat} - {series_name}"
                        normalized.append({'category': combined_cat, 'value': float(values[i])})
        else:
            # Single series - just categories and values
            if 'values' in gt_config:
                values = gt_config['values']
            else:
                # Try to extract values from series_data
                values = series_data[0]['values'] if series_data else []
            
            for i, cat in enumerate(categories):
                if i < len(values):
                    normalized.append({'category': cat, 'value': float(values[i])})
    
    return normalized

def calculate_exact_match_fixed(extracted, expected):
    """Calculate exact match with normalized data"""
    if not extracted or not expected:
        return 0.0
    
    # Normalize both datasets
    extracted_norm = normalize_extraction_data(extracted)
    expected_norm = normalize_ground_truth_data(expected) if isinstance(expected, dict) else expected
    
    if len(extracted_norm) != len(expected_norm):
        return 0.0
    
    # Create value maps for comparison
    extracted_map = {item['category'].lower(): item['value'] for item in extracted_norm}
    expected_map = {item['category'].lower(): item['value'] for item in expected_norm}
    
    # Check if all values match (with tolerance)
    if set(extracted_map.keys()) != set(expected_map.keys()):
        return 0.0
    
    for key in expected_map:
        if abs(extracted_map.get(key, -999) - expected_map[key]) >= 0.1:
            return 0.0
    
    return 1.0

def calculate_f1_score_fixed(extracted, expected):
    """Calculate F1 score with better matching"""
    if not extracted or not expected:
        return 0.0
    
    # Normalize data
    extracted_norm = normalize_extraction_data(extracted)
    expected_norm = normalize_ground_truth_data(expected) if isinstance(expected, dict) else expected
    
    # Extract values for comparison
    extracted_values = [(item['category'].lower(), item['value']) for item in extracted_norm]
    expected_values = [(item['category'].lower(), item['value']) for item in expected_norm]
    
    # Calculate matches
    true_positives = 0
    
    for e_cat, e_val in extracted_values:
        for g_cat, g_val in expected_values:
            # Flexible matching: partial category match OR exact value match
            cat_match = any(part in e_cat for part in g_cat.split()) or any(part in g_cat for part in e_cat.split())
            val_match = abs(e_val - g_val) < 0.1
            
            if cat_match and val_match:
                true_positives += 1
                break
    
    precision = true_positives / len(extracted_values) if extracted_values else 0
    recall = true_positives / len(expected_values) if expected_values else 0
    
    if precision + recall == 0:
        return 0.0
    
    f1 = 2 * (precision * recall) / (precision + recall)
    return f1

def calculate_value_accuracy_fixed(extracted, expected):
    """Calculate value accuracy with 10% tolerance"""
    if not extracted or not expected:
        return 0.0
    
    # Normalize data
    extracted_norm = normalize_extraction_data(extracted)
    expected_norm = normalize_ground_truth_data(expected) if isinstance(expected, dict) else expected
    
    # Get all values
    extracted_values = [item['value'] for item in extracted_norm]
    expected_values = [item['value'] for item in expected_norm]
    
    if not expected_values:
        return 0.0
    
    # Match values with tolerance
    matched = 0
    for e_val in expected_values:
        tolerance = abs(e_val * 0.1)  # 10% tolerance
        for x_val in extracted_values:
            if abs(x_val - e_val) <= tolerance:
                matched += 1
                break
    
    return (matched / len(expected_values)) * 100

def calculate_chart_type_accuracy_fixed(extracted_type, expected_type):
    """Fixed chart type matching with common variations"""
    if not extracted_type or not expected_type:
        return 0.0
    
    # Normalize types
    extracted_lower = extracted_type.lower().strip()
    expected_lower = expected_type.lower().strip()
    
    # Direct match
    if extracted_lower == expected_lower:
        return 1.0
    
    # Common equivalences
    equivalences = {
        'bar': ['bar', 'column', 'grouped_bar', 'stacked_bar'],
        'line': ['line', 'lines', 'line_chart'],
        'scatter': ['scatter', 'scatterplot', 'scatter_plot'],
        'pie': ['pie', 'pie_chart'],
        'area': ['area', 'area_chart', 'stacked_area']
    }
    
    # Check equivalences
    for key, variants in equivalences.items():
        if expected_lower in variants and extracted_lower in variants:
            return 1.0
    
    return 0.0

#### 3. Process All Extractions

In [11]:
def evaluate_extraction_fixed(extraction_key, extraction_data, ground_truth_data):
    """Evaluate extraction with fixed format handling"""
    
    # Parse extraction key
    parts = extraction_key.split('_')
    chart_id = f"{parts[0]}_{parts[1]}"  # e.g., chart_001
    
    # Determine if original or perturbation
    is_original = extraction_key.endswith('_original')
    
    # Get perturbation type
    perturbation_type = 'none'
    if not is_original:
        # Extract perturbation type from filename
        # Format: chart_XXX_[complexity]_[chart_type]_[perturbation]_medium
        if len(parts) >= 5:
            # Find the perturbation type (skip chart_XXX, complexity, chart_type)
            perturbation_candidates = ['gaussian_blur', 'rotation', 'random_blocks', 
                                     'brightness_shift', 'legend_corruption', 'grayscale_conversion']
            for i, part in enumerate(parts[2:], 2):
                if part in perturbation_candidates:
                    perturbation_type = part
                    break
    
    # Get ground truth
    gt = ground_truth_data.get(chart_id, {})
    if not gt:
        return None
    
    # Extract data
    extracted_data = extraction_data.get('data', [])
    extracted_chart_type = extraction_data.get('chart_type', '')
    expected_chart_type = gt.get('chart_type', '')
    
    # Calculate metrics with fixed functions
    exact_match = calculate_exact_match_fixed(extracted_data, gt)
    f1_score = calculate_f1_score_fixed(extracted_data, gt)
    value_accuracy = calculate_value_accuracy_fixed(extracted_data, gt)
    chart_type_accuracy = calculate_chart_type_accuracy_fixed(extracted_chart_type, expected_chart_type)
    
    return {
        'extraction_key': extraction_key,
        'chart_id': chart_id,
        'is_original': is_original,
        'perturbation_type': perturbation_type,
        'exact_match_accuracy': exact_match,
        'f1_score': f1_score,
        'value_accuracy': value_accuracy,
        'chart_type_accuracy': chart_type_accuracy,
        'extraction_confidence': extraction_data.get('extraction_confidence', 'unknown')
    }

# Process all extractions
print("\n" + "="*60)
print("Processing extractions with fixed format handling...")

results = []
failed_count = 0

for key, extraction in extractions.items():
    result = evaluate_extraction_fixed(key, extraction, ground_truth)
    if result:
        results.append(result)
    else:
        failed_count += 1

# Add failed extractions
expected_total = 1400
current_total = len(results)
missing_count = expected_total - current_total

print(f"\nExtraction Summary:")
print(f"  Expected: {expected_total}")
print(f"  Processed: {current_total}")
print(f"  Failed/Missing: {missing_count}")

# Add failed extractions
if missing_count > 0:
    for i in range(missing_count):
        results.append({
            'extraction_key': f'failed_extraction_{i}',
            'chart_id': 'failed',
            'is_original': False,
            'perturbation_type': 'failed',
            'exact_match_accuracy': 0.0,
            'f1_score': 0.0,
            'value_accuracy': 0.0,
            'chart_type_accuracy': 0.0,
            'extraction_confidence': 'failed'
        })

# Convert to DataFrame
df_results = pd.DataFrame(results)



Processing extractions with fixed format handling...


KeyError: 0

### 4. Calculate Relative Accuracy

In [6]:
print("\n" + "="*60)
print("Calculating Relative Accuracy...")

# Calculate composite score for each extraction
df_results['composite_score'] = (
    df_results['exact_match_accuracy'] + 
    df_results['f1_score'] + 
    df_results['value_accuracy'] / 100 +  # Normalize to 0-1 scale
    df_results['chart_type_accuracy']
) / 4 * 100  # Average of 4 metrics, converted to percentage

# Calculate RA for each perturbation
ra_results = []

for chart_id in df_results['chart_id'].unique():
    if chart_id == 'failed':
        continue
    
    # Get original performance
    original_data = df_results[(df_results['chart_id'] == chart_id) & 
                              (df_results['is_original'] == True)]
    
    if len(original_data) == 0:
        continue
    
    original_score = original_data['composite_score'].iloc[0]
    
    # Calculate RA for each perturbation of this chart
    perturbed_data = df_results[(df_results['chart_id'] == chart_id) & 
                               (df_results['is_original'] == False)]
    
    for _, pert_row in perturbed_data.iterrows():
        if original_score > 0:
            ra = (pert_row['composite_score'] / original_score) * 100
        else:
            ra = 0.0  # If original failed, RA is 0
        
        ra_results.append({
            'extraction_key': pert_row['extraction_key'],
            'relative_accuracy': min(ra, 100.0)  # Cap at 100%
        })

# Merge RA back to main dataframe
ra_df = pd.DataFrame(ra_results)
df_results = df_results.merge(ra_df, on='extraction_key', how='left')

# Fill RA for originals (always 100%) and failed (always 0%)
df_results.loc[df_results['is_original'] == True, 'relative_accuracy'] = 100.0
df_results.loc[df_results['perturbation_type'] == 'failed', 'relative_accuracy'] = 0.0



Calculating Relative Accuracy...


### 5. Summary Statistics

In [7]:
print("\n" + "="*60)
print("RELATIVE ACCURACY ANALYSIS RESULTS")
print("="*60)

# Overall statistics
print("\n1. Overall Performance:")
originals = df_results[df_results['is_original'] == True]
perturbations = df_results[df_results['is_original'] == False]

print(f"\nOriginal Charts (Clean):")
print(f"  Exact Match: {originals['exact_match_accuracy'].mean():.1%}")
print(f"  F1 Score: {originals['f1_score'].mean():.1%}")
print(f"  Value Accuracy: {originals['value_accuracy'].mean():.1f}%")
print(f"  Chart Type: {originals['chart_type_accuracy'].mean():.1%}")

print(f"\nPerturbed Charts:")
print(f"  Exact Match: {perturbations['exact_match_accuracy'].mean():.1%}")
print(f"  F1 Score: {perturbations['f1_score'].mean():.1%}")
print(f"  Value Accuracy: {perturbations['value_accuracy'].mean():.1f}%")
print(f"  Chart Type: {perturbations['chart_type_accuracy'].mean():.1%}")

# RA by perturbation type
print("\n2. Relative Accuracy by Perturbation Type:")
ra_by_type = perturbations.groupby('perturbation_type')['relative_accuracy'].agg(['mean', 'std', 'count'])
ra_by_type = ra_by_type.sort_values('mean', ascending=False)

print("\nPerturbation Type    | Mean RA | Std Dev | Count | Interpretation")
print("-" * 70)
for pert_type, row in ra_by_type.iterrows():
    if pert_type != 'failed':
        interpretation = (
            "Very Robust" if row['mean'] >= 90 else
            "Robust" if row['mean'] >= 80 else
            "Moderate" if row['mean'] >= 70 else
            "Vulnerable" if row['mean'] >= 60 else
            "Severe Impact"
        )
        print(f"{pert_type:18} | {row['mean']:6.1f}% | {row['std']:6.1f}% | {row['count']:5.0f} | {interpretation}")

# Overall RA
overall_ra = perturbations[perturbations['perturbation_type'] != 'failed']['relative_accuracy'].mean()
print(f"\nOverall Relative Accuracy: {overall_ra:.1f}%")
print(f"Interpretation: GPT-4V retains {overall_ra:.1f}% of its performance on average")


RELATIVE ACCURACY ANALYSIS RESULTS

1. Overall Performance:

Original Charts (Clean):
  Exact Match: 0.0%
  F1 Score: 0.0%
  Value Accuracy: 0.0%
  Chart Type: 54.0%

Perturbed Charts:
  Exact Match: 0.0%
  F1 Score: 0.0%
  Value Accuracy: 0.0%
  Chart Type: 54.1%

2. Relative Accuracy by Perturbation Type:

Perturbation Type    | Mean RA | Std Dev | Count | Interpretation
----------------------------------------------------------------------
area               |    nan% |    nan% |     0 | Severe Impact
bar                |    nan% |    nan% |     0 | Severe Impact
line               |    nan% |    nan% |     0 | Severe Impact
pie                |    nan% |    nan% |     0 | Severe Impact
scatter            |    nan% |    nan% |     0 | Severe Impact

Overall Relative Accuracy: nan%
Interpretation: GPT-4V retains nan% of its performance on average


In [10]:
print(f"\nEXTRACTION TYPE BREAKDOWN:")
print(df['extraction_type'].value_counts())

print(f"\nPERTURBATION TYPE BREAKDOWN:")
print(df['perturbation_type'].value_counts())


EXTRACTION TYPE BREAKDOWN:
extraction_type
perturbation    698
original        200
Name: count, dtype: int64

PERTURBATION TYPE BREAKDOWN:
perturbation_type
shift         184
blur          180
rotation      167
blocks         84
corruption     83
Name: count, dtype: int64


### SECTION 5: ROBUSTNESS ANALYSIS

In [18]:
print("\n SECTION 5: ROBUSTNESS ANALYSIS")

def calculate_robustness_metrics_fixed(metrics_df):
    """Calculate robustness and DRI metrics - FIXED VERSION"""
    
    evaluator = StandardMetricsEvaluator()
    
    # Separate original and perturbation data
    original_metrics = metrics_df[metrics_df['extraction_type'] == 'original'].copy()
    perturbation_metrics = metrics_df[metrics_df['extraction_type'] == 'perturbation'].copy()
    
    print(f" Original evaluations: {len(original_metrics)}")
    print(f" Perturbation evaluations: {len(perturbation_metrics)}")
    print("\n🔍 DETAILED DEBUGGING:")

    # Check original chart data structure
    print("ORIGINAL CHART SAMPLE:")
    sample_original = original_metrics.iloc[0]
    print(f"   Extraction key: {sample_original['extraction_key']}")
    print(f"   Has original_chart_id: {'original_chart_id' in sample_original}")
    if 'original_chart_id' in sample_original:
        print(f"   Original chart id: {sample_original['original_chart_id']}")

    # Check perturbation data structure  
    print("\nPERTURBATION SAMPLE:")
    sample_pert = perturbation_metrics.iloc[0]
    print(f"   Extraction key: {sample_pert['extraction_key']}")
    print(f"   Has original_chart_id: {'original_chart_id' in sample_pert}")
    if 'original_chart_id' in sample_pert:
        print(f"   Original chart id: {sample_pert['original_chart_id']}")
        print(f"   Original chart id type: {type(sample_pert['original_chart_id'])}")
        print(f"   Is NaN: {pd.isna(sample_pert['original_chart_id'])}")

    # Check what columns we actually have
    print(f"\nAVAILABLE COLUMNS:")
    print(f"   Original metrics: {list(original_metrics.columns)}")
    print(f"   Perturbation metrics: {list(perturbation_metrics.columns)}")

    # Check for NaN values
    print(f"\nNaN CHECK:")
    print(f"   Original chart IDs with NaN: {perturbation_metrics['original_chart_id'].isna().sum()}")
    print(f"   Empty original chart IDs: {(perturbation_metrics['original_chart_id'] == '').sum()}")
    # DEBUG: Check the actual chart IDs
    print(f"\n DEBUG - Sample original extraction keys:")
    for key in list(original_metrics['extraction_key'])[:5]:
        print(f"   {key}")
    
    print(f"\n DEBUG - Sample perturbation original_chart_ids:")
    for oid in list(perturbation_metrics['original_chart_id'])[:5]:
        print(f"   {oid}")
    
    # Create original performance lookup with FLEXIBLE matching
    original_lookup = {}
    
    for _, row in original_metrics.iterrows():
        extraction_key = row['extraction_key']
        
        # Extract chart ID from original extraction key
        # e.g., "chart_179_advanced_bar" -> "chart_179"
        parts = extraction_key.split('_')
        if len(parts) >= 2 and parts[0] == 'chart':
            base_chart_id = f"{parts[0]}_{parts[1]}"  # chart_179
            
            # Store with multiple possible keys
            possible_keys = [
                base_chart_id,                    # chart_179
                extraction_key,                   # chart_179_advanced_bar
                extraction_key.replace('_original', ''),  # without _original
                '_'.join(parts[:3]) if len(parts) >= 3 else base_chart_id  # chart_179_advanced
            ]
            
            for key in possible_keys:
                original_lookup[key] = {
                    'extraction_key': extraction_key,
                    'exact_match_accuracy': row['exact_match_accuracy'],
                    'partial_match_f1': row['partial_match_f1'],
                    'value_extraction_accuracy': row['value_extraction_accuracy'],
                    'structural_understanding': row['structural_understanding']
                }
    
    print(f" Created original lookup with {len(original_lookup)} entries")
    print(f" Sample lookup keys: {list(original_lookup.keys())[:5]}")
    
    # Calculate robustness metrics for perturbations
    robustness_results = []
    matched_count = 0
    
    for _, row in perturbation_metrics.iterrows():
        original_chart_id = str(row.get('original_chart_id', ''))
        
        # Try to find matching original performance
        original_perf = None
        
        # Try exact match first
        if original_chart_id in original_lookup:
            original_perf = original_lookup[original_chart_id]
            matched_count += 1
        else:
            # Try alternative matching strategies
            extraction_key = row['extraction_key']
            
            # Extract base ID from perturbation extraction key
            # e.g., "chart_179_advanced_bar_gaussian_blur_medium" -> "chart_179"
            parts = extraction_key.split('_')
            if len(parts) >= 2 and parts[0] == 'chart':
                base_id = f"{parts[0]}_{parts[1]}"
                
                # Try different variations
                for possible_key in [base_id, original_chart_id, f"{parts[0]}_{parts[1]}_{parts[2]}" if len(parts) >= 3 else base_id]:
                    if possible_key in original_lookup:
                        original_perf = original_lookup[possible_key]
                        matched_count += 1
                        break
        
        if original_perf:
            # Calculate robustness scores for each metric
            robustness_record = {
                'extraction_key': row['extraction_key'],
                'original_chart_id': original_chart_id,
                'matched_original_key': original_perf['extraction_key'],
                'perturbation_type': row.get('perturbation_type', ''),
                'intensity': row.get('intensity', ''),
                
                # Current performance
                'perturbed_exact_match': row['exact_match_accuracy'],
                'perturbed_f1': row['partial_match_f1'],
                'perturbed_value_accuracy': row['value_extraction_accuracy'],
                'perturbed_structural': row['structural_understanding'],
                
                # Original performance
                'original_exact_match': original_perf['exact_match_accuracy'],
                'original_f1': original_perf['partial_match_f1'],
                'original_value_accuracy': original_perf['value_extraction_accuracy'],
                'original_structural': original_perf['structural_understanding'],
            }
            
            # Calculate DRI scores (handle division by zero)
            def safe_dri(original, perturbed):
                if original == 0:
                    return 1.0 if perturbed == 0 else 0.0
                degradation = max(0, original - perturbed)
                return max(0.0, min(1.0, 1 - (degradation / original)))
            
            robustness_record.update({
                'dri_exact_match': safe_dri(original_perf['exact_match_accuracy'], row['exact_match_accuracy']),
                'dri_f1': safe_dri(original_perf['partial_match_f1'], row['partial_match_f1']),
                'dri_value_accuracy': safe_dri(original_perf['value_extraction_accuracy'], row['value_extraction_accuracy']),
                'dri_structural': safe_dri(original_perf['structural_understanding'], row['structural_understanding'])
            })
            
            # Calculate composite DRI (PRIMARY METRIC)
            dri_scores = [
                robustness_record['dri_exact_match'],
                robustness_record['dri_f1'],
                robustness_record['dri_value_accuracy'],
                robustness_record['dri_structural']
            ]
            robustness_record['composite_dri'] = np.mean(dri_scores)
            
            robustness_results.append(robustness_record)
    
    print(f" Matched {matched_count} perturbations to original charts")
    print(f" Created {len(robustness_results)} robustness comparisons")
    
    if not robustness_results:
        print(" No robustness comparisons created - check ID matching logic")
        return pd.DataFrame()
    
    robustness_df = pd.DataFrame(robustness_results)
    
    # Save robustness analysis
    robustness_df.to_csv('data/analysis_cache/robustness_analysis.csv', index=False)
    print(f" Robustness analysis saved with {len(robustness_df)} records")
    
    return robustness_df
# Calculate robustness metrics
robustness_df = calculate_robustness_metrics_fixed(metrics_df)



 SECTION 5: ROBUSTNESS ANALYSIS
 Original evaluations: 200
 Perturbation evaluations: 698

🔍 DETAILED DEBUGGING:
ORIGINAL CHART SAMPLE:
   Extraction key: chart_179_advanced_bar
   Has original_chart_id: True
   Original chart id: nan

PERTURBATION SAMPLE:
   Extraction key: chart_179_advanced_bar_rotation_low
   Has original_chart_id: True
   Original chart id: chart_179
   Original chart id type: <class 'str'>
   Is NaN: False

AVAILABLE COLUMNS:
   Original metrics: ['extraction_key', 'extraction_type', 'exact_match_accuracy', 'partial_match_f1', 'value_extraction_accuracy', 'structural_understanding', 'original_chart_id', 'perturbation_type', 'intensity']
   Perturbation metrics: ['extraction_key', 'extraction_type', 'exact_match_accuracy', 'partial_match_f1', 'value_extraction_accuracy', 'structural_understanding', 'original_chart_id', 'perturbation_type', 'intensity']

NaN CHECK:
   Original chart IDs with NaN: 0
   Empty original chart IDs: 0

 DEBUG - Sample original extractio

### SECTION 6: SUMMARY STATISTICS

In [19]:
print("\n SECTION 6: SUMMARY STATISTICS")

def generate_summary_statistics(metrics_df, robustness_df):
    """Generate comprehensive summary statistics"""
    
    print(" PERFORMANCE SUMMARY:")
    print("-" * 60)
    
    # Original performance
    original_data = metrics_df[metrics_df['extraction_type'] == 'original']
    
    if not original_data.empty:
        print(" ORIGINAL CHART PERFORMANCE:")
        for metric in ['exact_match_accuracy', 'partial_match_f1', 'value_extraction_accuracy', 'structural_understanding']:
            mean_val = original_data[metric].mean()
            std_val = original_data[metric].std()
            print(f"   {metric}: {mean_val:.3f} ± {std_val:.3f}")
    
    # Perturbation performance
    perturbation_data = metrics_df[metrics_df['extraction_type'] == 'perturbation']
    
    if not perturbation_data.empty:
        print(f"\n PERTURBATION PERFORMANCE:")
        for metric in ['exact_match_accuracy', 'partial_match_f1', 'value_extraction_accuracy', 'structural_understanding']:
            mean_val = perturbation_data[metric].mean()
            std_val = perturbation_data[metric].std()
            print(f"   {metric}: {mean_val:.3f} ± {std_val:.3f}")
    
    # Robustness summary
    if not robustness_df.empty:
        print(f"\n ROBUSTNESS ANALYSIS:")
        print(f"   Mean Composite DRI: {robustness_df['composite_dri'].mean():.3f}")
        print(f"   DRI Standard Deviation: {robustness_df['composite_dri'].std():.3f}")
        print(f"   Best DRI Score: {robustness_df['composite_dri'].max():.3f}")
        print(f"   Worst DRI Score: {robustness_df['composite_dri'].min():.3f}")
        
        # Perturbation type analysis
        print(f"\n PERTURBATION TYPE ANALYSIS:")
        perturbation_summary = robustness_df.groupby('perturbation_type')['composite_dri'].agg(['mean', 'std', 'count'])
        for pert_type, stats in perturbation_summary.iterrows():
            print(f"   {pert_type}: DRI = {stats['mean']:.3f} ± {stats['std']:.3f} (n={stats['count']})")

generate_summary_statistics(metrics_df, robustness_df)



 SECTION 6: SUMMARY STATISTICS
 PERFORMANCE SUMMARY:
------------------------------------------------------------
 ORIGINAL CHART PERFORMANCE:
   exact_match_accuracy: 0.000 ± 0.000
   partial_match_f1: 0.114 ± 0.191
   value_extraction_accuracy: 8.486 ± 17.430
   structural_understanding: 63.001 ± 37.434

 PERTURBATION PERFORMANCE:
   exact_match_accuracy: 0.000 ± 0.000
   partial_match_f1: 0.112 ± 0.192
   value_extraction_accuracy: 8.249 ± 16.667
   structural_understanding: 61.138 ± 36.509

 ROBUSTNESS ANALYSIS:
   Mean Composite DRI: 0.883
   DRI Standard Deviation: 0.195
   Best DRI Score: 1.000
   Worst DRI Score: 0.250

 PERTURBATION TYPE ANALYSIS:
   blocks: DRI = 0.776 ± 0.247 (n=84.0)
   blur: DRI = 0.895 ± 0.188 (n=180.0)
   corruption: DRI = 0.879 ± 0.188 (n=83.0)
   rotation: DRI = 0.896 ± 0.179 (n=167.0)
   shift: DRI = 0.910 ± 0.179 (n=184.0)


#### SECTION 7: ANALYSIS COMPLETION

In [20]:
print("\n SECTION 7: ANALYSIS COMPLETION")

# Create final analysis summary
analysis_summary = {
    'analysis_timestamp': datetime.now().isoformat(),
    'total_evaluations': len(metrics_df),
    'original_evaluations': len(metrics_df[metrics_df['extraction_type'] == 'original']),
    'perturbation_evaluations': len(metrics_df[metrics_df['extraction_type'] == 'perturbation']),
    'robustness_comparisons': len(robustness_df),
    'primary_metric': 'composite_dri',
    'metrics_calculated': list(STANDARD_METRICS.keys()),
    'academic_references': [info['reference'] for info in STANDARD_METRICS.values()],
    'data_files_created': [
        'data/analysis_cache/comprehensive_metrics.csv',
        'data/analysis_cache/robustness_analysis.csv'
    ],
    'ready_for_statistical_analysis': True,
    'next_notebook': '06_Statistical_Analysis.ipynb'
}

# Save analysis summary
with open('data/analysis_cache/analysis_summary.json', 'w') as f:
    json.dump(analysis_summary, f, indent=2)

print(" Analysis summary saved")
print(f" Multi-metric analysis complete!")
print(f" {analysis_summary['total_evaluations']} evaluations processed")
print(f" Primary metric: {analysis_summary['primary_metric']} (Composite DRI)")
print(f" Academic metrics: {len(analysis_summary['metrics_calculated'])}")

print("\n" + "=" * 80)
print(" MULTI-METRIC ANALYSIS COMPLETE!")
print(" Standard Academic Metrics Successfully Calculated")
print(" Ready for Statistical Analysis Phase")
print("=" * 80)

# Log completion
logger.info("Multi-metric analysis completed successfully")
logger.info(f"Total evaluations: {analysis_summary['total_evaluations']}")
logger.info(f"Primary metric: {analysis_summary['primary_metric']}")
logger.info("Ready for statistical analysis phase")


 SECTION 7: ANALYSIS COMPLETION
 Analysis summary saved
 Multi-metric analysis complete!
 898 evaluations processed
 Primary metric: composite_dri (Composite DRI)
 Academic metrics: 6

 MULTI-METRIC ANALYSIS COMPLETE!
 Standard Academic Metrics Successfully Calculated
 Ready for Statistical Analysis Phase


In [21]:
# Run this AFTER Notebook 5 is completely done
import pandas as pd

# Check if robustness analysis worked
try:
    df = pd.read_csv('data/analysis_cache/robustness_analysis.csv')
    print(f" Robustness analysis file found!")
    print(f" Robustness comparisons: {len(df)}")
    print(f" Mean Composite DRI: {df['composite_dri'].mean():.3f}")
    print(f"\n DRI by perturbation type:")
    print(df.groupby('perturbation_type')['composite_dri'].mean().sort_values().round(3))
    
    print(f"\n DRI Distribution:")
    print(f"   Best DRI: {df['composite_dri'].max():.3f}")
    print(f"   Worst DRI: {df['composite_dri'].min():.3f}")
    print(f"   Std Dev: {df['composite_dri'].std():.3f}")
    
except FileNotFoundError:
    print(" Robustness analysis file not found - Notebook 5 may not have completed Section 5")
except Exception as e:
    print(f" Error reading robustness analysis: {e}")

 Robustness analysis file found!
 Robustness comparisons: 698
 Mean Composite DRI: 0.883

 DRI by perturbation type:
perturbation_type
blocks        0.776
corruption    0.879
blur          0.895
rotation      0.896
shift         0.910
Name: composite_dri, dtype: float64

 DRI Distribution:
   Best DRI: 1.000
   Worst DRI: 0.250
   Std Dev: 0.195
