### GPT-4o Robustness Analysis: Comprehensive Extraction Pipeline


In [1]:
import os
import json
import base64
import pandas as pd
import numpy as np
from datetime import datetime
import time
import logging
from pathlib import Path
import openai
from dotenv import load_dotenv
import random
from PIL import Image
import traceback

# Set random seed for reproducibility
random.seed(42)

print("=" * 80)
print(" GPT-4O EXTRACTION PIPELINE")
print(" Systematic Chart Data Extraction for Robustness Analysis")
print("=" * 80)

# Setup logging
logger = logging.getLogger('research')


 GPT-4O EXTRACTION PIPELINE
 Systematic Chart Data Extraction for Robustness Analysis


### SECTION 1: EXTRACTION CONFIGURATION

In [2]:
print("\n SECTION 1: EXTRACTION CONFIGURATION")

# Load environment and API setup
load_dotenv()
OPENAI_API_KEY = os.getenv('OPENAI_API_KEY')

if not OPENAI_API_KEY or OPENAI_API_KEY == "your-openai-api-key-here":
    print(" OpenAI API key not configured!")
    print(" Please add your API key to .env file")
    print(" Create .env file with: OPENAI_API_KEY=your-actual-key")
    exit(1)

# Initialize OpenAI client
client = openai.OpenAI(api_key=OPENAI_API_KEY)

# Load previous phase summaries
try:
    with open('data/analysis_cache/perturbation_summary.json', 'r') as f:
        perturbation_summary = json.load(f)
    print(" Loaded perturbation summary")
except FileNotFoundError:
    print(" Perturbation summary not found - using defaults")
    perturbation_summary = {
        'total_perturbations_available': 800,
        'budget_planning': {'estimated_total_cost': 36.00}
    }

try:
    with open('research_config.json', 'r') as f:
        research_config = json.load(f)
    print(" Loaded research configuration")
except FileNotFoundError:
    print(" Research configuration not found!")
    exit(1)

# Extraction configuration
EXTRACTION_CONFIG = {
    "model_settings": {
        "model": "gpt-4o",
        "max_tokens": 1500,
        "temperature": 0.1,
        "timeout": 30,
        "max_retries": 3,
        "retry_delay": 2
    },
    
    "budget_management": {
        "total_budget": research_config['experimental_design']['budget_limit'],
        "cost_per_extraction": 0.03,
        "safety_buffer": 5.00,
        "usable_budget": research_config['experimental_design']['budget_limit'] - 5.00
    },
    
    "extraction_strategy": {
        "phase_1_originals": 200,
        "phase_2_priority_perturbations": 600, 
        "phase_3_intensity_analysis": 300,
        "phase_4_deep_analysis": 200
    },
    
    "quality_controls": {
        "min_data_points": 2,
        "max_extraction_time": 45,
        "validate_json": True,
        "save_failed_attempts": True
    }
}

print(f" Budget Management:")
print(f"   Total Budget: ${EXTRACTION_CONFIG['budget_management']['total_budget']:.2f}")
print(f"   Usable Budget: ${EXTRACTION_CONFIG['budget_management']['usable_budget']:.2f}")
print(f"   Cost per Extraction: ${EXTRACTION_CONFIG['budget_management']['cost_per_extraction']:.2f}")

max_extractions = int(EXTRACTION_CONFIG['budget_management']['usable_budget'] / 
                     EXTRACTION_CONFIG['budget_management']['cost_per_extraction'])
print(f"   Max Possible Extractions: {max_extractions}")



 SECTION 1: EXTRACTION CONFIGURATION
 Loaded perturbation summary
 Loaded research configuration
 Budget Management:
   Total Budget: $45.00
   Usable Budget: $40.00
   Cost per Extraction: $0.03
   Max Possible Extractions: 1333


### SECTION 2: PROFESSIONAL EXTRACTION ENGINE

In [3]:
print("\n SECTION 2: PROFESSIONAL EXTRACTION ENGINE")

class GPT4VisionExtractor:
    """Professional GPT-4o Vision extraction system with comprehensive features"""
    
    def __init__(self, client, config):
        self.client = client
        self.config = config
        
        # Performance tracking
        self.extraction_stats = {
            'total_attempts': 0,
            'successful_extractions': 0,
            'failed_extractions': 0,
            'total_cost': 0.0,
            'avg_response_time': 0.0,
            'response_times': []
        }
        
        # Error tracking
        self.error_log = []
        
        # Professional extraction prompt
        self.extraction_prompt = """
You are a professional data analyst. Your task is to extract numerical values and associated labels from this chart image with maximum precision and accuracy.

CRITICAL REQUIREMENTS:
1. Extract ALL visible data points with exact numerical values
2. Use the EXACT category/label names shown in the chart
3. Identify the chart type accurately
4. Report your confidence level honestly
5. Note any extraction difficulties

Return ONLY a valid JSON object in this EXACT format:
{
  "chart_title": "Exact title from the chart",
  "chart_type": "bar/pie/line/scatter/area/stacked_bar/grouped_bar",
  "data": [
    {"category": "Category_1", "value": numeric_value},
    {"category": "Category_2", "value": numeric_value}
  ],
  "extraction_confidence": "high/medium/low",
  "extraction_notes": "Any issues, ambiguities, or observations about the chart",
  "data_completeness": "complete/partial/incomplete"
}

IMPORTANT: 
- For pie charts, ensure percentages sum to approximately 100%
- For numeric values, use appropriate precision (1-2 decimal places)
- If text is unclear, report "low" confidence and note the issue
- Extract ALL visible data series if multiple exist
"""
    
    def extract_data(self, image_path, chart_metadata=None):
        """Extract data from a single chart with comprehensive error handling"""
        
        extraction_id = f"ext_{int(time.time())}_{random.randint(1000, 9999)}"
        start_time = time.time()
        
        try:
            # Validate image
            if not Path(image_path).exists():
                raise FileNotFoundError(f"Image not found: {image_path}")
            
            # Encode image
            base64_image = self._encode_image(image_path)
            
            # Prepare API request
            messages = [
                {
                    "role": "user",
                    "content": [
                        {"type": "text", "text": self.extraction_prompt},
                        {
                            "type": "image_url",
                            "image_url": {
                                "url": f"data:image/png;base64,{base64_image}",
                                "detail": "high"
                            }
                        }
                    ]
                }
            ]
            
            # Execute extraction with retries
            for attempt in range(self.config['model_settings']['max_retries']):
                try:
                    self.extraction_stats['total_attempts'] += 1
                    
                    response = self.client.chat.completions.create(
                        model=self.config['model_settings']['model'],
                        messages=messages,
                        max_tokens=self.config['model_settings']['max_tokens'],
                        temperature=self.config['model_settings']['temperature'],
                        timeout=self.config['model_settings']['timeout']
                    )
                    
                    # Parse response
                    content = response.choices[0].message.content
                    extracted_data = self._parse_json_response(content)
                    
                    if extracted_data:
                        # Add metadata
                        extracted_data['_extraction_metadata'] = {
                            'extraction_id': extraction_id,
                            'image_path': str(image_path),
                            'extraction_timestamp': datetime.now().isoformat(),
                            'model': self.config['model_settings']['model'],
                            'attempt_number': attempt + 1,
                            'response_time': time.time() - start_time,
                            'chart_metadata': chart_metadata
                        }
                        
                        # Validate extraction
                        validation_result = self._validate_extraction(extracted_data)
                        extracted_data['_validation'] = validation_result
                        
                        # Update stats
                        response_time = time.time() - start_time
                        self.extraction_stats['successful_extractions'] += 1
                        self.extraction_stats['total_cost'] += self.config['budget_management']['cost_per_extraction']
                        self.extraction_stats['response_times'].append(response_time)
                        self.extraction_stats['avg_response_time'] = np.mean(self.extraction_stats['response_times'])
                        
                        logger.info(f"Successful extraction: {image_path} (attempt {attempt + 1})")
                        return extracted_data
                    
                except Exception as api_error:
                    logger.warning(f"API attempt {attempt + 1} failed for {image_path}: {api_error}")
                    if attempt < self.config['model_settings']['max_retries'] - 1:
                        time.sleep(self.config['model_settings']['retry_delay'] * (attempt + 1))
                    else:
                        raise api_error
            
            # If we get here, all retries failed
            raise Exception("All retry attempts exhausted")
            
        except Exception as e:
            # Log failure
            self.extraction_stats['failed_extractions'] += 1
            error_entry = {
                'extraction_id': extraction_id,
                'image_path': str(image_path),
                'error_timestamp': datetime.now().isoformat(),
                'error_type': type(e).__name__,
                'error_message': str(e),
                'response_time': time.time() - start_time
            }
            self.error_log.append(error_entry)
            
            logger.error(f"Extraction failed for {image_path}: {e}")
            
            # Save failed attempt if configured
            if self.config['quality_controls']['save_failed_attempts']:
                failed_path = f"data/extractions/failed_{extraction_id}.json"
                with open(failed_path, 'w') as f:
                    json.dump(error_entry, f, indent=2)
            
            return None
    
    def _encode_image(self, image_path):
        """Encode image to base64 with validation"""
        try:
            # Validate image first
            with Image.open(image_path) as img:
                # Ensure RGB mode
                if img.mode != 'RGB':
                    img = img.convert('RGB')
                
                # Check reasonable size
                if img.size[0] * img.size[1] > 10000000:  # 10MP limit
                    # Resize if too large
                    img.thumbnail((3000, 3000), Image.Resampling.LANCZOS)
            
            # Encode to base64
            with open(image_path, "rb") as image_file:
                return base64.b64encode(image_file.read()).decode('utf-8')
                
        except Exception as e:
            raise Exception(f"Image encoding failed: {e}")
    
    def _parse_json_response(self, content):
        """Parse JSON from GPT-4o response with multiple strategies"""
        
        if not content:
            return None
        
        # Strategy 1: Direct JSON parsing
        try:
            return json.loads(content.strip())
        except json.JSONDecodeError:
            pass
        
        # Strategy 2: Extract JSON block
        try:
            json_start = content.find('{')
            json_end = content.rfind('}') + 1
            if json_start != -1 and json_end > json_start:
                json_str = content[json_start:json_end]
                return json.loads(json_str)
        except json.JSONDecodeError:
            pass
        
        # Strategy 3: Look for ```json blocks
        try:
            if '```json' in content:
                start = content.find('```json') + 7
                end = content.find('```', start)
                if end != -1:
                    json_str = content[start:end].strip()
                    return json.loads(json_str)
        except json.JSONDecodeError:
            pass
        
        # Strategy 4: Look for any ``` blocks
        try:
            if '```' in content:
                start = content.find('```') + 3
                end = content.find('```', start)
                if end != -1:
                    json_str = content[start:end].strip()
                    return json.loads(json_str)
        except json.JSONDecodeError:
            pass
        
        logger.warning(f"Could not parse JSON from response: {content[:200]}...")
        return None
    
    def _validate_extraction(self, extracted_data):
        """Validate extracted data quality"""
        
        validation = {
            'is_valid': True,
            'warnings': [],
            'errors': []
        }
        
        # Check required fields
        required_fields = ['chart_title', 'chart_type', 'data']
        for field in required_fields:
            if field not in extracted_data:
                validation['errors'].append(f"Missing required field: {field}")
                validation['is_valid'] = False
        
        # Validate data structure
        if 'data' in extracted_data:
            data_points = extracted_data['data']
            
            if not isinstance(data_points, list):
                validation['errors'].append("Data field must be a list")
                validation['is_valid'] = False
            elif len(data_points) < self.config['quality_controls']['min_data_points']:
                validation['warnings'].append(f"Only {len(data_points)} data points extracted")
            
            # Validate individual data points
            for i, point in enumerate(data_points):
                if not isinstance(point, dict):
                    validation['errors'].append(f"Data point {i} is not a dictionary")
                    continue
                
                if 'category' not in point:
                    validation['errors'].append(f"Data point {i} missing category")
                if 'value' not in point:
                    validation['errors'].append(f"Data point {i} missing value")
                elif not isinstance(point['value'], (int, float)):
                    try:
                        float(point['value'])
                    except (ValueError, TypeError):
                        validation['errors'].append(f"Data point {i} has invalid value: {point['value']}")
        
        # Chart-specific validations
        if 'chart_type' in extracted_data and 'data' in extracted_data:
            chart_type = extracted_data['chart_type']
            data_points = extracted_data['data']
            
            if chart_type == 'pie':
                # Pie chart values should sum to approximately 100%
                total_value = sum(point.get('value', 0) for point in data_points 
                                if isinstance(point.get('value'), (int, float)))
                if not (85 <= total_value <= 115):
                    validation['warnings'].append(f"Pie chart values sum to {total_value:.1f}%, expected ~100%")
        
        return validation
    
    def get_extraction_statistics(self):
        """Get comprehensive extraction statistics"""
        
        total_attempts = self.extraction_stats['total_attempts']
        successful = self.extraction_stats['successful_extractions']
        failed = self.extraction_stats['failed_extractions']
        
        stats = {
            'total_attempts': total_attempts,
            'successful_extractions': successful,
            'failed_extractions': failed,
            'success_rate': (successful / max(1, total_attempts)) * 100,
            'total_cost': self.extraction_stats['total_cost'],
            'avg_response_time': self.extraction_stats['avg_response_time'],
            'remaining_budget': (self.config['budget_management']['usable_budget'] - 
                               self.extraction_stats['total_cost']),
            'estimated_remaining_extractions': int((self.config['budget_management']['usable_budget'] - 
                                                  self.extraction_stats['total_cost']) / 
                                                 self.config['budget_management']['cost_per_extraction'])
        }
        
        return stats



 SECTION 2: PROFESSIONAL EXTRACTION ENGINE


### SECTION 3: STRATEGIC EXTRACTION EXECUTION

In [5]:
print("\n SECTION 3: STRATEGIC EXTRACTION EXECUTION")

def execute_strategic_extraction():
    """Execute extraction with strategic budget management"""
    
    # Initialize extractor
    extractor = GPT4VisionExtractor(client, EXTRACTION_CONFIG)
    
    # Load chart configurations for ground truth
    chart_configs = {}
    try:
        with open('data/ground_truth/chart_configurations.json', 'r') as f:
            chart_config_list = json.load(f)
            chart_configs = {config['id']: config for config in chart_config_list}
        print(f" Loaded {len(chart_configs)} chart configurations")
    except FileNotFoundError:
        print(" Chart configurations not found - proceeding without ground truth")
    
    # Extraction results storage
    extraction_results = {}
    
    print(f"\n STARTING STRATEGIC EXTRACTION")
    print(f" Budget: ${EXTRACTION_CONFIG['budget_management']['usable_budget']:.2f}")
    print(f" Target: ~{max_extractions} extractions")
    
    # PHASE 1: Original Charts Extraction
    print(f"\n PHASE 1: ORIGINAL CHARTS EXTRACTION")
    
    original_charts = list(Path('data/raw_charts').glob('*.png'))
    if len(original_charts) > EXTRACTION_CONFIG['extraction_strategy']['phase_1_originals']:
        # Sample strategically if too many
        random.shuffle(original_charts)
        original_charts = original_charts[:EXTRACTION_CONFIG['extraction_strategy']['phase_1_originals']]
    
    print(f"Processing {len(original_charts)} original charts...")
    
    for i, chart_path in enumerate(original_charts):
        chart_id = chart_path.stem
        
        # Check budget
        stats = extractor.get_extraction_statistics()
        if stats['remaining_budget'] < EXTRACTION_CONFIG['budget_management']['cost_per_extraction']:
            print(f" Budget exhausted after {i} charts")
            break
        
        print(f"Extracting {i+1}/{len(original_charts)}: {chart_id}")
        
        # Get chart metadata if available
        chart_metadata = chart_configs.get(chart_id, {})
        
        # Extract data
        extracted_data = extractor.extract_data(chart_path, chart_metadata)
        
        if extracted_data:
            # Save extraction result
            result_path = f"data/extractions/{chart_id}_original.json"
            with open(result_path, 'w') as f:
                json.dump(extracted_data, f, indent=2)
            
            extraction_results[chart_id] = {
                'type': 'original',
                'extracted_data': extracted_data,
                'ground_truth': chart_metadata.get('series_data', {}),
                'file_path': result_path
            }
            
            print(f"    Success (Cost: ${stats['total_cost']:.2f})")
        else:
            print(f"    Failed")
        
        # Progress update
        if (i + 1) % 10 == 0:
            current_stats = extractor.get_extraction_statistics()
            print(f"    Progress: {current_stats['successful_extractions']}/{current_stats['total_attempts']} "
                  f"(${current_stats['total_cost']:.2f} spent)")
        
        # Rate limiting
        time.sleep(1)
    
    phase_1_stats = extractor.get_extraction_statistics()
    print(f" PHASE 1 COMPLETE: {phase_1_stats['successful_extractions']} extractions, "
          f"${phase_1_stats['total_cost']:.2f} spent")
    
    # PHASE 2: Priority Perturbations
    print(f"\n PHASE 2: PRIORITY PERTURBATIONS")
    
    # Select priority perturbations
    perturbation_files = list(Path('data/perturbations').glob('*.png'))
    
    # Priority perturbation types (most important for robustness analysis)
    priority_types = ['gaussian_blur', 'rotation', 'brightness_shift', 'random_blocks', 'legend_corruption']
    
    priority_perturbations = []
    for pert_file in perturbation_files:
        if any(ptype in pert_file.name for ptype in priority_types):
            priority_perturbations.append(pert_file)
    
    # Limit based on budget and strategy
    max_phase_2 = min(
        EXTRACTION_CONFIG['extraction_strategy']['phase_2_priority_perturbations'],
        len(priority_perturbations),
        extractor.get_extraction_statistics()['estimated_remaining_extractions']
    )
    
    if max_phase_2 > 0:
        random.shuffle(priority_perturbations)
        selected_perturbations = priority_perturbations[:max_phase_2]
        
        print(f"Processing {len(selected_perturbations)} priority perturbations...")
        
        for i, pert_path in enumerate(selected_perturbations):
            # Check budget
            if extractor.get_extraction_statistics()['remaining_budget'] < EXTRACTION_CONFIG['budget_management']['cost_per_extraction']:
                print(f" Budget exhausted after {i} perturbations")
                break
            
            print(f"Extracting perturbation {i+1}/{len(selected_perturbations)}: {pert_path.name}")
            
            # Extract original chart ID and perturbation info
            filename_parts = pert_path.stem.split('_')
            if len(filename_parts) >= 3:
                original_chart_id = '_'.join(filename_parts[:-2])
                perturbation_type = filename_parts[-2]
                intensity = filename_parts[-1]
            else:
                original_chart_id = pert_path.stem
                perturbation_type = "unknown"
                intensity = "unknown"
            
            # Get original chart metadata
            original_metadata = chart_configs.get(original_chart_id, {})
            
            # Extract data
            pert_metadata = {
                'original_chart_id': original_chart_id,
                'perturbation_type': perturbation_type,
                'intensity': intensity,
                'original_chart_metadata': original_metadata
            }
            
            extracted_data = extractor.extract_data(pert_path, pert_metadata)
            
            if extracted_data:
                # Save extraction result
                result_key = f"{original_chart_id}_{perturbation_type}_{intensity}"
                result_path = f"data/extractions/{result_key}.json"
                
                with open(result_path, 'w') as f:
                    json.dump(extracted_data, f, indent=2)
                
                extraction_results[result_key] = {
                    'type': 'perturbation',
                    'original_chart_id': original_chart_id,
                    'perturbation_type': perturbation_type,
                    'intensity': intensity,
                    'extracted_data': extracted_data,
                    'ground_truth': original_metadata.get('series_data', {}),
                    'file_path': result_path
                }
                
                print(f"    Success")
            else:
                print(f"    Failed")
            
            # Progress update
            if (i + 1) % 25 == 0:
                current_stats = extractor.get_extraction_statistics()
                print(f"    Progress: {current_stats['successful_extractions']} total extractions, "
                      f"${current_stats['total_cost']:.2f} spent, "
                      f"${current_stats['remaining_budget']:.2f} remaining")
            
            # Rate limiting
            time.sleep(1)
    
    phase_2_stats = extractor.get_extraction_statistics()
    print(f"PHASE 2 COMPLETE: {phase_2_stats['successful_extractions'] - phase_1_stats['successful_extractions']} "
          f"perturbations extracted")
    
    # PHASE 3: Intensity Analysis (if budget allows)
    remaining_budget = extractor.get_extraction_statistics()['remaining_budget']
    remaining_extractions = extractor.get_extraction_statistics()['estimated_remaining_extractions']
    
    if remaining_extractions >= 50:
        print(f"\n PHASE 3: INTENSITY ANALYSIS")
        
        # Find intensity variants (low/high) for charts we already processed
        intensity_variants = []
        processed_base_names = set()
        
        for result_key in extraction_results.keys():
            if '_' in result_key and extraction_results[result_key]['type'] == 'perturbation':
                parts = result_key.split('_')
                if len(parts) >= 3:
                    base_name = '_'.join(parts[:-1])  # Everything except intensity
                    processed_base_names.add(base_name)
        
        # Find corresponding low/high intensity files
        for base_name in processed_base_names:
            for intensity in ['low', 'high']:
                intensity_file = Path(f"data/perturbations/{base_name}_{intensity}.png")
                if intensity_file.exists():
                    intensity_variants.append(intensity_file)
        
        # Limit based on remaining budget
        max_phase_3 = min(
            EXTRACTION_CONFIG['extraction_strategy']['phase_3_intensity_analysis'],
            len(intensity_variants),
            remaining_extractions
        )
        
        if max_phase_3 > 0:
            random.shuffle(intensity_variants)
            selected_intensity = intensity_variants[:max_phase_3]
            
            print(f"Processing {len(selected_intensity)} intensity variants...")
            
            for i, intensity_path in enumerate(selected_intensity):
                if extractor.get_extraction_statistics()['remaining_budget'] < EXTRACTION_CONFIG['budget_management']['cost_per_extraction']:
                    break
                
                print(f"Extracting intensity variant {i+1}/{len(selected_intensity)}: {intensity_path.name}")
                
                # Parse filename
                filename_parts = intensity_path.stem.split('_')
                if len(filename_parts) >= 3:
                    original_chart_id = '_'.join(filename_parts[:-2])
                    perturbation_type = filename_parts[-2]
                    intensity = filename_parts[-1]
                    
                    pert_metadata = {
                        'original_chart_id': original_chart_id,
                        'perturbation_type': perturbation_type,
                        'intensity': intensity,
                        'original_chart_metadata': chart_configs.get(original_chart_id, {})
                    }
                    
                    extracted_data = extractor.extract_data(intensity_path, pert_metadata)
                    
                    if extracted_data:
                        result_key = f"{original_chart_id}_{perturbation_type}_{intensity}"
                        result_path = f"data/extractions/{result_key}.json"
                        
                        with open(result_path, 'w') as f:
                            json.dump(extracted_data, f, indent=2)
                        
                        extraction_results[result_key] = {
                            'type': 'perturbation',
                            'original_chart_id': original_chart_id,
                            'perturbation_type': perturbation_type,
                            'intensity': intensity,
                            'extracted_data': extracted_data,
                            'ground_truth': chart_configs.get(original_chart_id, {}).get('series_data', {}),
                            'file_path': result_path
                        }
                        
                        print(f"    Success")
                    else:
                        print(f"    Failed")
                
                time.sleep(1)
        
        phase_3_stats = extractor.get_extraction_statistics()
        phase_3_new = phase_3_stats['successful_extractions'] - phase_2_stats['successful_extractions']
        print(f" PHASE 3 COMPLETE: {phase_3_new} intensity variants extracted")
    
    else:
        print(f"\n PHASE 3 SKIPPED: Insufficient budget (${remaining_budget:.2f} remaining)")
    
    return extraction_results, extractor

# Execute the strategic extraction
print(" Starting strategic extraction pipeline...")
all_results, extraction_engine = execute_strategic_extraction()



 SECTION 3: STRATEGIC EXTRACTION EXECUTION
 Starting strategic extraction pipeline...
 Loaded 200 chart configurations

 STARTING STRATEGIC EXTRACTION
 Budget: $40.00
 Target: ~1333 extractions

 PHASE 1: ORIGINAL CHARTS EXTRACTION
Processing 200 original charts...
Extracting 1/200: chart_179_advanced_bar
    Success (Cost: $0.00)
Extracting 2/200: chart_035_medium_line
    Success (Cost: $0.03)
Extracting 3/200: chart_058_complex_bar
    Success (Cost: $0.06)
Extracting 4/200: chart_189_medium_pie
    Success (Cost: $0.09)
Extracting 5/200: chart_003_medium_bar
    Success (Cost: $0.12)
Extracting 6/200: chart_102_medium_line
    Success (Cost: $0.15)
Extracting 7/200: chart_196_advanced_scatter
    Success (Cost: $0.18)
Extracting 8/200: chart_120_complex_pie
    Success (Cost: $0.21)
Extracting 9/200: chart_011_medium_scatter
    Success (Cost: $0.24)
Extracting 10/200: chart_192_complex_line
    Success (Cost: $0.27)
    Progress: 10/10 ($0.30 spent)
Extracting 11/200: chart_153_a

API attempt 1 failed for data\raw_charts\chart_071_advanced_bar.png: Request timed out.


    Success (Cost: $1.41)
Extracting 49/200: chart_122_advanced_scatter
    Success (Cost: $1.44)
Extracting 50/200: chart_109_advanced_bar
    Success (Cost: $1.47)
    Progress: 50/51 ($1.50 spent)
Extracting 51/200: chart_077_complex_area
    Success (Cost: $1.50)
Extracting 52/200: chart_144_complex_bar
    Success (Cost: $1.53)
Extracting 53/200: chart_062_advanced_line
    Success (Cost: $1.56)
Extracting 54/200: chart_098_complex_bar
    Success (Cost: $1.59)
Extracting 55/200: chart_194_medium_line
    Success (Cost: $1.62)
Extracting 56/200: chart_108_medium_bar
    Success (Cost: $1.65)
Extracting 57/200: chart_124_medium_scatter
    Success (Cost: $1.68)
Extracting 58/200: chart_041_medium_line
    Success (Cost: $1.71)
Extracting 59/200: chart_001_complex_bar
    Success (Cost: $1.74)
Extracting 60/200: chart_200_complex_pie
    Success (Cost: $1.77)
    Progress: 60/61 ($1.80 spent)
Extracting 61/200: chart_193_medium_line
    Success (Cost: $1.80)
Extracting 62/200: chart

API attempt 1 failed for data\raw_charts\chart_180_advanced_bar.png: Request timed out.


    Success (Cost: $4.14)
Extracting 140/200: chart_146_complex_line
    Success (Cost: $4.17)
    Progress: 140/142 ($4.20 spent)
Extracting 141/200: chart_072_advanced_line
    Success (Cost: $4.20)
Extracting 142/200: chart_056_medium_bar
    Success (Cost: $4.23)
Extracting 143/200: chart_151_complex_line
    Success (Cost: $4.26)
Extracting 144/200: chart_015_complex_line
    Success (Cost: $4.29)
Extracting 145/200: chart_047_complex_bar
    Success (Cost: $4.32)
Extracting 146/200: chart_090_complex_area
    Success (Cost: $4.35)
Extracting 147/200: chart_073_medium_area
    Success (Cost: $4.38)
Extracting 148/200: chart_139_medium_bar
    Success (Cost: $4.41)
Extracting 149/200: chart_018_complex_line
    Success (Cost: $4.44)
Extracting 150/200: chart_094_complex_scatter
    Success (Cost: $4.47)
    Progress: 150/152 ($4.50 spent)
Extracting 151/200: chart_029_medium_pie
    Success (Cost: $4.50)
Extracting 152/200: chart_135_medium_bar
    Success (Cost: $4.53)
Extracting 

Could not parse JSON from response: I'm unable to extract data from images directly. However, I can guide you on how to manually extract the data from the chart. Here's a template you can use to fill in the data:

```json
{
  "chart_tit...


    Success (Cost: $5.73)
Extracting 193/200: chart_171_advanced_bar
    Success (Cost: $5.76)
Extracting 194/200: chart_024_complex_line
    Success (Cost: $5.79)
Extracting 195/200: chart_186_medium_line
    Success (Cost: $5.82)
Extracting 196/200: chart_033_medium_line
    Success (Cost: $5.85)
Extracting 197/200: chart_055_complex_line
    Success (Cost: $5.88)
Extracting 198/200: chart_060_advanced_bar
    Success (Cost: $5.91)
Extracting 199/200: chart_068_medium_scatter
    Success (Cost: $5.94)
Extracting 200/200: chart_187_complex_scatter
    Success (Cost: $5.97)
    Progress: 200/203 ($6.00 spent)
 PHASE 1 COMPLETE: 200 extractions, $6.00 spent

 PHASE 2: PRIORITY PERTURBATIONS
Processing 600 priority perturbations...
Extracting perturbation 1/600: chart_179_advanced_bar_rotation_low.png
    Success
Extracting perturbation 2/600: chart_156_complex_bar_rotation_high.png
    Success
Extracting perturbation 3/600: chart_062_advanced_line_legend_corruption_medium.png
    Succes

Could not parse JSON from response: I'm unable to extract data from images directly. However, I can guide you on how to manually extract the data from the chart. Here's a template you can use to fill in the data:

```json
{
  "chart_tit...


    Success
Extracting perturbation 94/600: chart_156_complex_bar_brightness_shift_low.png
    Success
Extracting perturbation 95/600: chart_098_complex_bar_legend_corruption_medium.png
    Success
Extracting perturbation 96/600: chart_121_complex_scatter_gaussian_blur_medium.png
    Success
Extracting perturbation 97/600: chart_179_advanced_bar_rotation_high.png
    Success
Extracting perturbation 98/600: chart_189_medium_pie_brightness_shift_low.png
    Success
Extracting perturbation 99/600: chart_076_medium_area_brightness_shift_high.png
    Success
Extracting perturbation 100/600: chart_162_complex_bar_legend_corruption_medium.png
    Success
    Progress: 300 total extractions, $9.00 spent, $31.00 remaining
Extracting perturbation 101/600: chart_196_advanced_scatter_rotation_high.png
    Success
Extracting perturbation 102/600: chart_093_complex_area_rotation_low.png
    Success
Extracting perturbation 103/600: chart_119_advanced_pie_random_blocks_medium.png
    Success
Extractin

Could not parse JSON from response: I'm unable to extract data from the image directly. However, I can guide you on how to approach this task:

1. **Chart Title**: Identify the title at the top of the chart.
2. **Chart Type**: Determine...


    Success
Extracting perturbation 133/600: chart_108_medium_bar_legend_corruption_medium.png
    Success
Extracting perturbation 134/600: chart_174_complex_line_legend_corruption_medium.png
    Success
Extracting perturbation 135/600: chart_128_complex_bar_legend_corruption_medium.png
    Success
Extracting perturbation 136/600: chart_003_medium_bar_random_blocks_medium.png
    Success
Extracting perturbation 137/600: chart_014_complex_bar_rotation_medium.png
    Success
Extracting perturbation 138/600: chart_099_medium_line_random_blocks_medium.png
    Success
Extracting perturbation 139/600: chart_134_advanced_scatter_brightness_shift_low.png
    Success
Extracting perturbation 140/600: chart_013_advanced_bar_random_blocks_medium.png
    Success
Extracting perturbation 141/600: chart_003_medium_bar_brightness_shift_medium.png
    Success
Extracting perturbation 142/600: chart_167_advanced_area_rotation_high.png
    Success
Extracting perturbation 143/600: chart_153_advanced_pie_gau

Could not parse JSON from response: I'm unable to extract data from images directly. However, I can guide you on how to manually extract the data from the chart. Here's a general approach:

1. **Identify the Chart Type**: This is a grou...


    Success
Extracting perturbation 174/600: chart_074_complex_bar_brightness_shift_low.png
    Success
Extracting perturbation 175/600: chart_035_medium_line_brightness_shift_low.png
    Success
    Progress: 375 total extractions, $11.25 spent, $28.75 remaining
Extracting perturbation 176/600: chart_190_advanced_line_rotation_medium.png
    Success
Extracting perturbation 177/600: chart_062_advanced_line_brightness_shift_medium.png
    Success
Extracting perturbation 178/600: chart_031_complex_bar_legend_corruption_medium.png
    Success
Extracting perturbation 179/600: chart_070_medium_bar_rotation_medium.png
    Success
Extracting perturbation 180/600: chart_008_complex_pie_gaussian_blur_high.png
    Success
Extracting perturbation 181/600: chart_003_medium_bar_gaussian_blur_low.png
    Success
Extracting perturbation 182/600: chart_142_complex_line_brightness_shift_medium.png
    Success
Extracting perturbation 183/600: chart_064_complex_pie_brightness_shift_high.png
    Success
E

Could not parse JSON from response: I'm unable to extract data from images directly. However, I can guide you on how to manually extract the data from the chart:

1. **Chart Title**: "Correlation Analysis - (Scientific Research)"
2. **C...


    Success
Extracting perturbation 254/600: chart_126_complex_line_brightness_shift_medium.png
    Success
Extracting perturbation 255/600: chart_167_advanced_area_rotation_medium.png
    Success
Extracting perturbation 256/600: chart_044_medium_bar_brightness_shift_medium.png
    Success
Extracting perturbation 257/600: chart_169_medium_scatter_legend_corruption_medium.png
    Success
Extracting perturbation 258/600: chart_157_medium_line_brightness_shift_medium.png
    Success
Extracting perturbation 259/600: chart_143_medium_pie_brightness_shift_medium.png
    Success
Extracting perturbation 260/600: chart_190_advanced_line_brightness_shift_medium.png
    Success
Extracting perturbation 261/600: chart_104_advanced_line_gaussian_blur_high.png
    Success
Extracting perturbation 262/600: chart_066_advanced_bar_gaussian_blur_medium.png
    Success
Extracting perturbation 263/600: chart_064_complex_pie_brightness_shift_low.png
    Success
Extracting perturbation 264/600: chart_102_medi

Could not parse JSON from response: I'm unable to extract data from the image directly. However, I can help guide you on how to manually extract the data or provide assistance with data analysis if you have the data in a different forma...


    Success
Extracting perturbation 307/600: chart_196_advanced_scatter_gaussian_blur_medium.png
    Success
Extracting perturbation 308/600: chart_145_medium_bar_gaussian_blur_medium.png
    Success
Extracting perturbation 309/600: chart_104_advanced_line_rotation_high.png
    Success
Extracting perturbation 310/600: chart_088_complex_bar_random_blocks_medium.png
    Success
Extracting perturbation 311/600: chart_017_complex_scatter_legend_corruption_medium.png
    Success
Extracting perturbation 312/600: chart_070_medium_bar_gaussian_blur_medium.png
    Success
Extracting perturbation 313/600: chart_172_medium_bar_rotation_medium.png
    Success
Extracting perturbation 314/600: chart_129_complex_bar_legend_corruption_medium.png
    Success
Extracting perturbation 315/600: chart_147_complex_pie_gaussian_blur_high.png
    Success
Extracting perturbation 316/600: chart_063_medium_line_legend_corruption_medium.png
    Success
Extracting perturbation 317/600: chart_035_medium_line_gaussia

API attempt 1 failed for data\perturbations\chart_071_advanced_bar_brightness_shift_low.png: Request timed out.
API attempt 2 failed for data\perturbations\chart_071_advanced_bar_brightness_shift_low.png: Request timed out.
API attempt 3 failed for data\perturbations\chart_071_advanced_bar_brightness_shift_low.png: Request timed out.
Extraction failed for data\perturbations\chart_071_advanced_bar_brightness_shift_low.png: Request timed out.


    Failed
Extracting perturbation 353/600: chart_158_medium_bar_legend_corruption_medium.png
    Success
Extracting perturbation 354/600: chart_074_complex_bar_rotation_high.png
    Success
Extracting perturbation 355/600: chart_077_complex_area_rotation_medium.png
    Success
Extracting perturbation 356/600: chart_165_advanced_pie_legend_corruption_medium.png
    Success
Extracting perturbation 357/600: chart_071_advanced_bar_gaussian_blur_low.png
    Success
Extracting perturbation 358/600: chart_035_medium_line_brightness_shift_medium.png
    Success
Extracting perturbation 359/600: chart_162_complex_bar_gaussian_blur_low.png
    Success
Extracting perturbation 360/600: chart_071_advanced_bar_gaussian_blur_high.png
    Success
Extracting perturbation 361/600: chart_083_complex_bar_brightness_shift_medium.png
    Success
Extracting perturbation 362/600: chart_032_advanced_scatter_gaussian_blur_medium.png
    Success
Extracting perturbation 363/600: chart_109_advanced_bar_gaussian_bl

Extraction failed for data\perturbations\chart_058_complex_bar_rotation_low.png: Image encoding failed: image file is truncated


Extracting perturbation 415/600: chart_058_complex_bar_rotation_low.png
    Failed
Extracting perturbation 416/600: chart_109_advanced_bar_rotation_low.png
    Success
Extracting perturbation 417/600: chart_008_complex_pie_rotation_high.png
    Success
Extracting perturbation 418/600: chart_131_complex_bar_gaussian_blur_medium.png
    Success
Extracting perturbation 419/600: chart_064_complex_pie_random_blocks_medium.png
    Success
Extracting perturbation 420/600: chart_183_complex_bar_gaussian_blur_medium.png
    Success
Extracting perturbation 421/600: chart_119_advanced_pie_brightness_shift_medium.png
    Success
Extracting perturbation 422/600: chart_010_advanced_pie_brightness_shift_medium.png
    Success
Extracting perturbation 423/600: chart_162_complex_bar_brightness_shift_low.png
    Success
Extracting perturbation 424/600: chart_196_advanced_scatter_gaussian_blur_low.png
    Success
Extracting perturbation 425/600: chart_134_advanced_scatter_rotation_high.png
    Success
   

Could not parse JSON from response: I'm unable to process the image directly. Please provide the data in text form or describe the chart, and I'll help you format it into a JSON object....


    Success
Extracting perturbation 430/600: chart_195_advanced_pie_legend_corruption_medium.png
    Success
Extracting perturbation 431/600: chart_113_complex_bar_brightness_shift_high.png
    Success
Extracting perturbation 432/600: chart_134_advanced_scatter_brightness_shift_high.png
    Success
Extracting perturbation 433/600: chart_034_medium_line_gaussian_blur_medium.png
    Success
Extracting perturbation 434/600: chart_087_complex_scatter_legend_corruption_medium.png
    Success
Extracting perturbation 435/600: chart_013_advanced_bar_gaussian_blur_low.png
    Success
Extracting perturbation 436/600: chart_154_medium_area_legend_corruption_medium.png
    Success
Extracting perturbation 437/600: chart_155_advanced_pie_rotation_medium.png
    Success
Extracting perturbation 438/600: chart_134_advanced_scatter_gaussian_blur_high.png
    Success
Extracting perturbation 439/600: chart_180_advanced_bar_random_blocks_medium.png
    Success
Extracting perturbation 440/600: chart_061_com

Could not parse JSON from response: I'm unable to extract data from the image provided....


    Success
Extracting perturbation 465/600: chart_153_advanced_pie_brightness_shift_high.png
    Success
Extracting perturbation 466/600: chart_140_medium_scatter_random_blocks_medium.png
    Success
Extracting perturbation 467/600: chart_158_medium_bar_gaussian_blur_high.png
    Success
Extracting perturbation 468/600: chart_030_medium_bar_gaussian_blur_medium.png
    Success
Extracting perturbation 469/600: chart_031_complex_bar_brightness_shift_medium.png
    Success
Extracting perturbation 470/600: chart_036_advanced_bar_random_blocks_medium.png
    Success
Extracting perturbation 471/600: chart_189_medium_pie_gaussian_blur_high.png
    Success
Extracting perturbation 472/600: chart_117_medium_line_brightness_shift_medium.png
    Success
Extracting perturbation 473/600: chart_104_advanced_line_gaussian_blur_medium.png
    Success
Extracting perturbation 474/600: chart_129_complex_bar_gaussian_blur_medium.png
    Success
Extracting perturbation 475/600: chart_120_complex_pie_bright

Could not parse JSON from response: I'm unable to extract data from images directly. However, I can guide you on how to manually extract the data from the chart:

1. **Chart Title**: Look at the top of the chart for the title.
2. **Char...


    Success
Extracting perturbation 491/600: chart_023_medium_scatter_rotation_medium.png
    Success
Extracting perturbation 492/600: chart_076_medium_area_brightness_shift_low.png
    Success
Extracting perturbation 493/600: chart_122_advanced_scatter_gaussian_blur_low.png
    Success
Extracting perturbation 494/600: chart_071_advanced_bar_rotation_low.png
    Success
Extracting perturbation 495/600: chart_120_complex_pie_legend_corruption_medium.png
    Success
Extracting perturbation 496/600: chart_160_complex_bar_legend_corruption_medium.png
    Success
Extracting perturbation 497/600: chart_051_advanced_area_legend_corruption_medium.png
    Success
Extracting perturbation 498/600: chart_050_medium_pie_random_blocks_medium.png
    Success
Extracting perturbation 499/600: chart_183_complex_bar_legend_corruption_medium.png
    Success
Extracting perturbation 500/600: chart_076_medium_area_rotation_low.png
    Success
    Progress: 698 total extractions, $20.94 spent, $19.06 remainin

API attempt 1 failed for data\perturbations\chart_109_advanced_bar_rotation_medium.png: Request timed out.


    Success
Extracting perturbation 508/600: chart_030_medium_bar_legend_corruption_medium.png
    Success
Extracting perturbation 509/600: chart_067_advanced_scatter_rotation_medium.png
    Success
Extracting perturbation 510/600: chart_113_complex_bar_brightness_shift_low.png
    Success
Extracting perturbation 511/600: chart_082_complex_line_brightness_shift_high.png


Could not parse JSON from response: I'm unable to extract data from images directly. However, I can guide you on how to manually extract the data from the chart. Here's a general approach:

1. **Identify the Chart Type**: This is a line...
Could not parse JSON from response: I'm unable to extract data from images directly. However, I can guide you on how to manually extract the data from the chart. Here's a template you can use to fill in the data:

```json
{
  "chart_tit...


    Success
Extracting perturbation 512/600: chart_003_medium_bar_rotation_high.png
    Success
Extracting perturbation 513/600: chart_158_medium_bar_brightness_shift_high.png
    Success
Extracting perturbation 514/600: chart_080_medium_pie_random_blocks_medium.png
    Success
Extracting perturbation 515/600: chart_092_advanced_scatter_legend_corruption_medium.png
    Success
Extracting perturbation 516/600: chart_093_complex_area_gaussian_blur_low.png
    Success
Extracting perturbation 517/600: chart_158_medium_bar_gaussian_blur_low.png
    Success
Extracting perturbation 518/600: chart_190_advanced_line_gaussian_blur_medium.png
    Success
Extracting perturbation 519/600: chart_166_medium_bar_gaussian_blur_medium.png
    Success
Extracting perturbation 520/600: chart_087_complex_scatter_brightness_shift_medium.png
    Success
Extracting perturbation 521/600: chart_192_complex_line_gaussian_blur_high.png
    Success
Extracting perturbation 522/600: chart_080_medium_pie_rotation_medi

Could not parse JSON from response: I'm unable to extract data from the image directly. However, I can guide you on how to manually extract the data using an OCR tool or by visually inspecting the image. Here's a general approach:

1. *...


    Success
Extracting perturbation 538/600: chart_173_advanced_line_random_blocks_medium.png
    Success
Extracting perturbation 539/600: chart_111_medium_line_gaussian_blur_medium.png
    Success
Extracting perturbation 540/600: chart_050_medium_pie_rotation_high.png
    Success
Extracting perturbation 541/600: chart_042_medium_bar_gaussian_blur_high.png
    Success
Extracting perturbation 542/600: chart_003_medium_bar_brightness_shift_low.png
    Success
Extracting perturbation 543/600: chart_051_advanced_area_brightness_shift_medium.png
    Success
Extracting perturbation 544/600: chart_142_complex_line_gaussian_blur_medium.png


Could not parse JSON from response: I'm unable to extract data from images directly. However, I can guide you on how to manually extract the data from the chart. Here's a general approach:

1. **Identify the Chart Type**: This is a line...


    Success
Extracting perturbation 545/600: chart_102_medium_line_rotation_high.png
    Success
Extracting perturbation 546/600: chart_190_advanced_line_random_blocks_medium.png
    Success
Extracting perturbation 547/600: chart_001_complex_bar_legend_corruption_medium.png
    Success
Extracting perturbation 548/600: chart_167_advanced_area_gaussian_blur_medium.png
    Success
Extracting perturbation 549/600: chart_011_medium_scatter_brightness_shift_high.png
    Success
Extracting perturbation 550/600: chart_032_advanced_scatter_legend_corruption_medium.png
    Success
    Progress: 748 total extractions, $22.44 spent, $17.56 remaining
Extracting perturbation 551/600: chart_043_medium_pie_brightness_shift_high.png
    Success
Extracting perturbation 552/600: chart_030_medium_bar_brightness_shift_low.png
    Success
Extracting perturbation 553/600: chart_142_complex_line_legend_corruption_medium.png


Could not parse JSON from response: I'm unable to extract data from images directly. However, I can guide you on how to manually extract the data from the chart. Here's a template you can use to fill in the data:

```json
{
  "chart_tit...


    Success
Extracting perturbation 554/600: chart_045_complex_bar_rotation_low.png
    Success
Extracting perturbation 555/600: chart_144_complex_bar_brightness_shift_medium.png
    Success
Extracting perturbation 556/600: chart_036_advanced_bar_rotation_medium.png
    Success
Extracting perturbation 557/600: chart_013_advanced_bar_rotation_low.png
    Success
Extracting perturbation 558/600: chart_008_complex_pie_random_blocks_medium.png
    Success
Extracting perturbation 559/600: chart_056_medium_bar_rotation_medium.png
    Success
Extracting perturbation 560/600: chart_078_complex_bar_rotation_medium.png
    Success
Extracting perturbation 561/600: chart_052_advanced_area_gaussian_blur_medium.png
    Success
Extracting perturbation 562/600: chart_162_complex_bar_rotation_high.png
    Success
Extracting perturbation 563/600: chart_179_advanced_bar_brightness_shift_high.png
    Success
Extracting perturbation 564/600: chart_154_medium_area_brightness_shift_medium.png
    Success
Ext

Could not parse JSON from response: I'm unable to process the image directly. However, if you provide the data points and labels, I can help format them into the JSON object you need....


    Success
Extracting perturbation 578/600: chart_010_advanced_pie_brightness_shift_low.png
    Success
Extracting perturbation 579/600: chart_016_medium_area_random_blocks_medium.png
    Success
Extracting perturbation 580/600: chart_088_complex_bar_gaussian_blur_medium.png
    Success
Extracting perturbation 581/600: chart_153_advanced_pie_brightness_shift_medium.png
    Success
Extracting perturbation 582/600: chart_007_complex_line_rotation_low.png
    Success
Extracting perturbation 583/600: chart_034_medium_line_rotation_high.png
    Success
Extracting perturbation 584/600: chart_121_complex_scatter_brightness_shift_medium.png
    Success
Extracting perturbation 585/600: chart_081_complex_scatter_gaussian_blur_medium.png
    Success
Extracting perturbation 586/600: chart_088_complex_bar_legend_corruption_medium.png
    Success
Extracting perturbation 587/600: chart_087_complex_scatter_gaussian_blur_high.png
    Success
Extracting perturbation 588/600: chart_050_medium_pie_gaussi

API attempt 1 failed for data\perturbations\chart_109_advanced_bar_gaussian_blur_low.png: Request timed out.


    Success
Extracting intensity variant 5/280: chart_109_advanced_bar_rotation_low.png


API attempt 1 failed for data\perturbations\chart_109_advanced_bar_rotation_low.png: Request timed out.


    Success
Extracting intensity variant 6/280: chart_147_complex_pie_gaussian_blur_high.png
    Success
Extracting intensity variant 7/280: chart_045_complex_bar_brightness_shift_high.png
    Success
Extracting intensity variant 8/280: chart_156_complex_bar_rotation_high.png
    Success
Extracting intensity variant 9/280: chart_034_medium_line_gaussian_blur_high.png
    Success
Extracting intensity variant 10/280: chart_192_complex_line_brightness_shift_low.png
    Success
Extracting intensity variant 11/280: chart_093_complex_area_rotation_low.png
    Success
Extracting intensity variant 12/280: chart_156_complex_bar_brightness_shift_low.png
    Success
Extracting intensity variant 13/280: chart_007_complex_line_gaussian_blur_low.png
    Success
Extracting intensity variant 14/280: chart_083_complex_bar_gaussian_blur_high.png
    Success
Extracting intensity variant 15/280: chart_162_complex_bar_brightness_shift_low.png
    Success
Extracting intensity variant 16/280: chart_196_advan

API attempt 1 failed for data\perturbations\chart_109_advanced_bar_rotation_high.png: Request timed out.
API attempt 2 failed for data\perturbations\chart_109_advanced_bar_rotation_high.png: Request timed out.


    Success
Extracting intensity variant 45/280: chart_195_advanced_pie_rotation_high.png
    Success
Extracting intensity variant 46/280: chart_007_complex_line_brightness_shift_low.png
    Success
Extracting intensity variant 47/280: chart_008_complex_pie_brightness_shift_high.png
    Success
Extracting intensity variant 48/280: chart_134_advanced_scatter_gaussian_blur_low.png
    Success
Extracting intensity variant 49/280: chart_071_advanced_bar_rotation_high.png
    Success
Extracting intensity variant 50/280: chart_109_advanced_bar_brightness_shift_high.png
    Success
Extracting intensity variant 51/280: chart_074_complex_bar_brightness_shift_low.png
    Success
Extracting intensity variant 52/280: chart_134_advanced_scatter_rotation_high.png
    Success
Extracting intensity variant 53/280: chart_159_complex_line_rotation_high.png
    Success
Extracting intensity variant 54/280: chart_129_complex_bar_gaussian_blur_low.png
    Success
Extracting intensity variant 55/280: chart_03

Could not parse JSON from response: I'm unable to extract data from images directly. However, I can guide you on how to manually extract the data from the chart. Here's a template you can use to fill in the data:

```json
{
  "chart_tit...


    Success
Extracting intensity variant 159/280: chart_011_medium_scatter_gaussian_blur_high.png
    Success
Extracting intensity variant 160/280: chart_083_complex_bar_gaussian_blur_low.png
    Success
Extracting intensity variant 161/280: chart_192_complex_line_gaussian_blur_low.png
    Success
Extracting intensity variant 162/280: chart_156_complex_bar_gaussian_blur_high.png
    Success
Extracting intensity variant 163/280: chart_043_medium_pie_brightness_shift_low.png
    Success
Extracting intensity variant 164/280: chart_050_medium_pie_gaussian_blur_low.png
    Success
Extracting intensity variant 165/280: chart_082_complex_line_gaussian_blur_low.png
    Success
Extracting intensity variant 166/280: chart_158_medium_bar_rotation_low.png
    Success
Extracting intensity variant 167/280: chart_147_complex_pie_brightness_shift_low.png
    Success
Extracting intensity variant 168/280: chart_074_complex_bar_brightness_shift_high.png
    Success
Extracting intensity variant 169/280: c

API attempt 1 failed for data\perturbations\chart_071_advanced_bar_brightness_shift_high.png: Request timed out.
API attempt 2 failed for data\perturbations\chart_071_advanced_bar_brightness_shift_high.png: Request timed out.


    Success
Extracting intensity variant 265/280: chart_158_medium_bar_gaussian_blur_low.png
    Success
Extracting intensity variant 266/280: chart_113_complex_bar_brightness_shift_high.png
    Success
Extracting intensity variant 267/280: chart_007_complex_line_rotation_high.png
    Success
Extracting intensity variant 268/280: chart_179_advanced_bar_brightness_shift_high.png
    Success
Extracting intensity variant 269/280: chart_156_complex_bar_brightness_shift_high.png
    Success
Extracting intensity variant 270/280: chart_158_medium_bar_gaussian_blur_high.png
    Success
Extracting intensity variant 271/280: chart_036_advanced_bar_brightness_shift_low.png
    Success
Extracting intensity variant 272/280: chart_003_medium_bar_brightness_shift_high.png
    Success
Extracting intensity variant 273/280: chart_064_complex_pie_gaussian_blur_low.png
    Success
Extracting intensity variant 274/280: chart_093_complex_area_brightness_shift_high.png


Could not parse JSON from response: I'm unable to extract data from images directly. However, I can guide you on how to manually extract the data from the chart. Here's a template you can use:

```json
{
  "chart_title": "Correlation An...


    Success
Extracting intensity variant 275/280: chart_087_complex_scatter_rotation_high.png
    Success
Extracting intensity variant 276/280: chart_059_advanced_bar_brightness_shift_high.png


API attempt 1 failed for data\perturbations\chart_059_advanced_bar_brightness_shift_high.png: Request timed out.


    Success
Extracting intensity variant 277/280: chart_195_advanced_pie_brightness_shift_low.png
    Success
Extracting intensity variant 278/280: chart_008_complex_pie_rotation_low.png
    Success
Extracting intensity variant 279/280: chart_156_complex_bar_gaussian_blur_low.png
    Success
Extracting intensity variant 280/280: chart_067_advanced_scatter_gaussian_blur_high.png
    Success
 PHASE 3 COMPLETE: 280 intensity variants extracted


### SECTION 4: EXTRACTION ANALYSIS

In [6]:
print("\n SECTION 4: EXTRACTION ANALYSIS")

def analyze_extraction_results(results, extractor):
    """Comprehensive analysis of extraction results"""
    
    final_stats = extractor.get_extraction_statistics()
    
    print(" EXTRACTION EXECUTION SUMMARY:")
    print("-" * 60)
    print(f"Total Attempts: {final_stats['total_attempts']}")
    print(f"Successful Extractions: {final_stats['successful_extractions']}")
    print(f"Failed Extractions: {final_stats['failed_extractions']}")
    print(f"Success Rate: {final_stats['success_rate']:.1f}%")
    print(f"Total Cost: ${final_stats['total_cost']:.2f}")
    print(f"Remaining Budget: ${final_stats['remaining_budget']:.2f}")
    print(f"Average Response Time: {final_stats['avg_response_time']:.2f}s")
    
    # Analyze extraction types
    type_breakdown = {'original': 0, 'perturbation': 0}
    perturbation_breakdown = {}
    intensity_breakdown = {}
    
    for result_key, result_data in results.items():
        extraction_type = result_data['type']
        type_breakdown[extraction_type] += 1
        
        if extraction_type == 'perturbation':
            pert_type = result_data.get('perturbation_type', 'unknown')
            intensity = result_data.get('intensity', 'unknown')
            
            perturbation_breakdown[pert_type] = perturbation_breakdown.get(pert_type, 0) + 1
            intensity_breakdown[intensity] = intensity_breakdown.get(intensity, 0) + 1
    
    print(f"\n EXTRACTION TYPE BREAKDOWN:")
    for ext_type, count in type_breakdown.items():
        print(f"  {ext_type.title()}: {count} extractions")
    
    if perturbation_breakdown:
        print(f"\n PERTURBATION TYPE BREAKDOWN:")
        for pert_type, count in sorted(perturbation_breakdown.items()):
            print(f"  {pert_type}: {count} extractions")
    
    if intensity_breakdown:
        print(f"\n INTENSITY BREAKDOWN:")
        for intensity, count in sorted(intensity_breakdown.items()):
            print(f"  {intensity}: {count} extractions")
    
    # Analyze extraction quality
    confidence_distribution = {'high': 0, 'medium': 0, 'low': 0, 'unknown': 0}
    validation_issues = {'valid': 0, 'warnings': 0, 'errors': 0}
    
    for result_data in results.values():
        extracted_data = result_data.get('extracted_data', {})
        
        # Confidence analysis
        confidence = extracted_data.get('extraction_confidence', 'unknown')
        confidence_distribution[confidence] = confidence_distribution.get(confidence, 0) + 1
        
        # Validation analysis
        validation = extracted_data.get('_validation', {})
        if validation.get('is_valid', False):
            if validation.get('warnings', []):
                validation_issues['warnings'] += 1
            else:
                validation_issues['valid'] += 1
        else:
            validation_issues['errors'] += 1
    
    print(f"\n EXTRACTION QUALITY ANALYSIS:")
    print(f"Confidence Distribution:")
    for conf_level, count in confidence_distribution.items():
        percentage = (count / len(results)) * 100 if results else 0
        print(f"  {conf_level.title()}: {count} ({percentage:.1f}%)")
    
    print(f"Validation Results:")
    for val_type, count in validation_issues.items():
        percentage = (count / len(results)) * 100 if results else 0
        print(f"  {val_type.title()}: {count} ({percentage:.1f}%)")
    
    # Error analysis
    if extractor.error_log:
        print(f"\n ERROR ANALYSIS:")
        error_types = {}
        for error in extractor.error_log:
            error_type = error.get('error_type', 'Unknown')
            error_types[error_type] = error_types.get(error_type, 0) + 1
        
        for error_type, count in sorted(error_types.items()):
            print(f"  {error_type}: {count} occurrences")
    
    return {
        'execution_stats': final_stats,
        'type_breakdown': type_breakdown,
        'perturbation_breakdown': perturbation_breakdown,
        'intensity_breakdown': intensity_breakdown,
        'confidence_distribution': confidence_distribution,
        'validation_issues': validation_issues,
        'total_extractions': len(results)
    }

analysis_results = analyze_extraction_results(all_results, extraction_engine)


 SECTION 4: EXTRACTION ANALYSIS
 EXTRACTION EXECUTION SUMMARY:
------------------------------------------------------------
Total Attempts: 1108
Successful Extractions: 1078
Failed Extractions: 2
Success Rate: 97.3%
Total Cost: $32.34
Remaining Budget: $7.66
Average Response Time: 13.30s

 EXTRACTION TYPE BREAKDOWN:
  Original: 200 extractions
  Perturbation: 698 extractions

 PERTURBATION TYPE BREAKDOWN:
  blocks: 84 extractions
  blur: 180 extractions
  corruption: 83 extractions
  rotation: 167 extractions
  shift: 184 extractions

 INTENSITY BREAKDOWN:
  high: 140 extractions
  low: 140 extractions
  medium: 418 extractions

 EXTRACTION QUALITY ANALYSIS:
Confidence Distribution:
  High: 543 (60.5%)
  Medium: 320 (35.6%)
  Low: 35 (3.9%)
  Unknown: 0 (0.0%)
Validation Results:
  Valid: 867 (96.5%)
  Errors: 0 (0.0%)

 ERROR ANALYSIS:
  APITimeoutError: 1 occurrences
  Exception: 1 occurrences


### SECTION 5: DATA PREPARATION FOR ANALYSIS

In [7]:
print("\n SECTION 5: DATA PREPARATION FOR ANALYSIS")

def prepare_analysis_dataset(extraction_results):
    """Prepare comprehensive dataset for statistical analysis"""
    
    analysis_dataset = []
    
    for result_key, result_data in extraction_results.items():
        extracted_data = result_data.get('extracted_data', {})
        
        # Basic extraction info
        record = {
            'extraction_id': result_key,
            'extraction_type': result_data['type'],
            'chart_title': extracted_data.get('chart_title', ''),
            'chart_type': extracted_data.get('chart_type', ''),
            'extraction_confidence': extracted_data.get('extraction_confidence', 'unknown'),
            'data_completeness': extracted_data.get('data_completeness', 'unknown'),
            'extracted_data_points': len(extracted_data.get('data', [])),
            'extraction_timestamp': extracted_data.get('_extraction_metadata', {}).get('extraction_timestamp', ''),
            'response_time': extracted_data.get('_extraction_metadata', {}).get('response_time', 0)
        }
        
        # Perturbation-specific info
        if result_data['type'] == 'perturbation':
            record.update({
                'original_chart_id': result_data.get('original_chart_id', ''),
                'perturbation_type': result_data.get('perturbation_type', ''),
                'perturbation_intensity': result_data.get('intensity', ''),
            })
        else:
            record.update({
                'original_chart_id': result_key,
                'perturbation_type': 'none',
                'perturbation_intensity': 'none',
            })
        
        # Validation info
        validation = extracted_data.get('_validation', {})
        record.update({
            'validation_is_valid': validation.get('is_valid', False),
            'validation_warnings': len(validation.get('warnings', [])),
            'validation_errors': len(validation.get('errors', []))
        })
        
        # Ground truth comparison (if available)
        ground_truth = result_data.get('ground_truth', {})
        if ground_truth:
            record['ground_truth_data_points'] = len(ground_truth) if isinstance(ground_truth, dict) else 0
        else:
            record['ground_truth_data_points'] = 0
        
        analysis_dataset.append(record)
    
    return pd.DataFrame(analysis_dataset)

# Create analysis dataset
analysis_df = prepare_analysis_dataset(all_results)

# Save analysis dataset
analysis_df.to_csv('data/analysis_cache/extraction_analysis_dataset.csv', index=False)
print(f" Analysis dataset saved: {len(analysis_df)} records")

# Display sample of analysis dataset
print(f"\n ANALYSIS DATASET PREVIEW:")
print(analysis_df.head().to_string())


 SECTION 5: DATA PREPARATION FOR ANALYSIS
 Analysis dataset saved: 898 records

 ANALYSIS DATASET PREVIEW:
0  chart_179_advanced_bar        original          Quality Control Metrics - (Operational Metrics)  grouped_bar                  high          complete                     13  2025-06-21T15:26:18.969524      11.286501  chart_179_advanced_bar              none                   none                 True                    0                  0                         0
1   chart_035_medium_line        original             Market Share Analysis - (Business Analytics)         line                medium          complete                      9  2025-06-21T15:26:26.853917       6.879863   chart_035_medium_line              none                   none                 True                    0                  0                         0
2   chart_058_complex_bar        original  Experimental Results Comparison - (Scientific Research)  grouped_bar                  high          complete 

### SECTION 6: EXECUTION SUMMARY AND NEXT STEPS

In [8]:

print("\nSECTION 6: EXECUTION SUMMARY")

# Create comprehensive summary for next phase
execution_summary = {
    'extraction_execution_complete': True,
    'total_extractions': analysis_results['total_extractions'],
    'successful_extractions': analysis_results['execution_stats']['successful_extractions'],
    'success_rate': analysis_results['execution_stats']['success_rate'],
    'total_cost': analysis_results['execution_stats']['total_cost'],
    'remaining_budget': analysis_results['execution_stats']['remaining_budget'],
    'extraction_breakdown': {
        'original_charts': analysis_results['type_breakdown'].get('original', 0),
        'perturbations': analysis_results['type_breakdown'].get('perturbation', 0)
    },
    'perturbation_types_tested': list(analysis_results['perturbation_breakdown'].keys()),
    'quality_metrics': {
        'high_confidence_extractions': analysis_results['confidence_distribution'].get('high', 0),
        'valid_extractions': analysis_results['validation_issues'].get('valid', 0),
        'avg_response_time': analysis_results['execution_stats']['avg_response_time']
    },
    'ready_for_analysis': analysis_results['total_extractions'] >= 50,
    'next_notebook': '05_Multi_Metric_Analysis.ipynb'
}

# Save execution summary
with open('data/analysis_cache/extraction_summary.json', 'w') as f:
    json.dump(execution_summary, f, indent=2)

print("Execution summary saved for next phase")

# Save complete extraction results
with open('data/analysis_cache/complete_extraction_results.json', 'w') as f:
    # Convert to JSON-serializable format
    serializable_results = {}
    for key, value in all_results.items():
        serializable_results[key] = {
            'type': value['type'],
            'file_path': value['file_path']
        }
        if 'original_chart_id' in value:
            serializable_results[key]['original_chart_id'] = value['original_chart_id']
        if 'perturbation_type' in value:
            serializable_results[key]['perturbation_type'] = value['perturbation_type']
        if 'intensity' in value:
            serializable_results[key]['intensity'] = value['intensity']
    
    json.dump(serializable_results, f, indent=2)

print("Complete extraction results index saved")

# Final status assessment
if execution_summary['ready_for_analysis']:
    status = "SUCCESS"
    status_msg = "Ready for comprehensive analysis"
else:
    status = "PARTIAL_SUCCESS"
    status_msg = "Limited data - may need additional extractions"

print(f"\n" + "=" * 80)
print(f" GPT-4O EXTRACTION PIPELINE COMPLETE!")
print(f" Status: {status}")
print("=" * 80)

print(f" Total Extractions: {execution_summary['total_extractions']}")
print(f" Success Rate: {execution_summary['success_rate']:.1f}%")
print(f" Total Cost: ${execution_summary['total_cost']:.2f}")
print(f" Remaining Budget: ${execution_summary['remaining_budget']:.2f}")
print(f" Average Response Time: {execution_summary['quality_metrics']['avg_response_time']:.2f}s")

print(f"\n EXTRACTION BREAKDOWN:")
print(f"   Original Charts: {execution_summary['extraction_breakdown']['original_charts']}")
print(f"   Perturbations: {execution_summary['extraction_breakdown']['perturbations']}")

print(f"\n QUALITY METRICS:")
print(f"   High Confidence: {execution_summary['quality_metrics']['high_confidence_extractions']}")
print(f"   Valid Extractions: {execution_summary['quality_metrics']['valid_extractions']}")

print(f"\n PERTURBATION TYPES TESTED: {len(execution_summary['perturbation_types_tested'])}")
for pert_type in execution_summary['perturbation_types_tested']:
    print(f"   • {pert_type}")

print(f"\n STATUS: {status_msg}")
if execution_summary['ready_for_analysis']:
    print(" READY FOR NEXT PHASE: Multi-Metric Analysis")
else:
    print(" Consider additional extractions if budget allows")

print("=" * 80)

# Log final status
logger.info(f"Extraction pipeline completed")
logger.info(f"Total extractions: {execution_summary['total_extractions']}")
logger.info(f"Success rate: {execution_summary['success_rate']:.1f}%")
logger.info(f"Total cost: ${execution_summary['total_cost']:.2f}")
logger.info(f"Status: {status}")


SECTION 6: EXECUTION SUMMARY
Execution summary saved for next phase
Complete extraction results index saved

 GPT-4O EXTRACTION PIPELINE COMPLETE!
 Status: SUCCESS
 Total Extractions: 898
 Success Rate: 97.3%
 Total Cost: $32.34
 Remaining Budget: $7.66
 Average Response Time: 13.30s

 EXTRACTION BREAKDOWN:
   Original Charts: 200
   Perturbations: 698

 QUALITY METRICS:
   High Confidence: 543
   Valid Extractions: 867

 PERTURBATION TYPES TESTED: 5
   • rotation
   • corruption
   • shift
   • blur
   • blocks

 STATUS: Ready for comprehensive analysis
 READY FOR NEXT PHASE: Multi-Metric Analysis
