### GPT-4o Robustness Analysis: SAMPLE TEST VERSION
### Test extraction pipeline before full execution

In [1]:
# Run this in a new cell to test:
import os
from dotenv import load_dotenv

load_dotenv()
api_key = os.getenv('OPENAI_API_KEY')

print(f"API Key found: {'Yes' if api_key else 'No'}")
print(f"Key starts with: {api_key[:7] if api_key else 'None'}...")
print(f"Key length: {len(api_key) if api_key else 0}")

# Should show:
# API Key found: Yes
# Key starts with: sk-proj...
# Key length: 164 (or similar)

API Key found: Yes
Key starts with: sk-proj...
Key length: 164


In [2]:
import os
import json
import base64
import pandas as pd
import numpy as np
from datetime import datetime
import time
import logging
from pathlib import Path
import openai
from dotenv import load_dotenv
import random
from PIL import Image

print("=" * 80)
print(" GPT-4O EXTRACTION PIPELINE - SAMPLE TEST")
print(" Testing System Before Full Execution ($0.15 cost)")
print("=" * 80)

 GPT-4O EXTRACTION PIPELINE - SAMPLE TEST
 Testing System Before Full Execution ($0.15 cost)


### SECTION 1: TEST CONFIGURATION

In [3]:

print("\n SECTION 1: TEST CONFIGURATION")

# Load environment
load_dotenv()
OPENAI_API_KEY = os.getenv('OPENAI_API_KEY')

if not OPENAI_API_KEY or OPENAI_API_KEY == "your-openai-api-key-here":
    print(" OpenAI API key not configured!")
    print(" Please add your API key to .env file")
    exit(1)

print(" API key loaded")

# Initialize OpenAI client
client = openai.OpenAI(api_key=OPENAI_API_KEY)

# Test configuration - SMALL SCALE
TEST_CONFIG = {
    "test_charts": 3,           # Only 3 original charts
    "test_perturbations": 2,    # Only 2 perturbations  
    "total_test_cost": 0.15,    # Max $0.15 cost
    "cost_per_extraction": 0.03,
    "max_test_extractions": 5   # Safety limit
}

print(f" TEST PARAMETERS:")
print(f"   Test Charts: {TEST_CONFIG['test_charts']}")
print(f"   Test Perturbations: {TEST_CONFIG['test_perturbations']}")
print(f"   Max Cost: ${TEST_CONFIG['total_test_cost']:.2f}")
print(f"   Max Extractions: {TEST_CONFIG['max_test_extractions']}")


 SECTION 1: TEST CONFIGURATION
 API key loaded
 TEST PARAMETERS:
   Test Charts: 3
   Test Perturbations: 2
   Max Cost: $0.15
   Max Extractions: 5


### SECTION 2: SIMPLIFIED EXTRACTION ENGINE

In [4]:
print("\n SECTION 2: SIMPLIFIED TEST EXTRACTION ENGINE")

class TestGPT4VisionExtractor:
    """Simplified extractor for testing purposes"""
    
    def __init__(self, client):
        self.client = client
        self.test_stats = {
            'attempts': 0,
            'successes': 0,
            'failures': 0,
            'total_cost': 0.0,
            'response_times': []
        }
        
        # Simple but effective prompt
        self.test_prompt = """
Extract data from this chart image. Return ONLY a JSON object:

{
  "chart_title": "title from chart",
  "chart_type": "bar/pie/line/scatter/area",
  "data": [
    {"category": "Cat1", "value": 123},
    {"category": "Cat2", "value": 456}
  ],
  "confidence": "high/medium/low"
}

Extract ALL visible data points with exact values.
"""
    
    def test_extract(self, image_path):
        """Simple extraction for testing"""
        
        start_time = time.time()
        self.test_stats['attempts'] += 1
        
        try:
            print(f"  🔍 Testing extraction: {image_path.name}")
            
            # Encode image
            with open(image_path, "rb") as img_file:
                base64_image = base64.b64encode(img_file.read()).decode('utf-8')
            
            # API call
            response = self.client.chat.completions.create(
                model="gpt-4o",
                messages=[
                    {
                        "role": "user", 
                        "content": [
                            {"type": "text", "text": self.test_prompt},
                            {
                                "type": "image_url",
                                "image_url": {
                                    "url": f"data:image/png;base64,{base64_image}",
                                    "detail": "high"
                                }
                            }
                        ]
                    }
                ],
                max_tokens=1000,
                temperature=0.1
            )
            
            # Parse response
            content = response.choices[0].message.content
            
            # Try to extract JSON
            try:
                if '{' in content and '}' in content:
                    json_start = content.find('{')
                    json_end = content.rfind('}') + 1
                    json_str = content[json_start:json_end]
                    extracted_data = json.loads(json_str)
                else:
                    extracted_data = json.loads(content)
                
                # Add test metadata
                extracted_data['_test_metadata'] = {
                    'response_time': time.time() - start_time,
                    'timestamp': datetime.now().isoformat(),
                    'raw_response_length': len(content)
                }
                
                self.test_stats['successes'] += 1
                self.test_stats['total_cost'] += 0.03
                self.test_stats['response_times'].append(time.time() - start_time)
                
                print(f"     Success ({time.time() - start_time:.1f}s)")
                return extracted_data
                
            except json.JSONDecodeError as e:
                print(f"     JSON parsing failed: {e}")
                print(f"    Raw response: {content[:200]}...")
                self.test_stats['failures'] += 1
                return None
                
        except Exception as e:
            print(f"     API call failed: {e}")
            self.test_stats['failures'] += 1
            return None
    
    def get_test_stats(self):
        """Get test statistics"""
        return {
            'attempts': self.test_stats['attempts'],
            'successes': self.test_stats['successes'], 
            'failures': self.test_stats['failures'],
            'success_rate': (self.test_stats['successes'] / max(1, self.test_stats['attempts'])) * 100,
            'total_cost': self.test_stats['total_cost'],
            'avg_response_time': np.mean(self.test_stats['response_times']) if self.test_stats['response_times'] else 0
        }



 SECTION 2: SIMPLIFIED TEST EXTRACTION ENGINE


### SECTION 3: SAMPLE TEST EXECUTION

In [5]:
print("\n SECTION 3: SAMPLE TEST EXECUTION")

def run_sample_test():
    """Run small-scale test before full extraction"""
    
    print(" Finding test charts...")
    
    # Find available charts
    original_charts = list(Path('data/raw_charts').glob('*.png'))
    perturbation_charts = list(Path('data/perturbations').glob('*.png'))
    
    if not original_charts:
        print(" No original charts found! Run chart generation first.")
        return False
    
    print(f" Found {len(original_charts)} original charts")
    print(f" Found {len(perturbation_charts)} perturbations")
    
    # Select test samples
    test_originals = random.sample(original_charts, min(TEST_CONFIG['test_charts'], len(original_charts)))
    test_perturbations = random.sample(perturbation_charts, min(TEST_CONFIG['test_perturbations'], len(perturbation_charts)))
    
    print(f"\n SELECTED FOR TESTING:")
    print(f"   Original charts: {len(test_originals)}")
    print(f"   Perturbations: {len(test_perturbations)}")
    
    # Initialize test extractor
    extractor = TestGPT4VisionExtractor(client)
    test_results = {}
    
    print(f"\n STARTING SAMPLE EXTRACTION TEST...")
    print(f" Budget limit: ${TEST_CONFIG['total_test_cost']:.2f}")
    
    # Test original charts
    print(f"\n TESTING ORIGINAL CHARTS:")
    for i, chart_path in enumerate(test_originals):
        if extractor.get_test_stats()['total_cost'] >= TEST_CONFIG['total_test_cost']:
            print(f" Budget limit reached")
            break
            
        print(f"Test {i+1}: {chart_path.name}")
        result = extractor.test_extract(chart_path)
        
        if result:
            test_results[chart_path.stem] = {
                'type': 'original',
                'result': result,
                'file': str(chart_path)
            }
            
            # Show sample of extracted data
            data_points = result.get('data', [])
            print(f"     Extracted {len(data_points)} data points")
            if data_points:
                sample_point = data_points[0]
                print(f"     Sample: {sample_point}")
        
        time.sleep(1)  # Rate limiting
    
    # Test perturbations
    print(f"\n TESTING PERTURBATIONS:")
    for i, pert_path in enumerate(test_perturbations):
        if extractor.get_test_stats()['total_cost'] >= TEST_CONFIG['total_test_cost']:
            print(f" Budget limit reached")
            break
            
        print(f"Test {i+1}: {pert_path.name}")
        result = extractor.test_extract(pert_path)
        
        if result:
            test_results[pert_path.stem] = {
                'type': 'perturbation',
                'result': result,
                'file': str(pert_path)
            }
            
            # Show extracted data
            data_points = result.get('data', [])
            print(f"     Extracted {len(data_points)} data points")
            if data_points:
                sample_point = data_points[0]
                print(f"     Sample: {sample_point}")
        
        time.sleep(1)
    
    return test_results, extractor

# Run the sample test
test_results, test_extractor = run_sample_test()



 SECTION 3: SAMPLE TEST EXECUTION
 Finding test charts...
 Found 203 original charts
 Found 1650 perturbations

 SELECTED FOR TESTING:
   Original charts: 3
   Perturbations: 2

 STARTING SAMPLE EXTRACTION TEST...
 Budget limit: $0.15

 TESTING ORIGINAL CHARTS:
Test 1: chart_038_medium_line.png
  🔍 Testing extraction: chart_038_medium_line.png
     Success (5.8s)
     Extracted 7 data points
     Sample: {'category': 'Category 1', 'value': 95}
Test 2: chart_160_complex_bar.png
  🔍 Testing extraction: chart_160_complex_bar.png
     Success (14.9s)
     Extracted 32 data points
     Sample: {'category': 'Category 1', 'value': 92.4}
Test 3: chart_150_medium_bar.png
  🔍 Testing extraction: chart_150_medium_bar.png
     Success (5.1s)
     Extracted 10 data points
     Sample: {'category': 'Category 1', 'value': 60}

 TESTING PERTURBATIONS:
Test 1: chart_194_medium_line_grayscale_conversion_medium.png
  🔍 Testing extraction: chart_194_medium_line_grayscale_conversion_medium.png
     Succes

### SECTION 4: TEST ANALYSIS

In [6]:
print("\n SECTION 4: TEST ANALYSIS")

def analyze_test_results(results, extractor):
    """Analyze test results and provide recommendations"""
    
    stats = extractor.get_test_stats()
    
    print(" SAMPLE TEST RESULTS:")
    print("-" * 50)
    print(f"Total Attempts: {stats['attempts']}")
    print(f"Successful Extractions: {stats['successes']}")
    print(f"Failed Extractions: {stats['failures']}")
    print(f"Success Rate: {stats['success_rate']:.1f}%")
    print(f"Total Cost: ${stats['total_cost']:.2f}")
    print(f"Average Response Time: {stats['avg_response_time']:.2f}s")
    
    # Analyze extraction quality
    if results:
        print(f"\n EXTRACTION QUALITY ANALYSIS:")
        
        confidence_levels = {}
        data_point_counts = []
        chart_types = {}
        
        for result_key, result_data in results.items():
            extracted = result_data['result']
            
            # Confidence analysis
            confidence = extracted.get('confidence', 'unknown')
            confidence_levels[confidence] = confidence_levels.get(confidence, 0) + 1
            
            # Data point analysis
            data_points = extracted.get('data', [])
            data_point_counts.append(len(data_points))
            
            # Chart type analysis
            chart_type = extracted.get('chart_type', 'unknown')
            chart_types[chart_type] = chart_types.get(chart_type, 0) + 1
            
            # Show sample extraction
            print(f"\n SAMPLE: {result_key}")
            print(f"   Title: {extracted.get('chart_title', 'N/A')}")
            print(f"   Type: {extracted.get('chart_type', 'N/A')}")
            print(f"   Confidence: {extracted.get('confidence', 'N/A')}")
            print(f"   Data Points: {len(data_points)}")
            
            if data_points:
                print(f"   Sample Data: {data_points[0]}")
        
        print(f"\n QUALITY METRICS:")
        print(f"   Confidence Distribution: {confidence_levels}")
        print(f"   Chart Types Detected: {chart_types}")
        print(f"   Avg Data Points: {np.mean(data_point_counts):.1f}")
        print(f"   Data Point Range: {min(data_point_counts)}-{max(data_point_counts)}")
        
    return stats

test_analysis = analyze_test_results(test_results, test_extractor)



 SECTION 4: TEST ANALYSIS
 SAMPLE TEST RESULTS:
--------------------------------------------------
Total Attempts: 5
Successful Extractions: 5
Failed Extractions: 0
Success Rate: 100.0%
Total Cost: $0.15
Average Response Time: 8.04s

 EXTRACTION QUALITY ANALYSIS:

 SAMPLE: chart_038_medium_line
   Title: Market Share Analysis - (Business Analytics)
   Type: line
   Confidence: high
   Data Points: 7
   Sample Data: {'category': 'Category 1', 'value': 95}

 SAMPLE: chart_160_complex_bar
   Title: Supply Chain Performance - (Operational Metrics)
   Type: bar
   Confidence: high
   Data Points: 32
   Sample Data: {'category': 'Category 1', 'value': 92.4}

 SAMPLE: chart_150_medium_bar
   Title: Sector Comparison - (Financial Analysis)
   Type: bar
   Confidence: medium
   Data Points: 10
   Sample Data: {'category': 'Category 1', 'value': 60}

 SAMPLE: chart_194_medium_line_grayscale_conversion_medium
   Title: Supply Chain Performance - (Operational Metrics)
   Type: line
   Confidence:

### SECTION 5: RECOMMENDATIONS

In [7]:
print("\n SECTION 5: RECOMMENDATIONS")

def provide_recommendations(stats, results):
    """Provide recommendations based on test results"""
    
    print(" RECOMMENDATIONS FOR FULL EXTRACTION:")
    print("-" * 60)
    
    # Success rate assessment
    if stats['success_rate'] >= 90:
        print(" SUCCESS RATE: EXCELLENT (≥90%)")
        print("   Recommendation: Proceed with full extraction")
        proceed_recommendation = True
    elif stats['success_rate'] >= 75:
        print(" SUCCESS RATE: GOOD (≥75%)")
        print("   Recommendation: Proceed with caution")
        proceed_recommendation = True
    elif stats['success_rate'] >= 50:
        print(" SUCCESS RATE: MODERATE (≥50%)")
        print("   Recommendation: Consider debugging first")
        proceed_recommendation = False
    else:
        print(" SUCCESS RATE: LOW (<50%)")
        print("   Recommendation: Debug issues before proceeding")
        proceed_recommendation = False
    
    # Cost projection
    if proceed_recommendation:
        # Project full costs based on test
        estimated_full_extractions = 1000  # Conservative estimate
        projected_cost = estimated_full_extractions * 0.03
        projected_success_rate = stats['success_rate']
        expected_successful_extractions = int(estimated_full_extractions * projected_success_rate / 100)
        
        print(f"\n FULL EXTRACTION PROJECTIONS:")
        print(f"   Planned Extractions: {estimated_full_extractions}")
        print(f"   Projected Cost: ${projected_cost:.2f}")
        print(f"   Expected Success Rate: {projected_success_rate:.1f}%")
        print(f"   Expected Successful Extractions: {expected_successful_extractions}")
        
        if projected_cost <= 45:
            print(" BUDGET: Within limits")
        else:
            print(" BUDGET: May exceed limits")
    
    # Response time assessment
    avg_time = stats['avg_response_time']
    if avg_time <= 5:
        print(f"\n RESPONSE TIME: FAST ({avg_time:.1f}s average)")
    elif avg_time <= 10:
        print(f"\n RESPONSE TIME: GOOD ({avg_time:.1f}s average)")
    else:
        print(f"\n RESPONSE TIME: SLOW ({avg_time:.1f}s average)")
    
    # Quality assessment
    if results:
        high_confidence = sum(1 for r in results.values() 
                             if r['result'].get('confidence') == 'high')
        confidence_rate = (high_confidence / len(results)) * 100
        
        print(f"\n EXTRACTION QUALITY:")
        print(f"   High Confidence Rate: {confidence_rate:.1f}%")
        
        if confidence_rate >= 70:
            print(" QUALITY: HIGH - GPT-4o is confident in extractions")
        elif confidence_rate >= 50:
            print(" QUALITY: MODERATE - Acceptable for research")
        else:
            print(" QUALITY: LOW - Consider chart quality improvements")
    
    return proceed_recommendation

proceed_with_full = provide_recommendations(test_analysis, test_results)



 SECTION 5: RECOMMENDATIONS
 RECOMMENDATIONS FOR FULL EXTRACTION:
------------------------------------------------------------
 SUCCESS RATE: EXCELLENT (≥90%)
   Recommendation: Proceed with full extraction

 FULL EXTRACTION PROJECTIONS:
   Planned Extractions: 1000
   Projected Cost: $30.00
   Expected Success Rate: 100.0%
   Expected Successful Extractions: 1000
 BUDGET: Within limits

 RESPONSE TIME: GOOD (8.0s average)

 EXTRACTION QUALITY:
   High Confidence Rate: 80.0%
 QUALITY: HIGH - GPT-4o is confident in extractions


### SECTION 6: NEXT STEPS

In [8]:
print("\n SECTION 6: NEXT STEPS")

if proceed_with_full:
    print(" TEST SUCCESSFUL - READY FOR FULL EXTRACTION!")
    print("-" * 60)
    print(" System is working correctly")
    print(" API integration functional") 
    print(" JSON parsing working")
    print(" Quality looks good")
    print(" Budget projections reasonable")
    
    print(f"\n TO PROCEED WITH FULL EXTRACTION:")
    print(f"1.  Run the complete Notebook 4 (not the test version)")
    print(f"2.  Monitor costs during execution")
    print(f"3.  Expected results: ~800-1000 successful extractions")
    print(f"4.  Expected time: 2-4 hours")
    print(f"5.  Expected cost: $30-40")
    
    # Save test results for reference
    test_summary = {
        'test_successful': True,
        'test_stats': test_analysis,
        'recommendation': 'PROCEED',
        'projected_full_cost': 36.00,
        'test_timestamp': datetime.now().isoformat()
    }
    
else:
    print(" TEST REVEALED ISSUES - DEBUG NEEDED")
    print("-" * 60)
    print(" Success rate too low for reliable results")
    print(" Recommended actions:")
    print("   1. Check API key and credits")
    print("   2. Verify chart image quality")
    print("   3. Test with different charts")
    print("   4. Check internet connection")
    print("   5. Review error messages above")
    
    test_summary = {
        'test_successful': False,
        'test_stats': test_analysis,
        'recommendation': 'DEBUG_FIRST',
        'test_timestamp': datetime.now().isoformat()
    }

# Save test summary
with open('data/analysis_cache/extraction_test_summary.json', 'w') as f:
    json.dump(test_summary, f, indent=2)

print(f"\n Test summary saved to: data/analysis_cache/extraction_test_summary.json")

print("\n" + "=" * 80)
print(" SAMPLE TEST COMPLETE!")
if proceed_with_full:
    print(" READY FOR FULL EXTRACTION PIPELINE!")
else:
    print("🔧 DEBUG NEEDED BEFORE FULL EXTRACTION")
print("=" * 80)


 SECTION 6: NEXT STEPS
 TEST SUCCESSFUL - READY FOR FULL EXTRACTION!
------------------------------------------------------------
 System is working correctly
 API integration functional
 JSON parsing working
 Quality looks good
 Budget projections reasonable

 TO PROCEED WITH FULL EXTRACTION:
1.  Run the complete Notebook 4 (not the test version)
2.  Monitor costs during execution
3.  Expected results: ~800-1000 successful extractions
4.  Expected time: 2-4 hours
5.  Expected cost: $30-40

 Test summary saved to: data/analysis_cache/extraction_test_summary.json

 SAMPLE TEST COMPLETE!
 READY FOR FULL EXTRACTION PIPELINE!
