### GPT-4o Robustness Analysis: Environment Setup and Configuration

In [1]:
import os
import json
import sys
from pathlib import Path
import logging
from datetime import datetime

print("=" * 80)
print("GPT-4o Vision robustness analysis")
print("Chart Data Extraction Under Real-World Perturbations")
print("=" * 80)

GPT-4o Vision robustness analysis
Chart Data Extraction Under Real-World Perturbations


### SECTION 1: ENVIRONMENT VALIDATION

In [2]:

print("\n SECTION 1: ENVIRONMENT VALIDATION")

def check_python_version():
    """Ensure Python 3.10+"""
    version = sys.version_info
    if version.major == 3 and version.minor >= 10:
        print(f" Python {version.major}.{version.minor}.{version.micro} - Compatible")
        return True
    else:
        print(f" Python {version.major}.{version.minor} - Need 3.10+")
        return False

def check_required_packages():
    """Check all required packages are installed"""
    required_packages = {
        'openai': '1.0.0+',
        'pillow': '10.0.0+', 
        'matplotlib': '3.7.0+',
        'pandas': '2.0.0+',
        'numpy': '1.24.0+',
        'seaborn': '0.12.0+',
        'scipy': '1.10.0+',
        'scikit-learn': '1.3.0+',
        'python-dotenv': '1.0.0+'
    }
    
    missing_packages = []
    
    for package, min_version in required_packages.items():
        try:
            if package == 'pillow':
                import PIL
                print(f" {package} (PIL) - Installed")
            elif package == 'python-dotenv':
                import dotenv
                print(f" {package} - Installed")
            elif package == 'scikit-learn':
                import sklearn
                print(f" {package} - Installed")
            else:
                __import__(package)
                print(f" {package} - Installed")
        except ImportError:
            print(f" {package} - Missing")
            missing_packages.append(package)
    
    if missing_packages:
        print(f"\n Install missing packages:")
        print(f"pip install {' '.join(missing_packages)}")
        return False
    
    return True

# Run environment checks
python_ok = check_python_version()
packages_ok = check_required_packages()

if not (python_ok and packages_ok):
    print("\nEnvironment setup incomplete. Please fix issues above.")
    sys.exit(1)

print("\n Environment validation complete!")


 SECTION 1: ENVIRONMENT VALIDATION
 Python 3.12.7 - Compatible
 openai - Installed
 pillow (PIL) - Installed
 matplotlib - Installed
 pandas - Installed
 numpy - Installed
 seaborn - Installed
 scipy - Installed
 scikit-learn - Installed
 python-dotenv - Installed

 Environment validation complete!


### SECTION 2: PROJECT STRUCTURE CREATION

In [3]:
print("\n SECTION 2: PROJECT STRUCTURE CREATION")

def create_research_structure():
    """Create comprehensive directory structure for research project"""
    
    directories = [
        # Data directories
        "data/raw_charts",
        "data/perturbations", 
        "data/extractions",
        "data/ground_truth",
        "data/analysis_cache",
        
        # Results directories
        "results/figures",
        "results/tables", 
        "results/statistical_output",
        "results/validation_reports",
        
        # Documentation
        "docs/methodology",
        "docs/literature_notes",
        "docs/meeting_notes",
        
        # Backup and logs
        "logs",
        "backups",
        
        # Notebook outputs
        "notebook_outputs/charts",
        "notebook_outputs/analysis",
    ]
    
    created_dirs = []
    
    for directory in directories:
        path = Path(directory)
        if not path.exists():
            path.mkdir(parents=True, exist_ok=True)
            created_dirs.append(directory)
            print(f" Created: {directory}")
        else:
            print(f" Exists: {directory}")
    
    if created_dirs:
        print(f"\n Created {len(created_dirs)} new directories")
    else:
        print(f"\n All directories already exist")

create_research_structure()


 SECTION 2: PROJECT STRUCTURE CREATION
 Created: data/raw_charts
 Created: data/perturbations
 Created: data/extractions
 Created: data/ground_truth
 Created: data/analysis_cache
 Created: results/figures
 Created: results/tables
 Created: results/statistical_output
 Created: results/validation_reports
 Created: docs/methodology
 Created: docs/literature_notes
 Created: docs/meeting_notes
 Created: logs
 Created: backups
 Created: notebook_outputs/charts
 Created: notebook_outputs/analysis

 Created 16 new directories


### SECTION 3: CONFIGURATION SETUP

In [7]:
print("\n SECTION 3: CONFIGURATION SETUP")

# Research configuration
RESEARCH_CONFIG = {
    "project": {
        "title": "GPT-4o Vision Robustness Analysis in Chart Data Extraction",
        "author": "[Your Name]", 
        "institution": "[Your University]",
        "supervisor": "[Professor Name]",
        "start_date": datetime.now().isoformat(),
        "version": "1.0.0"
    },
    
    "experimental_design": {
        "total_charts": 200,
        "chart_complexity": ["medium", "complex", "advanced"],
        "perturbation_categories": 12,
        "intensity_levels": 3,
        "target_extractions": 1200,
        "budget_limit": 45.00
    },
    
    "quality_standards": {
        "chart_resolution": 300,  # DPI
        "image_format": "PNG",
        "color_space": "RGB",
        "min_data_points": 6,
        "max_data_points": 15
    },
    
    "evaluation_metrics": [
        "exact_match_accuracy",
        "partial_match_accuracy", 
        "category_recognition_rate",
        "value_precision_score",
        "structural_understanding",
        "confidence_correlation",
        "error_pattern_analysis",
        "composite_robustness_index"
    ]
}

# Save configuration
with open('research_config.json', 'w') as f:
    json.dump(RESEARCH_CONFIG, f, indent=2)

print(" Research configuration saved to 'research_config.json'")

# Display key parameters
print("\n KEY RESEARCH PARAMETERS:")
for category, params in RESEARCH_CONFIG.items():
    if category != "project":
        print(f"\n{category.upper().replace('_', ' ')}:")
        if isinstance(params, dict):
            for key, value in params.items():
                print(f"  ‚Ä¢ {key.replace('_', ' ').title()}: {value}")
        elif isinstance(params, list):
            for item in params:
                print(f"  ‚Ä¢ {item}")


 SECTION 3: CONFIGURATION SETUP
 Research configuration saved to 'research_config.json'

 KEY RESEARCH PARAMETERS:

EXPERIMENTAL DESIGN:
  ‚Ä¢ Total Charts: 200
  ‚Ä¢ Chart Complexity: ['medium', 'complex', 'advanced']
  ‚Ä¢ Perturbation Categories: 12
  ‚Ä¢ Intensity Levels: 3
  ‚Ä¢ Target Extractions: 1200
  ‚Ä¢ Budget Limit: 45.0

QUALITY STANDARDS:
  ‚Ä¢ Chart Resolution: 300
  ‚Ä¢ Image Format: PNG
  ‚Ä¢ Color Space: RGB
  ‚Ä¢ Min Data Points: 6
  ‚Ä¢ Max Data Points: 15

EVALUATION METRICS:
  ‚Ä¢ exact_match_accuracy
  ‚Ä¢ partial_match_accuracy
  ‚Ä¢ category_recognition_rate
  ‚Ä¢ value_precision_score
  ‚Ä¢ structural_understanding
  ‚Ä¢ confidence_correlation
  ‚Ä¢ error_pattern_analysis
  ‚Ä¢ composite_robustness_index


### SECTION 4: API CONFIGURATION  

In [None]:
print("\n SECTION 4: API CONFIGURATION")

def setup_api_configuration():
    """Setup and validate OpenAI API configuration"""
    
    # Check for .env file
    if not Path('.env').exists():
        print("  .env file not found")
        
        # Create template
        env_template = """# OpenAI API Configuration for GPT-4o Vision Research
OPENAI_API_KEY=your-openai-api-key-here

# Get your API key from: https://platform.openai.com/api-keys
# Estimated budget for complete study: $5.00

# Research Notes:
# - Model: gpt-4o (GPT-4 Omni with vision capabilities)
# - Cost per image: ~$0.02-0.03 (high detail mode)
# - Expected total extractions: 150+
"""
        
        with open('.env.template', 'w') as f:
            f.write(env_template)
        
        print(" Created .env.template file")
        print(" Please:")
        print("   1. Copy .env.template to .env")
        print("   2. Add your OpenAI API key")
        print("   3. Re-run this notebook")
        return False
    
    # Load environment variables
    from dotenv import load_dotenv
    load_dotenv()
    
    api_key = os.getenv('OPENAI_API_KEY')
    
    if not api_key or api_key == 'your-openai-api-key-here':
        print(" OpenAI API key not configured")
        print(" Please add your API key to .env file")
        return False
    
    # Test API connection
    try:
        import openai
        client = openai.OpenAI(api_key=api_key)
        
        # Test with simple request
        models = client.models.list()
        print(f" API connection successful")
        print(f" API key: {api_key[:8]}...")
        
        # Check for GPT-4o availability
        available_models = [model.id for model in models.data]
        if 'gpt-4o' in available_models:
            print(" GPT-4o model available")
        else:
            print("  GPT-4o not found in available models")
            print(f"Available models: {[m for m in available_models if 'gpt-4' in m]}")
        
        return True
        
    except Exception as e:
        print(f" API connection failed: {e}")
        return False

api_configured = setup_api_configuration()



üîë SECTION 4: API CONFIGURATION
‚ö†Ô∏è  .env file not found
 Created .env.template file
üîß Please:
   1. Copy .env.template to .env
   2. Add your OpenAI API key
   3. Re-run this notebook


### SECTION 5: LOGGING SETUP

In [8]:
print("\n SECTION 5: LOGGING SETUP")

def setup_research_logging():
    """Setup comprehensive logging for research tracking"""
    
    # Create logs directory if not exists
    Path('logs').mkdir(exist_ok=True)
    
    # Setup logging configuration
    log_format = '%(asctime)s - %(name)s - %(levelname)s - %(message)s'
    
    # Main research log
    research_logger = logging.getLogger('research')
    research_logger.setLevel(logging.INFO)
    
    # File handler
    file_handler = logging.FileHandler('logs/research_progress.log')
    file_handler.setFormatter(logging.Formatter(log_format))
    
    # Console handler  
    console_handler = logging.StreamHandler()
    console_handler.setFormatter(logging.Formatter('%(levelname)s - %(message)s'))
    
    research_logger.addHandler(file_handler)
    research_logger.addHandler(console_handler)
    
    # Log initial setup
    research_logger.info("=" * 50)
    research_logger.info("RESEARCH PROJECT INITIALIZED")
    research_logger.info("=" * 50)
    research_logger.info(f"Project: {RESEARCH_CONFIG['project']['title']}")
    research_logger.info(f"Start Date: {RESEARCH_CONFIG['project']['start_date']}")
    research_logger.info(f"Target Extractions: {RESEARCH_CONFIG['experimental_design']['target_extractions']}")
    research_logger.info(f"Budget Limit: ${RESEARCH_CONFIG['experimental_design']['budget_limit']}")
    
    print(" Research logging configured")
    print(" Logs will be saved to: logs/research_progress.log")
    
    return research_logger

logger = setup_research_logging()

INFO - RESEARCH PROJECT INITIALIZED
INFO - Project: GPT-4o Vision Robustness Analysis in Chart Data Extraction
INFO - Start Date: 2025-06-21T13:11:18.508893
INFO - Target Extractions: 1200
INFO - Budget Limit: $45.0



 SECTION 5: LOGGING SETUP
 Research logging configured
 Logs will be saved to: logs/research_progress.log


### SECTION 6: VALIDATION AND SUMMARY

In [9]:
print("\n SECTION 6: SETUP VALIDATION")

def validate_complete_setup():
    """Final validation of complete setup"""
    
    setup_status = {
        "Python Environment": python_ok,
        "Required Packages": packages_ok, 
        "Project Structure": True,  # Created successfully
        "Research Configuration": True,  # Saved successfully
        "API Configuration": api_configured,
        "Logging System": True  # Setup successfully
    }
    
    print("\n SETUP STATUS SUMMARY:")
    print("-" * 40)
    
    all_good = True
    for component, status in setup_status.items():
        status_icon = "All good" if status else "Not good"
        print(f"{status_icon} {component}")
        if not status:
            all_good = False
    
    print("-" * 40)
    
    if all_good:
        print(" SETUP COMPLETE! Ready for research implementation.")
        logger.info("Complete environment setup successful")
        
        print("\n NEXT STEPS:")
        print("1. Run Notebook 02: Advanced Chart Generation")
        print("2. Review generated charts for complexity")
        print("3. Proceed with perturbation framework")
        
        return True
    else:
        print("  Setup incomplete. Please resolve issues above.")
        logger.warning("Setup validation failed - issues need resolution")
        return False

setup_complete = validate_complete_setup()




 SECTION 6: SETUP VALIDATION

 SETUP STATUS SUMMARY:
----------------------------------------
All good Python Environment
All good Required Packages
All good Project Structure
All good Research Configuration
Not good API Configuration
All good Logging System
----------------------------------------
  Setup incomplete. Please resolve issues above.


### SECTION 7: RESEARCH ROADMAP

In [10]:
print("\n SECTION 7: RESEARCH ROADMAP")

roadmap = {
    "Phase 1 - Foundation": [
        " Environment setup and validation",
        " Advanced chart generation (medium-complex)",
        " Comprehensive perturbation framework",
        " Quality validation and sampling"
    ],
    
    "Phase 2 - Data Collection": [
        " GPT-4o extraction pipeline implementation",
        " Multi-metric evaluation framework", 
        " Budget-optimized strategic sampling",
        " Real-time progress monitoring"
    ],
    
    "Phase 3 - Analysis": [
        " Statistical analysis across 8 metrics",
        " Robustness pattern identification",
        " Performance degradation modeling",
        " Error pattern categorization"
    ],
    
    "Phase 4 - Results": [
        " Publication-quality visualizations",
        " Comprehensive results tables",
        " Statistical significance reporting",
        " Dissertation integration"
    ]
}

for phase, tasks in roadmap.items():
    print(f"\n{phase}:")
    for task in tasks:
        print(f"  {task}")

print(f"\n ESTIMATED TIMELINE: 4-5 days for complete implementation")
print(f" BUDGET ALLOCATION: ${RESEARCH_CONFIG['experimental_design']['budget_limit']}")
print(f" TARGET OUTCOME: {len(RESEARCH_CONFIG['evaluation_metrics'])} evaluation metrics")

print("\n" + "=" * 80)
print(" ENVIRONMENT SETUP COMPLETE - READY FOR ADVANCED RESEARCH!")
print("=" * 80)

# Save setup summary
setup_summary = {
    "setup_timestamp": datetime.now().isoformat(),
    "setup_status": setup_complete,
    "environment_validation": {
        "python_version": f"{sys.version_info.major}.{sys.version_info.minor}.{sys.version_info.micro}",
        "packages_installed": packages_ok,
        "api_configured": api_configured
    },
    "next_notebook": "02_Advanced_Chart_Generation.ipynb" if setup_complete else "Fix setup issues first"
}

with open('logs/setup_summary.json', 'w') as f:
    json.dump(setup_summary, f, indent=2)

print(f"\n Setup summary saved to: logs/setup_summary.json")


 SECTION 7: RESEARCH ROADMAP

Phase 1 - Foundation:
   Environment setup and validation
   Advanced chart generation (medium-complex)
   Comprehensive perturbation framework
   Quality validation and sampling

Phase 2 - Data Collection:
   GPT-4o extraction pipeline implementation
   Multi-metric evaluation framework
   Budget-optimized strategic sampling
   Real-time progress monitoring

Phase 3 - Analysis:
   Statistical analysis across 8 metrics
   Robustness pattern identification
   Performance degradation modeling
   Error pattern categorization

Phase 4 - Results:
   Publication-quality visualizations
   Comprehensive results tables
   Statistical significance reporting
   Dissertation integration

 ESTIMATED TIMELINE: 4-5 days for complete implementation
 BUDGET ALLOCATION: $45.0
 TARGET OUTCOME: 8 evaluation metrics

 ENVIRONMENT SETUP COMPLETE - READY FOR ADVANCED RESEARCH!

 Setup summary saved to: logs/setup_summary.json
