### GPT-4o Robustness Analysis: Comprehensive Perturbation Framework

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from PIL import Image, ImageFilter, ImageEnhance, ImageDraw, ImageOps, ImageFont
import json
import random
from pathlib import Path
import logging
import cv2
from scipy import ndimage
from skimage import transform, util
import warnings
from PIL import ImageFont  # Fix import
import warnings; warnings.filterwarnings('ignore') 
# Set random seeds for reproducibility
np.random.seed(42)
random.seed(42)

print("=" * 80)
print(" COMPREHENSIVE PERTURBATION FRAMEWORK")
print(" Systematic Chart Degradation for Robustness Testing")
print("=" * 80)

# Setup logging
logger = logging.getLogger('research')


 COMPREHENSIVE PERTURBATION FRAMEWORK
 Systematic Chart Degradation for Robustness Testing


### SECTION 1: PERTURBATION CONFIGURATION

In [2]:
print("\n SECTION 1: PERTURBATION CONFIGURATION")

# Load chart generation summary
try:
    with open('data/analysis_cache/chart_generation_summary.json', 'r') as f:
        chart_summary = json.load(f)
    print(f"Loaded chart summary: {chart_summary['charts_generated']} charts available")
except FileNotFoundError:
    print(" Chart generation summary not found - using defaults")
    chart_summary = {'charts_generated': 200, 'ready_for_perturbations': True}

# Comprehensive perturbation taxonomy
PERTURBATION_CONFIG = {
    "visual_noise": {
        "gaussian_blur": {
            "description": "Gaussian blur simulation (scanning/camera blur)",
            "intensities": {"low": 0.5, "medium": 1.0, "high": 2.0},
            "real_world_source": "Poor camera focus, motion blur during scanning"
        },
        "motion_blur": {
            "description": "Motion blur from camera/scanner movement",
            "intensities": {"low": 3, "medium": 5, "high": 9},
            "real_world_source": "Handheld photography, scanner vibration"
        },
        "salt_pepper_noise": {
            "description": "Random pixel noise from digital artifacts",
            "intensities": {"low": 0.005, "medium": 0.015, "high": 0.030},
            "real_world_source": "Digital compression, sensor noise"
        },
        "jpeg_artifacts": {
            "description": "JPEG compression artifacts",
            "intensities": {"low": 70, "medium": 40, "high": 20},
            "real_world_source": "Heavy image compression, web images"
        }
    },
    
    "geometric_transforms": {
        "rotation": {
            "description": "Document rotation (skewed scanning)",
            "intensities": {"low": 2, "medium": 5, "high": 10},
            "real_world_source": "Crooked document placement, tilted camera"
        },
        "scaling": {
            "description": "Image scaling artifacts",
            "intensities": {"low": 0.9, "medium": 0.8, "high": 1.2},
            "real_world_source": "Different zoom levels, resolution changes"
        },
        "perspective_distortion": {
            "description": "Perspective distortion from angled photography",
            "intensities": {"low": 0.1, "medium": 0.2, "high": 0.3},
            "real_world_source": "Photography at angles, document curvature"
        }
    },
    
    "color_lighting": {
        "brightness_shift": {
            "description": "Overall brightness changes",
            "intensities": {"low": 0.8, "medium": 0.6, "high": 1.4},
            "real_world_source": "Poor lighting conditions, over/under exposure"
        },
        "contrast_change": {
            "description": "Contrast modifications",
            "intensities": {"low": 0.7, "medium": 0.5, "high": 1.5},
            "real_world_source": "Poor lighting, display calibration issues"
        },
        "color_shift": {
            "description": "Color temperature changes",
            "intensities": {"low": 1.1, "medium": 1.3, "high": 1.6},
            "real_world_source": "Different lighting (fluorescent, LED, tungsten)"
        },
        "grayscale_conversion": {
            "description": "Color to grayscale conversion",
            "intensities": {"low": "partial", "medium": "full", "high": "full"},
            "real_world_source": "Black & white printing, monochrome displays"
        }
    },
    
    "occlusion_effects": {
        "random_blocks": {
            "description": "Random rectangular occlusions",
            "intensities": {"low": 0.05, "medium": 0.10, "high": 0.20},
            "real_world_source": "Stickers, tape, partial covering"
        },
        "text_overlay": {
            "description": "Text overlaid on chart",
            "intensities": {"low": 1, "medium": 3, "high": 6},
            "real_world_source": "Watermarks, annotations, stamps"
        },
        "partial_crop": {
            "description": "Chart edges cropped off",
            "intensities": {"low": 0.05, "medium": 0.10, "high": 0.15},
            "real_world_source": "Poor framing, document trimming"
        }
    },
    
    "chart_specific": {
        "legend_corruption": {
            "description": "Legend area specifically degraded",
            "intensities": {"low": "blur", "medium": "partial_remove", "high": "full_remove"},
            "real_world_source": "Legend outside scan area, damaged documents"
        },
        "axis_degradation": {
            "description": "Axis labels and ticks corrupted",
            "intensities": {"low": 1.0, "medium": 2.0, "high": 3.0},
            "real_world_source": "Edge artifacts, poor print quality"
        },
        "data_point_occlusion": {
            "description": "Individual data points obscured",
            "intensities": {"low": 1, "medium": 3, "high": 6},
            "real_world_source": "Ink smudges, physical damage"
        }
    }
}

print(" PERTURBATION TAXONOMY LOADED:")
for category, perturbations in PERTURBATION_CONFIG.items():
    print(f"   {category.replace('_', ' ').title()}: {len(perturbations)} types")

total_perturbations = sum(len(p) for p in PERTURBATION_CONFIG.values())
print(f" Total perturbation types: {total_perturbations}")
print(f" With 3 intensity levels: {total_perturbations * 3} variants per chart")



 SECTION 1: PERTURBATION CONFIGURATION
Loaded chart summary: 200 charts available
 PERTURBATION TAXONOMY LOADED:
   Visual Noise: 4 types
   Geometric Transforms: 3 types
   Color Lighting: 4 types
   Occlusion Effects: 3 types
   Chart Specific: 3 types
 Total perturbation types: 17
 With 3 intensity levels: 51 variants per chart


### SECTION 2: ADVANCED PERTURBATION ENGINE

In [3]:
print("\n SECTION 2: ADVANCED PERTURBATION ENGINE")

class AdvancedPerturbationEngine:
    """Professional perturbation system for systematic robustness testing"""
    
    def __init__(self):
        self.applied_perturbations = []
        self.failure_log = []
        
    def apply_visual_noise(self, image, perturbation_type, intensity_level):
        """Apply visual noise perturbations"""
        
        intensity = PERTURBATION_CONFIG["visual_noise"][perturbation_type]["intensities"][intensity_level]
        
        if perturbation_type == "gaussian_blur":
            return image.filter(ImageFilter.GaussianBlur(radius=intensity))
            
        elif perturbation_type == "motion_blur":
            # Create motion blur kernel
            kernel_size = int(intensity)
            kernel = np.zeros((kernel_size, kernel_size))
            kernel[int((kernel_size-1)/2), :] = np.ones(kernel_size)
            kernel = kernel / kernel_size
            
            # Convert PIL to OpenCV format
            img_array = np.array(image)
            blurred = cv2.filter2D(img_array, -1, kernel)
            return Image.fromarray(blurred.astype(np.uint8))
            
        elif perturbation_type == "salt_pepper_noise":
            img_array = np.array(image)
            noise = np.random.random(img_array.shape[:2])
            
            # Salt noise (white pixels)
            img_array[noise < intensity/2] = 255
            # Pepper noise (black pixels)  
            img_array[noise > 1 - intensity/2] = 0
            
            return Image.fromarray(img_array.astype(np.uint8))
            
        elif perturbation_type == "jpeg_artifacts":
            from io import BytesIO
            
            # Convert to RGB if needed
            if image.mode != 'RGB':
                image = image.convert('RGB')
                
            # Apply JPEG compression
            buffer = BytesIO()
            image.save(buffer, format='JPEG', quality=int(intensity))
            buffer.seek(0)
            return Image.open(buffer)
            
        return image
    
    def apply_geometric_transform(self, image, perturbation_type, intensity_level):
        """Apply geometric transformations"""
        
        intensity = PERTURBATION_CONFIG["geometric_transforms"][perturbation_type]["intensities"][intensity_level]
        
        if perturbation_type == "rotation":
            # Random rotation direction
            angle = intensity * random.choice([-1, 1])
            return image.rotate(angle, expand=True, fillcolor='white')
            
        elif perturbation_type == "scaling":
            new_size = (int(image.size[0] * intensity), int(image.size[1] * intensity))
            scaled = image.resize(new_size, Image.Resampling.LANCZOS)
            
            # Resize back to original size to create scaling artifacts
            return scaled.resize(image.size, Image.Resampling.LANCZOS)
            
        elif perturbation_type == "perspective_distortion":
            # Convert to numpy array
            img_array = np.array(image)
            h, w = img_array.shape[:2]
            
            # Create perspective transformation
            distortion = intensity
            src_points = np.float32([[0, 0], [w, 0], [w, h], [0, h]])
            dst_points = np.float32([
                [distortion * w, distortion * h],
                [w - distortion * w, distortion * h], 
                [w - distortion * w, h - distortion * h],
                [distortion * w, h - distortion * h]
            ])
            
            matrix = cv2.getPerspectiveTransform(src_points, dst_points)
            warped = cv2.warpPerspective(img_array, matrix, (w, h), 
                                       borderMode=cv2.BORDER_CONSTANT, 
                                       borderValue=(255, 255, 255))
            
            return Image.fromarray(warped.astype(np.uint8))
            
        return image
    
    def apply_color_lighting(self, image, perturbation_type, intensity_level):
        """Apply color and lighting changes"""
        
        intensity = PERTURBATION_CONFIG["color_lighting"][perturbation_type]["intensities"][intensity_level]
        
        if perturbation_type == "brightness_shift":
            enhancer = ImageEnhance.Brightness(image)
            return enhancer.enhance(intensity)
            
        elif perturbation_type == "contrast_change":
            enhancer = ImageEnhance.Contrast(image)
            return enhancer.enhance(intensity)
            
        elif perturbation_type == "color_shift":
            enhancer = ImageEnhance.Color(image)
            return enhancer.enhance(intensity)
            
        elif perturbation_type == "grayscale_conversion":
            if intensity_level == "low":
                # Partial desaturation
                enhancer = ImageEnhance.Color(image)
                return enhancer.enhance(0.3)
            else:
                # Full grayscale
                gray = image.convert('L')
                return gray.convert('RGB')
                
        return image
    
    def apply_occlusion_effects(self, image, perturbation_type, intensity_level):
        """Apply occlusion and masking effects"""
        
        intensity = PERTURBATION_CONFIG["occlusion_effects"][perturbation_type]["intensities"][intensity_level]
        
        if perturbation_type == "random_blocks":
            draw = ImageDraw.Draw(image)
            width, height = image.size
            
            # Calculate coverage area
            total_area = width * height
            target_coverage = intensity * total_area
            covered_area = 0
            
            while covered_area < target_coverage:
                # Random block size and position
                block_width = random.randint(20, min(100, width//4))
                block_height = random.randint(20, min(100, height//4))
                x = random.randint(0, width - block_width)
                y = random.randint(0, height - block_height)
                
                # Draw black rectangle
                draw.rectangle([x, y, x + block_width, y + block_height], fill='black')
                covered_area += block_width * block_height
                
        elif perturbation_type == "text_overlay":
            draw = ImageDraw.Draw(image)
            width, height = image.size
            
            # Add text overlays
            overlay_texts = ["CONFIDENTIAL", "DRAFT", "INTERNAL", "COPY", "SAMPLE", "WATERMARK"]
            
            for i in range(int(intensity)):
                text = random.choice(overlay_texts)
                x = random.randint(0, width - 100)
                y = random.randint(0, height - 30)
                
                # Semi-transparent overlay effect
                overlay = Image.new('RGBA', image.size, (255, 255, 255, 0))
                overlay_draw = ImageDraw.Draw(overlay)
                
                try:
                    # Try to use a font, fall back to default if not available
                    font = ImageFont.truetype("arial.ttf", 24)
                except:
                    font = ImageFont.load_default()
                
                overlay_draw.text((x, y), text, fill=(128, 128, 128, 128), font=font)
                image = Image.alpha_composite(image.convert('RGBA'), overlay).convert('RGB')
                
        elif perturbation_type == "partial_crop":
            width, height = image.size
            crop_size = int(min(width, height) * intensity)
            
            # Random crop from edges
            crop_side = random.choice(['top', 'bottom', 'left', 'right'])
            
            if crop_side == 'top':
                image = image.crop((0, crop_size, width, height))
            elif crop_side == 'bottom':
                image = image.crop((0, 0, width, height - crop_size))
            elif crop_side == 'left':
                image = image.crop((crop_size, 0, width, height))
            else:  # right
                image = image.crop((0, 0, width - crop_size, height))
                
            # Resize back to original size
            image = image.resize((width, height), Image.Resampling.LANCZOS)
            
        return image
    
    def apply_chart_specific(self, image, perturbation_type, intensity_level):
        """Apply chart-specific perturbations"""
        
        intensity = PERTURBATION_CONFIG["chart_specific"][perturbation_type]["intensities"][intensity_level]
        
        if perturbation_type == "legend_corruption":
            width, height = image.size
            
            # Typical legend locations (bottom-right, top-right, right side)
            legend_areas = [
                (width * 0.7, height * 0.8, width, height),        # Bottom-right
                (width * 0.7, 0, width, height * 0.3),             # Top-right  
                (width * 0.8, height * 0.2, width, height * 0.8)   # Right side
            ]
            
            legend_area = random.choice(legend_areas)
            
            if intensity_level == "low":
                # Blur legend area
                legend_region = image.crop(legend_area)
                blurred_legend = legend_region.filter(ImageFilter.GaussianBlur(radius=2))
                image.paste(blurred_legend, legend_area[:2])
                
            elif intensity_level == "medium":
                # Partially remove legend
                draw = ImageDraw.Draw(image)
                draw.rectangle(legend_area, fill=(240, 240, 240))
                
            else:  # high
                # Fully remove legend
                draw = ImageDraw.Draw(image)
                draw.rectangle(legend_area, fill='white')
                
        elif perturbation_type == "axis_degradation":
            # Blur axis areas (bottom and left edges)
            width, height = image.size
    
            # Bottom axis area - FIX: Convert float to int
            bottom_area = (0, int(height * 0.85), width, height)
            bottom_region = image.crop(bottom_area)
            blurred_bottom = bottom_region.filter(ImageFilter.GaussianBlur(radius=float(intensity)))
            image.paste(blurred_bottom, bottom_area[:2])

            # Left axis area - FIX: Convert float to int
            left_area = (0, 0, int(width * 0.15), height)
            left_region = image.crop(left_area)
            blurred_left = left_region.filter(ImageFilter.GaussianBlur(radius=float(intensity)))
            image.paste(blurred_left, left_area[:2])
            
        elif perturbation_type == "data_point_occlusion":
            draw = ImageDraw.Draw(image)
            width, height = image.size
            
            # Add small occlusions in data area (center of chart)
            data_area = {
                'x_min': width * 0.1,
                'x_max': width * 0.9, 
                'y_min': height * 0.1,
                'y_max': height * 0.8
            }
            
            for i in range(int(intensity)):
                x = random.randint(int(data_area['x_min']), int(data_area['x_max']) - 20)
                y = random.randint(int(data_area['y_min']), int(data_area['y_max']) - 20)
                
                # Small circular occlusion
                draw.ellipse([x, y, x + 15, y + 15], fill='black')
                
        return image
    
    def apply_perturbation(self, image, category, perturbation_type, intensity_level):
        """Apply a specific perturbation to an image"""
        
        try:
            # Ensure image is in RGB mode
            if image.mode != 'RGB':
                image = image.convert('RGB')
            
            # Apply perturbation based on category
            if category == "visual_noise":
                result = self.apply_visual_noise(image.copy(), perturbation_type, intensity_level)
            elif category == "geometric_transforms":
                result = self.apply_geometric_transform(image.copy(), perturbation_type, intensity_level)
            elif category == "color_lighting":
                result = self.apply_color_lighting(image.copy(), perturbation_type, intensity_level)
            elif category == "occlusion_effects":
                result = self.apply_occlusion_effects(image.copy(), perturbation_type, intensity_level)
            elif category == "chart_specific":
                result = self.apply_chart_specific(image.copy(), perturbation_type, intensity_level)
            else:
                result = image
                
            # Log successful application
            self.applied_perturbations.append({
                'category': category,
                'type': perturbation_type,
                'intensity': intensity_level,
                'status': 'success'
            })
            
            return result
            
        except Exception as e:
            # Log failure
            self.failure_log.append({
                'category': category,
                'type': perturbation_type,
                'intensity': intensity_level,
                'error': str(e)
            })
            
            logger.error(f"Perturbation failed: {category}/{perturbation_type}/{intensity_level} - {e}")
            return image


 SECTION 2: ADVANCED PERTURBATION ENGINE


### SECTION 3: STRATEGIC PERTURBATION EXECUTION

In [5]:
print("\n SECTION 3: STRATEGIC PERTURBATION EXECUTION")

def execute_strategic_perturbations():
    """Execute perturbations with budget-conscious strategy"""
    
    # Load available charts
    chart_files = list(Path('data/raw_charts').glob('*.png'))
    if not chart_files:
        print(" No charts found in data/raw_charts/")
        return
    
    print(f" Found {len(chart_files)} charts to process")
    
    # Initialize perturbation engine
    engine = AdvancedPerturbationEngine()
    
    # Strategic selection for budget efficiency
    # Use exactly 200 charts for perturbation
    selected_charts = chart_files[:200]
    print(f" Processing {len(selected_charts)} charts for perturbation")

    
    # Priority perturbations (most important for robustness testing)
    priority_perturbations = [
        ("visual_noise", "gaussian_blur"),
        ("geometric_transforms", "rotation"), 
        ("color_lighting", "brightness_shift"),
        ("color_lighting", "grayscale_conversion"),
        ("occlusion_effects", "random_blocks"),
        ("chart_specific", "legend_corruption")
    ]
    
    # # Secondary perturbations (if budget allows)
    # secondary_perturbations = [
    #     ("visual_noise", "motion_blur"),
    #     ("visual_noise", "salt_pepper_noise"),
    #     ("geometric_transforms", "scaling"),
    #     ("color_lighting", "contrast_change"),
    #     ("occlusion_effects", "text_overlay"),
    #     ("chart_specific", "axis_degradation")
    # ]
    
    total_operations = 0
    successful_operations = 0
    
    print("\n APPLYING PRIORITY PERTURBATIONS...")
    
    # Apply priority perturbations to all selected charts
    for i, chart_file in enumerate(selected_charts):
        try:
            original_image = Image.open(chart_file)
            chart_id = chart_file.stem
            
            # Apply each priority perturbation at medium intensity
            for category, pert_type in priority_perturbations:
                output_filename = f"{chart_id}_{pert_type}_medium.png"
                output_path = f"data/perturbations/{output_filename}"
                
                # Skip if already exists
                if Path(output_path).exists():
                    continue
                
                perturbed_image = engine.apply_perturbation(
                    original_image, category, pert_type, "medium"
                )
                
                # Save result
                perturbed_image.save(output_path, format='PNG', quality=95)
                
                total_operations += 1
                successful_operations += 1
                
            # Progress update
            if (i + 1) % 25 == 0:
                print(f" Processed {i + 1}/{len(selected_charts)} charts...")
                
        except Exception as e:
            print(f" Failed to process {chart_file}: {e}")
            continue
    
    print(f"\n PRIORITY PERTURBATIONS COMPLETE")
    print(f" Processed: {len(selected_charts)} charts")
    print(f" Generated: {successful_operations} perturbations")
    
    # # Apply secondary perturbations to subset (budget permitting)
    # secondary_chart_count = min(75, len(selected_charts))
    # secondary_charts = selected_charts[:secondary_chart_count]
    
    # print(f"\n APPLYING SECONDARY PERTURBATIONS TO {secondary_chart_count} CHARTS...")
    
    # for i, chart_file in enumerate(secondary_charts):
    #     try:
    #         original_image = Image.open(chart_file)
    #         chart_id = chart_file.stem
            
    #         # Apply secondary perturbations
    #         for category, pert_type in secondary_perturbations:
    #             output_filename = f"{chart_id}_{pert_type}_medium.png"
    #             output_path = f"data/perturbations/{output_filename}"
                
    #             # Skip if already exists
    #             if Path(output_path).exists():
    #                 continue
                
    #             perturbed_image = engine.apply_perturbation(
    #                 original_image, category, pert_type, "medium"
    #             )
                
    #             perturbed_image.save(output_path, format='PNG', quality=95)
    #             successful_operations += 1
                
    #         if (i + 1) % 15 == 0:
    #             print(f" Secondary processing: {i + 1}/{secondary_chart_count} charts...")
                
    #     except Exception as e:
    #         continue
    
    # # Generate intensity variations for top charts
    # intensity_chart_count = min(50, len(selected_charts))
    # intensity_charts = selected_charts[:intensity_chart_count]
    
    # print(f"\n GENERATING INTENSITY VARIATIONS FOR {intensity_chart_count} CHARTS...")
    
    # # Key perturbations for intensity analysis
    # intensity_perturbations = [
    #     ("visual_noise", "gaussian_blur"),
    #     ("geometric_transforms", "rotation"),
    #     ("color_lighting", "brightness_shift")
    # ]
    
    # for i, chart_file in enumerate(intensity_charts):
    #     try:
    #         original_image = Image.open(chart_file)
    #         chart_id = chart_file.stem
            
    #         for category, pert_type in intensity_perturbations:
    #             for intensity in ["low", "high"]:  # We already have medium
    #                 output_filename = f"{chart_id}_{pert_type}_{intensity}.png"
    #                 output_path = f"data/perturbations/{output_filename}"
                    
    #                 if Path(output_path).exists():
    #                     continue
                    
    #                 perturbed_image = engine.apply_perturbation(
    #                     original_image, category, pert_type, intensity
    #                 )
                    
    #                 perturbed_image.save(output_path, format='PNG', quality=95)
    #                 successful_operations += 1
                    
    #     except Exception as e:
    #         continue
    
    return {
        'total_charts_processed': len(selected_charts),
        'total_perturbations_generated': successful_operations,
        'priority_perturbations': len(priority_perturbations),
        'successful_applications': len(engine.applied_perturbations),
        'failed_applications': len(engine.failure_log),
        'perturbation_engine': engine
    }

# Execute the perturbation strategy
print(" Starting strategic perturbation execution...")
execution_results = execute_strategic_perturbations()



 SECTION 3: STRATEGIC PERTURBATION EXECUTION
 Starting strategic perturbation execution...
 Found 200 charts to process
 Processing 200 charts for perturbation

 APPLYING PRIORITY PERTURBATIONS...
 Processed 25/200 charts...
 Processed 50/200 charts...
 Processed 75/200 charts...
 Processed 100/200 charts...
 Processed 125/200 charts...
 Processed 150/200 charts...
 Processed 175/200 charts...
 Processed 200/200 charts...

 PRIORITY PERTURBATIONS COMPLETE
 Processed: 200 charts
 Generated: 1200 perturbations


### SECTION 4: QUALITY ASSESSMENT

In [6]:
print("\n SECTION 4: PERTURBATION QUALITY ASSESSMENT")

def assess_perturbation_quality(execution_results):
    """Assess the quality and coverage of generated perturbations"""
    
    print(" PERTURBATION EXECUTION SUMMARY:")
    print("-" * 50)
    
    print(f" Charts Processed: {execution_results['total_charts_processed']}")
    print(f" Perturbations Generated: {execution_results['total_perturbations_generated']}")
    print(f" Successful Operations: {execution_results['successful_applications']}")
    print(f" Failed Operations: {execution_results['failed_applications']}")
    
    if execution_results['failed_applications'] > 0:
        failure_rate = (execution_results['failed_applications'] / 
                       (execution_results['successful_applications'] + execution_results['failed_applications'])) * 100
        print(f" Failure Rate: {failure_rate:.2f}%")
    
    # Analyze perturbation distribution
    perturbation_files = list(Path('data/perturbations').glob('*.png'))
    print(f"\n Total Perturbation Files: {len(perturbation_files)}")
    
    # Categorize by perturbation type
    perturbation_stats = {}
    intensity_stats = {}
    
    for file_path in perturbation_files:
        filename = file_path.stem
        
        # Extract perturbation info from filename
        parts = filename.split('_')
        if len(parts) >= 3:
            pert_type = parts[-2]
            intensity = parts[-1]
            
            perturbation_stats[pert_type] = perturbation_stats.get(pert_type, 0) + 1
            intensity_stats[intensity] = intensity_stats.get(intensity, 0) + 1
    
    print(f"\n PERTURBATION TYPE DISTRIBUTION:")
    for pert_type, count in sorted(perturbation_stats.items()):
        print(f"  {pert_type}: {count} files")
    
    print(f"\n INTENSITY DISTRIBUTION:")
    for intensity, count in sorted(intensity_stats.items()):
        print(f"  {intensity}: {count} files")
    
    # Estimate extraction potential
    estimated_extractions = min(1000, len(perturbation_files))
    estimated_cost = estimated_extractions * 0.03
    
    print(f"\n EXTRACTION PLANNING:")
    print(f" Available for Extraction: {len(perturbation_files)} perturbations")
    print(f" Recommended Sample Size: {estimated_extractions}")
    print(f" Estimated Cost: ${estimated_cost:.2f}")
    
    return {
        'total_perturbation_files': len(perturbation_files),
        'perturbation_distribution': perturbation_stats,
        'intensity_distribution': intensity_stats,
        'recommended_sample_size': estimated_extractions,
        'estimated_extraction_cost': estimated_cost
    }

quality_assessment = assess_perturbation_quality(execution_results)




 SECTION 4: PERTURBATION QUALITY ASSESSMENT
 PERTURBATION EXECUTION SUMMARY:
--------------------------------------------------
 Charts Processed: 200
 Perturbations Generated: 1200
 Successful Operations: 1200
 Failed Operations: 0

 Total Perturbation Files: 1200

 PERTURBATION TYPE DISTRIBUTION:
  blocks: 200 files
  blur: 200 files
  conversion: 200 files
  corruption: 200 files
  rotation: 200 files
  shift: 200 files

 INTENSITY DISTRIBUTION:
  medium: 1200 files

 EXTRACTION PLANNING:
 Available for Extraction: 1200 perturbations
 Recommended Sample Size: 1000
 Estimated Cost: $30.00


### SECTION 5: PREPARATION FOR EXTRACTION

In [7]:
print("\n SECTION 5: PREPARATION FOR EXTRACTION PHASE")

# Create comprehensive summary for next phase
extraction_preparation = {
    'perturbation_generation_complete': True,
    'total_charts_available': execution_results['total_charts_processed'],
    'total_perturbations_available': quality_assessment['total_perturbation_files'],
    'perturbation_types': list(quality_assessment['perturbation_distribution'].keys()),
    'intensity_levels': list(quality_assessment['intensity_distribution'].keys()),
    'recommended_extraction_strategy': {
        'phase_1_originals': min(200, execution_results['total_charts_processed']),
        'phase_2_priority_perturbations': min(600, quality_assessment['total_perturbation_files']),
        'phase_3_intensity_analysis': min(300, quality_assessment['total_perturbation_files']),
        'total_planned_extractions': min(1100, 200 + quality_assessment['total_perturbation_files'])
    },
    'budget_planning': {
        'estimated_total_cost': quality_assessment['estimated_extraction_cost'],
        'cost_per_extraction': 0.03,
        'budget_available': 45.00,
        'cost_efficiency': quality_assessment['estimated_extraction_cost'] <= 45.00
    },
    'next_notebook': '04_GPT4_Extraction_Pipeline.ipynb'
}

# Save preparation summary
with open('data/analysis_cache/perturbation_summary.json', 'w') as f:
    json.dump(extraction_preparation, f, indent=2)

print(" Extraction preparation summary saved")

# Create sample showcase
def showcase_perturbation_samples():
    """Display sample perturbations for visual verification"""
    
    print("\n PERTURBATION SAMPLE SHOWCASE:")
    print("=" * 60)
    
    # Find sample files for each major perturbation type
    sample_types = ['gaussian_blur', 'rotation', 'brightness_shift', 'random_blocks', 'legend_corruption']
    
    for pert_type in sample_types:
        matching_files = list(Path('data/perturbations').glob(f'*{pert_type}*.png'))
        
        if matching_files:
            sample_file = matching_files[0]
            print(f" {pert_type.replace('_', ' ').title()}: {sample_file.name}")
            
            # Check file size and dimensions
            try:
                with Image.open(sample_file) as img:
                    print(f"   Size: {img.size[0]}x{img.size[1]} pixels")
                    print(f"   Mode: {img.mode}")
                    file_size = sample_file.stat().st_size / 1024  # KB
                    print(f"   File Size: {file_size:.1f} KB")
            except Exception as e:
                print(f"    Could not analyze: {e}")
        else:
            print(f" {pert_type}: No samples found")
        
        print()

showcase_perturbation_samples()

# Final validation and readiness check
def validate_readiness_for_extraction():
    """Final validation before proceeding to extraction phase"""
    
    print("\n READINESS VALIDATION FOR EXTRACTION PHASE:")
    print("-" * 60)
    
    checks = {
        'original_charts_available': len(list(Path('data/raw_charts').glob('*.png'))) > 0,
        'perturbations_generated': len(list(Path('data/perturbations').glob('*.png'))) > 0,
        'config_files_present': Path('research_config.json').exists(),
        'directories_created': all([
            Path('data/extractions').exists(),
            Path('data/analysis_cache').exists(),
            Path('results').exists()
        ]),
        'budget_sufficient': extraction_preparation['budget_planning']['cost_efficiency']
    }
    
    all_ready = True
    for check_name, status in checks.items():
        status_icon = "good" if status else "not good"
        print(f"{status_icon} {check_name.replace('_', ' ').title()}: {'Ready' if status else 'Not Ready'}")
        if not status:
            all_ready = False
    
    print("-" * 60)
    
    if all_ready:
        print(" ALL SYSTEMS READY FOR EXTRACTION PHASE!")
        print(" Proceed to Notebook 4: GPT-4o Extraction Pipeline")
        
        # Provide clear next steps
        print(f"\n EXTRACTION PHASE PREVIEW:")
        print(f"    Charts to Extract: {extraction_preparation['recommended_extraction_strategy']['phase_1_originals']}")
        print(f"    Perturbations to Extract: {extraction_preparation['recommended_extraction_strategy']['phase_2_priority_perturbations']}")
        print(f"    Estimated Cost: ${extraction_preparation['budget_planning']['estimated_total_cost']:.2f}")
        print(f"    Estimated Time: 2-4 hours")
        
    else:
        print(" ISSUES DETECTED - Please resolve before proceeding")
        
    return all_ready

readiness_status = validate_readiness_for_extraction()


 SECTION 5: PREPARATION FOR EXTRACTION PHASE
 Extraction preparation summary saved

 PERTURBATION SAMPLE SHOWCASE:
 Gaussian Blur: chart_001_complex_bar_gaussian_blur_medium.png
   Size: 4167x2670 pixels
   Mode: RGB
   File Size: 393.5 KB

 Rotation: chart_001_complex_bar_rotation_medium.png
   Size: 4385x3024 pixels
   Mode: RGB
   File Size: 384.8 KB

 Brightness Shift: chart_001_complex_bar_brightness_shift_medium.png
   Size: 4167x2670 pixels
   Mode: RGB
   File Size: 208.7 KB

 Random Blocks: chart_001_complex_bar_random_blocks_medium.png
   Size: 4167x2670 pixels
   Mode: RGB
   File Size: 214.2 KB

 Legend Corruption: chart_001_complex_bar_legend_corruption_medium.png
   Size: 4167x2670 pixels
   Mode: RGB
   File Size: 215.1 KB


 READINESS VALIDATION FOR EXTRACTION PHASE:
------------------------------------------------------------
good Original Charts Available: Ready
good Perturbations Generated: Ready
good Config Files Present: Ready
good Directories Created: Ready
good 

### SECTION 6: EXECUTION SUMMARY AND LOGGING

In [8]:
print("\n SECTION 6: EXECUTION SUMMARY")

# Create comprehensive execution log
execution_log = {
    'notebook': '03_Perturbation_Framework',
    'execution_timestamp': pd.Timestamp.now().isoformat(),
    'status': 'SUCCESS' if readiness_status else 'ISSUES_DETECTED',
    'charts_processed': execution_results['total_charts_processed'],
    'perturbations_generated': execution_results['total_perturbations_generated'],
    'perturbation_types_applied': len(set(p['type'] for p in execution_results['perturbation_engine'].applied_perturbations)),
    'failure_rate': len(execution_results['perturbation_engine'].failure_log) / max(1, len(execution_results['perturbation_engine'].applied_perturbations)) * 100,
    'ready_for_extraction': readiness_status,
    'next_phase_cost_estimate': extraction_preparation['budget_planning']['estimated_total_cost']
}

# Save execution log
with open('logs/perturbation_execution_log.json', 'w') as f:
    json.dump(execution_log, f, indent=2)

print(" Execution log saved to logs/perturbation_execution_log.json")

# Print final summary
print("\n" + "=" * 80)
print(" PERTURBATION FRAMEWORK EXECUTION COMPLETE!")
print("=" * 80)

if readiness_status:
    print(" STATUS: SUCCESSFUL")
    print(f" Generated {execution_results['total_perturbations_generated']} perturbations")
    print(f" {len(extraction_preparation['perturbation_types'])} perturbation types applied")
    print(f" Extraction cost estimate: ${extraction_preparation['budget_planning']['estimated_total_cost']:.2f}")
    print(" READY FOR EXTRACTION PHASE!")
else:
    print(" STATUS: ISSUES DETECTED")
    print(" Please review validation errors above")

print("=" * 80)

# Log to research logger
logger.info(f"Perturbation framework execution completed")
logger.info(f"Charts processed: {execution_results['total_charts_processed']}")
logger.info(f"Perturbations generated: {execution_results['total_perturbations_generated']}")
logger.info(f"Status: {execution_log['status']}")
logger.info(f"Ready for extraction: {readiness_status}")


 SECTION 6: EXECUTION SUMMARY
 Execution log saved to logs/perturbation_execution_log.json

 PERTURBATION FRAMEWORK EXECUTION COMPLETE!
 STATUS: SUCCESSFUL
 Generated 1200 perturbations
 6 perturbation types applied
 Extraction cost estimate: $30.00
 READY FOR EXTRACTION PHASE!
