# Color-Based Pixel Cluster Detection

Implements a detector for specific color clusters (Grey: `#b2b2b2`, Red: `#bc1c31`) in images. It includes:
- Centralized configuration for parameters.
- Image loading and display.
- An exact HEX/RGB color matching algorithm.
- Connected component analysis to identify clusters.
- Filtering clusters by minimum area.
- Comprehensive visualizations of intermediate steps and final results.
- A simple hyperparameter tuning example for `min_cluster_area`.

In [None]:
import cv2
import numpy as np
import os
import glob
import json
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from collections import defaultdict
from tqdm.notebook import tqdm
from skimage import measure # For connected component analysis
from skimage.color import label2rgb
from PIL import ImageColor # Pillow for HEX to RGB and potentially image loading

# Target colors
TARGET_COLORS_HEX = {
    'Grey': '#b2b2b2',
    'Red': '#b81f34' }

# Convert HEX to RGB tuples
TARGET_COLORS_RGB = {name: ImageColor.getrgb(hex_val) for name, hex_val in TARGET_COLORS_HEX.items()}
print(f"Target RGB Colors: {TARGET_COLORS_RGB}")

# Image paths (relative to the notebook location)
BASE_IMAGE_DIR = 'Stellenbilder-no-augs/train/' 

# Attempt to find a sample image, or use a placeholder
image_files_in_dir = glob.glob(os.path.join(BASE_IMAGE_DIR, '*.jpg'))
if image_files_in_dir:
    SAMPLE_IMAGE_NAME = os.path.basename(image_files_in_dir[0])
else:
    SAMPLE_IMAGE_NAME = 'YOUR_SAMPLE_IMAGE.jpg' # Placeholder if no images found
    print(f"Warning: No JPG images found in {BASE_IMAGE_DIR}. Please set SAMPLE_IMAGE_NAME manually.")

SAMPLE_IMAGE_PATH = os.path.join(BASE_IMAGE_DIR, SAMPLE_IMAGE_NAME)
print(f"Using sample image: {SAMPLE_IMAGE_PATH}")

# Hyperparameters for tuning
HYPERPARAMS = {
    'min_cluster_area_range': [200, 500, 750, 1000, 1250, 1500, 1750, 2000, 2250, 2500] # Amount of pixels
}

# Default minimum area 
DEFAULT_MIN_AREA = 199 # Pixels

## 2. Helper Functions

In [None]:
def load_image_rgb(image_path):
    """Loads an image using OpenCV and converts it from BGR to RGB."""
    if not os.path.exists(image_path):
        print(f"Error: Image not found at {image_path}")
        return None
    try:
        img_bgr = cv2.imread(image_path)
        if img_bgr is None:
            print(f"Error: OpenCV could not read image at {image_path}")
            return None
        img_rgb = cv2.cvtColor(img_bgr, cv2.COLOR_BGR2RGB)
        return img_rgb
    except Exception as e:
        print(f"Exception while loading image {image_path}: {e}")
        return None

def display_image(image, title="", cmap=None, figsize=(8, 6)):
    """Displays an image using Matplotlib."""
    if image is None:
        print("Cannot display None image.")
        return
    plt.figure(figsize=figsize)
    plt.imshow(image, cmap=cmap)
    plt.title(title)
    plt.axis('off')
    plt.show()

def display_images_side_by_side(images, titles, figsize=(15, 7), cmap=None):
    """Displays multiple images side-by-side."""
    if not images or not titles or len(images) != len(titles):
        print("Invalid input for display_images_side_by_side")
        return
    
    count = len(images)
    fig, axes = plt.subplots(1, count, figsize=figsize)
    if count == 1:
        axes = [axes]
        
    for i in range(count):
        if images[i] is not None:
            axes[i].imshow(images[i], cmap=cmap if len(images[i].shape) == 2 else None) # Apply cmap only for 2D arrays
            axes[i].set_title(titles[i])
        axes[i].axis('off')
    plt.tight_layout()
    plt.show()

def detect_exact_color_clusters(image_rgb, target_rgb_color, min_area=100):

    if image_rgb is None:
        return []
    
    # Create a binary mask where pixels match the target_rgb_color exactly
    binary_mask = np.all(image_rgb == target_rgb_color, axis=-1)
    
    if not np.any(binary_mask): # No pixels of the target color found
        return []
        
    # Label connected regions (clusters) in the binary mask
    labeled_mask, num_labels = measure.label(binary_mask, connectivity=2, background=0, return_num=True)
    
    if num_labels == 0:
        return []
        
    # Get properties of each labeled region
    regions = measure.regionprops(labeled_mask)
    
    detected_clusters = []
    for region in regions:
        if region.area >= min_area:
            detected_clusters.append({
                'label': region.label,
                'bbox': region.bbox,  # (min_row, min_col, max_row, max_col)
                'area': region.area,
                'centroid': region.centroid # (row, col)
            })
            
    return detected_clusters

def draw_detections_on_image(image_rgb, detections, color_name_label, box_color_rgb=(0, 255, 0)):

    if image_rgb is None:
        return None
    output_image = image_rgb.copy()
    
    # OpenCV expects BGR for drawing colors, so convert box_color_rgb
    box_color_bgr = tuple(reversed(box_color_rgb))

    for det in detections:
        min_r, min_c, max_r, max_c = det['bbox']
        area = det['area']
        
        # Draw bounding box
        cv2.rectangle(output_image, (min_c, min_r), (max_c, max_r), box_color_bgr, 2)
        
        # Prepare text label
        label_text = f"{color_name_label} (Area: {area})"
        
        # Put text above the bounding box
        text_y_pos = min_r - 10 if min_r - 10 > 10 else min_r + 20
        cv2.putText(output_image, label_text, (min_c, text_y_pos), 
                    cv2.FONT_HERSHEY_SIMPLEX, 0.5, box_color_bgr, 1, cv2.LINE_AA)
            
    return output_image

## 3. Load and Display a Sample Image

In [None]:
sample_image_rgb = load_image_rgb(SAMPLE_IMAGE_PATH)

if sample_image_rgb is not None:
    display_image(sample_image_rgb, title=f"Sample Image: {SAMPLE_IMAGE_NAME}")
else:
    print(f"Failed to load sample image. Please check the path: {SAMPLE_IMAGE_PATH}")

## 4. How the Color Detector Works

The process involves:
1.  **Binary Mask Creation:** Identify all pixels that exactly match the target RGB color.
2.  **Connected Component Labeling:** Group adjacent matching pixels into clusters (regions).
3.  **Region Property Analysis & Filtering:** Calculate properties (like area) for each cluster and filter out small, potentially noisy clusters.

In [None]:
if sample_image_rgb is not None:
    target_color_name = 'Grey' # Or 'Red'
    target_rgb_value = TARGET_COLORS_RGB[target_color_name]
    print(f"--- Analyzing for color: {target_color_name} ({target_rgb_value}) ---")

    # 1. Create Binary Mask
    binary_mask_for_target = np.all(sample_image_rgb == target_rgb_value, axis=-1)
    display_image(binary_mask_for_target.astype(np.uint8) * 255, title=f"1. Binary Mask for {target_color_name} ({target_rgb_value})", cmap='Pastel1')

    # 2. Label Connected Components
    if np.any(binary_mask_for_target):
        labeled_mask, num_labels = measure.label(binary_mask_for_target, connectivity=2, background=0, return_num=True)
        print(f"Found {num_labels} initial connected components (clusters) for {target_color_name}.")
        
        # Visualize labeled components with colors
        colored_labeled_mask = label2rgb(labeled_mask, image=sample_image_rgb, bg_label=0, kind='overlay')
        display_image(colored_labeled_mask, title=f"2. Labeled Components for {target_color_name} (Before Area Filtering)")
    else:
        print(f"No pixels found for {target_color_name}. Skipping labeling and filtering visualization.")
        labeled_mask = None
        num_labels = 0

    # 3. Region Property Analysis & Filtering
    if labeled_mask is not None and num_labels > 0:
        regions = measure.regionprops(labeled_mask)
        print(f"--- Region Properties (before filtering by area={DEFAULT_MIN_AREA}) ---")
        for i, region in enumerate(regions[:5]):
            print(f"  Region {region.label}: Area={region.area}, BBox={region.bbox}")
        if len(regions) > 5:
            print("  ... and more regions.")

        # Filter by min_area
        filtered_detections = []
        for region in regions:
            if region.area >= DEFAULT_MIN_AREA:
                filtered_detections.append({
                    'label': region.label, 'bbox': region.bbox, 
                    'area': region.area, 'centroid': region.centroid
                })
        
        print(f"Found {len(filtered_detections)} clusters for {target_color_name} after filtering by min_area >= {DEFAULT_MIN_AREA} pixels.")

        # Visualize detections after filtering
        image_with_filtered_detections = draw_detections_on_image(sample_image_rgb, 
                                                                  filtered_detections, 
                                                                  target_color_name,
                                                                  box_color_rgb=(255,0,0))
        if image_with_filtered_detections is not None:
            display_image(image_with_filtered_detections, 
                          title=f"3. {target_color_name} Detections (min_area={DEFAULT_MIN_AREA})")
    else:
        print(f"No components to filter for {target_color_name}.")
else:
    print("Sample image not loaded, skipping deep dive.")

## 5. Applying Detector and Visualizing Results for All Target Colors

In [None]:
if sample_image_rgb is not None:
    image_to_draw_on = sample_image_rgb.copy()
    all_detections_combined = []
    
    bbox_draw_colors = {
        'Grey': (0, 0, 255),
        'Red': (0, 255, 0)}

    for color_name, target_rgb in TARGET_COLORS_RGB.items():
        print(f"Detecting '{color_name}' clusters (RGB: {target_rgb}, Min Area: {DEFAULT_MIN_AREA})...")
        detections = detect_exact_color_clusters(sample_image_rgb, target_rgb, min_area=DEFAULT_MIN_AREA)
        print(f"Found {len(detections)} '{color_name}' clusters.")
        all_detections_combined.extend(detections)
        
        draw_color_for_bbox = bbox_draw_colors.get(color_name, (255,255,0))
        image_to_draw_on = draw_detections_on_image(image_to_draw_on, detections, color_name, draw_color_for_bbox)

    if image_to_draw_on is not None:
        display_image(image_to_draw_on, title=f"All Detected Clusters (min_area={DEFAULT_MIN_AREA})")
    
    masks_to_show = []
    mask_titles = []
    for color_name, target_rgb in TARGET_COLORS_RGB.items():
        mask = np.all(sample_image_rgb == target_rgb, axis=-1).astype(np.uint8) * 255
        masks_to_show.append(mask)
        mask_titles.append(f"Mask for {color_name}")
    
    if masks_to_show:
        display_images_side_by_side(masks_to_show, mask_titles, cmap='gray')
else:
    print("Sample image not loaded, skipping detection visualization.")

## 6. Hyperparameter Tuning: Minimum Cluster Area (`min_cluster_area`)

The `min_cluster_area` parameter helps filter out small, potentially irrelevant pixel groups.

In [None]:
if sample_image_rgb is not None:
    min_area_values = HYPERPARAMS['min_cluster_area_range']
    
    cluster_counts_by_area = {color_name: [] for color_name in TARGET_COLORS_RGB}

    for area_thresh in min_area_values:
        print(f"Testing min_area = {area_thresh} pixels...")
        for color_name, target_rgb in TARGET_COLORS_RGB.items():
            detections = detect_exact_color_clusters(sample_image_rgb, target_rgb, min_area=area_thresh)
            cluster_counts_by_area[color_name].append(len(detections))

    plt.figure(figsize=(12, 7))
    for color_name, counts in cluster_counts_by_area.items():
        plt.plot(min_area_values, counts, marker='o', linestyle='-', label=f'{color_name} Clusters')
        for i, count_val in enumerate(counts):
            max_val_for_offset = max(max(v) for v in cluster_counts_by_area.values()) if any(any(v) for v in cluster_counts_by_area.values()) else 1
            plt.text(min_area_values[i], count_val + (0.02 * max_val_for_offset) , str(count_val), ha='center', va='bottom', fontsize=9)

    plt.xlabel("Minimum Cluster Area (pixels)")
    plt.ylabel("Number of Detected Clusters")
    plt.title("Effect of Minimum Cluster Area on Detections (Sample Image)")
    plt.legend()
    plt.grid(True, which="both", ls="--", c='0.7')
    plt.xticks(min_area_values)
    plt.tight_layout()
    plt.show()
    
    print("--- Summary of Cluster Counts ---")
    for color_name in TARGET_COLORS_RGB:
        print(f"Color: {color_name}")
        for i, area_val in enumerate(min_area_values):
            print(f"  Min Area: {area_val:4d} pixels -> Clusters: {cluster_counts_by_area[color_name][i]}")
else:
    print("Sample image not loaded, skipping hyperparameter tuning.")

## 7. Batch Processing: Applying to Multiple Images

apply the detector with a chosen `min_cluster_area` (e.g., selected based on the tuning plot above) to a few more images from the dataset to see how it performs more broadly.

In [None]:
print(f"Using CHOSEN_MIN_AREA = {DEFAULT_MIN_AREA} for batch processing.")

if os.path.exists(BASE_IMAGE_DIR):
    all_image_files = glob.glob(os.path.join(BASE_IMAGE_DIR, '*.jpg'))
    num_images_to_process = min(5, len(all_image_files))
    
    if num_images_to_process == 0:
        print(f"No JPG images found in {BASE_IMAGE_DIR} for batch processing.")
    else:
        selected_image_files = all_image_files[:num_images_to_process]
        print(f"Processing {num_images_to_process} images...")

        for img_path in selected_image_files:
            print(f"--- Processing: {os.path.basename(img_path)} ---")
            current_image_rgb = load_image_rgb(img_path)
            
            if current_image_rgb is None:
                print(f"Skipping {os.path.basename(img_path)} due to loading error.")
                continue

            image_with_all_detections = current_image_rgb.copy()
            
            bbox_draw_colors_batch = {
                'Grey': (0, 0, 255),
                'Red': (0, 255, 0)
            }

            any_detections_on_image = False
            for color_name, target_rgb in TARGET_COLORS_RGB.items():
                detections = detect_exact_color_clusters(current_image_rgb, target_rgb, min_area=DEFAULT_MIN_AREA)
                print(f"  Found {len(detections)} '{color_name}' clusters.")
                if detections:
                    any_detections_on_image = True
                
                draw_color = bbox_draw_colors_batch.get(color_name, (255,255,0))
                image_with_all_detections = draw_detections_on_image(image_with_all_detections, 
                                                                     detections, 
                                                                     color_name, 
                                                                     draw_color)
            
            if image_with_all_detections is not None:
                display_title = f"Detections on {os.path.basename(img_path)} (min_area={DEFAULT_MIN_AREA})"
                if not any_detections_on_image:
                    display_title += " - No clusters found"
                display_image(image_with_all_detections, title=display_title, figsize=(10,8))
else:
    print(f"Image directory {BASE_IMAGE_DIR} not found. Skipping batch processing.")

## 8. Evaluation: Comparing Detector Results with Ground Truth from COCO Annotations

In [None]:
# Function to load COCO annotations and extract object counts per image
def load_coco_annotations(annotation_file):
    """Load COCO annotations and return a dictionary mapping image_id to object count"""
    if not os.path.exists(annotation_file):
        print(f"Annotation file not found: {annotation_file}")
        return None
    
    try:
        with open(annotation_file, 'r') as f:
            annotations = json.load(f)
        
        # Create mapping from image_id to file name
        image_id_to_filename = {}
        for image in annotations['images']:
            image_id_to_filename[image['id']] = image['file_name']
        
        # Count objects per image
        object_counts = defaultdict(int)
        for ann in annotations['annotations']:
            image_id = ann['image_id']
            object_counts[image_id] += 1
        
        # Map filename to object count
        filename_to_object_count = {}
        for image_id, count in object_counts.items():
            if image_id in image_id_to_filename:
                filename = image_id_to_filename[image_id]
                filename_to_object_count[filename] = count
        
        return filename_to_object_count
    
    except Exception as e:
        print(f"Error loading COCO annotations: {e}")
        return None

# Set the dataset path
DATASET_PATH = 'Stellenbilder-no-augs'  # 'Stellenbilder-augmented' for the augmented dataset

# Load annotations for each split
splits = ['train', 'valid', 'test']
annotations_by_split = {}

for split in splits:
    annotation_file = os.path.join(DATASET_PATH, split, '_annotations.coco.json')
    print(f"Loading annotations from {annotation_file}...")
    annotations_by_split[split] = load_coco_annotations(annotation_file)
    if annotations_by_split[split]:
        print(f"Loaded {len(annotations_by_split[split])} image annotations for {split} split")
    else:
        print(f"Failed to load annotations for {split} split")

In [None]:
test_split = 'train'
test_images_dir = os.path.join(DATASET_PATH, test_split)
test_annotations = annotations_by_split[test_split]

if not test_annotations:
    print("Test annotations not available. Cannot proceed with evaluation.")
else:
    # Function to run detector on an image and return detection counts
    def evaluate_image_with_detector(image_path, min_area=DEFAULT_MIN_AREA):
        """Run the detector on an image and return counts per target color"""
        image_rgb = load_image_rgb(image_path)
        if image_rgb is None:
            return None
        
        # Initialize detection results
        detection_results = {}
        total_detections = 0
        
        # Run detector for each target color
        for color_name, target_rgb in TARGET_COLORS_RGB.items():
            detections = detect_exact_color_clusters(image_rgb, target_rgb, min_area=min_area)
            detection_results[color_name] = len(detections)
            total_detections += len(detections)
        
        detection_results['total'] = total_detections
        return detection_results
    
    # Prepare data structure for evaluation results
    evaluation_results = []
    
    # Get list of test images
    test_image_files = []
    for filename in test_annotations.keys():
        file_path = os.path.join(test_images_dir, filename)
        if os.path.exists(file_path):
            test_image_files.append((filename, file_path))
    
    print(f"Found {len(test_image_files)} test images with annotations")
    
    # Run evaluation on test images
    print(f"Running evaluation with min_area={DEFAULT_MIN_AREA}...")
    for filename, file_path in tqdm(test_image_files, desc="Evaluating images"):
        # Get ground truth count
        gt_count = test_annotations.get(filename, 0)
        
        # Run detector
        detection_results = evaluate_image_with_detector(file_path)
        
        if detection_results is not None:
            # Store results
            result = {
                'filename': filename,
                'ground_truth_count': gt_count,
                'detected_total': detection_results['total']
            }
            # Add individual color counts
            for color_name in TARGET_COLORS_RGB.keys():
                result[f'detected_{color_name}'] = detection_results.get(color_name, 0)
            
            evaluation_results.append(result)
    
    eval_df = pd.DataFrame(evaluation_results)
    
    # Display summary statistics
    print("\nEvaluation Summary:")
    print(f"Number of evaluated images: {len(eval_df)}")
    print(f"Average ground truth objects per image: {eval_df['ground_truth_count'].mean():.2f}")
    print(f"Average detected objects per image: {eval_df['detected_total'].mean():.2f}")
    
    # Calculate detection difference (error)
    eval_df['detection_diff'] = eval_df['detected_total'] - eval_df['ground_truth_count']
    eval_df['detection_error'] = eval_df['detection_diff'].abs()
    eval_df['detection_error_pct'] = (eval_df['detection_error'] / eval_df['ground_truth_count']) * 100
    eval_df['detection_error_pct'] = eval_df['detection_error_pct'].fillna(0)
    
    # Display more statistics
    print(f"Average absolute detection error: {eval_df['detection_error'].mean():.2f} objects")
    print(f"Median absolute detection error: {eval_df['detection_error'].median():.2f} objects")
    print(f"Average detection error percentage: {eval_df['detection_error_pct'].mean():.2f}%")
    
    # Display the first few results
    print("\nSample evaluation results:")
    print(eval_df[['filename', 'ground_truth_count', 'detected_total', 'detection_diff']].head(10))

In [None]:
# Visualize the evaluation results
if 'eval_df' in locals() and not eval_df.empty:
    plt.figure(figsize=(15, 10))
    plt.subplot(2, 2, 1)
    
    # 1. Scatter plot of ground truth vs detected
    plt.scatter(eval_df['ground_truth_count'], eval_df['detected_total'], alpha=0.6)
    # Add perfect prediction line
    max_count = max(eval_df['ground_truth_count'].max(), eval_df['detected_total'].max())
    plt.plot([0, max_count], [0, max_count], 'r--', label='Perfect Detection')
    plt.xlabel('Ground Truth Object Count')
    plt.ylabel('Detected Object Count')
    plt.title('Ground Truth vs Detected Objects')
    plt.grid(True, linestyle='--', alpha=0.7)
    plt.legend()
    
    # 2. Histogram of detection difference
    plt.subplot(2, 2, 2)
    sns.histplot(eval_df['detection_diff'], kde=True)
    plt.axvline(x=0, color='r', linestyle='--', label='Perfect Detection')
    plt.xlabel('Detection Difference (Detected - Ground Truth)')
    plt.ylabel('Frequency')
    plt.title('Detection Error Distribution')
    plt.grid(True, linestyle='--', alpha=0.7)
    plt.legend()
    
    # 3. Bar chart of error by ground truth count
    plt.subplot(2, 2, 3)
    # Group by ground truth count and get mean error
    error_by_count = eval_df.groupby('ground_truth_count')['detection_error'].mean().reset_index()
    # Sort by ground truth count for better visualization
    error_by_count = error_by_count.sort_values('ground_truth_count')
    plt.bar(error_by_count['ground_truth_count'], error_by_count['detection_error'])
    plt.xlabel('Ground Truth Object Count')
    plt.ylabel('Average Absolute Error')
    plt.title('Average Detection Error by Ground Truth Count')
    plt.grid(True, linestyle='--', alpha=0.7)
    
    # 4. Pie chart of over-detection vs under-detection
    plt.subplot(2, 2, 4)
    over_detection = (eval_df['detection_diff'] > 0).sum()
    under_detection = (eval_df['detection_diff'] < 0).sum()
    exact_detection = (eval_df['detection_diff'] == 0).sum()
    
    labels = ['Over-detected', 'Under-detected', 'Exact Match']
    sizes = [over_detection, under_detection, exact_detection]
    colors = ['#ff9999', '#66b3ff', '#99ff99']
    explode = (0.1, 0.1, 0.1)  # explode all slices for better visibility
    
    plt.pie(sizes, explode=explode, labels=labels, colors=colors, autopct='%1.1f%%', shadow=True, startangle=90)
    plt.axis('equal')  # Equal aspect ratio ensures that pie is drawn as a circle
    plt.title('Distribution of Detection Results')
    
    plt.tight_layout()
    plt.show()
    
    # Another useful visualization: Detection error by image
    plt.figure(figsize=(15, 6))
    
    # Sort images by absolute error for better visualization
    sorted_df = eval_df.sort_values('detection_error', ascending=False)
    top_n = min(20, len(sorted_df))  # Show top 20 or less if fewer images
    
    plt.bar(range(top_n), sorted_df['detection_error'].head(top_n), color='skyblue')
    plt.xticks(range(top_n), sorted_df['filename'].head(top_n), rotation=90)
    plt.xlabel('Image Filename')
    plt.ylabel('Absolute Detection Error')
    plt.title(f'Top {top_n} Images with Highest Detection Error')
    plt.grid(True, axis='y', linestyle='--', alpha=0.7)
    plt.tight_layout()
    plt.show()
    
    # One more visualization: ROC-like curve (if we consider detection as binary classification)
    plt.figure(figsize=(10, 6))
    
    # Create thresholds for error tolerance
    error_thresholds = [0, 1, 2, 3, 4, 5, 10]
    accuracy_rates = []
    
    for threshold in error_thresholds:
        # Count images where error is less than or equal to threshold
        accurate_count = (eval_df['detection_error'] <= threshold).sum()
        accuracy_rate = accurate_count / len(eval_df) * 100
        accuracy_rates.append(accuracy_rate)
    
    plt.plot(error_thresholds, accuracy_rates, marker='o', linestyle='-', linewidth=2)
    plt.xlabel('Error Tolerance Threshold')
    plt.ylabel('Accuracy Rate (%)')
    plt.title('Accuracy Rate by Error Tolerance')
    plt.grid(True, linestyle='--', alpha=0.7)
    plt.tight_layout()
    plt.show()
else:
    print("No evaluation results available for visualization")

In [None]:
# Define a function to evaluate min_area parameter
def evaluate_min_area_parameter(test_images, test_annotations, min_area_values):
    """Evaluate different min_area values against ground truth"""
    results = []
    
    for min_area in tqdm(min_area_values, desc="Evaluating min_area values"):
        print(f"\nEvaluating min_area = {min_area}...")
        total_gt_objects = 0
        total_detected_objects = 0
        total_abs_error = 0
        
        for filename, file_path in test_images:
            # Get ground truth count
            gt_count = test_annotations.get(filename, 0)
            total_gt_objects += gt_count
            
            # Run detector with current min_area
            detection_results = evaluate_image_with_detector(file_path, min_area=min_area)
            
            if detection_results is not None:
                detected_count = detection_results['total']
                total_detected_objects += detected_count
                total_abs_error += abs(detected_count - gt_count)
        
        # Calculate metrics
        num_images = len(test_images)
        avg_error = total_abs_error / num_images if num_images > 0 else float('inf')
        total_diff = total_detected_objects - total_gt_objects
        
        # Store results
        results.append({
            'min_area': min_area,
            'total_gt_objects': total_gt_objects,
            'total_detected_objects': total_detected_objects,
            'total_diff': total_diff,
            'total_abs_error': total_abs_error,
            'avg_abs_error': avg_error
        })
        
        print(f"  Total GT Objects: {total_gt_objects}")
        print(f"  Total Detected Objects: {total_detected_objects}")
        print(f"  Average Absolute Error: {avg_error:.2f}")
    
    return results

# Let's evaluate a subset of images to save time
run_parameter_tuning = True  # Set to True to run the parameter tuning

if run_parameter_tuning and 'test_image_files' in locals() and test_annotations:
    # Define min_area values to test
    min_area_values_to_test = [400, 500, 750, 1000, 1500, 2000, 2500]
    
    # Select a subset of images for faster evaluation
    max_images_for_tuning = 10  # Adjust based on your time constraints
    test_subset = test_image_files[:max_images_for_tuning]
    
    print(f"Running parameter tuning on {len(test_subset)} images...")
    tuning_results = evaluate_min_area_parameter(test_subset, test_annotations, min_area_values_to_test)
    
    # Convert to DataFrame
    tuning_df = pd.DataFrame(tuning_results)
    
    # Visualize the tuning results
    plt.figure(figsize=(15, 10))
    
    # 1. Plot absolute error vs min_area
    plt.subplot(2, 2, 1)
    plt.plot(tuning_df['min_area'], tuning_df['avg_abs_error'], marker='o', linestyle='-')
    plt.xlabel('Minimum Area Threshold')
    plt.ylabel('Average Absolute Error')
    plt.title('Average Error vs. Min Area Threshold')
    plt.grid(True, linestyle='--', alpha=0.7)
    
    # 2. Plot total objects detected vs min_area
    plt.subplot(2, 2, 2)
    plt.plot(tuning_df['min_area'], tuning_df['total_detected_objects'], marker='o', linestyle='-', label='Detected')
    plt.axhline(y=tuning_df['total_gt_objects'].iloc[0], color='r', linestyle='--', label='Ground Truth')
    plt.xlabel('Minimum Area Threshold')
    plt.ylabel('Total Objects Count')
    plt.title('Total Objects Detected vs. Min Area Threshold')
    plt.legend()
    plt.grid(True, linestyle='--', alpha=0.7)
    
    # 3. Bar chart of error by min_area
    plt.subplot(2, 2, 3)
    plt.bar(tuning_df['min_area'].astype(str), tuning_df['total_abs_error'])
    plt.xlabel('Minimum Area Threshold')
    plt.ylabel('Total Absolute Error')
    plt.title('Total Error vs. Min Area Threshold')
    plt.grid(True, axis='y', linestyle='--', alpha=0.7)
    
    # 4. Line chart of total_diff by min_area (negative means under-detection, positive means over-detection)
    plt.subplot(2, 2, 4)
    plt.plot(tuning_df['min_area'], tuning_df['total_diff'], marker='o', linestyle='-')
    plt.axhline(y=0, color='r', linestyle='--', label='Perfect Match')
    plt.xlabel('Minimum Area Threshold')
    plt.ylabel('Total Difference (Detected - GT)')
    plt.title('Detection Bias vs. Min Area Threshold')
    plt.grid(True, linestyle='--', alpha=0.7)
    
    plt.tight_layout()
    plt.show()
    
    # Find the best min_area value based on average absolute error
    best_min_area = tuning_df.loc[tuning_df['avg_abs_error'].idxmin(), 'min_area']
    print(f"\nBest min_area value based on evaluation: {best_min_area}")
    print(f"Corresponding average absolute error: {tuning_df['avg_abs_error'].min():.2f} objects")
else:
    print("Parameter tuning skipped. Set run_parameter_tuning=True to enable.")