In [1]:
# Imports
import os
import json
import numpy as np
import matplotlib.pyplot as plt
import cv2
import random
import os
from collections import Counter
from pathlib import Path
from dotenv import load_dotenv
from roboflow import Roboflow

# Load environment variables from .env file
load_dotenv()
roboflow_api_key = os.getenv("ROBOFLOW_API_KEY")

In [None]:
# Download the dataset and the annotations
rf = Roboflow(api_key=roboflow_api_key)
project = rf.workspace("goldbach-neo-testspace").project("stellenbilder")
version = project.version(4) # 3 Augmented, 4 Original
dataset = version.download("coco-segmentation")      

# Dataset Analysis for Stellenbilder Instance Segmentation

## Dataset Overview

The dataset is organized in 3 directories:
- **train**: Training images and annotations
- **valid**: Validation images and annotations
- **test**: Test images and annotations

Each directory follows the COCO format with annotations in `_annotations.coco.json`.

In [None]:
# Function to load COCO annotations
def load_coco_annotations(annotation_file):
    with open(annotation_file, 'r') as f:
        return json.load(f)

# Load annotations from all splits
base_dir = Path("Stellenbilder-no-augs")
test_annotations = load_coco_annotations(base_dir / "test" / "_annotations.coco.json")
train_annotations = load_coco_annotations(base_dir / "train" / "_annotations.coco.json")
valid_annotations = load_coco_annotations(base_dir / "valid" / "_annotations.coco.json")

## Dataset Statistics

In [None]:
# Function to get dataset statistics
def get_dataset_stats(annotations):
    num_images = len(annotations['images'])
    num_annotations = len(annotations['annotations'])
    categories = {cat['id']: cat['name'] for cat in annotations['categories']}
    
    # Count instances per category
    category_counts = Counter([ann['category_id'] for ann in annotations['annotations']])
    category_stats = {categories[cat_id]: count for cat_id, count in category_counts.items()}
    
    # Calculate average annotations per image
    avg_annotations_per_image = num_annotations / num_images if num_images > 0 else 0
    
    return {
        'num_images': num_images,
        'num_annotations': num_annotations,
        'category_stats': category_stats,
        'avg_annotations_per_image': avg_annotations_per_image
    }

# Get statistics for each split
train_stats = get_dataset_stats(train_annotations)
valid_stats = get_dataset_stats(valid_annotations)
test_stats = get_dataset_stats(test_annotations)

# Display statistics
print(f"Training set: {train_stats['num_images']} images, {train_stats['num_annotations']} annotations")
print(f"Validation set: {valid_stats['num_images']} images, {valid_stats['num_annotations']} annotations")
print(f"Test set: {test_stats['num_images']} images, {test_stats['num_annotations']} annotations")
print("\nAverage annotations per image:")
print(f"Training: {train_stats['avg_annotations_per_image']:.2f}")
print(f"Validation: {valid_stats['avg_annotations_per_image']:.2f}")
print(f"Test: {test_stats['avg_annotations_per_image']:.2f}")

In [None]:
# Visualize category distribution
def plot_category_distribution(train_stats, valid_stats, test_stats):
    categories = list(train_stats['category_stats'].keys())
    train_counts = [train_stats['category_stats'].get(cat, 0) for cat in categories]
    valid_counts = [valid_stats['category_stats'].get(cat, 0) for cat in categories]
    test_counts = [test_stats['category_stats'].get(cat, 0) for cat in categories]
    
    x = np.arange(len(categories))
    width = 0.25
    
    fig, ax = plt.subplots(figsize=(12, 6))
    ax.bar(x - width, train_counts, width, label='Train')
    ax.bar(x, valid_counts, width, label='Validation')
    ax.bar(x + width, test_counts, width, label='Test')
    
    ax.set_xlabel('Categories')
    ax.set_ylabel('Number of Instances')
    ax.set_title('Category Distribution across Dataset Splits')
    ax.set_xticks(x)
    ax.set_xticklabels(categories, rotation=45, ha='right')
    ax.legend()
    
    plt.tight_layout()
    plt.show()

plot_category_distribution(train_stats, valid_stats, test_stats)

## Sample Images with Annotations

In [None]:
# Function to visualize images with annotations
def visualize_samples(annotations, img_dir, num_samples=3):
    categories = {cat['id']: cat['name'] for cat in annotations['categories']}
    image_ids = random.sample(range(len(annotations['images'])), min(num_samples, len(annotations['images'])))
    
    for idx, img_idx in enumerate(image_ids):
        img_info = annotations['images'][img_idx]
        img_path = os.path.join(img_dir, img_info['file_name'])
        
        # Load image
        image = cv2.imread(img_path)
        image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
        
        # Get annotations for this image
        img_anns = [ann for ann in annotations['annotations'] if ann['image_id'] == img_idx]
        
        # Setup figure
        plt.figure(figsize=(12, 8))
        plt.subplot(1, 2, 1)
        plt.title(f"Original Image: {img_info['file_name']}")
        plt.imshow(image)
        plt.axis('off')
        
        # Create a copy for segmentation visualization
        seg_img = image.copy()
        
        # Colors for different categories
        colors = plt.cm.rainbow(np.linspace(0, 1, len(categories)))
        
        # Draw segmentations
        for ann in img_anns:
            category = categories[ann['category_id']]
            color = (colors[ann['category_id'] % len(colors)] * 255).astype(np.uint8)[:3]
            
            # Convert to numpy array for OpenCV
            color_tuple = (int(color[0]), int(color[1]), int(color[2]))
            
            # Draw segmentation mask
            for seg in ann['segmentation']:
                # Reshape points to format required by fillPoly
                points = np.array(seg).reshape(-1, 2).astype(np.int32)
                cv2.fillPoly(seg_img, [points], color_tuple)
            
            # Get bounding box coordinates
            x, y, w, h = map(int, ann['bbox'])
            
            # Add label text
            cv2.rectangle(seg_img, (x, y), (x + w, y + h), color_tuple, 2)
            cv2.putText(seg_img, category, (x, y - 5), cv2.FONT_HERSHEY_SIMPLEX, 0.7, color_tuple, 2)
        
        # Show segmentation image
        plt.subplot(1, 2, 2)
        plt.title("Instance Segmentation")
        plt.imshow(seg_img)
        plt.axis('off')
        
        plt.tight_layout()
        plt.show()

# Visualize samples from each split
print("Training samples:")
visualize_samples(train_annotations, base_dir / "train", num_samples=2)

print("Validation samples:")
visualize_samples(valid_annotations, base_dir / "valid", num_samples=2)

print("Test samples:")
visualize_samples(test_annotations, base_dir / "test", num_samples=2)

## Analysis of Object Sizes

In [None]:
# Function to analyze object sizes
def analyze_object_sizes(annotations):
    categories = {cat['id']: cat['name'] for cat in annotations['categories']}
    
    # Collect areas by category
    category_areas = {cat_name: [] for cat_name in categories.values()}
    
    for ann in annotations['annotations']:
        category = categories[ann['category_id']]
        area = ann['area']
        category_areas[category].append(area)
    
    # Plot area distributions
    plt.figure(figsize=(14, 8))
    
    # Box plot for all categories
    data = [areas for cat, areas in category_areas.items() if areas]
    labels = [cat for cat, areas in category_areas.items() if areas]
    
    plt.boxplot(data, tick_labels=labels)
    plt.title('Distribution of Object Areas by Category')
    plt.ylabel('Area (pixels²)')
    plt.xticks(rotation=45, ha='right')
    plt.grid(axis='y', linestyle='--', alpha=0.7)
    
    plt.tight_layout()
    plt.show()
    
    # Calculate statistics
    stats = {}
    for cat, areas in category_areas.items():
        if areas:
            stats[cat] = {
                'min': min(areas),
                'max': max(areas),
                'mean': np.mean(areas),
                'median': np.median(areas),
                'count': len(areas)
            }
    
    return stats

# Analyze object sizes for all splits combined
all_annotations = {
    'images': train_annotations['images'] + valid_annotations['images'] + test_annotations['images'],
    'annotations': train_annotations['annotations'] + valid_annotations['annotations'] + test_annotations['annotations'],
    'categories': train_annotations['categories']
}

size_stats = analyze_object_sizes(all_annotations)

# Print statistics
print("Object Size Statistics by Category:")
for cat, stat in size_stats.items():
    print(f"{cat}: {stat['count']} instances, Mean area: {stat['mean']:.1f} px², Median: {stat['median']:.1f} px²")

## Color Analysis

In [None]:
# Function to analyze color distributions in images
def analyze_colors(annotations, img_dir, num_samples=3):
    image_ids = random.sample(range(len(annotations['images'])), min(num_samples, len(annotations['images'])))
    
    for idx, img_idx in enumerate(image_ids):
        img_info = annotations['images'][img_idx]
        img_path = os.path.join(img_dir, img_info['file_name'])
        
        # Load image
        image = cv2.imread(img_path)
        image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
        
        # Get annotations for this image
        img_anns = [ann for ann in annotations['annotations'] if ann['image_id'] == img_idx]
        
        # Setup figure
        plt.figure(figsize=(16, 6))
        
        # Show original image
        plt.subplot(1, 4, 1)
        plt.title(f"Original Image")
        plt.imshow(image)
        plt.axis('off')
        
        # Color histograms
        colors = ('r', 'g', 'b')
        channel_names = ('Red', 'Green', 'Blue')
        
        for i, color in enumerate(colors):
            plt.subplot(1, 4, i+2)
            histogram = cv2.calcHist([image], [i], None, [256], [0, 256])
            plt.title(f'{channel_names[i]} Channel')
            plt.xlim([0, 256])
            plt.plot(histogram, color=color)
            plt.grid(alpha=0.3)
        
        plt.tight_layout()
        plt.show()
        
        # Calculate color statistics
        print(f"\nColor statistics for {img_info['file_name']}:")
        for channel_idx, channel_name in enumerate(channel_names):
            channel_values = image[:,:,channel_idx].flatten()
            print(f"{channel_name}: Mean = {np.mean(channel_values):.1f}, Std = {np.std(channel_values):.1f}, Min = {np.min(channel_values)}, Max = {np.max(channel_values)}")

# Analyze colors for a few test samples
print("Color Analysis of Sample Images:")
analyze_colors(test_annotations, base_dir / "test", num_samples=2)

## Object Aspect Ratio Analysis

In [None]:
# Function to analyze object aspect ratios
def analyze_aspect_ratios(annotations):
    categories = {cat['id']: cat['name'] for cat in annotations['categories']}
    
    # Collect aspect ratios by category
    category_aspect_ratios = {cat_name: [] for cat_name in categories.values()}
    
    for ann in annotations['annotations']:
        category = categories[ann['category_id']]
        x, y, width, height = ann['bbox']
        
        # Avoid division by zero
        if height > 0:
            aspect_ratio = width / height
            category_aspect_ratios[category].append(aspect_ratio)
    
    # Plot aspect ratio distributions
    plt.figure(figsize=(14, 8))
    
    data = [ratios for cat, ratios in category_aspect_ratios.items() if ratios]
    labels = [cat for cat, ratios in category_aspect_ratios.items() if ratios]
    
    plt.boxplot(data, tick_labels=labels)
    plt.axhline(y=1.0, color='r', linestyle='--', alpha=0.7, label='Square (w=h)')
    plt.title('Distribution of Object Aspect Ratios by Category')
    plt.ylabel('Aspect Ratio (width/height)')
    plt.xticks(rotation=45, ha='right')
    plt.grid(axis='y', linestyle='--', alpha=0.7)
    plt.legend()
    
    plt.tight_layout()
    plt.show()
    
    # Calculate statistics
    stats = {}
    for cat, ratios in category_aspect_ratios.items():
        if ratios:
            stats[cat] = {
                'min': min(ratios),
                'max': max(ratios),
                'mean': np.mean(ratios),
                'median': np.median(ratios),
                'count': len(ratios)
            }
    
    return stats

# Analyze aspect ratios for all splits combined
aspect_ratio_stats = analyze_aspect_ratios(all_annotations)

# Print statistics
print("Object Aspect Ratio Statistics by Category:")
for cat, stat in aspect_ratio_stats.items():
    print(f"{cat}: Mean ratio: {stat['mean']:.2f}, Median ratio: {stat['median']:.2f}")

## Spatial Distribution Analysis

In [None]:
# Function to analyze spatial distribution of objects
def analyze_spatial_distribution(annotations):
    categories = {cat['id']: cat['name'] for cat in annotations['categories']}
    
    # Extract center points of all bounding boxes
    all_centers = []
    category_centers = {cat_name: [] for cat_name in categories.values()}
    
    # Standard image width and height from the dataset (assuming all images are same size)
    img_width = annotations['images'][0]['width']
    img_height = annotations['images'][0]['height']
    
    for ann in annotations['annotations']:
        category = categories[ann['category_id']]
        x, y, width, height = ann['bbox']
        
        # Calculate center in normalized coordinates (0-1)
        center_x = (x + width/2) / img_width
        center_y = (y + height/2) / img_height
        
        all_centers.append((center_x, center_y))
        category_centers[category].append((center_x, center_y))
    
    # Plot all centers
    plt.figure(figsize=(10, 10))
    
    # Get all x and y coordinates
    all_x = [c[0] for c in all_centers]
    all_y = [c[1] for c in all_centers]
    
    # Create heatmap-like visualization
    plt.hist2d(all_x, all_y, bins=20, cmap='hot')
    plt.colorbar(label='Number of objects')
    
    plt.title('Spatial Distribution of All Objects')
    plt.xlabel('Normalized X Position')
    plt.ylabel('Normalized Y Position')
    
    # Draw image boundaries
    plt.axhline(y=0, color='black', linestyle='-', alpha=0.3)
    plt.axhline(y=1, color='black', linestyle='-', alpha=0.3)
    plt.axvline(x=0, color='black', linestyle='-', alpha=0.3)
    plt.axvline(x=1, color='black', linestyle='-', alpha=0.3)
    
    plt.xlim(0, 1)
    plt.ylim(0, 1)
    plt.gca().invert_yaxis()  # Invert Y axis to match image coordinates
    
    plt.tight_layout()
    plt.show()
    
    # Plot per category (top N categories with most instances)
    top_categories = sorted([(cat, len(centers)) for cat, centers in category_centers.items() if centers], key=lambda x: x[1], reverse=True)[:3]
    
    if top_categories:
        fig, axes = plt.subplots(1, len(top_categories), figsize=(15, 5))
        if len(top_categories) == 1:
            axes = [axes]
            
        for i, (category, _) in enumerate(top_categories):
            centers = category_centers[category]
            x = [c[0] for c in centers]
            y = [c[1] for c in centers]
            
            axes[i].scatter(x, y, alpha=0.5)
            axes[i].set_title(f'Distribution: {category}')
            axes[i].set_xlabel('Normalized X Position')
            axes[i].set_ylabel('Normalized Y Position')
            axes[i].set_xlim(0, 1)
            axes[i].set_ylim(0, 1)
            axes[i].invert_yaxis()  # Invert Y axis to match image coordinates
            
            # Draw image boundaries
            axes[i].axhline(y=0, color='black', linestyle='-', alpha=0.3)
            axes[i].axhline(y=1, color='black', linestyle='-', alpha=0.3)
            axes[i].axvline(x=0, color='black', linestyle='-', alpha=0.3)
            axes[i].axvline(x=1, color='black', linestyle='-', alpha=0.3)
        
        plt.tight_layout()
        plt.show()

analyze_spatial_distribution(all_annotations)