# PASCAL VOC 2007 数据集探索

本笔记本探索PASCAL VOC 2007数据集，特别关注目标检测和两种分割任务(语义分割和实例分割)的数据。

In [1]:
# Import necessary libraries
import os
import numpy as np
import matplotlib.pyplot as plt
import xml.etree.ElementTree as ET
from PIL import Image
import random
import cv2
from collections import Counter
import pandas as pd
from tqdm import tqdm  # Using regular tqdm instead of tqdm.notebook

# Set display parameters
plt.rcParams['figure.figsize'] = (12, 8)
%matplotlib inline

In [2]:
# Set VOC2007 dataset path
# Modify this according to your actual path
VOC_ROOT = '/Users/weimu/vibe/data/VOCdevkit/VOC2007'

# Check if path exists
if not os.path.exists(VOC_ROOT):
    print(f"Warning: Path {VOC_ROOT} does not exist, please verify the correct VOC dataset path")
else:
    print(f"VOC dataset path: {VOC_ROOT}")

# Define dataset subdirectories
IMAGE_DIR = os.path.join(VOC_ROOT, 'JPEGImages')
ANNOTATION_DIR = os.path.join(VOC_ROOT, 'Annotations')
SEGMENTATION_DIR = os.path.join(VOC_ROOT, 'SegmentationClass')
SEGMENTATION_INSTANCE_DIR = os.path.join(VOC_ROOT, 'SegmentationObject')
IMAGESETS_DIR = os.path.join(VOC_ROOT, 'ImageSets')

VOC dataset path: /Users/weimu/vibe/data/VOCdevkit/VOC2007


In [6]:
# Check dataset structure
def check_dataset_structure():
    dirs = [
        (IMAGE_DIR, 'Image directory'),
        (ANNOTATION_DIR, 'Annotation directory'),
        (SEGMENTATION_DIR, 'Semantic segmentation directory'),
        (SEGMENTATION_INSTANCE_DIR, 'Instance segmentation directory'),
        (IMAGESETS_DIR, 'Image sets directory')
    ]

    for dir_path, dir_name in dirs:
        if os.path.exists(dir_path):
            num_files = len(os.listdir(dir_path))
            print(f"{dir_name} ({dir_path}): Exists - Contains {num_files} files")
        else:
            print(f"{dir_name} ({dir_path}): Does not exist")

check_dataset_structure()

# Get training and validation set image IDs
train_ids = []
val_ids = []
test_ids = []

if os.path.exists(os.path.join(IMAGESETS_DIR, 'Main', 'train.txt')):
    with open(os.path.join(IMAGESETS_DIR, 'Main', 'train.txt'), 'r') as f:
        train_ids = [line.strip() for line in f.readlines()]
    print(f"Training set contains {len(train_ids)} images")

if os.path.exists(os.path.join(IMAGESETS_DIR, 'Main', 'val.txt')):
    with open(os.path.join(IMAGESETS_DIR, 'Main', 'val.txt'), 'r') as f:
        val_ids = [line.strip() for line in f.readlines()]
    print(f"Validation set contains {len(val_ids)} images")

if os.path.exists(os.path.join(IMAGESETS_DIR, 'Main', 'test.txt')):
    with open(os.path.join(IMAGESETS_DIR, 'Main', 'test.txt'), 'r') as f:
        test_ids = [line.strip() for line in f.readlines()]
    print(f"Testing set contains {len(test_ids)} images")

Image directory (/Users/weimu/vibe/data/VOCdevkit/VOC2007/JPEGImages): Exists - Contains 9963 files
Annotation directory (/Users/weimu/vibe/data/VOCdevkit/VOC2007/Annotations): Exists - Contains 9963 files
Semantic segmentation directory (/Users/weimu/vibe/data/VOCdevkit/VOC2007/SegmentationClass): Exists - Contains 632 files
Instance segmentation directory (/Users/weimu/vibe/data/VOCdevkit/VOC2007/SegmentationObject): Exists - Contains 632 files
Image sets directory (/Users/weimu/vibe/data/VOCdevkit/VOC2007/ImageSets): Exists - Contains 4 files
Training set contains 2501 images
Validation set contains 2510 images
Testing set contains 4952 images


In [None]:
# Define function to parse XML annotation files
def parse_annotation(annotation_path):
    """Parse VOC annotation XML file"""
    tree = ET.parse(annotation_path)
    root = tree.getroot()

    # Get image information
    size = root.find('size')
    width = int(size.find('width').text)
    height = int(size.find('height').text)

    # Get all objects
    objects = []
    for obj in root.findall('object'):
        name = obj.find('name').text
        difficult = int(obj.find('difficult').text) if obj.find('difficult') is not None else 0

        bbox = obj.find('bndbox')
        xmin = int(float(bbox.find('xmin').text))
        ymin = int(float(bbox.find('ymin').text))
        xmax = int(float(bbox.find('xmax').text))
        ymax = int(float(bbox.find('ymax').text))

        objects.append({
            'name': name,
            'difficult': difficult,
            'bbox': [xmin, ymin, xmax, ymax]
        })

    return {
        'width': width,
        'height': height,
        'objects': objects
    }

# Test parsing functionality
if train_ids:
    sample_id = train_ids[0]
    sample_annotation_path = os.path.join(ANNOTATION_DIR, f"{sample_id}.xml")
    if os.path.exists(sample_annotation_path):
        sample_annotation = parse_annotation(sample_annotation_path)
        print(f"Sample image {sample_id} annotation:")
        print(f"Size: {sample_annotation['width']} x {sample_annotation['height']}")
        print(f"Number of objects: {len(sample_annotation['objects'])}")
        for i, obj in enumerate(sample_annotation['objects']):
            print(f"  Object {i+1}: {obj['name']}, Difficulty: {obj['difficult']}, Bounding box: {obj['bbox']}")

In [None]:
# Analyze class distribution in the training set
def analyze_class_distribution(image_ids):
    class_counter = Counter()
    difficult_counter = Counter()
    total_objects = 0

    for img_id in tqdm(image_ids, desc="Analyzing class distribution"):
        annotation_path = os.path.join(ANNOTATION_DIR, f"{img_id}.xml")
        if os.path.exists(annotation_path):
            annotation = parse_annotation(annotation_path)
            for obj in annotation['objects']:
                class_counter[obj['name']] += 1
                if obj['difficult']:
                    difficult_counter[obj['name']] += 1
                total_objects += 1

    return class_counter, difficult_counter, total_objects

# Analyze training set
if train_ids:
    print("Analyzing training set class distribution...")
    train_class_counter, train_difficult_counter, train_total_objects = analyze_class_distribution(train_ids)

    # Create class distribution table
    class_data = []
    for class_name, count in train_class_counter.most_common():
        difficult_count = train_difficult_counter[class_name]
        normal_count = count - difficult_count
        class_data.append({
            'Class': class_name,
            'Total Count': count,
            'Normal Samples': normal_count,
            'Difficult Samples': difficult_count,
            'Percentage (%)': round(count / train_total_objects * 100, 2)
        })

    class_df = pd.DataFrame(class_data)
    display(class_df)

    # Plot class distribution bar chart
    plt.figure(figsize=(14, 8))
    plt.bar(class_df['Class'], class_df['Total Count'], color='skyblue')
    plt.title('VOC2007 Training Set Class Distribution', fontsize=16)
    plt.xlabel('Class', fontsize=14)
    plt.ylabel('Sample Count', fontsize=14)
    plt.xticks(rotation=45)
    plt.grid(axis='y', linestyle='--', alpha=0.7)
    for i, count in enumerate(class_df['Total Count']):
        plt.text(i, count + 50, str(count), ha='center')
    plt.tight_layout()
    plt.show()

In [None]:
# Analyze bounding box size distribution
def analyze_bbox_sizes(image_ids):
    bbox_areas = []
    bbox_aspect_ratios = []
    class_bbox_areas = {}

    for img_id in tqdm(image_ids, desc="Analyzing bounding box sizes"):
        annotation_path = os.path.join(ANNOTATION_DIR, f"{img_id}.xml")
        if os.path.exists(annotation_path):
            annotation = parse_annotation(annotation_path)
            for obj in annotation['objects']:
                bbox = obj['bbox']
                width = bbox[2] - bbox[0]
                height = bbox[3] - bbox[1]
                area = width * height
                aspect_ratio = width / height if height > 0 else 0

                bbox_areas.append(area)
                bbox_aspect_ratios.append(aspect_ratio)

                class_name = obj['name']
                if class_name not in class_bbox_areas:
                    class_bbox_areas[class_name] = []
                class_bbox_areas[class_name].append(area)

    return bbox_areas, bbox_aspect_ratios, class_bbox_areas

# Analyze bounding box sizes
if train_ids:
    print("Analyzing training set bounding box size distribution...")
    bbox_areas, bbox_aspect_ratios, class_bbox_areas = analyze_bbox_sizes(train_ids)

    plt.figure(figsize=(18, 6))

    # Plot bounding box area distribution
    plt.subplot(1, 2, 1)
    plt.hist(bbox_areas, bins=50, alpha=0.7, color='blue')
    plt.title('Bounding Box Area Distribution', fontsize=14)
    plt.xlabel('Area (pixels)', fontsize=12)
    plt.ylabel('Count', fontsize=12)
    plt.grid(alpha=0.3)

    # Plot bounding box aspect ratio distribution
    plt.subplot(1, 2, 2)
    plt.hist(bbox_aspect_ratios, bins=50, range=(0, 5), alpha=0.7, color='green')
    plt.title('Bounding Box Aspect Ratio Distribution', fontsize=14)
    plt.xlabel('Aspect Ratio (width/height)', fontsize=12)
    plt.ylabel('Count', fontsize=12)
    plt.grid(alpha=0.3)

    plt.tight_layout()
    plt.show()

    # Plot boxplot of bounding box areas by class
    plt.figure(figsize=(15, 8))
    boxplot_data = [areas for cls, areas in class_bbox_areas.items()]
    plt.boxplot(boxplot_data, labels=class_bbox_areas.keys())
    plt.title('Bounding Box Area Distribution by Class', fontsize=16)
    plt.xlabel('Class', fontsize=14)
    plt.ylabel('Area (pixels)', fontsize=14)
    plt.yscale('log')  # Use log scale for better visualization
    plt.grid(axis='y', linestyle='--', alpha=0.7)
    plt.xticks(rotation=45)
    plt.tight_layout()
    plt.show()

In [None]:
# Visualize images with bounding boxes
def visualize_samples(image_ids, num_samples=5):
    """Visualize sample images and bounding boxes"""
    # Select random samples
    sample_ids = random.sample(image_ids, min(num_samples, len(image_ids)))

    # VOC dataset class colors
    colors = {
        'aeroplane': (255, 0, 0),      # Red
        'bicycle': (0, 255, 0),        # Green
        'bird': (0, 0, 255),           # Blue
        'boat': (255, 255, 0),         # Yellow
        'bottle': (255, 0, 255),       # Magenta
        'bus': (0, 255, 255),          # Cyan
        'car': (128, 0, 0),            # Dark red
        'cat': (0, 128, 0),            # Dark green
        'chair': (0, 0, 128),          # Dark blue
        'cow': (128, 128, 0),          # Olive
        'diningtable': (128, 0, 128),  # Purple
        'dog': (0, 128, 128),          # Teal
        'horse': (192, 0, 0),          # Brown-red
        'motorbike': (0, 192, 0),      # Forest green
        'person': (0, 0, 192),         # Navy blue
        'pottedplant': (192, 192, 0),  # Dark yellow
        'sheep': (192, 0, 192),        # Violet
        'sofa': (0, 192, 192),         # Light teal
        'train': (64, 64, 64),         # Dark gray
        'tvmonitor': (192, 192, 192)   # Silver
    }

    # Draw sample images
    _, axes = plt.subplots(num_samples, 1, figsize=(12, 6*num_samples))
    if num_samples == 1:
        axes = [axes]

    for i, img_id in enumerate(sample_ids):
        img_path = os.path.join(IMAGE_DIR, f"{img_id}.jpg")
        annotation_path = os.path.join(ANNOTATION_DIR, f"{img_id}.xml")

        if os.path.exists(img_path) and os.path.exists(annotation_path):
            # Read image
            img = cv2.imread(img_path)
            img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)

            # Parse annotation
            annotation = parse_annotation(annotation_path)

            # Draw bounding boxes
            for obj in annotation['objects']:
                bbox = obj['bbox']
                class_name = obj['name']
                xmin, ymin, xmax, ymax = bbox

                # Get class color, default to gray
                color = colors.get(class_name, (128, 128, 128))

                # Convert BGR to RGB
                color_rgb = (color[0], color[1], color[2])

                # Draw bounding box
                cv2.rectangle(img, (xmin, ymin), (xmax, ymax), color_rgb, 2)

                # Add class label
                label = f"{class_name}"
                font_scale = 0.7
                font_thickness = 2
                (text_width, text_height), _ = cv2.getTextSize(label, cv2.FONT_HERSHEY_SIMPLEX, font_scale, font_thickness)

                # Background rectangle
                cv2.rectangle(img, (xmin, ymin - text_height - 10), (xmin + text_width + 10, ymin), color_rgb, -1)

                # Text
                cv2.putText(img, label, (xmin + 5, ymin - 5), cv2.FONT_HERSHEY_SIMPLEX,
                            font_scale, (255, 255, 255), font_thickness)

            # Show image
            axes[i].imshow(img)
            axes[i].set_title(f"Image ID: {img_id} ({annotation['width']}x{annotation['height']})")
            axes[i].axis('off')
        else:
            axes[i].text(0.5, 0.5, f"Image or annotation not found: {img_id}", ha='center', va='center')
            axes[i].axis('off')

    plt.tight_layout()
    plt.show()

# Visualize random samples from training set
if train_ids:
    print("Visualizing random training samples...")
    visualize_samples(train_ids, num_samples=5)

In [None]:
# Analyze segmentation masks (if available)
def visualize_segmentation(image_ids, num_samples=3):
    """Visualize semantic and instance segmentation masks"""
    # Select random samples with segmentation masks
    sample_ids = []
    for img_id in random.sample(image_ids, min(len(image_ids), num_samples*3)):
        seg_path = os.path.join(SEGMENTATION_DIR, f"{img_id}.png")
        inst_path = os.path.join(SEGMENTATION_INSTANCE_DIR, f"{img_id}.png")
        if os.path.exists(seg_path) and os.path.exists(inst_path):
            sample_ids.append(img_id)
            if len(sample_ids) >= num_samples:
                break

    if not sample_ids:
        print("Could not find samples with both semantic and instance segmentation masks.")
        return

    # Create visualization for each sample
    for img_id in sample_ids:
        img_path = os.path.join(IMAGE_DIR, f"{img_id}.jpg")
        seg_path = os.path.join(SEGMENTATION_DIR, f"{img_id}.png")
        inst_path = os.path.join(SEGMENTATION_INSTANCE_DIR, f"{img_id}.png")

        # Read image and masks
        img = np.array(Image.open(img_path))
        seg_mask = np.array(Image.open(seg_path))
        inst_mask = np.array(Image.open(inst_path))

        # Create image grid
        fig, axes = plt.subplots(1, 3, figsize=(18, 6))

        # Original image
        axes[0].imshow(img)
        axes[0].set_title(f"Original Image: {img_id}", fontsize=14)
        axes[0].axis('off')

        # Semantic segmentation mask
        axes[1].imshow(seg_mask)
        axes[1].set_title("Semantic Segmentation Mask", fontsize=14)
        axes[1].axis('off')

        # Instance segmentation mask
        axes[2].imshow(inst_mask)
        axes[2].set_title("Instance Segmentation Mask", fontsize=14)
        axes[2].axis('off')

        plt.tight_layout()
        plt.show()

# Visualize segmentation masks
if train_ids:
    print("Visualizing segmentation mask examples...")
    visualize_segmentation(train_ids)

In [None]:
# Analyze number of objects per image
def analyze_objects_per_image(image_ids):
    """Analyze number of objects per image"""
    objects_per_image = []
    class_count_per_image = {}

    for img_id in tqdm(image_ids, desc="Analyzing objects per image"):
        annotation_path = os.path.join(ANNOTATION_DIR, f"{img_id}.xml")
        if os.path.exists(annotation_path):
            annotation = parse_annotation(annotation_path)

            # Count total objects
            num_objects = len(annotation['objects'])
            objects_per_image.append(num_objects)

            # Count objects by class
            class_counts = Counter([obj['name'] for obj in annotation['objects']])
            for cls, count in class_counts.items():
                if cls not in class_count_per_image:
                    class_count_per_image[cls] = []
                class_count_per_image[cls].append(count)

    return objects_per_image, class_count_per_image

# Analyze objects per image
if train_ids:
    print("Analyzing objects per image in training set...")
    objects_per_image, class_count_per_image = analyze_objects_per_image(train_ids)

    # Plot distribution of objects per image
    plt.figure(figsize=(12, 6))
    plt.hist(objects_per_image, bins=range(0, max(objects_per_image) + 2), alpha=0.7, color='blue')
    plt.title('Objects per Image Distribution', fontsize=16)
    plt.xlabel('Number of Objects', fontsize=14)
    plt.ylabel('Number of Images', fontsize=14)
    plt.grid(alpha=0.3)
    plt.xticks(range(0, max(objects_per_image) + 1))

    avg_objects = sum(objects_per_image) / len(objects_per_image) if objects_per_image else 0
    plt.axvline(x=avg_objects, color='red', linestyle='--')
    plt.text(avg_objects + 0.1, plt.ylim()[1] * 0.9, f'Average: {avg_objects:.2f}', color='red')

    plt.tight_layout()
    plt.show()

    # Count images with multiple classes
    multi_class_images = sum(1 for counts in class_count_per_image.values() if sum(counts) > 0)
    single_class_images = len(train_ids) - multi_class_images

    # Plot pie chart showing single vs multi-class image ratio
    plt.figure(figsize=(8, 8))
    plt.pie([single_class_images, multi_class_images],
            labels=['Single-class Images', 'Multi-class Images'],
            autopct='%1.1f%%',
            colors=['lightblue', 'lightgreen'],
            startangle=90,
            explode=(0, 0.1))
    plt.title('Single vs Multi-class Images Ratio', fontsize=16)
    plt.axis('equal')
    plt.tight_layout()
    plt.show()

In [None]:
# Summarize dataset features
def summarize_dataset(train_ids, val_ids=None):
    """Summarize main features of the dataset"""
    print("=" * 50)
    print("VOC2007 Dataset Summary")
    print("=" * 50)

    # Training set statistics
    if train_ids:
        train_images = len(train_ids)

        # Parse all objects
        train_objects = []
        train_classes = set()
        for img_id in tqdm(train_ids, desc="Summarizing training set"):
            annotation_path = os.path.join(ANNOTATION_DIR, f"{img_id}.xml")
            if os.path.exists(annotation_path):
                annotation = parse_annotation(annotation_path)
                train_objects.extend(annotation['objects'])
                train_classes.update([obj['name'] for obj in annotation['objects']])

        # Statistics
        print(f"\nTraining Set:")
        print(f"- Number of images: {train_images}")
        print(f"- Total objects: {len(train_objects)}")
        print(f"- Number of classes: {len(train_classes)}")
        print(f"- Average objects per image: {len(train_objects) / train_images:.2f}")

        # Class list
        print(f"- Class list: {', '.join(sorted(train_classes))}")

    # Validation set statistics
    if val_ids:
        val_images = len(val_ids)

        # Parse all objects
        val_objects = []
        val_classes = set()
        for img_id in tqdm(val_ids, desc="Summarizing validation set"):
            annotation_path = os.path.join(ANNOTATION_DIR, f"{img_id}.xml")
            if os.path.exists(annotation_path):
                annotation = parse_annotation(annotation_path)
                val_objects.extend(annotation['objects'])
                val_classes.update([obj['name'] for obj in annotation['objects']])

        # Statistics
        print(f"\nValidation Set:")
        print(f"- Number of images: {val_images}")
        print(f"- Total objects: {len(val_objects)}")
        print(f"- Number of classes: {len(val_classes)}")
        print(f"- Average objects per image: {len(val_objects) / val_images:.2f}")

    print("\nNotes:")
    print("- VOC2007 dataset is relatively balanced, but 'person' class typically has higher proportion")
    print("- Dataset includes samples of varying difficulty, with 'difficult=1' samples potentially ignored in evaluation")
    print("- Most common object size range is medium-sized, but there's significant size variation")
    print("- Some images contain objects from multiple classes, adding to detection complexity")
    print("=" * 50)

# Generate dataset summary
if train_ids:
    summarize_dataset(train_ids, val_ids)