# Caltech-101 Dataset: Exploratory Data Analysis

This notebook performs exploratory data analysis on the Caltech-101 dataset including:
- Dataset loading and statistics
- Class distribution visualization
- Sample image visualizations
- Data augmentation demonstrations

In [None]:
import sys
sys.path.insert(0, '..')

import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
import torch
from torchvision import transforms

from src.data.dataset import (
    Caltech101Dataset, 
    get_dataset_statistics,
    create_dataloaders,
    get_transforms
)

%matplotlib inline
sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (12, 8)

## 1. Load Dataset and Basic Statistics

In [None]:
# Set data directory
DATA_DIR = '../data/101_ObjectCategories'

# Load dataset statistics
print("Computing dataset statistics...")
stats = get_dataset_statistics(DATA_DIR)

print(f"\nDataset Statistics:")
print(f"  Total images: {stats['num_images']}")
print(f"  Number of classes: {stats['num_classes']}")
print(f"  Mean RGB: {stats['mean']}")
print(f"  Std RGB: {stats['std']}")

## 2. Class Distribution Analysis

In [None]:
# Visualize class distribution
class_names = stats['class_names']
class_counts = list(stats['class_distribution'].values())

# Sort by count
sorted_indices = np.argsort(class_counts)[::-1]
sorted_classes = [class_names[i] for i in sorted_indices]
sorted_counts = [class_counts[i] for i in sorted_indices]

# Plot
plt.figure(figsize=(16, 8))
plt.bar(range(len(sorted_classes)), sorted_counts, color='steelblue', alpha=0.7)
plt.axhline(y=np.mean(sorted_counts), color='red', linestyle='--', 
            linewidth=2, label=f'Mean: {np.mean(sorted_counts):.1f}')
plt.xlabel('Class (sorted by frequency)', fontsize=12)
plt.ylabel('Number of Images', fontsize=12)
plt.title('Caltech-101 Class Distribution', fontsize=14, fontweight='bold')
plt.legend()
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()

print(f"\nClass Distribution Summary:")
print(f"  Mean samples per class: {np.mean(sorted_counts):.2f}")
print(f"  Median samples per class: {np.median(sorted_counts):.2f}")
print(f"  Min samples: {np.min(sorted_counts)} ({sorted_classes[-1]})")
print(f"  Max samples: {np.max(sorted_counts)} ({sorted_classes[0]})")

## 3. Sample Images from Different Classes

In [None]:
# Load dataset
dataset = Caltech101Dataset(
    root_dir=DATA_DIR,
    image_size=128,
    transform=transforms.Compose([
        transforms.Resize((128, 128)),
        transforms.ToTensor()
    ])
)

# Select random classes
num_classes_to_show = 10
num_samples_per_class = 5

random_classes = np.random.choice(len(class_names), num_classes_to_show, replace=False)

fig, axes = plt.subplots(num_classes_to_show, num_samples_per_class, 
                         figsize=(15, 3 * num_classes_to_show))

for i, class_idx in enumerate(random_classes):
    # Find all images from this class
    class_indices = [idx for idx, label in enumerate(dataset.labels) if label == class_idx]
    
    # Sample random images
    sample_indices = np.random.choice(class_indices, 
                                     min(num_samples_per_class, len(class_indices)), 
                                     replace=False)
    
    for j, idx in enumerate(sample_indices):
        image, label = dataset[idx]
        
        # Convert to displayable format
        img_display = image.permute(1, 2, 0).numpy()
        
        axes[i, j].imshow(img_display)
        axes[i, j].axis('off')
        
        if j == 0:
            axes[i, j].set_ylabel(class_names[class_idx], fontsize=10, rotation=0, 
                                 ha='right', va='center')

plt.suptitle('Sample Images from Random Classes', fontsize=16, fontweight='bold', y=1.0)
plt.tight_layout()
plt.show()

## 4. Data Augmentation Demonstration

In [None]:
# Get augmentation transforms
train_transform, _ = get_transforms(image_size=128, augmentation=True)

# Select one image
sample_idx = np.random.choice(len(dataset))
image, label = dataset[sample_idx]

# Original image (without augmentation)
original_image = image.permute(1, 2, 0).numpy()

# Apply augmentation multiple times
num_augmentations = 8
fig, axes = plt.subplots(3, 3, figsize=(12, 12))

# Show original in center
axes[1, 1].imshow(original_image)
axes[1, 1].set_title('Original', fontweight='bold')
axes[1, 1].axis('off')

# Load PIL image for augmentation
from PIL import Image
pil_image = Image.open(dataset.image_paths[sample_idx]).convert('RGB')

# Apply augmentations
positions = [(0,0), (0,1), (0,2), (1,0), (1,2), (2,0), (2,1), (2,2)]
for k, (i, j) in enumerate(positions):
    augmented = train_transform(pil_image)
    
    # Denormalize for display
    mean = torch.tensor([0.485, 0.456, 0.406]).view(3, 1, 1)
    std = torch.tensor([0.229, 0.224, 0.225]).view(3, 1, 1)
    augmented = augmented * std + mean
    augmented = torch.clamp(augmented, 0, 1)
    
    axes[i, j].imshow(augmented.permute(1, 2, 0).numpy())
    axes[i, j].set_title(f'Augmentation {k+1}')
    axes[i, j].axis('off')

plt.suptitle(f'Data Augmentation Examples\nClass: {class_names[label]}', 
            fontsize=14, fontweight='bold')
plt.tight_layout()
plt.show()

## 5. Train/Val/Test Split Analysis

In [None]:
# Create dataloaders to see split
train_loader, val_loader, test_loader, _ = create_dataloaders(
    data_dir=DATA_DIR,
    image_size=128,
    batch_size=32,
    augmentation=False
)

# Get split sizes
train_size = len(train_loader.dataset)
val_size = len(val_loader.dataset)
test_size = len(test_loader.dataset)
total_size = train_size + val_size + test_size

# Visualize split
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(14, 5))

# Pie chart
sizes = [train_size, val_size, test_size]
labels = ['Train', 'Validation', 'Test']
colors = ['#66b3ff', '#ff9999', '#99ff99']
explode = (0.05, 0.05, 0.05)

ax1.pie(sizes, explode=explode, labels=labels, colors=colors,
        autopct='%1.1f%%', shadow=True, startangle=90)
ax1.set_title('Dataset Split Proportions', fontweight='bold')

# Bar chart
ax2.bar(labels, sizes, color=colors, alpha=0.7)
ax2.set_ylabel('Number of Images', fontsize=12)
ax2.set_title('Dataset Split Sizes', fontweight='bold')
ax2.grid(True, alpha=0.3)

for i, v in enumerate(sizes):
    ax2.text(i, v + 50, str(v), ha='center', va='bottom', fontweight='bold')

plt.tight_layout()
plt.show()

print(f"\nDataset Split:")
print(f"  Train: {train_size} ({100*train_size/total_size:.1f}%)")
print(f"  Val: {val_size} ({100*val_size/total_size:.1f}%)")
print(f"  Test: {test_size} ({100*test_size/total_size:.1f}%)")
print(f"  Total: {total_size}")

## 6. Image Size Analysis

In [None]:
# Analyze original image sizes
from PIL import Image
import random

# Sample random images
sample_size = 500
sample_indices = random.sample(range(len(dataset)), min(sample_size, len(dataset)))

widths = []
heights = []
aspects = []

for idx in sample_indices:
    img_path = dataset.image_paths[idx]
    img = Image.open(img_path)
    width, height = img.size
    widths.append(width)
    heights.append(height)
    aspects.append(width / height)

# Plot distributions
fig, axes = plt.subplots(2, 2, figsize=(14, 10))

# Width distribution
axes[0, 0].hist(widths, bins=30, color='steelblue', alpha=0.7, edgecolor='black')
axes[0, 0].set_xlabel('Width (pixels)', fontsize=11)
axes[0, 0].set_ylabel('Frequency', fontsize=11)
axes[0, 0].set_title('Image Width Distribution', fontweight='bold')
axes[0, 0].axvline(np.mean(widths), color='red', linestyle='--', linewidth=2, label=f'Mean: {np.mean(widths):.1f}')
axes[0, 0].legend()
axes[0, 0].grid(True, alpha=0.3)

# Height distribution
axes[0, 1].hist(heights, bins=30, color='coral', alpha=0.7, edgecolor='black')
axes[0, 1].set_xlabel('Height (pixels)', fontsize=11)
axes[0, 1].set_ylabel('Frequency', fontsize=11)
axes[0, 1].set_title('Image Height Distribution', fontweight='bold')
axes[0, 1].axvline(np.mean(heights), color='red', linestyle='--', linewidth=2, label=f'Mean: {np.mean(heights):.1f}')
axes[0, 1].legend()
axes[0, 1].grid(True, alpha=0.3)

# Aspect ratio distribution
axes[1, 0].hist(aspects, bins=30, color='green', alpha=0.7, edgecolor='black')
axes[1, 0].set_xlabel('Aspect Ratio (width/height)', fontsize=11)
axes[1, 0].set_ylabel('Frequency', fontsize=11)
axes[1, 0].set_title('Aspect Ratio Distribution', fontweight='bold')
axes[1, 0].axvline(np.mean(aspects), color='red', linestyle='--', linewidth=2, label=f'Mean: {np.mean(aspects):.2f}')
axes[1, 0].legend()
axes[1, 0].grid(True, alpha=0.3)

# Scatter plot
axes[1, 1].scatter(widths, heights, alpha=0.5, s=10)
axes[1, 1].set_xlabel('Width (pixels)', fontsize=11)
axes[1, 1].set_ylabel('Height (pixels)', fontsize=11)
axes[1, 1].set_title('Image Dimensions Scatter', fontweight='bold')
axes[1, 1].grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

print(f"\nImage Size Statistics (from {sample_size} samples):")
print(f"  Width - Mean: {np.mean(widths):.1f}, Std: {np.std(widths):.1f}")
print(f"  Height - Mean: {np.mean(heights):.1f}, Std: {np.std(heights):.1f}")
print(f"  Aspect Ratio - Mean: {np.mean(aspects):.2f}, Std: {np.std(aspects):.2f}")

## Summary

This exploratory analysis reveals:
- The Caltech-101 dataset contains 101 object categories with varying number of samples per class
- Images have different sizes and aspect ratios, requiring resizing for model input
- Data augmentation can significantly increase training data diversity
- The dataset is split into 70% training, 15% validation, and 15% test sets (stratified)