# Cats vs Dogs Classification - Exploratory Data Analysis

This notebook provides exploratory data analysis for the Cats vs Dogs classification dataset.

In [None]:
# Import required libraries
import os
import sys
from pathlib import Path
import numpy as np
import matplotlib.pyplot as plt
from PIL import Image
from collections import Counter
import random

# Add project root to path
project_root = Path.cwd().parent if 'notebooks' in str(Path.cwd()) else Path.cwd()
sys.path.insert(0, str(project_root))

print(f"Project root: {project_root}")

## 1. Dataset Overview

In [None]:
# Define data paths
RAW_DATA_DIR = project_root / 'data' / 'raw'
PROCESSED_DATA_DIR = project_root / 'data' / 'processed'

# Check if data directories exist
print(f"Raw data exists: {RAW_DATA_DIR.exists()}")
print(f"Processed data exists: {PROCESSED_DATA_DIR.exists()}")

In [None]:
def count_images(data_dir: Path) -> dict:
    """Count images in each class directory."""
    counts = {}
    
    if not data_dir.exists():
        return counts
    
    for class_dir in data_dir.iterdir():
        if class_dir.is_dir():
            image_count = len([f for f in class_dir.iterdir() 
                              if f.suffix.lower() in {'.jpg', '.jpeg', '.png'}])
            counts[class_dir.name] = image_count
    
    return counts

# Count raw data
if RAW_DATA_DIR.exists():
    raw_counts = count_images(RAW_DATA_DIR)
    print("Raw data counts:")
    for class_name, count in raw_counts.items():
        print(f"  {class_name}: {count} images")
    print(f"  Total: {sum(raw_counts.values())} images")
else:
    print("Raw data not found. Please download the dataset first.")

## 2. Sample Images Visualization

In [None]:
def visualize_samples(data_dir: Path, n_samples: int = 8):
    """Visualize sample images from each class."""
    if not data_dir.exists():
        print("Data directory not found.")
        return
    
    classes = [d for d in data_dir.iterdir() if d.is_dir()]
    
    fig, axes = plt.subplots(len(classes), n_samples, figsize=(16, 4*len(classes)))
    
    for i, class_dir in enumerate(classes):
        images = list(class_dir.glob('*.jpg')) + list(class_dir.glob('*.jpeg')) + list(class_dir.glob('*.png'))
        samples = random.sample(images, min(n_samples, len(images)))
        
        for j, img_path in enumerate(samples):
            img = Image.open(img_path)
            ax = axes[i, j] if len(classes) > 1 else axes[j]
            ax.imshow(img)
            ax.axis('off')
            if j == 0:
                ax.set_title(class_dir.name, fontsize=14, fontweight='bold')
    
    plt.tight_layout()
    plt.show()

# Visualize raw data samples
if RAW_DATA_DIR.exists():
    print("Sample images from raw dataset:")
    visualize_samples(RAW_DATA_DIR, n_samples=6)

## 3. Image Size Distribution

In [None]:
def analyze_image_sizes(data_dir: Path, max_samples: int = 500):
    """Analyze image size distribution."""
    if not data_dir.exists():
        print("Data directory not found.")
        return None, None
    
    widths = []
    heights = []
    
    image_files = list(data_dir.rglob('*.jpg')) + list(data_dir.rglob('*.jpeg')) + list(data_dir.rglob('*.png'))
    samples = random.sample(image_files, min(max_samples, len(image_files)))
    
    for img_path in samples:
        try:
            with Image.open(img_path) as img:
                widths.append(img.size[0])
                heights.append(img.size[1])
        except Exception:
            continue
    
    return np.array(widths), np.array(heights)

if RAW_DATA_DIR.exists():
    widths, heights = analyze_image_sizes(RAW_DATA_DIR)
    
    if widths is not None:
        fig, axes = plt.subplots(1, 3, figsize=(15, 4))
        
        # Width distribution
        axes[0].hist(widths, bins=30, edgecolor='black', alpha=0.7)
        axes[0].set_xlabel('Width (pixels)')
        axes[0].set_ylabel('Count')
        axes[0].set_title('Image Width Distribution')
        axes[0].axvline(np.mean(widths), color='red', linestyle='--', label=f'Mean: {np.mean(widths):.0f}')
        axes[0].legend()
        
        # Height distribution
        axes[1].hist(heights, bins=30, edgecolor='black', alpha=0.7, color='orange')
        axes[1].set_xlabel('Height (pixels)')
        axes[1].set_ylabel('Count')
        axes[1].set_title('Image Height Distribution')
        axes[1].axvline(np.mean(heights), color='red', linestyle='--', label=f'Mean: {np.mean(heights):.0f}')
        axes[1].legend()
        
        # Aspect ratio
        aspect_ratios = widths / heights
        axes[2].hist(aspect_ratios, bins=30, edgecolor='black', alpha=0.7, color='green')
        axes[2].set_xlabel('Aspect Ratio (W/H)')
        axes[2].set_ylabel('Count')
        axes[2].set_title('Aspect Ratio Distribution')
        axes[2].axvline(1.0, color='red', linestyle='--', label='Square')
        axes[2].legend()
        
        plt.tight_layout()
        plt.show()
        
        print(f"\nImage Size Statistics:")
        print(f"  Width - Min: {widths.min()}, Max: {widths.max()}, Mean: {widths.mean():.0f}")
        print(f"  Height - Min: {heights.min()}, Max: {heights.max()}, Mean: {heights.mean():.0f}")

## 4. Data Preprocessing Demonstration

In [None]:
from src.data.preprocess import load_and_resize_image, get_train_transforms, get_val_transforms
import albumentations as A

# Demonstrate preprocessing
def show_preprocessing_pipeline(image_path: str):
    """Show preprocessing steps."""
    # Load original
    original = Image.open(image_path)
    
    # Load and resize
    resized = load_and_resize_image(image_path)
    
    # Show
    fig, axes = plt.subplots(1, 2, figsize=(10, 5))
    
    axes[0].imshow(original)
    axes[0].set_title(f'Original ({original.size[0]}x{original.size[1]})')
    axes[0].axis('off')
    
    if resized is not None:
        axes[1].imshow(resized)
        axes[1].set_title(f'Resized ({resized.shape[1]}x{resized.shape[0]})')
    axes[1].axis('off')
    
    plt.tight_layout()
    plt.show()

# Find a sample image
if RAW_DATA_DIR.exists():
    sample_images = list(RAW_DATA_DIR.rglob('*.jpg'))[:1]
    if sample_images:
        show_preprocessing_pipeline(str(sample_images[0]))

## 5. Data Augmentation Examples

In [None]:
def show_augmentations(image_path: str, n_augmentations: int = 6):
    """Show data augmentation examples."""
    # Load and resize image
    image = load_and_resize_image(image_path)
    
    if image is None:
        print("Could not load image")
        return
    
    # Get augmentation transforms (without normalization for visualization)
    augment = A.Compose([
        A.HorizontalFlip(p=0.5),
        A.RandomBrightnessContrast(brightness_limit=0.2, contrast_limit=0.2, p=0.5),
        A.ShiftScaleRotate(shift_limit=0.1, scale_limit=0.1, rotate_limit=15, p=0.5),
        A.GaussNoise(var_limit=(10.0, 50.0), p=0.3),
        A.ColorJitter(brightness=0.2, contrast=0.2, saturation=0.2, hue=0.1, p=0.3),
    ])
    
    # Generate augmentations
    fig, axes = plt.subplots(2, 4, figsize=(16, 8))
    axes = axes.flatten()
    
    axes[0].imshow(image)
    axes[0].set_title('Original', fontsize=12)
    axes[0].axis('off')
    
    for i in range(1, min(n_augmentations + 1, 8)):
        augmented = augment(image=image)['image']
        axes[i].imshow(augmented)
        axes[i].set_title(f'Augmented {i}', fontsize=12)
        axes[i].axis('off')
    
    plt.suptitle('Data Augmentation Examples', fontsize=14, fontweight='bold')
    plt.tight_layout()
    plt.show()

# Show augmentation examples
if RAW_DATA_DIR.exists():
    sample_images = list(RAW_DATA_DIR.rglob('*.jpg'))[:1]
    if sample_images:
        show_augmentations(str(sample_images[0]))

## 6. Train/Val/Test Split Verification

In [None]:
# Check processed data splits
if PROCESSED_DATA_DIR.exists():
    splits = ['train', 'val', 'test']
    split_counts = {}
    
    for split in splits:
        split_dir = PROCESSED_DATA_DIR / split
        if split_dir.exists():
            split_counts[split] = count_images(split_dir)
    
    # Visualize splits
    if split_counts:
        classes = list(list(split_counts.values())[0].keys())
        x = np.arange(len(classes))
        width = 0.25
        
        fig, ax = plt.subplots(figsize=(10, 6))
        
        for i, split in enumerate(splits):
            if split in split_counts:
                counts = [split_counts[split].get(c, 0) for c in classes]
                ax.bar(x + i*width, counts, width, label=split.capitalize())
        
        ax.set_xlabel('Class')
        ax.set_ylabel('Number of Images')
        ax.set_title('Train/Val/Test Split Distribution')
        ax.set_xticks(x + width)
        ax.set_xticklabels(classes)
        ax.legend()
        
        plt.tight_layout()
        plt.show()
        
        # Print statistics
        print("\nSplit Statistics:")
        for split in splits:
            if split in split_counts:
                total = sum(split_counts[split].values())
                print(f"  {split.capitalize()}: {total} images")
else:
    print("Processed data not found. Run preprocessing first.")

## 7. Summary

This notebook demonstrated:
1. Dataset structure and class distribution
2. Sample image visualization
3. Image size analysis
4. Preprocessing pipeline (resizing to 224x224)
5. Data augmentation techniques
6. Train/val/test split verification

Next steps:
- Run `python src/data/preprocess.py` to preprocess the dataset
- Train the model with `python src/training/train.py`