# Data Exploration and Analysis
## AASD 4014 Final Project - Group 6
### Pascal VOC 2007 Person/Dog Detection Dataset

**Team Members:**
- Athul Mathai (101520716) - Data Engineer
- Anjana Jayakumar (101567844) - ML Engineer  
- Anu Sunny (101578581) - DevOps & Deployment
- Devikaa Dinesh (101568031) - Report Writer
- Saranya Shaji (101569858) - Software Engineer
- Syed Mohamed Shakeel Syed Nizar Imam (101518452) - QA Engineer
- Tri Thanh Alan Inder Kumar (101413004) - Project Manager
- Ishika Fatwani (101494093) - UX Designer & Visualization Specialist

In [None]:
import sys
sys.path.append('/app/src')

import os
import json
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
import cv2
from PIL import Image
import pandas as pd
from collections import Counter
import warnings
warnings.filterwarnings('ignore')

# Import our custom utilities
from utils import plot_class_distribution, ensure_dir, load_json

# Set plotting style
plt.style.use('seaborn-v0_8')
sns.set_palette("husl")

print("Libraries imported successfully!")

## 1. Dataset Overview

In [None]:
# Dataset paths
data_dir = Path('/app/data')
images_dir = data_dir / 'images'
labels_dir = data_dir / 'labels'

# Load dataset statistics if available
stats_file = data_dir / 'dataset_stats.json'
if stats_file.exists():
    stats = load_json(str(stats_file))
    print("Dataset Statistics:")
    print(json.dumps(stats, indent=2))
else:
    print("Dataset statistics not found. Run dataset preparation first.")
    stats = {}

## 2. Class Distribution Analysis

In [None]:
def analyze_class_distribution():
    """Analyze class distribution in train and validation sets"""
    
    distribution = {'train': {'person': 0, 'dog': 0, 'images': 0}, 
                   'val': {'person': 0, 'dog': 0, 'images': 0}}
    
    class_names = ['person', 'dog']
    
    for split in ['train', 'val']:
        split_labels_dir = labels_dir / split
        
        if not split_labels_dir.exists():
            print(f"Labels directory not found: {split_labels_dir}")
            continue
            
        label_files = list(split_labels_dir.glob('*.txt'))
        distribution[split]['images'] = len(label_files)
        
        for label_file in label_files:
            with open(label_file, 'r') as f:
                for line in f:
                    if line.strip():
                        class_id = int(line.split()[0])
                        if class_id == 0:
                            distribution[split]['person'] += 1
                        elif class_id == 1:
                            distribution[split]['dog'] += 1
    
    return distribution

# Analyze distribution
if (labels_dir / 'train').exists():
    dist = analyze_class_distribution()
    print("Class Distribution Analysis:")
    for split in ['train', 'val']:
        print(f"\n{split.upper()} SET:")
        print(f"  Images: {dist[split]['images']}")
        print(f"  Person instances: {dist[split]['person']}")
        print(f"  Dog instances: {dist[split]['dog']}")
        total_instances = dist[split]['person'] + dist[split]['dog']
        if total_instances > 0:
            print(f"  Person ratio: {dist[split]['person']/total_instances:.2%}")
            print(f"  Dog ratio: {dist[split]['dog']/total_instances:.2%}")
else:
    print("Dataset not prepared yet. Run dataset preparation first.")
    dist = {}

In [None]:
# Visualize class distribution
if dist:
    plot_class_distribution(dist, '/app/results/plots/class_distribution.png')
else:
    print("No distribution data to plot.")

## 3. Sample Image Analysis

In [None]:
def display_sample_images(num_samples=6):
    """Display sample images with their annotations"""
    
    train_images_dir = images_dir / 'train'
    train_labels_dir = labels_dir / 'train'
    
    if not train_images_dir.exists():
        print("Training images not found. Run dataset preparation first.")
        return
    
    image_files = list(train_images_dir.glob('*.jpg'))[:num_samples]
    
    if not image_files:
        print("No image files found.")
        return
    
    fig, axes = plt.subplots(2, 3, figsize=(15, 10))
    axes = axes.flatten()
    
    class_names = ['person', 'dog']
    colors = [(255, 0, 0), (0, 255, 0)]  # Red for person, Green for dog
    
    for i, img_path in enumerate(image_files[:6]):
        # Load image
        image = cv2.imread(str(img_path))
        image_rgb = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
        
        # Load corresponding label
        label_path = train_labels_dir / (img_path.stem + '.txt')
        
        if label_path.exists():
            h, w = image.shape[:2]
            
            with open(label_path, 'r') as f:
                for line in f:
                    if line.strip():
                        parts = line.strip().split()
                        class_id = int(parts[0])
                        x_center, y_center, width, height = map(float, parts[1:5])
                        
                        # Convert YOLO format to pixel coordinates
                        x1 = int((x_center - width/2) * w)
                        y1 = int((y_center - height/2) * h)
                        x2 = int((x_center + width/2) * w)
                        y2 = int((y_center + height/2) * h)
                        
                        # Draw bounding box
                        color = colors[class_id]
                        cv2.rectangle(image_rgb, (x1, y1), (x2, y2), color, 2)
                        
                        # Add label
                        label = class_names[class_id]
                        cv2.putText(image_rgb, label, (x1, y1-10), 
                                  cv2.FONT_HERSHEY_SIMPLEX, 0.7, color, 2)
        
        axes[i].imshow(image_rgb)
        axes[i].set_title(f'Sample {i+1}: {img_path.name}')
        axes[i].axis('off')
    
    plt.tight_layout()
    plt.savefig('/app/results/plots/sample_images.png', dpi=300, bbox_inches='tight')
    plt.show()

# Display sample images
display_sample_images()

## 4. Image Size Analysis

In [None]:
def analyze_image_sizes(sample_size=100):
    """Analyze distribution of image sizes"""
    
    train_images_dir = images_dir / 'train'
    
    if not train_images_dir.exists():
        print("Training images not found.")
        return
    
    image_files = list(train_images_dir.glob('*.jpg'))[:sample_size]
    
    widths, heights, ratios = [], [], []
    
    for img_path in image_files:
        with Image.open(img_path) as img:
            w, h = img.size
            widths.append(w)
            heights.append(h)
            ratios.append(w/h)
    
    if not widths:
        print("No images to analyze.")
        return
    
    # Create visualizations
    fig, axes = plt.subplots(2, 2, figsize=(15, 10))
    
    # Width distribution
    axes[0, 0].hist(widths, bins=20, alpha=0.7, color='skyblue')
    axes[0, 0].set_title('Image Width Distribution')
    axes[0, 0].set_xlabel('Width (pixels)')
    axes[0, 0].set_ylabel('Frequency')
    
    # Height distribution
    axes[0, 1].hist(heights, bins=20, alpha=0.7, color='lightcoral')
    axes[0, 1].set_title('Image Height Distribution')
    axes[0, 1].set_xlabel('Height (pixels)')
    axes[0, 1].set_ylabel('Frequency')
    
    # Aspect ratio distribution
    axes[1, 0].hist(ratios, bins=20, alpha=0.7, color='lightgreen')
    axes[1, 0].set_title('Aspect Ratio Distribution')
    axes[1, 0].set_xlabel('Width/Height Ratio')
    axes[1, 0].set_ylabel('Frequency')
    
    # Scatter plot of width vs height
    axes[1, 1].scatter(widths, heights, alpha=0.6, color='purple')
    axes[1, 1].set_title('Width vs Height')
    axes[1, 1].set_xlabel('Width (pixels)')
    axes[1, 1].set_ylabel('Height (pixels)')
    
    plt.tight_layout()
    plt.savefig('/app/results/plots/image_size_analysis.png', dpi=300, bbox_inches='tight')
    plt.show()
    
    # Print statistics
    print(f"\nImage Size Statistics (n={len(widths)}):")
    print(f"Width - Mean: {np.mean(widths):.1f}, Std: {np.std(widths):.1f}, Range: {min(widths)}-{max(widths)}")
    print(f"Height - Mean: {np.mean(heights):.1f}, Std: {np.std(heights):.1f}, Range: {min(heights)}-{max(heights)}")
    print(f"Aspect Ratio - Mean: {np.mean(ratios):.2f}, Std: {np.std(ratios):.2f}")

# Analyze image sizes
analyze_image_sizes()

## 5. Bounding Box Analysis

In [None]:
def analyze_bounding_boxes():
    """Analyze bounding box sizes and distributions"""
    
    train_labels_dir = labels_dir / 'train'
    
    if not train_labels_dir.exists():
        print("Training labels not found.")
        return
    
    bbox_data = {'person': {'widths': [], 'heights': [], 'areas': []},
                'dog': {'widths': [], 'heights': [], 'areas': []}}
    
    class_names = ['person', 'dog']
    
    label_files = list(train_labels_dir.glob('*.txt'))
    
    for label_file in label_files:
        with open(label_file, 'r') as f:
            for line in f:
                if line.strip():
                    parts = line.strip().split()
                    class_id = int(parts[0])
                    _, _, width, height = map(float, parts[1:5])
                    
                    if class_id < len(class_names):
                        class_name = class_names[class_id]
                        bbox_data[class_name]['widths'].append(width)
                        bbox_data[class_name]['heights'].append(height)
                        bbox_data[class_name]['areas'].append(width * height)
    
    # Create visualizations
    fig, axes = plt.subplots(2, 3, figsize=(18, 12))
    
    colors = ['skyblue', 'lightcoral']
    
    for i, class_name in enumerate(class_names):
        if not bbox_data[class_name]['widths']:
            continue
            
        # Width distribution
        axes[i, 0].hist(bbox_data[class_name]['widths'], bins=20, 
                       alpha=0.7, color=colors[i], label=class_name)
        axes[i, 0].set_title(f'{class_name.capitalize()} - Bbox Width Distribution')
        axes[i, 0].set_xlabel('Normalized Width')
        axes[i, 0].set_ylabel('Frequency')
        
        # Height distribution
        axes[i, 1].hist(bbox_data[class_name]['heights'], bins=20, 
                       alpha=0.7, color=colors[i], label=class_name)
        axes[i, 1].set_title(f'{class_name.capitalize()} - Bbox Height Distribution')
        axes[i, 1].set_xlabel('Normalized Height')
        axes[i, 1].set_ylabel('Frequency')
        
        # Area distribution
        axes[i, 2].hist(bbox_data[class_name]['areas'], bins=20, 
                       alpha=0.7, color=colors[i], label=class_name)
        axes[i, 2].set_title(f'{class_name.capitalize()} - Bbox Area Distribution')
        axes[i, 2].set_xlabel('Normalized Area')
        axes[i, 2].set_ylabel('Frequency')
    
    plt.tight_layout()
    plt.savefig('/app/results/plots/bbox_analysis.png', dpi=300, bbox_inches='tight')
    plt.show()
    
    # Print statistics
    for class_name in class_names:
        if bbox_data[class_name]['widths']:
            print(f"\n{class_name.upper()} Bounding Box Statistics:")
            print(f"  Count: {len(bbox_data[class_name]['widths'])}")
            print(f"  Width - Mean: {np.mean(bbox_data[class_name]['widths']):.3f}, "
                  f"Std: {np.std(bbox_data[class_name]['widths']):.3f}")
            print(f"  Height - Mean: {np.mean(bbox_data[class_name]['heights']):.3f}, "
                  f"Std: {np.std(bbox_data[class_name]['heights']):.3f}")
            print(f"  Area - Mean: {np.mean(bbox_data[class_name]['areas']):.3f}, "
                  f"Std: {np.std(bbox_data[class_name]['areas']):.3f}")

# Analyze bounding boxes
analyze_bounding_boxes()

## 6. Dataset Summary and Recommendations

In [None]:
print("\n" + "="*60)
print("DATASET ANALYSIS SUMMARY")
print("="*60)

if dist:
    total_train_images = dist['train']['images']
    total_val_images = dist['val']['images']
    total_person = dist['train']['person'] + dist['val']['person']
    total_dog = dist['train']['dog'] + dist['val']['dog']
    
    print(f"Total Images: {total_train_images + total_val_images}")
    print(f"  Training: {total_train_images}")
    print(f"  Validation: {total_val_images}")
    print(f"\nTotal Instances: {total_person + total_dog}")
    print(f"  Person: {total_person}")
    print(f"  Dog: {total_dog}")
    
    if total_person + total_dog > 0:
        print(f"\nClass Balance:")
        print(f"  Person: {total_person/(total_person + total_dog):.1%}")
        print(f"  Dog: {total_dog/(total_person + total_dog):.1%}")

print("\nRECOMMENDATIONS FOR TRAINING:")
print("-" * 40)
print("1. Image Size: Use 512x512 for training (good balance of detail and speed)")
print("2. Data Augmentation: Apply horizontal flips, mosaic, and color jittering")
print("3. Class Balance: Monitor for potential class imbalance during training")
print("4. Batch Size: Start with 16 and adjust based on available memory")
print("5. Transfer Learning: Use YOLOv5s pretrained weights for faster convergence")
print("\n" + "="*60)

## 7. Save Analysis Results

In [None]:
# Save analysis summary
analysis_summary = {
    "timestamp": pd.Timestamp.now().isoformat(),
    "dataset": "Pascal VOC 2007 (person/dog subset)",
    "analysis_type": "Exploratory Data Analysis",
    "distribution": dist if dist else {},
    "recommendations": [
        "Use 512x512 input resolution",
        "Apply data augmentation",
        "Monitor class balance",
        "Use transfer learning with YOLOv5s"
    ]
}

ensure_dir('/app/results/analysis')
with open('/app/results/analysis/eda_summary.json', 'w') as f:
    json.dump(analysis_summary, f, indent=2)

print("Analysis complete! Results saved to /app/results/analysis/")
print("Generated plots saved to /app/results/plots/")