In [None]:
import os
import sys
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import tensorflow as tf
import tensorflow_datasets as tfds
from collections import Counter
import pandas as pd
from typing import Dict, List, Tuple

# Add project root to path
sys.path.append('..')
from src.config.config import Config
from src.config.model_configs import ModelConfigs
from src.data.dataset_loader import OxfordPetDatasetLoader
from src.data.preprocessing import DataPreprocessor
from src.data.augmentation import DataAugmentor

print("TensorFlow version:", tf.__version__)
print("GPUs available:", tf.src.config.list_physical_devices('GPU'))

# Set random seeds for reproducibility
tf.random.set_seed(src.config.RANDOM_SEED)
np.random.seed(src.config.RANDOM_SEED)

# Configure matplotlib
plt.style.use('default')
plt.rcParams['figure.figsize'] = src.config.FIGURE_SIZE
plt.rcParams['figure.dpi'] = src.config.DPI

In [None]:
class DatasetExplorer:
    """Dataset exploration and analysis utilities."""
    
    def __init__(self):
        self.config = Config()
        self.loader = OxfordPetDatasetLoader(self.config)
        self.preprocessor = DataPreprocessor(self.config)
        self.augmentor = DataAugmentor(self.config)
        
    def load_and_analyze_dataset(self):
        """Load dataset and perform initial analysis."""
        print("=== Loading Oxford-IIIT Pet Dataset ===")
        
        # Load dataset
        self.train_ds, self.val_ds, self.test_ds = self.loader.load_dataset()
        self.dataset_info = self.loader.get_dataset_info()
        
        print(f"Dataset loaded successfully!")
        print(f"Number of classes: {self.dataset_info['total_classes']}")
        print(f"Class names: {self.dataset_info['class_names'][:10]}...")  # Show first 10
        
        return self.train_ds, self.val_ds, self.test_ds
    
    def analyze_class_distribution(self, dataset: tf.data.Dataset, split_name: str):
        """Analyze class distribution in the dataset."""
        print(f"\n=== Analyzing {split_name} Class Distribution ===")
        
        # Count labels
        labels = []
        for sample in dataset:
            labels.append(sample['label'].numpy())
        
        # Create distribution analysis
        label_counts = Counter(labels)
        class_names = self.dataset_info['class_names']
        
        # Create DataFrame for analysis
        df = pd.DataFrame([
            {'class_id': class_id, 'class_name': class_names[class_id], 'count': count, 
             'breed_type': 'Cat' if class_names[class_id][0].isupper() else 'Dog'}
            for class_id, count in label_counts.items()
        ])
        df = df.sort_values('count', ascending=False)
        
        print(f"Total samples: {len(labels)}")
        print(f"Classes represented: {len(label_counts)}")
        print(f"Average samples per class: {np.mean(list(label_counts.values())):.1f}")
        print(f"Min samples per class: {min(label_counts.values())}")
        print(f"Max samples per class: {max(label_counts.values())}")
        
        # Count cats vs dogs
        cat_count = len([name for name in class_names if name[0].isupper()])
        dog_count = len(class_names) - cat_count
        print(f"Cat breeds: {cat_count}, Dog breeds: {dog_count}")
        
        return df
    
    def visualize_class_distribution(self, train_df: pd.DataFrame, val_df: pd.DataFrame):
        """Visualize class distribution."""
        fig, axes = plt.subplots(2, 2, figsize=(16, 12))
        
        # 1. Top 15 classes by count (training)
        top_classes = train_df.head(15)
        axes[0, 0].barh(range(len(top_classes)), top_classes['count'])
        axes[0, 0].set_yticks(range(len(top_classes)))
        axes[0, 0].set_yticklabels(top_classes['class_name'], fontsize=8)
        axes[0, 0].set_xlabel('Number of Samples')
        axes[0, 0].set_title('Top 15 Classes by Sample Count (Training)')
        axes[0, 0].invert_yaxis()
        
        # 2. Cat vs Dog distribution
        breed_counts = train_df.groupby('breed_type')['count'].sum()
        axes[0, 1].pie(breed_counts.values, labels=breed_counts.index, autopct='%1.1f%%')
        axes[0, 1].set_title('Cat vs Dog Distribution (Training)')
        
        # 3. Distribution histogram
        axes[1, 0].hist(train_df['count'], bins=20, alpha=0.7, label='Training')
        axes[1, 0].hist(val_df['count'], bins=20, alpha=0.7, label='Validation')
        axes[1, 0].set_xlabel('Samples per Class')
        axes[1, 0].set_ylabel('Number of Classes')
        axes[1, 0].set_title('Distribution of Samples per Class')
        axes[1, 0].legend()
        
        # 4. Train vs Val comparison for top classes
        merged_df = train_df.merge(val_df, on='class_name', suffixes=('_train', '_val'))
        top_10_merged = merged_df.head(10)
        
        x = np.arange(len(top_10_merged))
        width = 0.35
        
        axes[1, 1].bar(x - width/2, top_10_merged['count_train'], width, label='Train', alpha=0.8)
        axes[1, 1].bar(x + width/2, top_10_merged['count_val'], width, label='Validation', alpha=0.8)
        axes[1, 1].set_xlabel('Classes')
        axes[1, 1].set_ylabel('Number of Samples')
        axes[1, 1].set_title('Train vs Validation Split (Top 10 Classes)')
        axes[1, 1].set_xticks(x)
        axes[1, 1].set_xticklabels(top_10_merged['class_name'], rotation=45, ha='right', fontsize=8)
        axes[1, 1].legend()
        
        plt.tight_layout()
        plt.savefig(f'{src.config.PLOTS_DIR}/class_distribution.png', dpi=300, bbox_inches='tight')
        plt.show()
    
    def analyze_image_properties(self, dataset: tf.data.Dataset, split_name: str, num_samples: int = 1000):
        """Analyze image properties like size, aspect ratio, etc."""
        print(f"\n=== Analyzing {split_name} Image Properties ===")
        
        image_properties = {
            'widths': [],
            'heights': [],
            'aspect_ratios': [],
            'areas': [],
            'channels': []
        }
        
        count = 0
        for sample in dataset.take(num_samples):
            image = sample['image']
            height, width, channels = image.shape
            
            image_properties['widths'].append(width)
            image_properties['heights'].append(height)
            image_properties['aspect_ratios'].append(width / height)
            image_properties['areas'].append(width * height)
            image_properties['channels'].append(channels)
            
            count += 1
            if count >= num_samples:
                break
        
        # Calculate statistics
        stats = {}
        for prop, values in image_properties.items():
            stats[prop] = {
                'mean': np.mean(values),
                'std': np.std(values),
                'min': np.min(values),
                'max': np.max(values),
                'median': np.median(values)
            }
        
        print(f"Analyzed {count} images:")
        for prop, stat in stats.items():
            if prop != 'channels':  # Skip channels as it's always 3
                print(f"{prop.capitalize()}: mean={stat['mean']:.1f}, std={stat['std']:.1f}, "
                      f"range=[{stat['min']:.0f}, {stat['max']:.0f}]")
        
        return image_properties, stats
    
    def visualize_image_properties(self, image_props: Dict, split_name: str):
        """Visualize image properties."""
        fig, axes = plt.subplots(2, 2, figsize=(12, 10))
        
        # 1. Width and Height distribution
        axes[0, 0].hist(image_props['widths'], bins=30, alpha=0.7, label='Width')
        axes[0, 0].hist(image_props['heights'], bins=30, alpha=0.7, label='Height')
        axes[0, 0].set_xlabel('Pixels')
        axes[0, 0].set_ylabel('Frequency')
        axes[0, 0].set_title(f'Image Dimensions Distribution ({split_name})')
        axes[0, 0].legend()
        
        # 2. Aspect ratio distribution
        axes[0, 1].hist(image_props['aspect_ratios'], bins=30, alpha=0.7)
        axes[0, 1].set_xlabel('Aspect Ratio (Width/Height)')
        axes[0, 1].set_ylabel('Frequency')
        axes[0, 1].set_title(f'Aspect Ratio Distribution ({split_name})')
        axes[0, 1].axvline(x=1.0, color='red', linestyle='--', label='Square (1:1)')
        axes[0, 1].legend()
        
        # 3. Area distribution
        areas_k = [area/1000 for area in image_props['areas']]  # Convert to thousands
        axes[1, 0].hist(areas_k, bins=30, alpha=0.7)
        axes[1, 0].set_xlabel('Area (thousands of pixels)')
        axes[1, 0].set_ylabel('Frequency')
        axes[1, 0].set_title(f'Image Area Distribution ({split_name})')
        
        # 4. Width vs Height scatter
        axes[1, 1].scatter(image_props['widths'], image_props['heights'], alpha=0.5)
        axes[1, 1].set_xlabel('Width (pixels)')
        axes[1, 1].set_ylabel('Height (pixels)')
        axes[1, 1].set_title(f'Width vs Height ({split_name})')
        # Add diagonal line for square images
        max_dim = max(max(image_props['widths']), max(image_props['heights']))
        axes[1, 1].plot([0, max_dim], [0, max_dim], 'r--', alpha=0.5, label='Square')
        axes[1, 1].legend()
        
        plt.tight_layout()
        plt.savefig(f'{src.config.PLOTS_DIR}/image_properties_{split_name.lower()}.png', dpi=300, bbox_inches='tight')
        plt.show()
    
    def analyze_bounding_boxes(self, dataset: tf.data.Dataset, split_name: str, num_samples: int = 1000):
        """Analyze bounding box properties."""
        print(f"\n=== Analyzing {split_name} Bounding Box Properties ===")
        
        bbox_properties = {
            'widths': [],
            'heights': [],
            'areas': [],
            'aspect_ratios': [],
            'center_x': [],
            'center_y': []
        }
        
        count = 0
        for sample in dataset.take(num_samples):
            # Process sample to get normalized bbox
            processed = self.preprocessor.preprocess_sample(sample)
            bbox = processed['bbox'].numpy()  # [xmin, ymin, xmax, ymax] normalized
            
            xmin, ymin, xmax, ymax = bbox
            width = xmax - xmin
            height = ymax - ymin
            area = width * height
            aspect_ratio = width / height if height > 0 else 0
            center_x = (xmin + xmax) / 2
            center_y = (ymin + ymax) / 2
            
            bbox_properties['widths'].append(width)
            bbox_properties['heights'].append(height)
            bbox_properties['areas'].append(area)
            bbox_properties['aspect_ratios'].append(aspect_ratio)
            bbox_properties['center_x'].append(center_x)
            bbox_properties['center_y'].append(center_y)
            
            count += 1
            if count >= num_samples:
                break
        
        # Calculate statistics
        stats = {}
        for prop, values in bbox_properties.items():
            stats[prop] = {
                'mean': np.mean(values),
                'std': np.std(values),
                'min': np.min(values),
                'max': np.max(values),
                'median': np.median(values)
            }
        
        print(f"Analyzed {count} bounding boxes:")
        for prop, stat in stats.items():
            print(f"{prop.capitalize()}: mean={stat['mean']:.3f}, std={stat['std']:.3f}, "
                  f"range=[{stat['min']:.3f}, {stat['max']:.3f}]")
        
        return bbox_properties, stats
    
    def visualize_bounding_boxes(self, bbox_props: Dict, split_name: str):
        """Visualize bounding box properties."""
        fig, axes = plt.subplots(2, 3, figsize=(15, 10))
        
        # 1. Width and Height distribution
        axes[0, 0].hist(bbox_props['widths'], bins=30, alpha=0.7, label='Width')
        axes[0, 0].hist(bbox_props['heights'], bins=30, alpha=0.7, label='Height')
        axes[0, 0].set_xlabel('Normalized Size')
        axes[0, 0].set_ylabel('Frequency')
        axes[0, 0].set_title(f'BBox Dimensions ({split_name})')
        axes[0, 0].legend()
        
        # 2. Area distribution
        axes[0, 1].hist(bbox_props['areas'], bins=30, alpha=0.7)
        axes[0, 1].set_xlabel('Normalized Area')
        axes[0, 1].set_ylabel('Frequency')
        axes[0, 1].set_title(f'BBox Area Distribution ({split_name})')
        
        # 3. Aspect ratio distribution
        axes[0, 2].hist(bbox_props['aspect_ratios'], bins=30, alpha=0.7)
        axes[0, 2].set_xlabel('Aspect Ratio (Width/Height)')
        axes[0, 2].set_ylabel('Frequency')
        axes[0, 2].set_title(f'BBox Aspect Ratio ({split_name})')
        axes[0, 2].axvline(x=1.0, color='red', linestyle='--', label='Square')
        axes[0, 2].legend()
        
        # 4. Center position heatmap
        axes[1, 0].hist2d(bbox_props['center_x'], bbox_props['center_y'], bins=20, cmap='Blues')
        axes[1, 0].set_xlabel('Center X (normalized)')
        axes[1, 0].set_ylabel('Center Y (normalized)')
        axes[1, 0].set_title(f'BBox Center Distribution ({split_name})')
        axes[1, 0].set_xlim(0, 1)
        axes[1, 0].set_ylim(0, 1)
        
        # 5. Width vs Height scatter
        axes[1, 1].scatter(bbox_props['widths'], bbox_props['heights'], alpha=0.5)
        axes[1, 1].set_xlabel('Width (normalized)')
        axes[1, 1].set_ylabel('Height (normalized)')
        axes[1, 1].set_title(f'BBox Width vs Height ({split_name})')
        # Add diagonal line for square bboxes
        max_dim = max(max(bbox_props['widths']), max(bbox_props['heights']))
        axes[1, 1].plot([0, max_dim], [0, max_dim], 'r--', alpha=0.5, label='Square')
        axes[1, 1].legend()
        
        # 6. Area vs Aspect Ratio
        axes[1, 2].scatter(bbox_props['areas'], bbox_props['aspect_ratios'], alpha=0.5)
        axes[1, 2].set_xlabel('Area (normalized)')
        axes[1, 2].set_ylabel('Aspect Ratio')
        axes[1, 2].set_title(f'BBox Area vs Aspect Ratio ({split_name})')
        
        plt.tight_layout()
        plt.savefig(f'{src.config.PLOTS_DIR}/bbox_properties_{split_name.lower()}.png', dpi=300, bbox_inches='tight')
        plt.show()
    
    def visualize_sample_data(self, dataset: tf.data.Dataset, num_samples: int = 8):
        """Visualize sample images with annotations."""
        print(f"\n=== Visualizing Sample Data ===")
        
        fig, axes = plt.subplots(2, 4, figsize=(16, 8))
        axes = axes.flatten()
        
        class_names = self.dataset_info['class_names']
        
        count = 0
        for sample in dataset.take(num_samples):
            # Preprocess sample
            processed = self.preprocessor.preprocess_sample(sample)
            
            image = processed['image'].numpy()
            label = processed['label'].numpy()
            bbox = processed['bbox'].numpy()  # normalized
            seg_mask = processed['segmentation_mask'].numpy()
            
            # Create visualization
            ax = axes[count]
            
            # Show image
            ax.imshow(image)
            
            # Draw bounding box
            xmin, ymin, xmax, ymax = bbox
            # Convert normalized coordinates to pixel coordinates
            h, w = image.shape[:2]
            xmin_px, ymin_px = xmin * w, ymin * h
            xmax_px, ymax_px = xmax * w, ymax * h
            
            from matplotlib.patches import Rectangle
            rect = Rectangle((xmin_px, ymin_px), xmax_px - xmin_px, ymax_px - ymin_px,
                           linewidth=2, edgecolor='red', facecolor='none')
            ax.add_patch(rect)
            
            # Set title with class name
            class_name = class_names[label]
            breed_type = "Cat" if class_name[0].isupper() else "Dog"
            ax.set_title(f'{class_name} ({breed_type})', fontsize=10)
            ax.axis('off')
            
            count += 1
            if count >= num_samples:
                break
        
        plt.tight_layout()
        plt.savefig(f'{src.config.PLOTS_DIR}/sample_src.data.png', dpi=300, bbox_inches='tight')
        plt.show()
    
    def visualize_segmentation_masks(self, dataset: tf.data.Dataset, num_samples: int = 6):
        """Visualize segmentation masks."""
        print(f"\n=== Visualizing Segmentation Masks ===")
        
        fig, axes = plt.subplots(2, 6, figsize=(18, 6))
        
        class_names = self.dataset_info['class_names']
        
        count = 0
        for sample in dataset.take(num_samples):
            # Preprocess sample
            processed = self.preprocessor.preprocess_sample(sample)
            
            image = processed['image'].numpy()
            label = processed['label'].numpy()
            seg_mask = processed['segmentation_mask'].numpy()
            
            # Original image
            axes[0, count].imshow(image)
            axes[0, count].set_title(f'{class_names[label]}', fontsize=10)
            axes[0, count].axis('off')
            
            # Segmentation mask
            # Create color map: 0=background (blue), 1=foreground (red), 2=unknown (green)
            mask_colored = np.zeros((*seg_mask.shape, 3))
            mask_colored[seg_mask == 0] = [0, 0, 1]    # Background - blue
            mask_colored[seg_mask == 1] = [1, 0, 0]    # Foreground - red  
            mask_colored[seg_mask == 2] = [0, 1, 0]    # Unknown - green
            
            axes[1, count].imshow(mask_colored)
            axes[1, count].set_title('Segmentation Mask', fontsize=10)
            axes[1, count].axis('off')
            
            count += 1
            if count >= num_samples:
                break
        
        # Add legend
        from matplotlib.patches import Patch
        legend_elements = [
            Patch(facecolor='blue', label='Background'),
            Patch(facecolor='red', label='Foreground (Pet)'),
            Patch(facecolor='green', label='Unknown')
        ]
        fig.legend(handles=legend_elements, loc='upper center', bbox_to_anchor=(0.5, 0.95), ncol=3)
        
        plt.tight_layout()
        plt.savefig(f'{src.config.PLOTS_DIR}/segmentation_masks.png', dpi=300, bbox_inches='tight')
        plt.show()
    
    def test_preprocessing_pipeline(self):
        """Test the preprocessing pipeline."""
        print(f"\n=== Testing Preprocessing Pipeline ===")
        
        # Test with a single sample
        sample = next(iter(self.train_ds.take(1)))
        print("Original sample keys:", list(sample.keys()))
        print("Original image shape:", sample['image'].shape)
        print("Original bbox:", sample['bbox'].numpy())
        print("Original label:", sample['label'].numpy())
        print("Original seg mask shape:", sample['segmentation_mask'].shape)
        
        # Process sample
        processed = self.preprocessor.preprocess_sample(sample)
        print("\nProcessed sample keys:", list(processed.keys()))
        print("Processed image shape:", processed['image'].shape)
        print("Processed bbox (normalized):", processed['bbox'].numpy())
        print("Processed label:", processed['label'].numpy())
        print("Processed seg mask shape:", processed['segmentation_mask'].shape)
        print("Processed seg mask unique values:", np.unique(processed['segmentation_mask'].numpy()))
        
        # Test augmentation
        image = processed['image']
        bbox = processed['bbox'] 
        mask = processed['segmentation_mask']
        
        aug_image, aug_bbox, aug_mask = self.augmentor.augment_sample(image, bbox, mask)
        print("\nAugmented image shape:", aug_image.shape)
        print("Augmented bbox:", aug_bbox.numpy())
        print("Augmented mask shape:", aug_mask.shape)
        
        print("Preprocessing pipeline test completed successfully!")
    
    def create_processed_datasets(self):
        """Create preprocessed datasets for training."""
        print(f"\n=== Creating Processed Datasets ===")
        
        def preprocess_fn(sample):
            return self.preprocessor.preprocess_sample(sample)
        
        def augment_fn(sample):
            processed = preprocess_fn(sample)
            image, bbox, mask = self.augmentor.augment_sample(
                processed['image'], processed['bbox'], processed['segmentation_mask']
            )
            processed['image'] = image
            processed['bbox'] = bbox  
            processed['segmentation_mask'] = mask
            return processed
        
        # Create preprocessed datasets
        train_processed = self.train_ds.map(augment_fn, num_parallel_calls=tf.data.AUTOTUNE)
        val_processed = self.val_ds.map(preprocess_fn, num_parallel_calls=tf.data.AUTOTUNE)
        test_processed = self.test_ds.map(preprocess_fn, num_parallel_calls=tf.data.AUTOTUNE)
        
        # Configure for performance
        train_processed = train_processed.batch(src.config.BATCH_SIZE).prefetch(tf.data.AUTOTUNE)
        val_processed = val_processed.batch(src.config.BATCH_SIZE).prefetch(tf.data.AUTOTUNE)
        test_processed = test_processed.batch(src.config.BATCH_SIZE).prefetch(tf.data.AUTOTUNE)
        
        print("Processed datasets created successfully!")
        return train_processed, val_processed, test_processed
  

In [None]:
# Main execution
if __name__ == "__main__":
    # Initialize explorer
    explorer = DatasetExplorer()
    
    # Load and analyze dataset
    train_ds, val_ds, test_ds = explorer.load_and_analyze_dataset()
    
    # Analyze class distributions
    train_class_df = explorer.analyze_class_distribution(train_ds, "Training")
    val_class_df = explorer.analyze_class_distribution(val_ds, "Validation")
    
    # Visualize class distributions
    explorer.visualize_class_distribution(train_class_df, val_class_df)
    
    # Analyze image properties
    train_img_props, train_img_stats = explorer.analyze_image_properties(train_ds, "Training")
    explorer.visualize_image_properties(train_img_props, "Training")
    
    # Analyze bounding boxes
    train_bbox_props, train_bbox_stats = explorer.analyze_bounding_boxes(train_ds, "Training")
    explorer.visualize_bounding_boxes(train_bbox_props, "Training")
    
    # Visualize sample data
    explorer.visualize_sample_data(train_ds)
    explorer.visualize_segmentation_masks(train_ds)
    
    # Test preprocessing pipeline
    explorer.test_preprocessing_pipeline()
    
    # Create processed datasets for next tasks
    train_processed, val_processed, test_processed = explorer.create_processed_datasets()
    
    print("\n=== Task 1 Completed Successfully! ===")
    print("Next steps:")
    print("1. Run Task 2: Object Detection")
    print("2. Run Task 3: Semantic Segmentation") 
    print("3. Run Task 4: Multitask Learning")
    print("4. Generate comprehensive report")
