# üß™ Dataset Pipeline Testing

This notebook tests the data loading pipeline for both EuroSAT and BigEarthNet datasets.

**Works on:**
- ‚úÖ Local environment
- ‚úÖ Kaggle environment (auto-detects)


In [None]:
# ============================================================================
# SETUP AND IMPORTS
# ============================================================================

import sys
import os
from pathlib import Path
import matplotlib.pyplot as plt
import numpy as np
import torch
import pandas as pd

# Auto-detect environment
IS_KAGGLE = os.path.exists('/kaggle/input')

if IS_KAGGLE:
    print("üåê Running on Kaggle environment")
    # On Kaggle, repo is cloned to /kaggle/working/Final_exam
    project_root = Path('/kaggle/working/Final_exam')
else:
    print("üíª Running on local environment")
    # On local, notebook is in notebooks/, so go up one level
    project_root = Path.cwd().parent

# Add project root to path
sys.path.insert(0, str(project_root))

print(f"üìÇ Project root: {project_root}")
print(f"‚úì Python path updated")


In [None]:
# Import configurations and modules
from config import (
    EUROSAT_PATH, METADATA_PATH, BIGEARTHNET_FOLDERS, 
    REFERENCE_MAPS_FOLDER, CORINE_TO_EUROSAT, CLASS_NAMES, NUM_CLASSES,
    IS_KAGGLE
)
from src.data import (
    EuroSATDataset, BigEarthNetSegmentationDataset,
    get_classification_train_augmentation, get_val_augmentation,
    get_segmentation_train_augmentation
)
from src.utils.visualization import mask_to_rgb, denormalize_image, COLOR_PALETTE

print("‚úì Imports successful!")
print(f"\nüìç Paths configuration:")
print(f"  EuroSAT: {EUROSAT_PATH}")
print(f"  Exists: {EUROSAT_PATH.exists()}")
print(f"\n  Metadata: {METADATA_PATH}")
print(f"  Exists: {METADATA_PATH.exists()}")
print(f"\n  Reference Maps: {REFERENCE_MAPS_FOLDER}")
print(f"  Exists: {REFERENCE_MAPS_FOLDER.exists()}")
print(f"\n  BigEarthNet folders: {len(BIGEARTHNET_FOLDERS)} found")
for folder in BIGEARTHNET_FOLDERS:
    print(f"    - {folder} (exists: {folder.exists()})")


## üåç Test 1: EuroSAT Dataset


In [None]:
print("="*70)
print("TESTING EUROSAT DATASET")
print("="*70)

# Create transforms
train_transform = get_classification_train_augmentation(64, strength='light')
val_transform = get_val_augmentation(64)

# Load datasets
try:
    train_dataset = EuroSATDataset(EUROSAT_PATH, 'train', train_transform)
    val_dataset = EuroSATDataset(EUROSAT_PATH, 'val', val_transform)
    test_dataset = EuroSATDataset(EUROSAT_PATH, 'test', val_transform)
    
    print(f"‚úì Train samples: {len(train_dataset):,}")
    print(f"‚úì Val samples: {len(val_dataset):,}")
    print(f"‚úì Test samples: {len(test_dataset):,}")
    print(f"‚úì Classes: {train_dataset.classes}")
    
    # Get class distribution
    print("\nüìä Class distribution:")
    train_dataset.get_class_distribution()
    
except Exception as e:
    print(f"‚úó Error loading EuroSAT: {e}")
    import traceback
    traceback.print_exc()


In [None]:
# Visualize EuroSAT samples
fig, axes = plt.subplots(2, 5, figsize=(15, 6))
fig.suptitle('EuroSAT Training Samples', fontsize=16, fontweight='bold')

for i in range(10):
    sample = train_dataset[i]
    img = sample['image']  # torch.Tensor (3, 64, 64)
    label = sample['label']
    class_name = CLASS_NAMES[label]
    
    # Denormalize - returns numpy array (H, W, 3)
    img_np = denormalize_image(img)
    
    ax = axes[i // 5, i % 5]
    ax.imshow(img_np)
    ax.set_title(f'{class_name}', fontsize=10, fontweight='bold')
    ax.axis('off')

plt.tight_layout()
plt.show()

print("‚úì Visualized 10 EuroSAT samples")


## üõ∞Ô∏è Test 2: BigEarthNet Dataset


In [None]:
print("="*70)
print("LOADING BIGEARTHNET METADATA")
print("="*70)

try:
    metadata_df = pd.read_parquet(METADATA_PATH)
    print(f"‚úì Loaded metadata: {len(metadata_df):,} patches")
    print(f"  Columns: {list(metadata_df.columns)}")
    
    # Show split distribution
    print(f"\nüìä Split distribution:")
    print(metadata_df['split'].value_counts())
    
    # For testing, use a subset
    TEST_SUBSET_SIZE = 100  # Adjust based on your needs
    
    train_df = metadata_df[metadata_df['split'] == 'train'].head(TEST_SUBSET_SIZE)
    val_df = metadata_df[metadata_df['split'] == 'validation'].head(50)
    
    print(f"\n‚úì Using {len(train_df)} train samples (subset for testing)")
    print(f"‚úì Using {len(val_df)} val samples (subset for testing)")
    
    # Show sample metadata
    print(f"\nüìã Sample metadata:")
    print(train_df.head(3))
    
except Exception as e:
    print(f"‚úó Error loading metadata: {e}")
    import traceback
    traceback.print_exc()


In [None]:
print("="*70)
print("CREATING BIGEARTHNET DATASET")
print("="*70)

# Create transforms
seg_train_transform = get_segmentation_train_augmentation(120, strength='light')
seg_val_transform = get_val_augmentation(120)

try:
    train_dataset_ben = BigEarthNetSegmentationDataset(
        metadata_df=train_df,
        data_folders=BIGEARTHNET_FOLDERS,
        reference_maps_folder=REFERENCE_MAPS_FOLDER,
        corine_to_eurosat_mapping=CORINE_TO_EUROSAT,
        transform=seg_train_transform,
        num_classes=NUM_CLASSES,
        validate_data=True
    )
    
    val_dataset_ben = BigEarthNetSegmentationDataset(
        metadata_df=val_df,
        data_folders=BIGEARTHNET_FOLDERS,
        reference_maps_folder=REFERENCE_MAPS_FOLDER,
        corine_to_eurosat_mapping=CORINE_TO_EUROSAT,
        transform=seg_val_transform,
        num_classes=NUM_CLASSES,
        validate_data=True
    )
    
    print(f"‚úì Created BigEarthNet train dataset: {len(train_dataset_ben)} samples")
    print(f"‚úì Created BigEarthNet val dataset: {len(val_dataset_ben)} samples")
    
except Exception as e:
    print(f"‚úó Error creating BigEarthNet dataset: {e}")
    import traceback
    traceback.print_exc()


In [None]:
print("="*70)
print("TESTING SAMPLE LOADING")
print("="*70)

# Test loading a few samples
num_test = 5
success = 0

for i in range(num_test):
    try:
        sample = train_dataset_ben[i]
        print(f"\n‚úì Sample {i}: {sample['patch_id']}")
        print(f"  Image: {sample['image'].shape}, Mask: {sample['mask'].shape}")
        print(f"  Classes: {torch.unique(sample['mask']).tolist()}")
        success += 1
    except Exception as e:
        print(f"\n‚úó Sample {i} failed: {e}")

print(f"\nüìä Loaded {success}/{num_test} samples successfully")


In [None]:
# Visualize BigEarthNet samples
fig, axes = plt.subplots(3, 4, figsize=(16, 12))
fig.suptitle('BigEarthNet Samples (Image & Mask)', fontsize=16, fontweight='bold')

for i in range(6):
    try:
        sample = val_dataset_ben[i]
        img = sample['image']  # torch.Tensor (3, 120, 120)
        mask = sample['mask']  # torch.Tensor (120, 120)
        
        # Denormalize image - returns numpy array (H, W, 3)
        img_np = denormalize_image(img)
        
        # Convert mask to RGB
        if isinstance(mask, torch.Tensor):
            mask = mask.cpu().numpy()
        mask_rgb = mask_to_rgb(mask, COLOR_PALETTE)
        
        # Plot
        ax_img = axes[i // 2, (i % 2) * 2]
        ax_img.imshow(img_np)
        ax_img.set_title(f'Sample {i}: Image', fontsize=10)
        ax_img.axis('off')
        
        ax_mask = axes[i // 2, (i % 2) * 2 + 1]
        ax_mask.imshow(mask_rgb)
        ax_mask.set_title(f'Sample {i}: Mask', fontsize=10)
        ax_mask.axis('off')
        
    except Exception as e:
        print(f"‚úó Failed to visualize sample {i}: {e}")

plt.tight_layout()
plt.show()
print("‚úì Visualization complete")


## üîÑ Test 3: DataLoader


In [None]:
from torch.utils.data import DataLoader

print("="*70)
print("TESTING DATALOADER")
print("="*70)

# Create DataLoader
train_loader = DataLoader(
    train_dataset_ben,
    batch_size=4,
    shuffle=True,
    num_workers=0,  # Use 0 for debugging
    pin_memory=False
)

# Get one batch
try:
    batch = next(iter(train_loader))
    
    print(f"‚úì Batch loaded successfully!")
    print(f"  Images: {batch['image'].shape}")
    print(f"  Masks: {batch['mask'].shape}")
    print(f"  Patch IDs: {batch['patch_id']}")
    print(f"\nüìä Data info:")
    print(f"  Image dtype: {batch['image'].dtype}")
    print(f"  Mask dtype: {batch['mask'].dtype}")
    print(f"  Image range: [{batch['image'].min():.3f}, {batch['image'].max():.3f}]")
    print(f"  Mask range: [{batch['mask'].min()}, {batch['mask'].max()}]")
    print(f"  Unique classes: {torch.unique(batch['mask']).tolist()}")
    
except Exception as e:
    print(f"‚úó Error: {e}")
    import traceback
    traceback.print_exc()


## üé® Class Color Legend


In [None]:
from matplotlib.patches import Patch

fig, ax = plt.subplots(figsize=(8, 6))
ax.axis('off')

# Create legend patches
legend_patches = []
for i, class_name in enumerate(CLASS_NAMES):
    color = COLOR_PALETTE[i] / 255.0
    patch = Patch(color=color, label=f'{i}: {class_name}')
    legend_patches.append(patch)

ax.legend(handles=legend_patches, loc='center', fontsize=12, frameon=True)
ax.set_title('Class Color Legend', fontsize=14, fontweight='bold')

plt.tight_layout()
plt.show()


## üìã Summary


In [None]:
print("="*70)
print("DATASET PIPELINE TEST SUMMARY")
print("="*70)

print(f"\nüåê Environment: {'Kaggle' if IS_KAGGLE else 'Local'}")

print("\n‚úÖ EuroSAT Dataset:")
try:
    print(f"  - Train: {len(train_dataset):,} samples")
    print(f"  - Val: {len(val_dataset):,} samples")
    print(f"  - Test: {len(test_dataset):,} samples")
    print(f"  - Image size: 64x64")
except:
    print("  ‚ö†Ô∏è  Dataset not loaded")

print("\n‚úÖ BigEarthNet Dataset:")
try:
    print(f"  - Train: {len(train_dataset_ben)} samples (subset)")
    print(f"  - Val: {len(val_dataset_ben)} samples (subset)")
    print(f"  - Image size: 120x120")
    print(f"  - Classes: {NUM_CLASSES}")
    
    failed = train_dataset_ben.get_failed_samples()
    if failed:
        print(f"  ‚ö†Ô∏è  Failed samples: {len(failed)}")
except:
    print("  ‚ö†Ô∏è  Dataset not loaded")

print("\n" + "="*70)
print("‚úÖ All tests completed!")
print("üöÄ Ready for training!")
print("="*70)
