# Data Cleaning & Preprocessing
## Solinfitec Solix - PlantVillage Pipeline

**Tasks**:
- Scan for duplicate images
- Detect corrupted images
- Verify stratified split integrity
- Preview augmentation pipelines

In [None]:
import sys
sys.path.insert(0, '../..')

import numpy as np
import matplotlib.pyplot as plt
from pathlib import Path
from PIL import Image

from src.data.preprocessing import DataPreprocessor
from src.data.dataset import PlantVillageDataset
from src.features.augmentation import get_train_transforms, get_val_transforms

DATA_DIR = Path('../../data/raw')
PLANTVILLAGE_DIR = DATA_DIR / 'PlantVillage'

## 1. Duplicate Scan

In [None]:
preprocessor = DataPreprocessor(str(DATA_DIR), skip_nested='PlantVillage/PlantVillage')

print('Scanning for duplicates (by MD5 hash)...')
duplicates = preprocessor.scan_duplicates()

if duplicates:
    total_dupes = sum(len(v) - 1 for v in duplicates.values())
    print(f'Found {total_dupes} duplicate images in {len(duplicates)} groups')
    # Show first 3 duplicate groups
    for i, (h, files) in enumerate(list(duplicates.items())[:3]):
        print(f'\nGroup {i+1} ({len(files)} files):')
        for f in files:
            print(f'  {f}')
else:
    print('No duplicates found within top-level class directories.')

## 2. Corrupted Image Detection

In [None]:
print('Scanning for corrupted images...')
corrupted = preprocessor.detect_corrupted()

if corrupted:
    print(f'Found {len(corrupted)} corrupted images:')
    for c in corrupted[:10]:
        print(f'  {c}')
else:
    print('All images are valid.')

## 3. Verify Stratified Splits

In [None]:
train_ds = PlantVillageDataset(str(PLANTVILLAGE_DIR), split='train', seed=42)
val_ds = PlantVillageDataset(str(PLANTVILLAGE_DIR), split='val', seed=42)
test_ds = PlantVillageDataset(str(PLANTVILLAGE_DIR), split='test', seed=42)

print(f'Train: {len(train_ds)} | Val: {len(val_ds)} | Test: {len(test_ds)}')
print(f'Total: {len(train_ds) + len(val_ds) + len(test_ds)}')

# Verify no overlap
train_paths = set(train_ds.image_paths)
val_paths = set(val_ds.image_paths)
test_paths = set(test_ds.image_paths)
assert len(train_paths & val_paths) == 0, 'Train/Val overlap!'
assert len(train_paths & test_paths) == 0, 'Train/Test overlap!'
assert len(val_paths & test_paths) == 0, 'Val/Test overlap!'
print('No overlap between splits.')

# Class distribution per split
print('\nTrain class counts:')
for name, count in sorted(train_ds.get_label_counts().items()):
    print(f'  {name}: {count}')

## 4. Augmentation Preview

In [None]:
# Preview training augmentations on a sample image
sample_path = train_ds.image_paths[0]
img = np.array(Image.open(sample_path).convert('RGB'))
transform = get_train_transforms(img_size=224)

fig, axes = plt.subplots(2, 4, figsize=(16, 8))
axes[0, 0].imshow(img)
axes[0, 0].set_title('Original')
axes[0, 0].axis('off')

for i in range(1, 8):
    row, col = divmod(i, 4)
    augmented = transform(image=img)['image']
    # Denormalize for display
    display = augmented.permute(1, 2, 0).numpy()
    display = display * np.array([0.229, 0.224, 0.225]) + np.array([0.485, 0.456, 0.406])
    display = np.clip(display, 0, 1)
    axes[row, col].imshow(display)
    axes[row, col].set_title(f'Aug #{i}')
    axes[row, col].axis('off')

plt.suptitle('Training Augmentation Samples', fontsize=14)
plt.tight_layout()
plt.show()