# Setup

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
import cv2

# Import your preprocessing utilities
from preprocessing import (
    create_validation_split,
    get_dataset_statistics,
    augment_minority_classes,
    get_augmentation_pipeline_no_tensor,
    load_and_preprocess_image,
    CATEGORIES_MAP
)

# Data Preprocessing for Grain Disease Classification

This notebook documents all preprocessing steps applied to the grain dataset.

## Preprocessing Steps:
1. Validation Split Creation
2. Dataset Statistics Analysis
3. Image Transformation Pipeline
4. Final Dataset Summary

Create Validation Split

In [None]:
print("Creating validation splits...")
maize_stats = create_validation_split("maize", val_ratio=0.15, random_state=42)
rice_stats = create_validation_split("rice", val_ratio=0.15, random_state=42)

Visualize Split Statistics

In [None]:
fig, axes = plt.subplots(1, 2, figsize=(16, 6))

for idx, (grain, stats) in enumerate([("maize", maize_stats), ("rice", rice_stats)]):
    if not stats:
        continue
    
    categories = list(stats.keys())
    train_counts = [stats[cat]['train'] for cat in categories]
    val_counts = [stats[cat]['val'] for cat in categories]
    
    x = np.arange(len(categories))
    width = 0.35
    
    axes[idx].bar(x - width/2, train_counts, width, label='Train', alpha=0.8)
    axes[idx].bar(x + width/2, val_counts, width, label='Validation', alpha=0.8)
    
    axes[idx].set_xlabel('Category')
    axes[idx].set_ylabel('Number of Images')
    axes[idx].set_title(f'{grain.upper()} - Train/Validation Split')
    axes[idx].set_xticks(x)
    axes[idx].set_xticklabels(categories, rotation=45)
    axes[idx].legend()
    axes[idx].grid(axis='y', alpha=0.3)

plt.tight_layout()
plt.show()

Check Class Balance

In [None]:
for grain in ["maize", "rice"]:
    print(f"\n{grain.upper()}:")
    stats = get_dataset_statistics(grain, splits=['train', 'val', 'test'])
    
    df = pd.DataFrame(stats).fillna(0).astype(int)
    print(df)
    print(f"Total: {df.sum().sum()} images")

Visualize Augmentation Examples
- Show original vs augmented images

In [None]:
def show_augmentation_examples(grain_type, category, n_examples=3):
    base_path = Path(f"./dataset/images/{grain_type}/train/{category}")
    images = list(base_path.glob("*.png"))[:n_examples]
    
    aug_pipeline = get_augmentation_pipeline_no_tensor(split='train', img_size=224)
    
    fig, axes = plt.subplots(n_examples, 2, figsize=(8, n_examples * 3))
    
    for i, img_path in enumerate(images):
        # Original
        img_orig = cv2.imread(str(img_path))
        img_orig = cv2.cvtColor(img_orig, cv2.COLOR_BGR2RGB)
        
        # Augmented
        img_aug = aug_pipeline(image=img_orig.copy())['image']
        
        axes[i, 0].imshow(img_orig)
        axes[i, 0].set_title(f"Original - {img_path.name}")
        axes[i, 0].axis('off')
        
        axes[i, 1].imshow(img_aug)
        axes[i, 1].set_title("Augmented")
        axes[i, 1].axis('off')
    
    plt.suptitle(f"{grain_type.upper()} - {category} Augmentation Examples")
    plt.tight_layout()
    plt.show()

# Example usage
show_augmentation_examples("maize", "0_NOR", n_examples=3)

Final Dataset Summary

In [None]:
print("\n=== FINAL DATASET SUMMARY ===")
for grain in ["maize", "rice"]:
    print(f"\n{grain.upper()}:")
    stats = get_dataset_statistics(grain, splits=['train', 'val', 'test'])
    
    df = pd.DataFrame(stats).fillna(0).astype(int)
    df['Total'] = df.sum(axis=1)
    print(df)
    print(f"\nGrand Total: {df['Total'].sum()} images")
    print(f"Train/Val/Test Ratio: {df.loc[:, 'train'].sum()}:{df.loc[:, 'val'].sum()}:{df.loc[:, 'test'].sum()}")

## Preprocessing Summary

### Decisions Made:
1. **Validation Split**: 15% of training data (stratified by class)
2. **Image Size**: 224×224 pixels (standard for CNN architectures)
3. **Normalization**: ImageNet statistics (mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
4. **Augmentation**: Applied to training set only
   - Random 90° rotations
   - Horizontal/vertical flips
   - Brightness/contrast adjustments
   - HSV color shifts
   - Gaussian noise

### Rationale:
- **224×224 size**: Compatible with pre-trained models (ResNet, VGG, EfficientNet)
- **ImageNet normalization**: Standard practice for transfer learning
- **Augmentation choices**: Preserve grain characteristics while introducing variation
- **No test augmentation**: Ensures fair evaluation on original data distribution

### Next Steps:
- Teammates can import `preprocessing.py` functions
- Use `get_augmentation_pipeline()` in data loaders
- Validation set ready for hyperparameter tuning