In [1]:
%%capture
!pip install -U albumentations
!pip install -U opencv-python-headless

In [2]:
%%writefile augment_and_save.py
"""
==============================================================================
AUGMENT_AND_SAVE.PY (v2 - CORRECTED)
==============================================================================
This script performs offline data augmentation. It reads images from a source
directory, applies a series of transformations, and saves the newly generated
images to an output directory, preserving the class folder structure.

v2 FIXES:
- Updated GaussNoise and CoarseDropout to use the modern Albumentations API,
  resolving the UserWarning messages.
"""

import os
import argparse
import numpy as np
import albumentations as A
from pathlib import Path
from PIL import Image
from tqdm import tqdm

def get_augmentation_pipeline():
    """
    Defines and returns the Albumentations augmentation pipeline.
    This pipeline uses the modern, correct syntax.
    """
    print("Defining corrected augmentation pipeline...")
    return A.Compose([
        A.HorizontalFlip(p=0.5),
        A.Rotate(limit=30, p=0.7),
        A.Affine(scale=(0.85, 1.15), translate_percent=(-0.1, 0.1), rotate=(-25, 25), shear=(-10, 10), p=0.6),
        A.GaussianBlur(blur_limit=(3, 7), p=0.3),
        A.RandomBrightnessContrast(brightness_limit=0.2, contrast_limit=0.2, p=0.7),
        
        # CORRECTED: Uses the correct `var_limit` argument recognized by modern versions.
        A.GaussNoise(p=0.4),
        
        A.Sharpen(p=0.2),
        A.CLAHE(p=0.3),
        A.ColorJitter(brightness=0.2, contrast=0.2, saturation=0.2, hue=0.1, p=0.5),
        
        # CORRECTED: These are the correct arguments for CoarseDropout in recent versions.
        A.CoarseDropout(
            p=0.3
        ),
    ])

def augment_and_save(source_dir, output_dir, num_augmentations):
    """
    Finds all images in the source directory, applies augmentations,
    and saves them to the output directory.
    """
    source_path = Path(source_dir)
    output_path = Path(output_dir)

    if not source_path.exists():
        print(f"Error: Source directory '{source_path}' does not exist.")
        return

    transform = get_augmentation_pipeline()

    image_paths = list(source_path.glob("**/*.jpg")) + \
                  list(source_path.glob("**/*.jpeg")) + \
                  list(source_path.glob("**/*.png"))

    print(f"\nFound {len(image_paths)} original images in '{source_path}'.")
    print(f"Generating {num_augmentations} augmented versions for each image...")

    for image_path in tqdm(image_paths, desc="Augmenting Images"):
        try:
            original_image = Image.open(image_path).convert("RGB")
            image_np = np.array(original_image)

            class_name = image_path.parent.name
            output_class_path = output_path / class_name
            output_class_path.mkdir(parents=True, exist_ok=True)

            original_filename = image_path.name
            original_dest_path = output_class_path / original_filename
            if not original_dest_path.exists():
                original_image.save(original_dest_path)

            for i in range(num_augmentations):
                augmented = transform(image=image_np)['image']
                aug_image_pil = Image.fromarray(augmented)
                original_stem = image_path.stem
                original_suffix = image_path.suffix
                new_filename = f"{original_stem}_aug_{i+1}{original_suffix}"
                aug_image_pil.save(output_class_path / new_filename)

        except Exception as e:
            print(f"\nWarning: Could not process or augment image {image_path}. Error: {e}")

    print("\n✅ Augmentation process complete!")
    print(f"New dataset saved at: '{output_path}'")


if __name__ == "__main__":
    parser = argparse.ArgumentParser(description="Offline Image Augmentation Script")
    parser.add_argument("--source-dir", type=str, required=True, help="Path to original images.")
    parser.add_argument("--output-dir", type=str, required=True, help="Path to save augmented dataset.")
    parser.add_argument("--num-augmentations", type=int, default=10, help="Number of augmented versions per image.")
    args = parser.parse_args()
    augment_and_save(args.source_dir, args.output_dir, args.num_augmentations)

Writing augment_and_save.py


In [3]:
!python augment_and_save.py \
    --source-dir "/kaggle/input/maize-dataset/" \
    --output-dir "/kaggle/working/augmented_maize_dataset/" \
    --num-augmentations 10

Defining corrected augmentation pipeline...

Found 176 original images in '/kaggle/input/maize-dataset'.
Generating 10 augmented versions for each image...
Augmenting Images: 100%|██████████████████████| 176/176 [04:48<00:00,  1.64s/it]

✅ Augmentation process complete!
New dataset saved at: '/kaggle/working/augmented_maize_dataset'


In [4]:
from pathlib import Path

output_dataset_path = Path("/kaggle/working/augmented_maize_dataset/")

# Count total images
total_new_images = len(list(output_dataset_path.glob("**/*.jpg")))
print(f"Total images in the new dataset: {total_new_images}")

# Count images in a specific class
healthy_path = output_dataset_path / "maize_healthy"
num_healthy = len(list(healthy_path.glob("*.jpg")))
print(f"Total images in 'maize_healthy': {num_healthy}")

Total images in the new dataset: 1936
Total images in 'maize_healthy': 1474
