
# Preprocessing — Static Augmented Dataset Builder
Build a balanced, augmented PlantVillage binary dataset (0=healthy, 1=diseased) and save RGB PNGs + CSV labels for TensorFlow and PyTorch.


In [2]:
import tensorflow as tf
import tensorflow_datasets as tfds
import numpy as np
import random
from pathlib import Path
from skimage.transform import resize
import math

# ==========================================
# CONFIG
# ==========================================
SEED = 42
IMG_SIZE = (224, 224)
NPY_DIR = Path("preprocessed_numpy")
NPY_DIR.mkdir(exist_ok=True, parents=True)

random.seed(SEED)
np.random.seed(SEED)
tf.random.set_seed(SEED)

# ==========================================
# UTILS
# ==========================================

def numpy_resize(img):
    """
    Pure NumPy/scikit-image resize (bilinear)
    img: uint8 numpy array (H, W, 3)
    return: uint8 numpy array (224,224,3)
    """
    img_resized = resize(
        img,
        IMG_SIZE,
        order=1,                 # bilinear
        preserve_range=True,
        anti_aliasing=True
    ).astype(np.uint8)
    return img_resized


def augment_healthy_np(img):
    """
    EXACT healthy augmentation logic but done with NumPy/skimage
    (Equivalent behavior to your tf.image pipeline)
    """
    # Convert to float in 0–1
    img_f = img.astype(np.float32) / 255.0

    # Horizontal flip
    if random.random() < 0.5:
        img_f = img_f[:, ::-1]

    # Vertical flip
    if random.random() < 0.5:
        img_f = img_f[::-1, :]

    # 90/180/270 rotation
    k = random.randint(0, 3)
    img_f = np.rot90(img_f, k)

    # Random saturation
    sat_factor = random.uniform(0.8, 1.25)
    img_f = img_f * sat_factor
    img_f = np.clip(img_f, 0, 1)

    # Random hue shift
    hue_shift = random.uniform(-0.05, 0.05)
    img_f = np.clip(img_f + hue_shift, 0, 1)

    # Random brightness
    bright = random.uniform(-0.12, 0.12)
    img_f = np.clip(img_f + bright, 0, 1)

    # Random contrast
    contrast = random.uniform(0.8, 1.25)
    img_f = np.clip((img_f - 0.5) * contrast + 0.5, 0, 1)

    return (img_f * 255.0).astype(np.uint8)


def augment_diseased_np(img):
    """
    Diseased augmentation with replacement (p=0.5)
    Equivalent to your tf.image logic
    """
    # Replacement probability
    if random.random() < 0.5:
        return img

    img_f = img.astype(np.float32) / 255.0

    if random.random() < 0.5:
        img_f = img_f[:, ::-1]

    k = random.randint(0, 3)
    img_f = np.rot90(img_f, k)

    # Slight contrast
    contrast = random.uniform(0.9, 1.1)
    img_f = np.clip((img_f - 0.5) * contrast + 0.5, 0, 1)

    # Slight brightness
    bright = random.uniform(-0.08, 0.08)
    img_f = np.clip(img_f + bright, 0, 1)

    # Slight hue
    hue_shift = random.uniform(-0.03, 0.03)
    img_f = np.clip(img_f + hue_shift, 0, 1)

    return (img_f * 255).astype(np.uint8)


# ==========================================
# 1) LOAD TFDS (decodes TFRecords internally)
# ==========================================
print("Loading PlantVillage from TFDS...")
ds, info = tfds.load("plant_village", split="train", as_supervised=True, with_info=True)

label_names = info.features["label"].names
num_examples = info.splits["train"].num_examples
print("Total images:", num_examples)

# ==========================================
# 2) Convert TFDS → NumPy + Binary labels + Resize
# ==========================================
print("Converting to NumPy + Resizing...")

all_images = []
all_labels = []

for img_tf, label_tf in ds:
    img_np = img_tf.numpy()  # (H,W,3)
    label_idx = label_tf.numpy()
    label_name = label_names[label_idx]

    # Binary mapping
    binary = 0 if label_name.endswith("healthy") else 1

    # Resize via NumPy/scikit-image
    img_resized = numpy_resize(img_np)

    all_images.append(img_resized)
    all_labels.append(binary)

all_images = np.array(all_images, dtype=np.uint8)
all_labels = np.array(all_labels, dtype=np.int64)

print("Finished resize. Shape:", all_images.shape)

# ==========================================
# 3) Train/Val/Test split (80/10/10)
# ==========================================
indices = np.arange(num_examples)
np.random.shuffle(indices)

train_end = int(0.8 * num_examples)
val_end = int(0.9 * num_examples)

train_idx = indices[:train_end]
val_idx = indices[train_end:val_end]
test_idx = indices[val_end:]

train_images = all_images[train_idx]
train_labels = all_labels[train_idx]

val_images = all_images[val_idx]
val_labels = all_labels[val_idx]

test_images = all_images[test_idx]
test_labels = all_labels[test_idx]

print(f"Splits → Train: {len(train_images)}, Val: {len(val_images)}, Test: {len(test_images)}")

# ==========================================
# 4) Build Augmented Train Dataset (Healthy balancing)
# ==========================================
print("Applying augmentation to train set...")

healthy_idx = np.where(train_labels == 0)[0]
diseased_idx = np.where(train_labels == 1)[0]

H = len(healthy_idx)
D = len(diseased_idx)

print(f"Train Healthy={H}, Diseased={D}")

healthy_multiplier = max(1, math.ceil(D/H) - 1) if H > 0 else 0
print("Healthy multiplier =", healthy_multiplier)

aug_images = []
aug_labels = []

# Add original healthy + augmented healthy
for _ in range(healthy_multiplier + 1):
    for idx in healthy_idx:
        img = train_images[idx]
        if _ == 0:
            aug_images.append(img)
            aug_labels.append(0)
        else:
            aug_images.append(augment_healthy_np(img))
            aug_labels.append(0)

# Add diseased with replacement augmentation
for idx in diseased_idx:
    img = train_images[idx]
    aug_images.append(augment_diseased_np(img))
    aug_labels.append(1)

# Convert to arrays
aug_images = np.array(aug_images, dtype=np.uint8)
aug_labels = np.array(aug_labels, dtype=np.int64)

print("Augmented train size:", len(aug_images))

# ==========================================
# 5) Shuffle final train set
# ==========================================
perm = np.random.permutation(len(aug_images))
aug_images = aug_images[perm]
aug_labels = aug_labels[perm]

# ==========================================
# 6) SAVE NPY FILES
# ==========================================
np.save(NPY_DIR / "train_images.npy", aug_images)
np.save(NPY_DIR / "train_labels.npy", aug_labels)

np.save(NPY_DIR / "val_images.npy", val_images)
np.save(NPY_DIR / "val_labels.npy", val_labels)

np.save(NPY_DIR / "test_images.npy", test_images)
np.save(NPY_DIR / "test_labels.npy", test_labels)

print("Saved all NumPy arrays to:", NPY_DIR)
print("DONE.")


Loading PlantVillage from TFDS...
Total images: 54303
Converting to NumPy + Resizing...


2025-11-28 16:27:16.547528: W tensorflow/core/framework/local_rendezvous.cc:404] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence


Finished resize. Shape: (54303, 224, 224, 3)
Splits → Train: 43442, Val: 5430, Test: 5431
Applying augmentation to train set...
Train Healthy=12095, Diseased=31347
Healthy multiplier = 2
Augmented train size: 67632
Saved all NumPy arrays to: preprocessed_numpy
DONE.
