# Preprocessing Digital Image Data

## Data Selection

### Loading and Resizing
+ Load images from the selected dataset
+ Resize images to a consistent dimension
+ Explain the rationale for choosing the specific dimensions
+ Discuss the trade-offs between image size and computational efficiency.

In [None]:
import os
import cv2
import numpy as np
from PIL import Image, ImageEnhance
import random
from tqdm import tqdm
from concurrent.futures import ThreadPoolExecutor, as_completed
from scipy import ndimage

In [None]:

class ImageDataset:
    def __init__(self, root_dir, target_size=(224, 224),
                 use_augmentation=False, normalize=True,
                 subset_ratio=1.0, seed=42):
        self.root_dir = root_dir
        self.target_size = target_size
        self.use_augmentation = use_augmentation
        self.normalize = normalize

        if not os.path.isdir(root_dir):
            raise ValueError(f"Invalid directory: {root_dir}")

        self.image_paths = []
        self.labels = []
        self.class_to_idx = {}
        self.idx_to_class = {}

        supported_ext = ('.jpg', '.jpeg', '.png', '.bmp', '.tiff', '.webp')

        classes = sorted([
            d for d in os.listdir(root_dir)
            if os.path.isdir(os.path.join(root_dir, d))
        ])

        if not classes:
            raise ValueError("No class subdirectories found")

        for idx, cls in enumerate(classes):
            self.class_to_idx[cls] = idx
            self.idx_to_class[idx] = cls

            cls_dir = os.path.join(root_dir, cls)
            for f in os.listdir(cls_dir):
                if f.lower().endswith(supported_ext):
                    self.image_paths.append(os.path.join(cls_dir, f))
                    self.labels.append(idx)
        if subset_ratio < 1.0:
            random.seed(seed)
            total = len(self.image_paths)
            subset_size = int(total * subset_ratio)

            indices = list(range(total))
            random.shuffle(indices)
            selected = indices[:subset_size]

            self.image_paths = [self.image_paths[i] for i in selected]
            self.labels = [self.labels[i] for i in selected]

        print(f"✓ Using subset: {subset_size}/{total} images ({subset_ratio*100:.0f}%)")
        print(f"✓ Dataset loaded: {len(self.image_paths)} images, {len(classes)} classes")

    def augment(self, image):
        # Random horizontal flip
        if random.random() < 0.5:
            image = image.transpose(Image.FLIP_LEFT_RIGHT)

        # Random rotation
        if random.random() < 0.5:
            angle = random.uniform(-15, 15)
            image = image.rotate(angle)

        # Color jitter
        if random.random() < 0.5:
            image = ImageEnhance.Brightness(image).enhance(random.uniform(0.8, 1.2))
            image = ImageEnhance.Contrast(image).enhance(random.uniform(0.8, 1.2))
            image = ImageEnhance.Color(image).enhance(random.uniform(0.8, 1.2))

        return image


    def load_image(self, idx):
        try:
            img_path = self.image_paths[idx]

            # OpenCV đọc nhanh hơn
            image = cv2.imread(img_path)
            image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
            image = cv2.resize(image, self.target_size)

            image = Image.fromarray(image)

            if self.use_augmentation:
                image = self.augment(image)

            image = np.asarray(image, dtype=np.float32) / 255.0  # [0,1]

            if self.normalize:
                mean = np.array([0.485, 0.456, 0.406])
                std = np.array([0.229, 0.224, 0.225])
                image = (image - mean) / std

            # CHW format (giống PyTorch)
            image = np.transpose(image, (2, 0, 1))

            return image, self.labels[idx]

        except Exception as e:
            print(f"Error loading image: {e}")
            blank = np.zeros((3, *self.target_size), dtype=np.float32)
            return blank, self.labels[idx]

class DataLoader:
    def __init__(self, dataset, batch_size=32, shuffle=True):
        self.dataset = dataset
        self.batch_size = batch_size
        self.shuffle = shuffle
        self.indices = np.arange(len(dataset.image_paths))

    def __iter__(self):
        if self.shuffle:
            np.random.shuffle(self.indices)

        for start in range(0, len(self.indices), self.batch_size):
            batch_idx = self.indices[start:start + self.batch_size]

            images, labels = [], []
            for idx in batch_idx:
                img, lbl = self.dataset.load_image(idx)
                images.append(img)
                labels.append(lbl)

            yield np.stack(images), np.array(labels)

    def __len__(self):
        return len(self.indices) // self.batch_size

def build_image_pipeline(
    root_dir,
    output_dir=None,
    target_size=(224, 224),
    batch_size=32,
    shuffle=True,
    use_augmentation=False,
    normalize=True,
    subset_ratio=1.0,
    seed=42,
    save_resized=False,
    num_workers=None
):
    """
    Hàm tổng hợp:
    - Load dataset
    - Tạo DataLoader
    - (Optional) Resize & save ảnh ra disk

    Returns:
        dataset, loader
    """

    # =============================
    # 1. Khởi tạo Dataset
    # =============================
    dataset = ImageDataset(
        root_dir=root_dir,
        target_size=target_size,
        use_augmentation=use_augmentation,
        normalize=normalize,
        subset_ratio=subset_ratio,
        seed=seed
    )

    # =============================
    # 2. Khởi tạo DataLoader
    # =============================
    loader = DataLoader(
        dataset=dataset,
        batch_size=batch_size,
        shuffle=shuffle
    )

    print(f"✓ Total images: {len(dataset.image_paths)}")
    print(f"✓ Classes: {len(dataset.class_to_idx)}")
    print(f"✓ Batches per epoch: {len(loader)}")

    # =============================
    # 3. Resize & Save (optional)
    # =============================
    if save_resized:
        if output_dir is None:
            raise ValueError("output_dir must be provided if save_resized=True")

        os.makedirs(output_dir, exist_ok=True)

        if num_workers is None:
            num_workers = min(16, os.cpu_count() * 2)

        print(f"✓ Saving resized images to: {output_dir}")
        print(f"✓ Using {num_workers} workers")

        def resize_and_save(img_path, label):
            try:
                class_name = dataset.idx_to_class[label]
                out_dir = os.path.join(output_dir, class_name)
                os.makedirs(out_dir, exist_ok=True)

                out_path = os.path.join(out_dir, os.path.basename(img_path))

                img = cv2.imread(img_path)
                if img is None:
                    return False

                img = cv2.resize(img, target_size)
                return cv2.imwrite(out_path, img)

            except Exception:
                return False

        tasks = list(zip(dataset.image_paths, dataset.labels))

        success = 0
        with ThreadPoolExecutor(max_workers=num_workers) as executor:
            futures = [
                executor.submit(resize_and_save, p, l)
                for p, l in tasks
            ]

            for f in tqdm(as_completed(futures), total=len(futures)):
                if f.result():
                    success += 1

        print(f"✅ Saved {success}/{len(tasks)} images")

    return dataset, loader



OUTPUT_ROOT = r"C:/Users/GEED/Documents/data-mining-l1/data/images/subset_resized_224"
TARGET_SIZE = (224, 224)
NUM_WORKERS = min(16, os.cpu_count() * 2)

os.makedirs(OUTPUT_ROOT, exist_ok=True)

def resize_and_save(img_path, label):
    try:
        class_name = dataset.idx_to_class[label]
        out_dir = os.path.join(OUTPUT_ROOT, class_name)
        os.makedirs(out_dir, exist_ok=True)

        out_path = os.path.join(out_dir, os.path.basename(img_path))

        img = cv2.imread(img_path)
        if img is None:
            return False

        img = cv2.resize(img, TARGET_SIZE)
        return cv2.imwrite(out_path, img)

    except Exception:
        return False

tasks = list(zip(dataset.image_paths, dataset.labels))

success = 0
with ThreadPoolExecutor(max_workers=NUM_WORKERS) as executor:
    futures = [executor.submit(resize_and_save, p, l) for p, l in tasks]

    for f in tqdm(as_completed(futures), total=len(futures)):
        if f.result():
            success += 1

print(f"✅ Saved {success}/{len(tasks)} images")


✓ Using subset: 7029/70295 images (10%)
✓ Dataset loaded: 7029 images, 38 classes
✓ Total images: 7029
✓ Classes: 38
✓ Batches per epoch: 219
✓ Batch loaded
Images shape: (32, 3, 224, 224)
Labels shape: (32,)


100%|██████████| 7029/7029 [00:06<00:00, 1111.13it/s]

✅ Saved 7029/7029 images





### Grayscale Conversion
+ Convert color images to grayscale where appropriate
+ Compare information retention between color and grayscale representations
+ Discuss when grayscale conversion is beneficial versus detrimental

In [None]:
def rgb_to_grayscale(images):
    # batch: (B, 3, H, W)
    r, g, b = images[:, 0], images[:, 1], images[:, 2]
    gray = 0.299 * r + 0.587 * g + 0.114 * b
    return gray  # (B, H, W)

for images, labels in loader:
    gray_images = rgb_to_grayscale(images)
    print(gray_images.shape)  # (B, 224, 224)
    break

### Normalization
+ Normalization: Apply pixel normalization (e.g., scaling to [0,1] or [-1,1]) and implement standardization (zero mean, unit variance).
+ Compare different normalization techniques and their effects on the data distribution.


In [None]:
# This is the code snippet you provided.
# You should replace this comment with the actual code you want me to complete.
# Every code you should provide comments to be graded. 

def normalize_gray_batchwise(gray, eps=1e-6):
    """
    Chuẩn hóa toàn batch chung một mean/std
    """
    mean = gray.mean(axis=(0,1,2), keepdims=True)
    std  = gray.std(axis=(0,1,2), keepdims=True)
    return (gray - mean) / (std + eps)


def normalize_gray_batchwise(gray, eps=1e-6):
    """
    Chuẩn hóa toàn batch chung một mean/std
    """
    mean = gray.mean(axis=(0,1,2), keepdims=True)
    std  = gray.std(axis=(0,1,2), keepdims=True)
    return (gray - mean) / (std + eps)


### Edge Detection (Optional Bonus)
+ Apply edge detection algorithms (Sobel, Prewitt, Canny)
+ Extract edge features from images, and then visualize detected edges and discuss their significance for your
chosen dataset.

In [None]:
# This is the code snippet you provided.
# You should replace this comment with the actual code you want me to complete.
# Every code you should provide comments to be graded. 
def sobel_edges_batchwise(gray):
    """
    Sobel edge detection cho gray đã normalize batchwise
    gray: (B, H, W)
    return: (B, H, W)
    """
    edges = []
    for g in gray:
        sx = ndimage.sobel(g, axis=0)
        sy = ndimage.sobel(g, axis=1)
        edges.append(np.hypot(sx, sy))
    return np.stack(edges)

def sobel_edges_per_image(gray):
    """
    Sobel edge detection cho gray đã normalize per-image
    """
    edges = []
    for g in gray:
        sx = ndimage.sobel(g, axis=0)
        sy = ndimage.sobel(g, axis=1)
        edges.append(np.hypot(sx, sy))
    return np.stack(edges)


def prewitt_edges_batchwise(gray):
    """
    Prewitt edge detection cho gray đã normalize batchwise
    """
    edges = []
    for g in gray:
        px = ndimage.prewitt(g, axis=0)
        py = ndimage.prewitt(g, axis=1)
        edges.append(np.hypot(px, py))
    return np.stack(edges)

def prewitt_edges_per_image(gray):
    """
    Prewitt edge detection cho gray đã normalize per-image
    """
    edges = []
    for g in gray:
        px = ndimage.prewitt(g, axis=0)
        py = ndimage.prewitt(g, axis=1)
        edges.append(np.hypot(px, py))
    return np.stack(edges)


import cv2

def canny_edges_per_image(gray, low=50, high=150):
    """
    Canny edge detection cho gray đã normalize per-image
    """
    edges = []
    for g in gray:
        # Rescale về [0,255] cho Canny
        g_uint8 = cv2.normalize(
            g, None, 0, 255, cv2.NORM_MINMAX
        ).astype(np.uint8)

        edges.append(cv2.Canny(g_uint8, low, high))
    return np.stack(edges)


import matplotlib.pyplot as plt

def visualize_edges(gray, edges, idx=0, title="Edges"):
    """
    Hiển thị ảnh grayscale và edge
    """
    plt.figure(figsize=(8,4))

    plt.subplot(1,2,1)
    plt.imshow(gray[idx], cmap="gray")
    plt.title("Grayscale")
    plt.axis("off")

    plt.subplot(1,2,2)
    plt.imshow(edges[idx], cmap="gray")
    plt.title(title)
    plt.axis("off")

    plt.show()


Main Function for testing purpose and visualization

In [None]:
# ===============================
# MAIN CELL: PIPELINE TEST & VISUALIZATION
# ===============================

def main_pipeline_test():
    """
    Cell main để:
    - Load dataset + dataloader
    - RGB → Grayscale
    - Normalize grayscale
    - Edge detection
    - Visualization kết quả
    """

    # ===============================
    # 1. Khởi tạo pipeline
    # ===============================
    TRAIN_ROOT = r"C:/Users/GEED/Documents/data-mining-l1/data/images/New Plant Diseases Dataset(Augmented)/train"
    OUTPUT_ROOT = r"C:/Users/GEED/Documents/data-mining-l1/data/images/subset_resized_224"

    dataset, loader = build_image_pipeline(
        root_dir=TRAIN_ROOT,
        output_dir=OUTPUT_ROOT,
        target_size=(224, 224),
        batch_size=8,              # nhỏ để visualize
        shuffle=True,
        use_augmentation=False,    # tắt để dễ quan sát
        normalize=True,
        subset_ratio=0.1,
        save_resized=False         # chỉ test pipeline
    )

    # ===============================
    # 2. Lấy 1 batch
    # ===============================
    images, labels = next(iter(loader))
    print("RGB batch shape:", images.shape)   # (B, 3, 224, 224)

    # ===============================
    # 3. RGB → Grayscale
    # ===============================
    gray = rgb_to_grayscale(images)
    print("Grayscale shape:", gray.shape)     # (B, 224, 224)

    # ===============================
    # 4. Normalize grayscale (batchwise)
    # ===============================
    gray_norm = normalize_gray_batchwise(gray)
    print("Gray normalized:",
          f"mean={gray_norm.mean():.4f}, std={gray_norm.std():.4f}")

    # ===============================
    # 5. Edge Detection
    # ===============================
    sobel_edges   = sobel_edges_batchwise(gray_norm)
    prewitt_edges = prewitt_edges_batchwise(gray_norm)
    canny_edges   = canny_edges_per_image(gray_norm)

    print("Sobel edges shape:", sobel_edges.shape)
    print("Prewitt edges shape:", prewitt_edges.shape)
    print("Canny edges shape:", canny_edges.shape)

    # ===============================
    # 6. Visualization
    # ===============================
    visualize_edges(gray_norm, sobel_edges, idx=0, title="Sobel Edges")
    visualize_edges(gray_norm, prewitt_edges, idx=0, title="Prewitt Edges")
    visualize_edges(gray_norm, canny_edges, idx=0, title="Canny Edges")


# ===============================
# RUN MAIN
# ===============================
main_pipeline_test()


Image (RGB)
 → Resize
 → Normalize
 → Grayscale
 → Gray Normalization
 → Edge Detection
 → Visualization / Feature Extraction