In [32]:
import os
import cv2
import numpy as np
from pathlib import Path
from tqdm import tqdm
import pandas as pd

In [33]:
RAW_DIR = Path("data/raw")
PROCESSED_DIR = Path("data/processed")
IMG_SIZE = 512  # Resize images to 512x512

LESION_TYPES = {
    "Microaneurysms": "_MA",
    "Haemorrhages": "_HE",
    "Hard Exudates": "_EX",
    "Soft Exudates": "_SE",
    "Optic Disc": "_OD"
}

In [41]:
def create_dirs():
    (PROCESSED_DIR / "segmentation/images").mkdir(parents=True, exist_ok=True)
    (PROCESSED_DIR / "segmentation/masks").mkdir(parents=True, exist_ok=True)
    (PROCESSED_DIR / "classification/images").mkdir(parents=True, exist_ok=True)
    (PROCESSED_DIR / "classification/labels").mkdir(parents=True, exist_ok=True)

In [42]:
def resize_image(img_path, size):
    img = cv2.imread(str(img_path))
    if img is None:
        raise ValueError(f"Failed to read image: {img_path}")
    img = cv2.resize(img, (size, size))
    return img

In [43]:
def preprocess_segmentation():
    img_dir = RAW_DIR / "Segmentation/training set/original images"
    gt_root = RAW_DIR / "Segmentation/training set/ground truths"

    print(f"Processing segmentation data from:\n- Images: {img_dir}\n- Masks: {gt_root}")

    for img_path in tqdm(list(img_dir.glob("*.jpg")), desc="Processing Segmentation"):
        try:
            # Process and save image
            image = resize_image(img_path, IMG_SIZE)
            out_img_path = PROCESSED_DIR / "segmentation/images" / img_path.name
            cv2.imwrite(str(out_img_path), image)

            # Create multi-channel mask
            composite_mask = np.zeros((IMG_SIZE, IMG_SIZE, len(LESION_TYPES)), dtype=np.uint8)
            
            for channel_idx, (lesion_type, suffix) in enumerate(LESION_TYPES.items()):
                mask_path = gt_root / lesion_type / f"{img_path.stem}{suffix}.tif"
                
                if mask_path.exists():
                    mask = cv2.imread(str(mask_path), cv2.IMREAD_GRAYSCALE)
                    mask = cv2.resize(mask, (IMG_SIZE, IMG_SIZE))
                    _, binary_mask = cv2.threshold(mask, 127, 255, cv2.THRESH_BINARY)
                    composite_mask[:, :, channel_idx] = binary_mask

            # Save mask as numpy array
            mask_filename = img_path.stem + ".npy"
            out_mask_path = PROCESSED_DIR / "segmentation/masks" / mask_filename
            np.save(out_mask_path, composite_mask)

        except Exception as e:
            print(f"Error processing {img_path.name}: {str(e)}")
            continue


In [45]:
def preprocess_classification():
    img_dir = RAW_DIR / "Disease Grading/1. Original Images"
    labels_path = RAW_DIR / "Disease Grading/2. Groundtruths/IDRiD_Disease Grading_Training Labels.csv"

    labels_df = pd.read_csv(labels_path)

    for idx, row in tqdm(labels_df.iterrows(), total=len(labels_df), desc="Processing Classification"):
        try:
            img_file = row['Image name'] + ".jpg"
            img_path = img_dir / img_file
            
            image = resize_image(img_path, IMG_SIZE)
            out_img_path = PROCESSED_DIR / "classification/images" / img_file
            label_file = PROCESSED_DIR / "classification/labels" / (row['Image name'] + ".txt")
            
            cv2.imwrite(str(out_img_path), image)
            with open(label_file, "w") as f:
                f.write(str(row['Retinopathy grade']))
        except Exception as e:
            print(f"Error processing {img_file}: {str(e)}")
            continue

In [47]:
create_dirs()
preprocess_segmentation()
preprocess_classification()
print("Preprocessing complete.")

Processing segmentation data from:
- Images: data\raw\Segmentation\training set\original images
- Masks: data\raw\Segmentation\training set\ground truths


Processing Segmentation:   0%|          | 0/54 [00:00<?, ?it/s]

Processing Segmentation: 100%|██████████| 54/54 [00:02<00:00, 21.53it/s]
Processing Classification: 100%|██████████| 413/413 [00:18<00:00, 22.66it/s]

Preprocessing complete.



