In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
# ================================================================
# MEMORY-EFFICIENT BALANCED DATASET CREATION (WITH PROGRESS LOGS)
# Retina = 70k Train + 15k Val
# Non-Retina = 30k Train + 15k Val  (CIFAR + COCO ONLY)
# FULL SHUFFLING ENABLED FOR MAXIMUM DIVERSITY
# ================================================================

import os
import cv2
import numpy as np
import shutil
import random
import requests
from tqdm import tqdm
from zipfile import ZipFile

print("\nüöÄ STARTING BALANCED DATASET GENERATION PIPELINE...\n")

# ================================================================
# 1) DOWNLOAD RETINAL DATASET USING KAGGLEHUB
# ================================================================
print("üìå STEP 1: Installing KaggleHub & Downloading Retinal Dataset")
!pip install -q kagglehub
import kagglehub

print("üì• Downloading retinal dataset via KaggleHub...")
kaggle_path = kagglehub.dataset_download(
    "ascanipek/eyepacs-aptos-messidor-diabetic-retinopathy"
)
print("‚úî Retina dataset downloaded at:", kaggle_path)

# ================================================================
# TARGET SIZES
# ================================================================
RETINA_TRAIN_TARGET = 20000
RETINA_VAL_TARGET   = 8000

NONRET_TRAIN_TARGET = 20000
NONRET_VAL_TARGET   = 8000

# ================================================================
# 2) COLLECT RETINAL IMAGES (WITH SHUFFLING)
# ================================================================
print("\nüìå STEP 2: Collecting retinal images...")

retinal_images = []
for root, dirs, files in os.walk(kaggle_path):
    for f in files:
        if f.lower().endswith((".jpg", ".jpeg", ".png", ".bmp")):
            retinal_images.append(os.path.join(root, f))

print(f"üîç Found {len(retinal_images)} raw retinal images.")

# üî• SHUFFLE FOR FULL DIVERSITY
random.shuffle(retinal_images)

required_retina = RETINA_TRAIN_TARGET + RETINA_VAL_TARGET
retina_train = retinal_images[:RETINA_TRAIN_TARGET]
retina_val   = retinal_images[RETINA_TRAIN_TARGET:required_retina]

# üî• SHUFFLE TRAIN AND VAL SEPARATELY
random.shuffle(retina_train)
random.shuffle(retina_val)

print(f"üìä Retinal Train Count: {len(retina_train)}")
print(f"üìä Retinal Val Count:   {len(retina_val)}")

# ================================================================
# 3) CREATE FINAL FOLDER STRUCTURE
# ================================================================
print("\nüìå STEP 3: Creating final dataset folder structure...")

base_dir = "/content/retina_nonretina_dataset"

paths = {
    "train_retinal": f"{base_dir}/train/retinal",
    "train_nonret":  f"{base_dir}/train/non-retinal",
    "val_retinal":   f"{base_dir}/val/retinal",
    "val_nonret":    f"{base_dir}/val/non-retinal",
}

for p in paths.values():
    os.makedirs(p, exist_ok=True)

print("‚úî Final folders created!")

# ================================================================
# COPY RETINAL IMAGES
# ================================================================
print("\nüìå STEP 4: Copying retinal TRAIN images...")
for img in tqdm(retina_train, desc="Copying Retinal Train"):
    shutil.copy(img, paths["train_retinal"])

print("\nüìå Copying retinal VAL images...")
for img in tqdm(retina_val, desc="Copying Retinal Val"):
    shutil.copy(img, paths["val_retinal"])


# ================================================================
# 4) NON-RETINAL IMAGES (CIFAR + COCO ONLY, FULL SHUFFLING)
# ================================================================
print("\nüìå STEP 5: Collecting Non-Retinal Images from CIFAR & COCO...")

nonret_raw = "/content/nonret_raw"
os.makedirs(nonret_raw, exist_ok=True)
all_nonret = []

# -----------------------------------------------------------
# üìå SOURCE 1: CIFAR-10 & CIFAR-100
# -----------------------------------------------------------
print("\nüì• Loading CIFAR-10 and CIFAR-100...")
from tensorflow.keras.datasets import cifar10, cifar100

(X10, _), _ = cifar10.load_data()
(X100, _), _ = cifar100.load_data()

# üî• SHUFFLE CIFAR BEFORE SAVING
np.random.shuffle(X10)
np.random.shuffle(X100)

def save_cifar(images, folder):
    out = f"{nonret_raw}/{folder}"
    os.makedirs(out, exist_ok=True)
    print(f"üñº Saving {folder} images...")
    for i, img in enumerate(images):
        img = cv2.cvtColor(img, cv2.COLOR_RGB2BGR)
        path = f"{out}/{folder}_{i}.jpg"
        cv2.imwrite(path, img)
        all_nonret.append(path)

save_cifar(X10, "cifar10")      # 50,000 images
save_cifar(X100, "cifar100")    # 50,000 images

# -----------------------------------------------------------
# üìå SOURCE 2: COCO 2017 train subset
# -----------------------------------------------------------
print("\nüì• Downloading COCO 2017 train subset (~40,000 images)...")

coco_zip = "/content/train2017.zip"
coco_url = "http://images.cocodataset.org/zips/train2017.zip"

with requests.get(coco_url, stream=True) as r:
    total = int(r.headers.get("content-length", 0))
    with open(coco_zip, "wb") as f, tqdm(total=total, desc="Downloading COCO", unit="B", unit_scale=True):
        for chunk in r.iter_content(chunk_size=1024 * 1024):
            f.write(chunk)

coco_out = f"{nonret_raw}/coco"
os.makedirs(coco_out, exist_ok=True)

with ZipFile(coco_zip, "r") as z:
    jpgs = [x for x in z.namelist() if x.endswith(".jpg")]

    # üî• SHUFFLE COCO LIST BEFORE SAMPLING
    random.shuffle(jpgs)

    print("üìÇ Extracting 40,000 COCO images...")
    for f in tqdm(jpgs[:40000], desc="Extracting COCO"):
        z.extract(f, coco_out)
        all_nonret.append(os.path.join(coco_out, f))

os.remove(coco_zip)

# -----------------------------------------------------------
# üìå UNIQUE + SHUFFLE
# -----------------------------------------------------------
print("\nüîç Removing duplicate paths...")
all_nonret = list(set(all_nonret))

# üî• FULL SHUFFLE AGAIN FOR MAXIMUM VARIETY
random.shuffle(all_nonret)

print(f"üìä UNIQUE non-retinal images collected: {len(all_nonret)}")


# ================================================================
# 5) SAMPLE EXACT TARGET (WITH SHUFFLING)
# ================================================================
print("\nüìå STEP 6: Sampling EXACT 30k Train + 15k Val...")

nonret_train = all_nonret[:NONRET_TRAIN_TARGET]
nonret_val   = all_nonret[NONRET_TRAIN_TARGET:NONRET_TRAIN_TARGET + NONRET_VAL_TARGET]

# üî• SHUFFLE TRAIN & VAL AGAIN FOR DIVERSITY
random.shuffle(nonret_train)
random.shuffle(nonret_val)

print(f"‚úî Non-Retinal Train Selected: {len(nonret_train)}")
print(f"‚úî Non-Retinal Val  Selected: {len(nonret_val)}")

# ================================================================
# COPY NON-RETINAL IMAGES
# ================================================================
print("\nüì§ Copying NON-RETINAL TRAIN images...")
for img in tqdm(nonret_train, desc="Copying Non-Retinal Train"):
    shutil.copy(img, paths["train_nonret"])

print("\nüì§ Copying NON-RETINAL VAL images...")
for img in tqdm(nonret_val, desc="Copying Non-Retinal Val"):
    shutil.copy(img, paths["val_nonret"])

print("\nüî• Cleaning nonret_raw to save disk...")
shutil.rmtree(nonret_raw)

# ================================================================
# 6) ZIP FINAL DATASET
# ================================================================
print("\nüìå STEP 7: Creating ZIP file (Memory-Safe)...")
shutil.make_archive("/content/retina_nonretina_dataset_balanced2", "zip", base_dir)

final_zip = "/content/retina_nonretina_dataset_balanced2.zip"
print("\nüéâ DONE! ZIP FILE CREATED SUCCESSFULLY!")
print("üì¶ Download ZIP at:", final_zip)

# ================================================================
# 7) UPLOAD TO GOOGLE DRIVE
# ================================================================
print("\nüìå STEP 8: Uploading ZIP to Google Drive...")



drive_target = "/content/drive/MyDrive/retina_nonretina_dataset_balanced2.zip"
shutil.copy(final_zip, drive_target)

print("\n‚úî File uploaded to Drive at:", drive_target)
print("\nüöÄ DATASET GENERATION PIPELINE COMPLETE!")



üöÄ STARTING BALANCED DATASET GENERATION PIPELINE...

üìå STEP 1: Installing KaggleHub & Downloading Retinal Dataset
üì• Downloading retinal dataset via KaggleHub...
Downloading from https://www.kaggle.com/api/v1/datasets/download/ascanipek/eyepacs-aptos-messidor-diabetic-retinopathy?dataset_version_number=4...


100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 20.5G/20.5G [03:40<00:00, 99.5MB/s]

Extracting files...





‚úî Retina dataset downloaded at: /root/.cache/kagglehub/datasets/ascanipek/eyepacs-aptos-messidor-diabetic-retinopathy/versions/4

üìå STEP 2: Collecting retinal images...
üîç Found 236170 raw retinal images.
üìä Retinal Train Count: 20000
üìä Retinal Val Count:   8000

üìå STEP 3: Creating final dataset folder structure...
‚úî Final folders created!

üìå STEP 4: Copying retinal TRAIN images...


Copying Retinal Train: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 20000/20000 [01:03<00:00, 315.06it/s]



üìå Copying retinal VAL images...


Copying Retinal Val: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 8000/8000 [00:23<00:00, 340.89it/s]



üìå STEP 5: Collecting Non-Retinal Images from CIFAR & COCO...

üì• Loading CIFAR-10 and CIFAR-100...
Downloading data from https://www.cs.toronto.edu/~kriz/cifar-10-python.tar.gz
[1m170498071/170498071[0m [32m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m[37m[0m [1m3s[0m 0us/step
Downloading data from https://www.cs.toronto.edu/~kriz/cifar-100-python.tar.gz
[1m169001437/169001437[0m [32m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m[37m[0m [1m3s[0m 0us/step
üñº Saving cifar10 images...
üñº Saving cifar100 images...

üì• Downloading COCO 2017 train subset (~40,000 images)...


Downloading COCO:   0%|          | 0.00/19.3G [05:13<?, ?B/s]


üìÇ Extracting 40,000 COCO images...


Extracting COCO: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 40000/40000 [02:42<00:00, 246.71it/s]



üîç Removing duplicate paths...
üìä UNIQUE non-retinal images collected: 140000

üìå STEP 6: Sampling EXACT 30k Train + 15k Val...
‚úî Non-Retinal Train Selected: 20000
‚úî Non-Retinal Val  Selected: 8000

üì§ Copying NON-RETINAL TRAIN images...


Copying Non-Retinal Train: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 20000/20000 [00:40<00:00, 497.42it/s]



üì§ Copying NON-RETINAL VAL images...


Copying Non-Retinal Val: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 8000/8000 [00:13<00:00, 603.77it/s]



üî• Cleaning nonret_raw to save disk...

üìå STEP 7: Creating ZIP file (Memory-Safe)...

üéâ DONE! ZIP FILE CREATED SUCCESSFULLY!
üì¶ Download ZIP at: /content/retina_nonretina_dataset_balanced2.zip

üìå STEP 8: Uploading ZIP to Google Drive...

‚úî File uploaded to Drive at: /content/drive/MyDrive/retina_nonretina_dataset_balanced2.zip

üöÄ DATASET GENERATION PIPELINE COMPLETE!
