In [1]:
import os
import shutil
import random
from tqdm import tqdm


In [2]:
# --- CONFIGURATION ---
# Dataset 1 (Strokes)
STROKE_DATASET_PATH = "../content/Brain_Stroke_MRI/Dataset_MRI_Folder" 

# Dataset 2 (Tumors) usually unzips to 'Training' and 'Testing' folders
# pointing to the "Training" dir to get the most images
TUMOR_DATASET_PATH = "../content/Brain_Tumor_Classification/Training" 

OUTPUT_DIR = "../content/dataset"

In [3]:
def create_dataset_structure():
    classes = ["Hemorrhagic", "Ischemic", "Tumor"]
    for c in classes:
        os.makedirs(os.path.join(OUTPUT_DIR, c), exist_ok=True)
    print(f"Created directories in {OUTPUT_DIR}")

def copy_images_recursive(src_dir, dst_class_name, limit=None):
    """Recursively copies all images from source directory and subdirectories."""
    if not os.path.exists(src_dir):
        print(f"WARNING: Source directory not found: {src_dir}")
        return

    dst_dir = os.path.join(OUTPUT_DIR, dst_class_name)
    
    # Recursively find all image files
    image_files = []
    for root, dirs, files in os.walk(src_dir):
        for f in files:
            if f.lower().endswith(('.jpg', '.jpeg', '.png', '.dcm', '.nii', '.nii.gz')):
                image_files.append(os.path.join(root, f))
    
    # Optional: limit number of images to balance the dataset if needed
    if limit:
        image_files = image_files[:limit]
        
    print(f"Copying {len(image_files)} images from {src_dir} to {dst_class_name}...")
    
    for img_path in tqdm(image_files):
        # Create unique filename to avoid overwrites from different subdirs
        rel_path = os.path.relpath(img_path, src_dir)
        filename = rel_path.replace(os.sep, '_')
        # Replace spaces and other problematic characters
        filename = filename.replace(' ', '_')
        shutil.copy(img_path, os.path.join(dst_dir, filename))

def copy_images(src_dir, dst_class_name, limit=None):
    """Copies images from source to the destination class folder (non-recursive)."""
    if not os.path.exists(src_dir):
        print(f"WARNING: Source directory not found: {src_dir}")
        return

    dst_dir = os.path.join(OUTPUT_DIR, dst_class_name)
    files = [f for f in os.listdir(src_dir) if f.lower().endswith(('.jpg', '.jpeg', '.png'))]
    
    # Optional: limit number of images to balance the dataset if needed
    if limit:
        files = files[:limit]
        
    print(f"Copying {len(files)} images from {src_dir} to {dst_class_name}...")
    
    for f in tqdm(files):
        shutil.copy(os.path.join(src_dir, f), os.path.join(dst_dir, f))


In [4]:
# 1. Setup
create_dataset_structure()

# 2. Process Strokes (Dataset 1) - Recursively copy from all subdirectories
# Note: Check your unzipped folder names carefully! 
# They are often named 'Haemorrhagic' and 'Ischemic' inside the dataset.
copy_images_recursive(os.path.join(STROKE_DATASET_PATH, "Haemorrhagic"), "Hemorrhagic")
copy_images_recursive(os.path.join(STROKE_DATASET_PATH, "Ischemic"), "Ischemic")

# 3. Process Tumors (Dataset 2)
# We merge Glioma, Meningioma, and Pituitary into one 'Tumor' class
# copy_images(os.path.join(TUMOR_DATASET_PATH, "glioma_tumor"), "Tumor")
copy_images(os.path.join(TUMOR_DATASET_PATH, "meningioma_tumor"), "Tumor")
# copy_images(os.path.join(TUMOR_DATASET_PATH, "pituitary_tumor"), "Tumor")

print("\nDataset generation complete!")
print("Please check the 'content/dataset' folder.")


Created directories in ../content/dataset
Copying 186 images from ../content/Brain_Stroke_MRI/Dataset_MRI_Folder\Haemorrhagic to Hemorrhagic...


  0%|          | 0/186 [00:00<?, ?it/s]

100%|██████████| 186/186 [00:01<00:00, 103.19it/s]


Copying 30 images from ../content/Brain_Stroke_MRI/Dataset_MRI_Folder\Ischemic to Ischemic...


100%|██████████| 30/30 [00:00<00:00, 89.79it/s] 


Copying 822 images from ../content/Brain_Tumor_Classification/Training\meningioma_tumor to Tumor...


100%|██████████| 822/822 [00:07<00:00, 113.49it/s]


Dataset generation complete!
Please check the 'content/dataset' folder.





In [5]:
# Delete all folders in content except 'dataset'
content_dir = "../content"
for item in os.listdir(content_dir):
    item_path = os.path.join(content_dir, item)
    if os.path.isdir(item_path) and item != "dataset":
        print(f"Deleting: {item_path}")
        shutil.rmtree(item_path)

print("Cleanup complete! Only 'dataset' folder remains.")

Deleting: ../content\Brain_Stroke_MRI
Deleting: ../content\Brain_Tumor_Classification
Cleanup complete! Only 'dataset' folder remains.
