To use this notebook you need to create yourself a kaggle.json file containing your kaggle username and your kaggle api key that you need to create in your settings. The JSON should look like this. {
  "username": "YOUR_USERNAME",
  "key": "YOUR_API_KEY"
}




In [2]:
from google.colab import files
uploaded = files.upload()   # choose kaggle.json

Saving kaggle.json to kaggle.json


In [3]:
!mkdir -p ~/.kaggle
!mv kaggle.json ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json

In [24]:
!kaggle datasets download -d borhanitrash/alzheimer-mri-disease-classification-dataset
!kaggle datasets download -d masoudnickparvar/brain-tumor-mri-dataset
!pip install -q datasets pillow

Dataset URL: https://www.kaggle.com/datasets/borhanitrash/alzheimer-mri-disease-classification-dataset
License(s): apache-2.0
alzheimer-mri-disease-classification-dataset.zip: Skipping, found more recently modified local copy (use --force to force download)
Dataset URL: https://www.kaggle.com/datasets/gabrielleyva307/brain-tumor-mri-classification-dataset-2025
License(s): CC0-1.0
brain-tumor-mri-classification-dataset-2025.zip: Skipping, found more recently modified local copy (use --force to force download)
Dataset URL: https://www.kaggle.com/datasets/masoudnickparvar/brain-tumor-mri-dataset
License(s): CC0-1.0
Downloading brain-tumor-mri-dataset.zip to /content
 83% 124M/149M [00:00<00:00, 1.30GB/s]
100% 149M/149M [00:00<00:00, 1.25GB/s]


In [21]:
# Import all necessary libaries
import zipfile
import os
from pathlib import Path
from datasets import load_dataset
from PIL import Image
import io
import shutil

#### Now create the alzheimer dataset

In [12]:
zip_path = "alzheimer-mri-disease-classification-dataset.zip"
extract_path = "raw_alzheimer"

os.makedirs(extract_path, exist_ok=True)

with zipfile.ZipFile(zip_path, 'r') as zip_ref:
    zip_ref.extractall(extract_path)

print("Unzipped to:", extract_path)

raw_root = Path("raw_alzheimer")
alz_dir = next(raw_root.iterdir())
data_dir = alz_dir / "Data"

print("Alzheimer root:", alz_dir)
print("Data dir:", data_dir)
list(data_dir.iterdir())

train_files = [str(p) for p in data_dir.glob("train*.parquet")]
test_files  = [str(p) for p in data_dir.glob("test*.parquet")]

ds = load_dataset(
    "parquet",
    data_files={"train": train_files, "test": test_files}
)

dataset_train = ds["train"]
dataset_test  = ds["test"]

Unzipped to: raw_alzheimer
Alzheimer root: raw_alzheimer/Alzheimer MRI Disease Classification Dataset
Data dir: raw_alzheimer/Alzheimer MRI Disease Classification Dataset/Data


Generating train split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

Dataset({
    features: ['image', 'label'],
    num_rows: 5120
})
{'image': [{'bytes': b'\xff\xd8\xff\xe0\x00\x10JFIF\x00\x01\x01\x01\x01,\x01,\x00\x00\xff\xdb\x00C\x00\x03\x02\x02\x03\x02\x02\x03\x03\x03\x03\x04\x03\x03\x04\x05\x08\x05\x05\x04\x04\x05\n\x07\x07\x06\x08\x0c\n\x0c\x0c\x0b\n\x0b\x0b\r\x0e\x12\x10\r\x0e\x11\x0e\x0b\x0b\x10\x16\x10\x11\x13\x14\x15\x15\x15\x0c\x0f\x17\x18\x16\x14\x18\x12\x14\x15\x14\xff\xc0\x00\x0b\x08\x00\x80\x00\x80\x01\x01\x11\x00\xff\xc4\x00\x1d\x00\x01\x00\x02\x02\x03\x01\x01\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x07\x08\x06\t\x01\x04\x05\x02\x03\xff\xc4\x008\x10\x00\x01\x03\x03\x03\x03\x03\x02\x04\x05\x03\x04\x03\x00\x00\x00\x01\x02\x03\x04\x00\x05\x11\x06\x07!\x121A\x08\x13Q"a\x142\x81\x91\x15#BRq\t\x16\xd1\x17%r\xa1\xb1\xc1\xf0\xff\xda\x00\x08\x01\x01\x00\x00?\x00\xd5U)JR\x94\xa5)JR\x94\xa5)JS\xa4\xfc\x1a\xe4!G\xb2I\xfd+\x8c\x13N\xd4\xa6\x08\x1d\x8d)JR\x94\xa5\x00\xc9\xa9\x1bo\xfd?\xeb]\xc9\xb5\xbbt\xb4\xda\x96\x9bCIZ\x95p\x91\xf4\xb3\xf4\x8e@\xc6

In [18]:
base = Path("data") / "alzheimer"
(base / "yes").mkdir(parents=True, exist_ok=True)
(base / "no").mkdir(parents=True, exist_ok=True)


def get_pil_image(image_dict):
    return Image.open(io.BytesIO(image_dict["bytes"])).convert("RGB")

def label_to_folder(label: int) -> str:
    return "no" if label == 2 else "yes"

def save_split(ds, split_name: str):
    for i, example in enumerate(ds):
        img = get_pil_image(example["image"])
        label = example["label"]

        folder = label_to_folder(label)
        out_path = base / folder / f"{split_name}_{i}.png"

        img.save(out_path)

        if i % 500 == 0:
            print(f"[{split_name}] saved {i} images...")

save_split(dataset_train, "train")
save_split(dataset_test, "test")

print("Done!")

{'image': {'bytes': b'\xff\xd8\xff\xe0\x00\x10JFIF\x00\x01\x01\x01\x01,\x01,\x00\x00\xff\xdb\x00C\x00\x03\x02\x02\x03\x02\x02\x03\x03\x03\x03\x04\x03\x03\x04\x05\x08\x05\x05\x04\x04\x05\n\x07\x07\x06\x08\x0c\n\x0c\x0c\x0b\n\x0b\x0b\r\x0e\x12\x10\r\x0e\x11\x0e\x0b\x0b\x10\x16\x10\x11\x13\x14\x15\x15\x15\x0c\x0f\x17\x18\x16\x14\x18\x12\x14\x15\x14\xff\xc0\x00\x0b\x08\x00\x80\x00\x80\x01\x01\x11\x00\xff\xc4\x00\x1d\x00\x00\x02\x03\x01\x01\x01\x01\x01\x00\x00\x00\x00\x00\x00\x00\x00\x00\x07\x05\x06\x08\x04\x02\x03\x01\t\xff\xc4\x008\x10\x00\x01\x03\x03\x03\x02\x05\x03\x01\x05\x08\x03\x00\x00\x00\x00\x01\x02\x03\x04\x00\x05\x11\x06\x07\x12!1\x13"AQa\x142q\x81\x08\x15R\x91\xa1#$3Bb\x92\xb1\xd1\x16r\x82\xff\xda\x00\x08\x01\x01\x00\x00?\x00\xfeUQE\x14QE\x14QE\x14QE\x14QE\x14W\xea\x1bS\x8bJ\x12\x92\xa5(\xe0$\x0c\x92\x7f\x157\x07BjK\x9a\x1c\\;\x05\xceRZ\xfb\xd4\xd4G\x14\x13\xf9\xc0\xe9\xd8\xd4l\xfbL\xdbZ\xf8\xcc\x86\xfcU{<\xd9G\xcf\xadu\xda\xb4\x95\xee\xfaH\xb6\xd9\xe7O\xc63\xf4\xd1\x96\xe63\xdb

##### Now unzip the first tumor dataset and create the tumor folder

In [25]:
zip_path = Path("brain-tumor-mri-dataset.zip")
extract_path = Path("raw_tumor")

extract_path.mkdir(parents=True, exist_ok=True)

with zipfile.ZipFile(zip_path, "r") as z:
    z.extractall(extract_path)

print("Extracted to:", extract_path)

tumor_dir = Path("data/tumor")
(tumor_dir / "yes").mkdir(parents=True, exist_ok=True)
(tumor_dir / "no").mkdir(parents=True, exist_ok=True)

tumor_dir

Extracted to: raw_tumor


PosixPath('data/tumor')

In [26]:
# Source folders
root = Path("raw_tumor")
train_dir = root / "Training"
test_dir = root / "Testing"

# Target folders
target = Path("data/tumor")
(target / "yes").mkdir(parents=True, exist_ok=True)
(target / "no").mkdir(parents=True, exist_ok=True)

def copy_images(src, dst):
    src, dst = Path(src), Path(dst)
    count = 0
    for img in src.rglob("*"):
        if img.suffix.lower() in {".jpg", ".jpeg", ".png"}:
            shutil.copy(img, dst / img.name)
            count += 1
    print(f"Copied {count} → {dst}")

# Classes
tumor_classes = ["glioma", "meningioma", "pituitary"]
no_tumor_class = "notumor"

for cls in tumor_classes:
    copy_images(train_dir / cls, target / "yes")

copy_images(train_dir / no_tumor_class, target / "no")

for cls in tumor_classes:
    copy_images(test_dir / cls, target / "yes")

copy_images(test_dir / no_tumor_class, target / "no")


Copied 1321 → data/tumor/yes
Copied 1339 → data/tumor/yes
Copied 1457 → data/tumor/yes
Copied 1595 → data/tumor/no
Copied 300 → data/tumor/yes
Copied 306 → data/tumor/yes
Copied 300 → data/tumor/yes
Copied 405 → data/tumor/no
