<a href="https://colab.research.google.com/github/tamara-kostova/MSc_Thesis_Neuroimaging/blob/master/data_loading_medgemma.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Generate metadata and labels for Medgemma 1.5 evaluation
class, subclass, modality, sequence, plane

In [4]:
import os

In [6]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [1]:
BASE_DIR = "/content/drive/MyDrive/MSc_Thesis_Neuroimaging"
RAW_DIR = f"{BASE_DIR}/data/raw"
PROC_DIR = f"{BASE_DIR}/data/processed"
SPLIT_DIR = f"{BASE_DIR}/data/split"


In [2]:
def tree(path, level=2):
    for root, dirs, files in os.walk(path):
        depth = root.replace(path, "").count(os.sep)
        if depth > level:
            continue
        indent = " " * 4 * depth
        print(f"{indent}{os.path.basename(root)}/")
        for f in files[:5]:
            print(f"{indent}    {f}")


In [7]:
tree(RAW_DIR, level=3)

raw/
    brain-tumor-detection.zip
    multiple-sclerosis.zip
    Br35H-Mask-RCNN/
        annotations_all.json
        TEST/
            y702.jpg
            y705.jpg
            y701.jpg
            y704.jpg
            annotations_test.json
        TRAIN/
            y0.jpg
            annotations_train.json
            y1.jpg
            y100.jpg
            y10.jpg
        VAL/
            annotations_val.json
            y500.jpg
            y501.jpg
            y502.jpg
            y503.jpg
    no/
        no65.jpg
        no650.jpg
        no651.jpg
        no652.jpg
        no653.jpg
    pred/
        pred0.jpg
        pred1.jpg
        pred10.jpg
        pred11.jpg
        pred12.jpg
    yes/
        y1448.jpg
        y1449.jpg
        y145.jpg
        y1450.jpg
        y1451.jpg
    Glioma (Astrocitoma, Ganglioglioma, Glioblastoma, Oligodendroglioma, Ependimoma) T1/
        glioma (1).webp
        glioma (1).jpeg
        glioma (1).jpg
        glioma (10).jpg
        glioma 

In [8]:
tree(PROC_DIR, level=3)

processed/
    MRI/
        tumor_binary/
            tumor/
                y1448.jpg
                y1449.jpg
                y145.jpg
                y1450.jpg
                y1451.jpg
            normal/
                no1444.jpg
                no1445.jpg
                no1446.jpg
                no1447.jpg
                no1448.jpg
        tumor_multiclass/
            Carcinoma/
                108._big_gallery.jpeg
                100._big_gallery.jpeg
                111._big_gallery.jpeg
                107._big_gallery.jpeg
                110._big_gallery.jpeg
            Schwannoma/
                schwannoma (113).jpeg
                schwannoma (12).jpg
                schwannoma (19).jpeg
                schwannoma (102).jpeg
                schwannoma (13).jpeg
            Germinoma/
                2022-06-10 13_55_53-Pineal germinoma _ Radiology Case _ Radiopaedia.org.jpg
                63_big_gallery.jpeg
                07ae29130ba9a7cf94585ffbf35076_big_gall

In [9]:
tree(SPLIT_DIR, level=3)

split/
    MRI_tumor_binary_norm/
        split_stats.csv
        train/
            normal/
                no493.jpg
                no1385.jpg
                no678.jpg
                no184.jpg
                no624.jpg
            tumor/
                y493.jpg
                y139.jpg
                y678.jpg
                y185.jpg
                y624.jpg
        val/
            normal/
                no209.jpg
                no1227.jpg
                no1214.jpg
                no1216.jpg
                no484.jpg
            tumor/
                y209.jpg
                y1232.jpg
                y122.jpg
                y1221.jpg
                y484.jpg
        test/
            normal/
                no431.jpg
                no224.jpg
                no25.jpg
                no1068.jpg
                no720.jpg
            tumor/
                y431.jpg
                y224.jpg
                y25.jpg
                y1074.jpg
                y720.jpg
    MRI_tumo

In [23]:
import os
import csv
from pathlib import Path

def infer_metadata(relpath: str):
    parts = relpath.split(os.sep)
    path_str = relpath.lower()
    modality = "unknown"
    sequence = "unknown"
    plane = "unknown"
    main_class = "unknown"
    subtype = "unknown"

    if "br35h-mask-rcnn" in path_str or "pred" in path_str:
        return None
    if "brain_stroke_ct_dataset" in path_str or "aisd" in path_str:
        modality = "CT"
        if "aisd" in path_str:
           subtype = "Ischemic"
    else:
        modality = "MRI"

    if "figshare" in path_str:
        sequence = "T1C"
        main_class = "Tumor"
        if "glioma" in path_str:
            subtype = "Glioma"
        elif "meningioma" in path_str:
            subtype = "Meningioma"
        elif "pituitary" in path_str:
            subtype = "Pituitary"

    elif " t1c+" in path_str or " t1c" in path_str:
        sequence = "T1C"
    elif " t1/" in path_str or " t1 " in path_str:
        sequence = "T1"
    elif " t2" in path_str:
        sequence = "T2"
    elif "ms/" in path_str:
        sequence = "FLAIR"
    elif "brain_stroke_ct_dataset" in path_str or "aisd" in path_str:
        sequence = "NCCT"
    else:
        sequence = "unknown"


    if "axial_crop" in path_str:
        plane = "Axial"
    elif "saggital_crop" in path_str:
        plane = "Sagittal"
    elif "figshare" in path_str:
        plane="unknown"
    else:
        plane = "Axial"

    if "ms/" in path_str:
        if " ms " in path_str or "/ms_" in path_str or "/ms/" in path_str:
            main_class, subtype = "MS", "MS"
        else:
            main_class, subtype = "Normal", "Control"
    elif "brain_stroke_ct_dataset" in path_str or "aisd" in path_str:
        if "bleeding" in path_str:
            main_class, subtype = "Stroke", "Hemorrhagic"
        elif "ischemia" in path_str:
            main_class, subtype = "Stroke", "Ischemic"
        elif "normal" in path_str:
            main_class, subtype = "Normal", "Normal"
        else:
            main_class, subtype = "Stroke", "unknown"
    elif any(k in path_str for k in ["outros","glioma", "meningioma", "schwannoma",
                                     "neurocitoma", "carcinoma", "germinoma",
                                     "granuloma", "tuberculoma", "papiloma",
                                     "meduloblastoma", "astrocitoma", "ependimoma", "ganglioglioma", "oligodendroglioma", "glioblastoma"]):
        main_class = "Tumor"
        if any (l in path_str for l in ["astrocitoma", "carcinoma", "ependimoma", "ganglioglioma", "tuberculoma", "oligodendroglioma", "meduloblastoma", "granuloma", "glioblastoma", "germinoma"]):
            plane="unknown"
        for t in ["outros", "glioma","meningioma","schwannoma","neurocitoma",
                  "carcinoma","germinoma","granuloma","tuberculoma",
                  "papiloma","meduloblastoma"]:
            if t in path_str:
                subtype = t.capitalize()
                break
        else:
            subtype = "Other"
    elif "normal" in path_str:
        main_class, subtype = "Normal", "Normal"
    elif "yes" in path_str:
        main_class, subtype = "Tumor", "unknown"
    elif "no/no" in path_str:
        main_class, subtype = "Normal", "Normal"

    return modality, sequence, plane, main_class, subtype


rows = []
for root, dirs, files in os.walk(RAW_DIR):
    for f in files:
        if not f.lower().endswith((".png",".jpg",".jpeg",".webp",".bmp",".dcm")):
            continue
        full = Path(root) / f
        rel = full.relative_to(RAW_DIR).as_posix()
        meta = infer_metadata(rel)
        if meta is None:
            continue
        modality, sequence, plane, main_class, subtype = meta
        if main_class:
          rows.append({
              "relpath": rel,
              "modality": modality,
              "sequence": sequence,
              "plane": plane,
              "main_class": main_class,
              "subtype": subtype,
          })

with open(f"{RAW_DIR}/raw_metadata.csv", "w", newline="") as f:
    writer = csv.DictWriter(f, fieldnames=list(rows[0].keys()))
    writer.writeheader()
    writer.writerows(rows)

print(f"Wrote {len(rows)} rows to {RAW_DIR}/raw_metadata.csv")


Wrote 56686 rows to /content/drive/MyDrive/MSc_Thesis_Neuroimaging/data/raw/raw_metadata.csv


In [26]:
import os
import json
import csv
from pathlib import Path
from collections import defaultdict
import re

def _normalize_stem(filename: str) -> str:
    stem = Path(filename).stem
    return re.sub(r'_\d+$', '', stem)

def load_raw_metadata(raw_metadata_csv):
    """
    Load raw_metadata.csv into two dicts:
      - by exact filename
      - by normalized stem (to handle _1, _2 variants)
    """
    metadata_by_filename = defaultdict(list)
    metadata_by_norm_stem = defaultdict(list)

    with open(raw_metadata_csv, "r") as f:
        reader = csv.DictReader(f)
        for row in reader:
            # Key by filename
            fname = Path(row["relpath"]).name
            metadata_by_filename[fname].append(row)

            # Also key by normalized stem (extension-insensitive)
            norm = _normalize_stem(fname)
            metadata_by_norm_stem[norm].append(row)

    return metadata_by_filename, metadata_by_norm_stem


def find_best_metadata_row(filename, class_name, metadata_by_filename,
    metadata_by_norm_stem):
    """
    Find the best metadata row for an image.

    """
    candidates = metadata_by_filename.get(filename, [])

    if candidates:
        for row in candidates:
            if row["subtype"].lower() == class_name.lower():
                return row
        return candidates[0]
    norm = _normalize_stem(filename)
    candidates = metadata_by_norm_stem.get(norm, [])

    if candidates:
        for row in candidates:
            if row["subtype"].lower() == class_name.lower():
                return row
        return candidates[0]

    # Nothing found
    return None


def map_split_to_metadata(split_root, raw_metadata_csv, output_dir):
    """
    For each split subdirectory (MRI_tumor_binary_norm, MRI_tumor_multiclass_norm, etc.),
    create *_labels.json files for train/val/test.
    """
    split_root = Path(split_root)
    output_dir = Path(output_dir)
    output_dir.mkdir(parents=True, exist_ok=True)

    metadata_by_filename, metadata_by_norm_stem = load_raw_metadata(raw_metadata_csv)

    # Find all split subdirectories
    split_subdirs = [d for d in split_root.iterdir() if d.is_dir()]

    total_mapped = 0
    total_skipped = 0

    for split_subdir in sorted(split_subdirs):
        split_name = split_subdir.name
        print(f"\n{'='*70}")
        print(f"Processing: {split_name}")
        print(f"{'='*70}")

        # Process train, val, test splits
        for split_type in ["train", "val", "test"]:
            split_path = split_subdir / split_type
            if not split_path.exists():
                print(f"  ⊘ {split_type}/ not found")
                continue

            labels = {}
            skipped = 0

            # Walk through class folders
            for class_name in sorted(os.listdir(split_path)):
                class_dir = split_path / class_name
                if not class_dir.is_dir():
                    continue

                # For each image in the class
                for fname in os.listdir(class_dir):
                    if not fname.lower().endswith((".png", ".jpg", ".jpeg")):
                        continue

                    img_path = (class_dir / fname).as_posix()

                    # Find metadata
                    row = find_best_metadata_row(fname, class_name, metadata_by_filename, metadata_by_norm_stem)

                    if row:
                        labels[img_path] = {
                            "main_class": row["main_class"],
                            "subtype": row["subtype"],
                            "modality": row["modality"],
                            "sequence": row["sequence"],
                            "plane": row["plane"],
                        }
                        total_mapped += 1
                    else:
                        skipped += 1
                        total_skipped += 1

            # Save labels JSON
            labels_json = output_dir / f"{split_name}_{split_type}_labels.json"
            with open(labels_json, "w") as f:
                json.dump(labels, f, indent=2)

            print(
                f"  ✓ {split_type:5} → {len(labels):5} images mapped "
                f"({skipped} skipped) → {labels_json.name}"
            )

    print(f"\n{'='*70}")
    print(f"Summary:")
    print(f"{'='*70}")
    print(f"✓ Total mapped:  {total_mapped}")
    print(f"✗ Total skipped: {total_skipped}")
    print(f"✓ Output dir:    {output_dir}")


def main():
    raw_dir = RAW_DIR
    split_root = SPLIT_DIR

    # Find raw_metadata.csv
    raw_metadata_csv = f"{RAW_DIR}/raw_metadata.csv"

    # Map metadata to splits
    output_dir = f"{SPLIT_DIR}/labels"
    map_split_to_metadata(split_root, raw_metadata_csv, output_dir)


if __name__ == "__main__":
    main()


Processing: CT_stroke_binary_norm
  ✓ train →  4654 images mapped (0 skipped) → CT_stroke_binary_norm_train_labels.json
  ✓ val   →   997 images mapped (0 skipped) → CT_stroke_binary_norm_val_labels.json
  ✓ test  →   999 images mapped (0 skipped) → CT_stroke_binary_norm_test_labels.json

Processing: MRI_ms_norm
  ✓ train →  2731 images mapped (0 skipped) → MRI_ms_norm_train_labels.json
  ✓ val   →   586 images mapped (0 skipped) → MRI_ms_norm_val_labels.json
  ✓ test  →   586 images mapped (0 skipped) → MRI_ms_norm_test_labels.json

Processing: MRI_tumor_binary_norm
  ✓ train →  2100 images mapped (0 skipped) → MRI_tumor_binary_norm_train_labels.json
  ✓ val   →   450 images mapped (0 skipped) → MRI_tumor_binary_norm_val_labels.json
  ✓ test  →   450 images mapped (0 skipped) → MRI_tumor_binary_norm_test_labels.json

Processing: MRI_tumor_multiclass_norm
  ✓ train →  5556 images mapped (0 skipped) → MRI_tumor_multiclass_norm_train_labels.json
  ✓ val   →  1194 images mapped (0 skippe

In [27]:
from pathlib import Path
import json
from collections import Counter

SPLIT_DIR = Path("/content/drive/MyDrive/MSc_Thesis_Neuroimaging/data/split")
LABELS_DIR = SPLIT_DIR / "labels"

def inspect_labels(dataset_name, split_type):
    f = LABELS_DIR / f"{dataset_name}_{split_type}_labels.json"
    with open(f) as fp:
        labels = json.load(fp)
    print(f"{dataset_name} {split_type}: {len(labels)} images")

    main = Counter()
    subtype = Counter()
    seq = Counter()
    plane = Counter()

    for meta in labels.values():
        main[meta["main_class"]] += 1
        subtype[meta["subtype"]] += 1
        seq[meta["sequence"]] += 1
        plane[meta["plane"]] += 1

    print("  main_class:", dict(main))
    print("  subtype   :", dict(list(subtype.items())[:5]), "...")
    print("  sequence  :", dict(seq))
    print("  plane     :", dict(plane))
    print()

inspect_labels("MRI_tumor_multiclass_norm", "test")
inspect_labels("CT_stroke_binary_norm", "test")
inspect_labels("MRI_ms_norm", "test")


MRI_tumor_multiclass_norm test: 1197 images
  main_class: {'Tumor': 1034, 'Normal': 163}
  subtype   : {'Carcinoma': 42, 'Germinoma': 14, 'Glioma': 236, 'Granuloma': 13, 'Meduloblastoma': 20} ...
  sequence  : {'T1C': 275, 'T1': 752, 'T2': 170}
  plane     : {'unknown': 342, 'Axial': 855}

CT_stroke_binary_norm test: 999 images
  main_class: {'Normal': 665, 'Stroke': 334}
  subtype   : {'Normal': 665, 'Ischemic': 173, 'Hemorrhagic': 161} ...
  sequence  : {'NCCT': 999}
  plane     : {'Axial': 999}

MRI_ms_norm test: 586 images
  main_class: {'Normal': 586}
  subtype   : {'Control': 586} ...
  sequence  : {'FLAIR': 586}
  plane     : {'Axial': 385, 'Sagittal': 201}

