In [1]:
import pandas as pd

df = pd.read_csv("ISIC_2020_Training_GroundTruth.csv")  # change path if needed

# Option 1: one-liner
print(df["benign_malignant"].value_counts())

# Option 2: explicit variables
benign     = (df["target"] == 0).sum()
malignant  = (df["target"] == 1).sum()
print(f"Benign: {benign}\nMalignant: {malignant}")

benign_malignant
benign       32542
malignant      584
Name: count, dtype: int64
Benign: 32542
Malignant: 584


In [2]:
import pandas as pd
from pathlib import Path

# ---------- CONFIG ----------
SOURCE_CSV  = Path("ISIC_2020_Training_GroundTruth.csv")  # adjust path as needed
OUTPUT_CSV  = Path("subset_400mal_600ben.csv")
RNG_SEED    = 2025
# ----------------------------

df = pd.read_csv(SOURCE_CSV)

# Identify class-label column and masks
if "benign_malignant" in df.columns:          # common in ISIC ground-truth files
    label_col = "benign_malignant"
    mal_mask  = df[label_col].str.lower() == "malignant"
    ben_mask  = df[label_col].str.lower() == "benign"
elif "target" in df.columns:                   # 1 = malignant, 0 = benign
    label_col = "target"
    mal_mask  = df[label_col] == 1
    ben_mask  = df[label_col] == 0
else:
    raise ValueError("No recognised class label column.")

# Sample 400 malignant + 600 benign
malignant_df = df[mal_mask].sample(n=400, random_state=RNG_SEED, replace=False)
benign_df    = df[ben_mask].sample(n=600, random_state=RNG_SEED, replace=False)

subset_df = (
    pd.concat([malignant_df, benign_df])
      .sample(frac=1, random_state=RNG_SEED)        # shuffle rows
      .reset_index(drop=True)
)

subset_df.to_csv(OUTPUT_CSV, index=False)
print(f"Saved {len(subset_df)} rows to {OUTPUT_CSV}")

Saved 1000 rows to subset_400mal_600ben.csv


In [4]:
import pandas as pd

df = pd.read_csv("subset_400mal_600ben.csv")  # change path if needed

print(df["benign_malignant"].value_counts())

benign_malignant
benign       600
malignant    400
Name: count, dtype: int64


In [7]:
import pandas as pd
from pathlib import Path
from PIL import Image
import shutil

# ── CONFIG ────────────────────────────────────────────────────────────
CSV_PATH         = Path("subset_400mal_600ben.csv")     # ← 1 000-row csv
SRC_IMG_DIR      = Path("JPEG")      # ← original images
DST_DIR          = Path("400mal_600ben_224")                  # ← output folder
FILENAME_COL     = "image_name"                         # change if needed
LABEL_COL        = "benign_malignant"                   # or "target"
RENAME_WITH_LABEL = False                                # prepend label_
IMG_SIZE         = 224                                  # EfficientNet-B0
# ───────────────────────────────────────────────────────────────────────

# ---------- setup ----------
df = pd.read_csv(CSV_PATH)

# map 0/1 → strings if necessary
if df[LABEL_COL].dtype != object:
    df[LABEL_COL] = df[LABEL_COL].map({0: "benign", 1: "malignant"})

DST_DIR.mkdir(parents=True, exist_ok=True)

# ---------- copy + resize ----------
missing = 0
for _, row in df.iterrows():
    src_path = SRC_IMG_DIR / f"{row[FILENAME_COL]}.jpg"   # tweak ext if .png
    if not src_path.exists():
        missing += 1
        continue

    # choose destination file name
    fname = src_path.name
    if RENAME_WITH_LABEL:
        fname = f"{row[LABEL_COL]}_{fname}"
    dst_path = DST_DIR / fname

    # open, resize, save
    with Image.open(src_path) as im:
        im = im.convert("RGB")
        im = im.resize((IMG_SIZE, IMG_SIZE),
                       Image.Resampling.LANCZOS)  # PIL ≥ 9.1
        im.save(dst_path, "JPEG", quality=95)

print(f"✓ Copied & resized {len(df)-missing} images to {DST_DIR.resolve()}")
if missing:
    print(f"⚠️  {missing} files listed in CSV were missing.")

✓ Copied & resized 1000 images to /Users/adrian/Desktop/Final Project/400mal_600ben_224
