In [1]:
from pathlib import Path

import pandas as pd
import numpy as np

import h5py
from io import BytesIO
from PIL import Image

from sklearn.model_selection import GroupKFold, StratifiedGroupKFold

In [2]:
SEED = 2022
FOLDS = 5
INPUT_PATH = Path("/kaggle/input/isic-2024-challenge/")
OUTPUT_FILENAME = "folds.csv"

In [3]:
id_column = "isic_id"
target_column = "target"
group_column = "patient_id"
gkf_fold_column = "gkf_fold"
sgkf_fold_column = "sgkf_fold"

AUSTRALIAN_ATTRIBUTIONS = [
    "ACEMID MIA",
    "Frazer Institute, The University of Queensland, Dermatology Research Centre"
]

train_metadata = pd.read_csv(INPUT_PATH / "train-metadata.csv", low_memory=False);print(f"Train data size: {train_metadata.shape}")
train_images = h5py.File(INPUT_PATH / "train-image.hdf5", mode="r")
train_metadata.head()

Train data size: (401059, 55)


Unnamed: 0,isic_id,target,patient_id,age_approx,sex,anatom_site_general,clin_size_long_diam_mm,image_type,tbp_tile_type,tbp_lv_A,...,lesion_id,iddx_full,iddx_1,iddx_2,iddx_3,iddx_4,iddx_5,mel_mitotic_index,mel_thick_mm,tbp_lv_dnn_lesion_confidence
0,ISIC_0015670,0,IP_1235828,60.0,male,lower extremity,3.04,TBP tile: close-up,3D: white,20.244422,...,,Benign,Benign,,,,,,,97.517282
1,ISIC_0015845,0,IP_8170065,60.0,male,head/neck,1.1,TBP tile: close-up,3D: white,31.71257,...,IL_6727506,Benign,Benign,,,,,,,3.141455
2,ISIC_0015864,0,IP_6724798,60.0,male,posterior torso,3.4,TBP tile: close-up,3D: XP,22.57583,...,,Benign,Benign,,,,,,,99.80404
3,ISIC_0015902,0,IP_4111386,65.0,male,anterior torso,3.22,TBP tile: close-up,3D: XP,14.242329,...,,Benign,Benign,,,,,,,99.989998
4,ISIC_0024200,0,IP_8313778,55.0,male,anterior torso,2.73,TBP tile: close-up,3D: white,24.72552,...,,Benign,Benign,,,,,,,70.44251


In [4]:
# train_metadata_2020 = pd.read_csv("/kaggle/input/isic-2020-challenge/train-metadata.csv", low_memory=False);print(f"Train data size: {train_metadata.shape}")
# train_images_2020 = h5py.File("/kaggle/input/isic-2020-challenge/train-image.hdf5", mode="r")

# some_id = train_metadata_2020.loc[train_metadata_2020["diagnosis"] == "melanoma"].sample(n=1)[id_column].item()
# img = Image.open(BytesIO(train_images_2020[some_id][()]))
# print(f"{some_id}: Label: {train_metadata_2020.loc[train_metadata_2020[id_column] == some_id, 'diagnosis'].item()}")
# display(img)

# some_id = train_metadata.loc[train_metadata[target_column] == 1].sample(n=1)[id_column].item()
# img = Image.open(BytesIO(train_images[some_id][()]))
# print(f"{some_id}: Label: {train_metadata.loc[train_metadata[id_column] == some_id, target_column].item()}")
# display(img)

In [5]:
drop_ids = ["ISIC_2346081", "ISIC_0573025", "ISIC_1443812", "ISIC_5374420"]
# https://www.kaggle.com/competitions/isic-2024-challenge/discussion/528168#2960144
# https://www.kaggle.com/competitions/isic-2024-challenge/discussion/521145#2935689

# for drop_id in drop_ids:
#     img = Image.open(BytesIO(train_images[drop_id][()]))
#     print(f"{drop_id}: Label: {train_metadata.loc[train_metadata[id_column] == drop_id, target_column].item()}")
#     display(img)

In [6]:
gkf_folds = np.zeros(train_metadata.shape[0], dtype=int)
gkf = GroupKFold(n_splits=FOLDS)
for fold_idx, (_, val_idx) in enumerate(gkf.split(train_metadata, y=train_metadata[target_column], groups=train_metadata[group_column])):
    gkf_folds[val_idx] = fold_idx + 1
train_metadata[gkf_fold_column] = gkf_folds
train_metadata.loc[train_metadata[id_column].isin(drop_ids), gkf_fold_column] = -1

for fold in np.unique(train_metadata[gkf_fold_column]):
    if fold < 0:
        continue
    print(f"Fold: {fold}")
    print(f"Target mean: {train_metadata.loc[train_metadata[gkf_fold_column] == fold, target_column].mean()}")
    print(f"Unique patients: {train_metadata.loc[train_metadata[gkf_fold_column] == fold, group_column].nunique()}\n")

Fold: 1
Target mean: 0.0009599321814147156
Unique patients: 206

Fold: 2
Target mean: 0.000972435202154318
Unique patients: 209

Fold: 3
Target mean: 0.0009973694381069929
Unique patients: 208

Fold: 4
Target mean: 0.0009849268785298408
Unique patients: 209

Fold: 5
Target mean: 0.0009724473257698541
Unique patients: 210



In [7]:
train_metadata[gkf_fold_column].value_counts()

gkf_fold
 1    80214
 2    80211
 3    80211
 5    80210
 4    80209
-1        4
Name: count, dtype: int64

In [8]:
sgkf_folds = np.zeros(train_metadata.shape[0], dtype=int)
sgkf = StratifiedGroupKFold(n_splits=FOLDS, shuffle=True, random_state=SEED)
for fold_idx, (_, val_idx) in enumerate(sgkf.split(train_metadata, y=train_metadata[target_column], groups=train_metadata[group_column])):
    sgkf_folds[val_idx] = fold_idx + 1
train_metadata[sgkf_fold_column] = sgkf_folds
train_metadata.loc[train_metadata[id_column].isin(drop_ids), sgkf_fold_column] = -1

for fold in np.unique(train_metadata[sgkf_fold_column]):
    if fold < 0:
        continue
    print(f"Fold: {fold}")
    print(f"Target mean: {train_metadata.loc[train_metadata[sgkf_fold_column] == fold, target_column].mean()}")
    print(f"Unique patients: {train_metadata.loc[train_metadata[sgkf_fold_column] == fold, group_column].nunique()}\n")

Fold: 1
Target mean: 0.0010228705919739416
Unique patients: 207

Fold: 2
Target mean: 0.0009833744165311795
Unique patients: 209

Fold: 3
Target mean: 0.0009668187794879728
Unique patients: 209

Fold: 4
Target mean: 0.0012134568176746263
Unique patients: 209

Fold: 5
Target mean: 0.0006950013365410318
Unique patients: 208



In [9]:
train_metadata[sgkf_fold_column].value_counts()

sgkf_fold
 1    100697
 3     77574
 2     76268
 5     74820
 4     71696
-1         4
Name: count, dtype: int64

In [10]:
train_metadata["is_aus"] = train_metadata["attribution"].isin(AUSTRALIAN_ATTRIBUTIONS).astype(int)
columns_to_write = [id_column, group_column, gkf_fold_column, sgkf_fold_column, "is_aus"]
train_metadata[columns_to_write].head(n=10)

Unnamed: 0,isic_id,patient_id,gkf_fold,sgkf_fold,is_aus
0,ISIC_0015670,IP_1235828,4,4,0
1,ISIC_0015845,IP_8170065,1,2,0
2,ISIC_0015864,IP_6724798,5,3,0
3,ISIC_0015902,IP_4111386,2,3,1
4,ISIC_0024200,IP_8313778,1,1,0
5,ISIC_0035502,IP_3026693,5,1,0
6,ISIC_0051648,IP_0218255,1,5,1
7,ISIC_0051665,IP_7734648,2,5,0
8,ISIC_0051710,IP_1307115,2,3,0
9,ISIC_0051758,IP_2180091,5,3,0


In [11]:
train_metadata[columns_to_write].to_csv(OUTPUT_FILENAME, index=False)