In [1]:
from pathlib import Path

import pandas as pd
import numpy as np

import h5py
from io import BytesIO
from PIL import Image

from sklearn.model_selection import GroupKFold

In [2]:
SEED = 2022
FOLDS = 5
INPUT_PATH = Path("/kaggle/input/isic-2024-challenge/")
OUTPUT_FILENAME = "folds.csv"

In [3]:
id_column = "isic_id"
target_column = "target"
group_column = "patient_id"
fold_column = "fold"

train_metadata = pd.read_csv(INPUT_PATH / "train-metadata.csv", low_memory=False);print(f"Train data size: {train_metadata.shape}")
train_metadata.head()

Train data size: (401059, 55)


Unnamed: 0,isic_id,target,patient_id,age_approx,sex,anatom_site_general,clin_size_long_diam_mm,image_type,tbp_tile_type,tbp_lv_A,...,lesion_id,iddx_full,iddx_1,iddx_2,iddx_3,iddx_4,iddx_5,mel_mitotic_index,mel_thick_mm,tbp_lv_dnn_lesion_confidence
0,ISIC_0015670,0,IP_1235828,60.0,male,lower extremity,3.04,TBP tile: close-up,3D: white,20.244422,...,,Benign,Benign,,,,,,,97.517282
1,ISIC_0015845,0,IP_8170065,60.0,male,head/neck,1.1,TBP tile: close-up,3D: white,31.71257,...,IL_6727506,Benign,Benign,,,,,,,3.141455
2,ISIC_0015864,0,IP_6724798,60.0,male,posterior torso,3.4,TBP tile: close-up,3D: XP,22.57583,...,,Benign,Benign,,,,,,,99.80404
3,ISIC_0015902,0,IP_4111386,65.0,male,anterior torso,3.22,TBP tile: close-up,3D: XP,14.242329,...,,Benign,Benign,,,,,,,99.989998
4,ISIC_0024200,0,IP_8313778,55.0,male,anterior torso,2.73,TBP tile: close-up,3D: white,24.72552,...,,Benign,Benign,,,,,,,70.44251


In [4]:
folds = np.zeros(train_metadata.shape[0], dtype=int)
gkf = GroupKFold(n_splits=FOLDS)
for fold_idx, (_, val_idx) in enumerate(gkf.split(train_metadata, y=train_metadata[target_column], groups=train_metadata[group_column])):
    folds[val_idx] = fold_idx + 1
train_metadata[fold_column] = folds

for fold in np.unique(train_metadata[fold_column]):
    print(f"Fold: {fold}")
    print(f"Target mean: {train_metadata.loc[train_metadata[fold_column] == fold, target_column].mean()}")
    print(f"Unique patients: {train_metadata.loc[train_metadata[fold_column] == fold, group_column].nunique()}\n")

Fold: 1
Target mean: 0.0009599321814147156
Unique patients: 206

Fold: 2
Target mean: 0.0009724230788410712
Unique patients: 209

Fold: 3
Target mean: 0.0009973694381069929
Unique patients: 208

Fold: 4
Target mean: 0.0009973694381069929
Unique patients: 209

Fold: 5
Target mean: 0.000972435202154318
Unique patients: 210



In [5]:
train_metadata[fold_column].value_counts()

fold
1    80214
2    80212
4    80211
5    80211
3    80211
Name: count, dtype: int64

In [6]:
columns_to_write = [id_column, group_column, fold_column]
train_metadata[columns_to_write].head(n=10)

Unnamed: 0,isic_id,patient_id,fold
0,ISIC_0015670,IP_1235828,4
1,ISIC_0015845,IP_8170065,1
2,ISIC_0015864,IP_6724798,5
3,ISIC_0015902,IP_4111386,2
4,ISIC_0024200,IP_8313778,1
5,ISIC_0035502,IP_3026693,5
6,ISIC_0051648,IP_0218255,1
7,ISIC_0051665,IP_7734648,2
8,ISIC_0051710,IP_1307115,2
9,ISIC_0051758,IP_2180091,5


In [7]:
train_metadata[columns_to_write].to_csv(OUTPUT_FILENAME, index=False)