In [1]:
from pathlib import Path
from pprint import pprint
from tqdm import tqdm

import pandas as pd
import numpy as np

import h5py
from io import BytesIO
from PIL import Image

from sklearn.model_selection import GroupKFold, StratifiedGroupKFold

In [2]:
SEED = 2022
FOLDS = 5
INPUT_PATH = Path("/kaggle/input/isic-2024-challenge/")
OUTPUT_FILENAME = "folds.csv"

In [3]:
id_column = "isic_id"
target_column = "target"
group_column = "patient_id"
gkf_fold_column = "gkf_fold"
sgkf_fold_column = "sgkf_fold"
tsgkf_fold_column = "tsgkf_fold"

train_metadata = pd.read_csv(INPUT_PATH / "train-metadata.csv", low_memory=False);print(f"Train data size: {train_metadata.shape}")
train_metadata.head()

Train data size: (401059, 55)


Unnamed: 0,isic_id,target,patient_id,age_approx,sex,anatom_site_general,clin_size_long_diam_mm,image_type,tbp_tile_type,tbp_lv_A,...,lesion_id,iddx_full,iddx_1,iddx_2,iddx_3,iddx_4,iddx_5,mel_mitotic_index,mel_thick_mm,tbp_lv_dnn_lesion_confidence
0,ISIC_0015670,0,IP_1235828,60.0,male,lower extremity,3.04,TBP tile: close-up,3D: white,20.244422,...,,Benign,Benign,,,,,,,97.517282
1,ISIC_0015845,0,IP_8170065,60.0,male,head/neck,1.1,TBP tile: close-up,3D: white,31.71257,...,IL_6727506,Benign,Benign,,,,,,,3.141455
2,ISIC_0015864,0,IP_6724798,60.0,male,posterior torso,3.4,TBP tile: close-up,3D: XP,22.57583,...,,Benign,Benign,,,,,,,99.80404
3,ISIC_0015902,0,IP_4111386,65.0,male,anterior torso,3.22,TBP tile: close-up,3D: XP,14.242329,...,,Benign,Benign,,,,,,,99.989998
4,ISIC_0024200,0,IP_8313778,55.0,male,anterior torso,2.73,TBP tile: close-up,3D: white,24.72552,...,,Benign,Benign,,,,,,,70.44251


In [4]:
patient_df = train_metadata.groupby(group_column, as_index=False)[target_column].agg(lambda x: 1 if x.sum() > 0 else 0)
folds = np.zeros(train_metadata.shape[0], dtype=int)
sgkf = StratifiedGroupKFold(n_splits=FOLDS)
for fold_idx, (dev_idx, val_idx) in enumerate(sgkf.split(patient_df, y=patient_df[target_column], groups=patient_df[group_column])):
    val_patients = patient_df.loc[val_idx, group_column]
    folds[train_metadata[train_metadata[group_column].isin(val_patients)].index] = fold_idx + 1
train_metadata[sgkf_fold_column] = folds

fold_attributes = []
for fold in np.unique(train_metadata[sgkf_fold_column]):
    dev_df = train_metadata.loc[train_metadata[sgkf_fold_column] != fold, :]
    print(f"Fold: {fold} | Proportion of malignant patients in dev")
    print(dev_df.groupby(group_column)[target_column].agg(lambda x: 1 if x.sum() > 0 else 0).value_counts(normalize=True))
    dev_num_samples = dev_df.shape[0]
    dev_num_pos_samples = dev_df[target_column].sum()
    dev_num_neg_samples = dev_num_samples - dev_num_pos_samples
    dev_target_mean = dev_df[target_column].mean()
    dev_num_patients = dev_df[group_column].nunique()
    dev_avg_num_samples_per_patient = dev_df.groupby(group_column)[id_column].count().mean()
    
    val_df = train_metadata.loc[train_metadata[sgkf_fold_column] == fold, :]
    print(f"Fold: {fold} | Proportion of malignant patients in validation")
    print(val_df.groupby(group_column)[target_column].agg(lambda x: 1 if x.sum() > 0 else 0).value_counts(normalize=True))
    val_num_samples = val_df.shape[0]
    val_num_pos_samples = val_df[target_column].sum()
    val_num_neg_samples = val_num_samples - val_num_pos_samples
    val_target_mean = val_df[target_column].mean()
    val_num_patients = val_df[group_column].nunique()
    val_avg_num_samples_per_patient = val_df.groupby(group_column)[id_column].count().mean()
    fold_attributes += [{
        "Fold": fold,
        "DevNumSamples": dev_num_samples,
        "DevNumPosSamples": dev_num_pos_samples,
        "DevNumNegSamples": dev_num_neg_samples,
        "DevTargetMean": dev_target_mean,
        "DevNumPatients": dev_num_patients,
        "DevAvgSamplesPerPatient": dev_avg_num_samples_per_patient,
        "ValNumSamples": val_num_samples,
        "ValNumPosSamples": val_num_pos_samples,
        "ValNumNegSamples": val_num_neg_samples,
        "ValTargetMean": val_target_mean,
        "ValNumPatients": val_num_patients,
        "ValAvgSamplesPerPatient": val_avg_num_samples_per_patient
    }]
    print("\n")
fold_attributes_df = pd.DataFrame(fold_attributes)
fold_attributes_df

Fold: 1 | Proportion of malignant patients in dev
target
0    0.751501
1    0.248499
Name: proportion, dtype: float64
Fold: 1 | Proportion of malignant patients in validation
target
0    0.751196
1    0.248804
Name: proportion, dtype: float64


Fold: 2 | Proportion of malignant patients in dev
target
0    0.751799
1    0.248201
Name: proportion, dtype: float64
Fold: 2 | Proportion of malignant patients in validation
target
0    0.75
1    0.25
Name: proportion, dtype: float64


Fold: 3 | Proportion of malignant patients in dev
target
0    0.751501
1    0.248499
Name: proportion, dtype: float64
Fold: 3 | Proportion of malignant patients in validation
target
0    0.751196
1    0.248804
Name: proportion, dtype: float64


Fold: 4 | Proportion of malignant patients in dev
target
0    0.751799
1    0.248201
Name: proportion, dtype: float64
Fold: 4 | Proportion of malignant patients in validation
target
0    0.75
1    0.25
Name: proportion, dtype: float64


Fold: 5 | Proportion of malignant pa

Unnamed: 0,Fold,DevNumSamples,DevNumPosSamples,DevNumNegSamples,DevTargetMean,DevNumPatients,DevAvgSamplesPerPatient,ValNumSamples,ValNumPosSamples,ValNumNegSamples,ValTargetMean,ValNumPatients,ValAvgSamplesPerPatient
0,1,329019,301,328718,0.000915,833,394.980792,72040,92,71948,0.001277,209,344.688995
1,2,310519,315,310204,0.001014,834,372.32494,90540,78,90462,0.000861,208,435.288462
2,3,323293,316,322977,0.000977,833,388.106843,77766,77,77689,0.00099,209,372.086124
3,4,322301,318,321983,0.000987,834,386.452038,78758,75,78683,0.000952,208,378.644231
4,5,319104,322,318782,0.001009,834,382.618705,81955,71,81884,0.000866,208,394.014423


In [5]:
folds = np.zeros(train_metadata.shape[0], dtype=int)
gkf = GroupKFold(n_splits=FOLDS)
for fold_idx, (_, val_idx) in enumerate(gkf.split(train_metadata, y=train_metadata[target_column], groups=train_metadata[group_column])):
    folds[val_idx] = fold_idx + 1
train_metadata[gkf_fold_column] = folds

fold_attributes = []
for fold in np.unique(train_metadata[gkf_fold_column]):
    dev_df = train_metadata.loc[train_metadata[gkf_fold_column] != fold, :]
    print(f"Fold: {fold} | Proportion of malignant patients in dev")
    print(dev_df.groupby(group_column)[target_column].agg(lambda x: 1 if x.sum() > 0 else 0).value_counts(normalize=True))
    dev_num_samples = dev_df.shape[0]
    dev_num_pos_samples = dev_df[target_column].sum()
    dev_num_neg_samples = dev_num_samples - dev_num_pos_samples
    dev_target_mean = dev_df[target_column].mean()
    dev_num_patients = dev_df[group_column].nunique()
    dev_avg_num_samples_per_patient = dev_df.groupby(group_column)[id_column].count().mean()
    
    val_df = train_metadata.loc[train_metadata[gkf_fold_column] == fold, :]
    print(f"Fold: {fold} | Proportion of malignant patients in validation")
    print(val_df.groupby(group_column)[target_column].agg(lambda x: 1 if x.sum() > 0 else 0).value_counts(normalize=True))
    val_num_samples = val_df.shape[0]
    val_num_pos_samples = val_df[target_column].sum()
    val_num_neg_samples = val_num_samples - val_num_pos_samples
    val_target_mean = val_df[target_column].mean()
    val_num_patients = val_df[group_column].nunique()
    val_avg_num_samples_per_patient = val_df.groupby(group_column)[id_column].count().mean()
    fold_attributes += [{
        "Fold": fold,
        "DevNumSamples": dev_num_samples,
        "DevNumPosSamples": dev_num_pos_samples,
        "DevNumNegSamples": dev_num_neg_samples,
        "DevTargetMean": dev_target_mean,
        "DevNumPatients": dev_num_patients,
        "DevAvgSamplesPerPatient": dev_avg_num_samples_per_patient,
        "ValNumSamples": val_num_samples,
        "ValNumPosSamples": val_num_pos_samples,
        "ValNumNegSamples": val_num_neg_samples,
        "ValTargetMean": val_target_mean,
        "ValNumPatients": val_num_patients,
        "ValAvgSamplesPerPatient": val_avg_num_samples_per_patient
    }]
    print("\n")
fold_attributes_df = pd.DataFrame(fold_attributes)
fold_attributes_df

Fold: 1 | Proportion of malignant patients in dev
target
0    0.745215
1    0.254785
Name: proportion, dtype: float64
Fold: 1 | Proportion of malignant patients in validation
target
0    0.776699
1    0.223301
Name: proportion, dtype: float64


Fold: 2 | Proportion of malignant patients in dev
target
0    0.7503
1    0.2497
Name: proportion, dtype: float64
Fold: 2 | Proportion of malignant patients in validation
target
0    0.755981
1    0.244019
Name: proportion, dtype: float64


Fold: 3 | Proportion of malignant patients in dev
target
0    0.7494
1    0.2506
Name: proportion, dtype: float64
Fold: 3 | Proportion of malignant patients in validation
target
0    0.759615
1    0.240385
Name: proportion, dtype: float64


Fold: 4 | Proportion of malignant patients in dev
target
0    0.7503
1    0.2497
Name: proportion, dtype: float64
Fold: 4 | Proportion of malignant patients in validation
target
0    0.755981
1    0.244019
Name: proportion, dtype: float64


Fold: 5 | Proportion of malignan

Unnamed: 0,Fold,DevNumSamples,DevNumPosSamples,DevNumNegSamples,DevTargetMean,DevNumPatients,DevAvgSamplesPerPatient,ValNumSamples,ValNumPosSamples,ValNumNegSamples,ValTargetMean,ValNumPatients,ValAvgSamplesPerPatient
0,1,320845,316,320529,0.000985,836,383.785885,80214,77,80137,0.00096,206,389.38835
1,2,320847,315,320532,0.000982,833,385.170468,80212,78,80134,0.000972,209,383.789474
2,3,320848,313,320535,0.000976,834,384.709832,80211,80,80131,0.000997,208,385.629808
3,4,320848,313,320535,0.000976,833,385.171669,80211,80,80131,0.000997,209,383.784689
4,5,320848,315,320533,0.000982,832,385.634615,80211,78,80133,0.000972,210,381.957143


In [6]:
id_list = train_metadata[group_column].value_counts().index
CT = len(id_list) // FOLDS
s = np.zeros((FOLDS)); t = np.zeros((FOLDS)); i = 0
for k in range(CT + 1):
    if k != CT:
        for j in range(FOLDS):
            s[j] = train_metadata.loc[train_metadata[group_column] == id_list[i + j], target_column].sum()
        xx = np.argsort(s); yy = np.argsort(-t)
        t[yy] = t[yy] + s[xx]
        for j in range(FOLDS):
            train_metadata.loc[train_metadata[group_column] == id_list[i + xx[j]], tsgkf_fold_column] = yy[j]
        i += FOLDS
    else:
        for j in range(len(id_list) - CT*FOLDS):
            train_metadata.loc[train_metadata[group_column] == id_list[i + j], tsgkf_fold_column] = j
train_metadata[tsgkf_fold_column] = train_metadata[tsgkf_fold_column].astype(int) + 1

fold_attributes = []
for fold in np.unique(train_metadata[tsgkf_fold_column]):
    dev_df = train_metadata.loc[train_metadata[tsgkf_fold_column] != fold, :]
    print(f"Fold: {fold} | Proportion of malignant patients in dev")
    print(dev_df.groupby(group_column)[target_column].agg(lambda x: 1 if x.sum() > 0 else 0).value_counts(normalize=True))
    dev_num_samples = dev_df.shape[0]
    dev_num_pos_samples = dev_df[target_column].sum()
    dev_num_neg_samples = dev_num_samples - dev_num_pos_samples
    dev_target_mean = dev_df[target_column].mean()
    dev_num_patients = dev_df[group_column].nunique()
    dev_avg_num_samples_per_patient = dev_df.groupby(group_column)[id_column].count().mean()
    
    val_df = train_metadata.loc[train_metadata[tsgkf_fold_column] == fold, :]
    print(f"Fold: {fold} | Proportion of malignant patients in validation")
    print(val_df.groupby(group_column)[target_column].agg(lambda x: 1 if x.sum() > 0 else 0).value_counts(normalize=True))
    val_num_samples = val_df.shape[0]
    val_num_pos_samples = val_df[target_column].sum()
    val_num_neg_samples = val_num_samples - val_num_pos_samples
    val_target_mean = val_df[target_column].mean()
    val_num_patients = val_df[group_column].nunique()
    val_avg_num_samples_per_patient = val_df.groupby(group_column)[id_column].count().mean()
    fold_attributes += [{
        "Fold": fold,
        "DevNumSamples": dev_num_samples,
        "DevNumPosSamples": dev_num_pos_samples,
        "DevNumNegSamples": dev_num_neg_samples,
        "DevTargetMean": dev_target_mean,
        "DevNumPatients": dev_num_patients,
        "DevAvgSamplesPerPatient": dev_avg_num_samples_per_patient,
        "ValNumSamples": val_num_samples,
        "ValNumPosSamples": val_num_pos_samples,
        "ValNumNegSamples": val_num_neg_samples,
        "ValTargetMean": val_target_mean,
        "ValNumPatients": val_num_patients,
        "ValAvgSamplesPerPatient": val_avg_num_samples_per_patient
    }]
    print("\n")
fold_attributes_df = pd.DataFrame(fold_attributes)
fold_attributes_df

Fold: 1 | Proportion of malignant patients in dev
target
0    0.737095
1    0.262905
Name: proportion, dtype: float64
Fold: 1 | Proportion of malignant patients in validation
target
0    0.808612
1    0.191388
Name: proportion, dtype: float64


Fold: 2 | Proportion of malignant patients in dev
target
0    0.756303
1    0.243697
Name: proportion, dtype: float64
Fold: 2 | Proportion of malignant patients in validation
target
0    0.732057
1    0.267943
Name: proportion, dtype: float64


Fold: 3 | Proportion of malignant patients in dev
target
0    0.754197
1    0.245803
Name: proportion, dtype: float64
Fold: 3 | Proportion of malignant patients in validation
target
0    0.740385
1    0.259615
Name: proportion, dtype: float64


Fold: 4 | Proportion of malignant patients in dev
target
0    0.755396
1    0.244604
Name: proportion, dtype: float64
Fold: 4 | Proportion of malignant patients in validation
target
0    0.735577
1    0.264423
Name: proportion, dtype: float64


Fold: 5 | Proportion

Unnamed: 0,Fold,DevNumSamples,DevNumPosSamples,DevNumNegSamples,DevTargetMean,DevNumPatients,DevAvgSamplesPerPatient,ValNumSamples,ValNumPosSamples,ValNumNegSamples,ValTargetMean,ValNumPatients,ValAvgSamplesPerPatient
0,1,317810,316,317494,0.000994,833,381.52461,83249,77,83172,0.000925,209,398.320574
1,2,322942,315,322627,0.000975,833,387.685474,78117,78,78039,0.000999,209,373.76555
2,3,320210,316,319894,0.000987,834,383.944844,80849,77,80772,0.000952,208,388.697115
3,4,321000,310,320690,0.000966,834,384.892086,80059,83,79976,0.001037,208,384.899038
4,5,322274,315,321959,0.000977,834,386.419664,78785,78,78707,0.00099,208,378.774038


In [7]:
columns_to_write = [id_column, group_column, gkf_fold_column, sgkf_fold_column, tsgkf_fold_column]
train_metadata[columns_to_write].head(n=10)

Unnamed: 0,isic_id,patient_id,gkf_fold,sgkf_fold,tsgkf_fold
0,ISIC_0015670,IP_1235828,4,4,2
1,ISIC_0015845,IP_8170065,1,3,5
2,ISIC_0015864,IP_6724798,5,2,5
3,ISIC_0015902,IP_4111386,2,2,5
4,ISIC_0024200,IP_8313778,1,3,4
5,ISIC_0035502,IP_3026693,5,4,2
6,ISIC_0051648,IP_0218255,1,3,4
7,ISIC_0051665,IP_7734648,2,5,5
8,ISIC_0051710,IP_1307115,2,3,1
9,ISIC_0051758,IP_2180091,5,2,1


In [8]:
train_metadata[columns_to_write].to_csv(OUTPUT_FILENAME, index=False)