In [1]:
from pathlib import Path

import pandas as pd
import numpy as np

from sklearn.model_selection import GroupKFold
from sklearn.utils.class_weight import compute_class_weight

In [2]:
SEED = 2022
FOLDS = 5
INPUT_PATH = Path("../input/isic-2024-challenge/")
OUTPUT_FILENAME = "folds.csv"

In [3]:
train_metadata = pd.read_csv(INPUT_PATH / "train-metadata.csv", low_memory=False);print(f"Train data size: {train_metadata.shape}")

Train data size: (401059, 55)


In [4]:
train_metadata.head()

Unnamed: 0,isic_id,target,patient_id,age_approx,sex,anatom_site_general,clin_size_long_diam_mm,image_type,tbp_tile_type,tbp_lv_A,...,lesion_id,iddx_full,iddx_1,iddx_2,iddx_3,iddx_4,iddx_5,mel_mitotic_index,mel_thick_mm,tbp_lv_dnn_lesion_confidence
0,ISIC_0015670,0,IP_1235828,60.0,male,lower extremity,3.04,TBP tile: close-up,3D: white,20.244422,...,,Benign,Benign,,,,,,,97.517282
1,ISIC_0015845,0,IP_8170065,60.0,male,head/neck,1.1,TBP tile: close-up,3D: white,31.71257,...,IL_6727506,Benign,Benign,,,,,,,3.141455
2,ISIC_0015864,0,IP_6724798,60.0,male,posterior torso,3.4,TBP tile: close-up,3D: XP,22.57583,...,,Benign,Benign,,,,,,,99.80404
3,ISIC_0015902,0,IP_4111386,65.0,male,anterior torso,3.22,TBP tile: close-up,3D: XP,14.242329,...,,Benign,Benign,,,,,,,99.989998
4,ISIC_0024200,0,IP_8313778,55.0,male,anterior torso,2.73,TBP tile: close-up,3D: white,24.72552,...,,Benign,Benign,,,,,,,70.44251


In [5]:
id_column = "isic_id"
target_column = "target"
group_column = "patient_id"
fold_column = "fold"
multiclass_target_column = "multiclass_target"
sample_weight_column = "sample_weight"

folds = np.zeros(train_metadata.shape[0], dtype=int)

gkf = GroupKFold(n_splits=FOLDS)
for fold_idx, (_, val_idx) in enumerate(gkf.split(train_metadata, train_metadata[target_column], train_metadata[group_column])):
    folds[val_idx] = fold_idx + 1

train_metadata[fold_column] = folds

In [6]:
train_metadata[multiclass_target_column] = ""
train_metadata.loc[(train_metadata["lesion_id"].isnull()) & (train_metadata[target_column] == 0), multiclass_target_column] = "0_Benign:Weak"
train_metadata.loc[(train_metadata["lesion_id"].notnull()) & (train_metadata[target_column] == 0), multiclass_target_column] = "0_Benign:Strong"
train_metadata.loc[(train_metadata["lesion_id"].notnull()) & (train_metadata["iddx_3"].str.contains("Basal cell carcinoma")), multiclass_target_column] = "1_BCC:Strong"
train_metadata.loc[(train_metadata["lesion_id"].notnull()) & (train_metadata["iddx_3"].str.contains("Melanoma")), multiclass_target_column] = "1_Melanoma:Strong"
train_metadata.loc[(train_metadata["lesion_id"].notnull()) & (train_metadata["iddx_3"].str.contains("Squamous cell carcinoma")), multiclass_target_column] = "1_SCC:Strong"
assert np.sum(train_metadata[multiclass_target_column] == "") == 0

In [7]:
class_weights = {
    "0_Benign:Weak": 1,
    "0_Benign:Strong": 2,
    "1_BCC:Strong": 100,
    "1_Melanoma:Strong": 100,
    "1_SCC:Strong": 200
}

In [8]:
train_metadata[sample_weight_column] = train_metadata[multiclass_target_column].map(class_weights)

In [9]:
columns_to_write = [id_column, group_column, multiclass_target_column, fold_column, sample_weight_column]
train_metadata[columns_to_write].head(n=10)

Unnamed: 0,isic_id,patient_id,multiclass_target,fold,sample_weight
0,ISIC_0015670,IP_1235828,0_Benign:Weak,4,1
1,ISIC_0015845,IP_8170065,0_Benign:Strong,1,2
2,ISIC_0015864,IP_6724798,0_Benign:Weak,5,1
3,ISIC_0015902,IP_4111386,0_Benign:Weak,2,1
4,ISIC_0024200,IP_8313778,0_Benign:Weak,1,1
5,ISIC_0035502,IP_3026693,0_Benign:Weak,5,1
6,ISIC_0051648,IP_0218255,0_Benign:Weak,1,1
7,ISIC_0051665,IP_7734648,0_Benign:Weak,2,1
8,ISIC_0051710,IP_1307115,0_Benign:Weak,2,1
9,ISIC_0051758,IP_2180091,0_Benign:Weak,5,1


In [10]:
train_metadata[multiclass_target_column].value_counts()

multiclass_target
0_Benign:Weak        379001
0_Benign:Strong       21665
1_BCC:Strong            163
1_Melanoma:Strong       157
1_SCC:Strong             73
Name: count, dtype: int64

In [11]:
train_metadata[multiclass_target_column].value_counts(normalize=True)

multiclass_target
0_Benign:Weak        0.945001
0_Benign:Strong      0.054019
1_BCC:Strong         0.000406
1_Melanoma:Strong    0.000391
1_SCC:Strong         0.000182
Name: proportion, dtype: float64

In [12]:
train_metadata[columns_to_write].to_csv(OUTPUT_FILENAME, index=False)