In [1]:
from pathlib import Path

import pandas as pd
import numpy as np

from sklearn.model_selection import GroupKFold

In [2]:
SEED = 2022
FOLDS = 5
INPUT_PATH = Path("../input/isic-2024-challenge/")
OUTPUT_FILENAME = "folds.csv"

In [3]:
train_metadata = pd.read_csv(INPUT_PATH / "train-metadata.csv", low_memory=False);print(f"Train data size: {train_metadata.shape}")

Train data size: (401059, 55)


In [4]:
train_metadata.head()

Unnamed: 0,isic_id,target,patient_id,age_approx,sex,anatom_site_general,clin_size_long_diam_mm,image_type,tbp_tile_type,tbp_lv_A,...,lesion_id,iddx_full,iddx_1,iddx_2,iddx_3,iddx_4,iddx_5,mel_mitotic_index,mel_thick_mm,tbp_lv_dnn_lesion_confidence
0,ISIC_0015670,0,IP_1235828,60.0,male,lower extremity,3.04,TBP tile: close-up,3D: white,20.244422,...,,Benign,Benign,,,,,,,97.517282
1,ISIC_0015845,0,IP_8170065,60.0,male,head/neck,1.1,TBP tile: close-up,3D: white,31.71257,...,IL_6727506,Benign,Benign,,,,,,,3.141455
2,ISIC_0015864,0,IP_6724798,60.0,male,posterior torso,3.4,TBP tile: close-up,3D: XP,22.57583,...,,Benign,Benign,,,,,,,99.80404
3,ISIC_0015902,0,IP_4111386,65.0,male,anterior torso,3.22,TBP tile: close-up,3D: XP,14.242329,...,,Benign,Benign,,,,,,,99.989998
4,ISIC_0024200,0,IP_8313778,55.0,male,anterior torso,2.73,TBP tile: close-up,3D: white,24.72552,...,,Benign,Benign,,,,,,,70.44251


In [5]:
id_column = "isic_id"
target_column = "target"
group_column = "patient_id"
fold_column = "fold"
final_target_column = "final_target"
sample_weight_column = "sample_weight"

folds = np.zeros(train_metadata.shape[0], dtype=int)

gkf = GroupKFold(n_splits=FOLDS)
for fold_idx, (_, val_idx) in enumerate(gkf.split(train_metadata, train_metadata[target_column], train_metadata[group_column])):
    folds[val_idx] = fold_idx + 1

train_metadata[fold_column] = folds

In [6]:
train_metadata[final_target_column] = np.nan
train_metadata.loc[(train_metadata["lesion_id"].isnull()) & (train_metadata[target_column] == 0), final_target_column] = -1
train_metadata.loc[(train_metadata["lesion_id"].notnull()) & (train_metadata[target_column] == 0), final_target_column] = 0
train_metadata.loc[(train_metadata["lesion_id"].notnull()) & (train_metadata["iddx_3"].str.contains("Basal cell carcinoma")), final_target_column] = 1
train_metadata.loc[(train_metadata["lesion_id"].notnull()) & (train_metadata["iddx_3"].str.contains("Melanoma")), final_target_column] = 2
train_metadata.loc[(train_metadata["lesion_id"].notnull()) & (train_metadata["iddx_3"].str.contains("Squamous cell carcinoma")), final_target_column] = 3
assert train_metadata[final_target_column].isnull().sum() == 0
train_metadata[final_target_column] = train_metadata[final_target_column].astype(int)

In [7]:
value_counts_df = train_metadata[final_target_column].value_counts()
tmp = value_counts_df[value_counts_df.index > 0]
pos_sample_weights = dict(tmp * 0.1 / tmp.sum())
pos_sample_weights = {int(k): v for k, v in pos_sample_weights.items()}

sample_weights = {}
sample_weights[-1] = 0.3
sample_weights[0] = 0.6
sample_weights = {**sample_weights, **pos_sample_weights}

train_metadata[sample_weight_column] = train_metadata[final_target_column].map(sample_weights)

In [8]:
train_metadata.loc[train_metadata[final_target_column] == -1, final_target_column] = 0

In [9]:
columns_to_write = [id_column, group_column, final_target_column, fold_column, sample_weight_column]
train_metadata[columns_to_write].head(n=10)

Unnamed: 0,isic_id,patient_id,final_target,fold,sample_weight
0,ISIC_0015670,IP_1235828,0,4,0.3
1,ISIC_0015845,IP_8170065,0,1,0.6
2,ISIC_0015864,IP_6724798,0,5,0.3
3,ISIC_0015902,IP_4111386,0,2,0.3
4,ISIC_0024200,IP_8313778,0,1,0.3
5,ISIC_0035502,IP_3026693,0,5,0.3
6,ISIC_0051648,IP_0218255,0,1,0.3
7,ISIC_0051665,IP_7734648,0,2,0.3
8,ISIC_0051710,IP_1307115,0,2,0.3
9,ISIC_0051758,IP_2180091,0,5,0.3


In [10]:
train_metadata[final_target_column].value_counts()

final_target
0    400666
1       163
2       157
3        73
Name: count, dtype: int64

In [11]:
train_metadata[final_target_column].value_counts(normalize=True)

final_target
0    0.999020
1    0.000406
2    0.000391
3    0.000182
Name: proportion, dtype: float64

In [12]:
train_metadata[columns_to_write].to_csv(OUTPUT_FILENAME, index=False)