In [1]:
import gc
from pathlib import Path

import h5py

import pandas as pd
import numpy as np

import isic_cnn_multi_predict
import isic_cnn_binary_predict
import isic_boosting_predict
from isic_boosting_predict import pauc_80, PAUC

from isic_helper import time_to_str

In [2]:
id_column = "isic_id"
target_column = "target"
group_column = "patient_id"
fold_column = "gkf_fold"

INPUT_PATH = Path("/kaggle/input/isic-2024-challenge/")
FOLDS_PATH = Path("/kaggle/input/isic-scd-folds")

TOTAL_RUNTIME = 0
TEST_FOLD = -1

EXPECTED_TEST_SIZE = 500000

In [3]:
train_metadata = pd.read_csv(INPUT_PATH / "train-metadata.csv", low_memory=False, na_values=["NA"])
test_metadata = pd.read_csv(INPUT_PATH / "test-metadata.csv", low_memory=False, na_values=["NA"])

folds_df = pd.read_csv(FOLDS_PATH / "folds.csv")
train_metadata = train_metadata.merge(folds_df, on=[id_column, group_column], how="inner")
print(f"Train data size: {train_metadata.shape}")
print(f"Test data size: {test_metadata.shape}")

train_images = h5py.File(INPUT_PATH / "train-image.hdf5", mode="r")
test_images = h5py.File(INPUT_PATH / "test-image.hdf5", mode="r")

train_metadata, cat_cols, cont_cols = isic_cnn_binary_predict.cnn_feature_engineering(train_metadata)
test_metadata, _, _ = isic_cnn_binary_predict.cnn_feature_engineering(test_metadata)

Train data size: (401059, 58)
Test data size: (3, 44)


In [4]:
if (TEST_FOLD != -1) and (test_metadata.shape[0] == 3):
    test_metadata = train_metadata[train_metadata[fold_column] == TEST_FOLD].reset_index(drop=True)
    test_images = train_images
    folds_to_run = [TEST_FOLD]
    test_targets = test_metadata[target_column]
else:
    folds_to_run = np.unique(train_metadata[fold_column])

sample_size = test_metadata.shape[0]

In [5]:
sample_size

3

In [6]:
cnn_model_names = ["efficientnet_b0", "efficientnet_b1", "efficientnet_b0", "efficientnet_b1", "efficientnet_b2"]
cnn_versions = ["v1", "v1", "v1", "v1", "v1"]
cnn_modes = ["trainmulti", "trainmulti", "trainbinary", "trainbinary", "trainbinary"]
cnn_paths = [f"/kaggle/input/isic-scd-{model_name.replace('_', '-')}-{version}-{mode}" for model_name, version, mode in zip(cnn_model_names, cnn_versions, cnn_modes)]

all_cnn_oof_columns = []
for idx, path in enumerate(cnn_paths):
    print("\n")
    model_name = cnn_model_names[idx]
    version = cnn_versions[idx]
    mode = cnn_modes[idx]
    
    cnn_oof_train_preds_model_df = pd.read_csv(f"{path}/oof_train_preds_{model_name}_{version}.csv")
    cnn_oof_columns = [col for col in cnn_oof_train_preds_model_df if col.startswith("oof_")]
    cnn_oof_new_columns = [f"{col}_{mode}" for col in cnn_oof_columns]
    columns_mapper = dict(zip(cnn_oof_columns, cnn_oof_new_columns))
    cnn_oof_train_preds_model_df = cnn_oof_train_preds_model_df.rename(columns=columns_mapper)
    all_cnn_oof_columns += cnn_oof_new_columns
    if idx == 0:
        cnn_oof_train_preds_df = cnn_oof_train_preds_model_df[[id_column] + cnn_oof_new_columns].copy()
    else:
        cnn_oof_train_preds_df = cnn_oof_train_preds_df.merge(cnn_oof_train_preds_model_df[[id_column] + cnn_oof_new_columns], on=id_column, how="left")
        assert cnn_oof_train_preds_df.shape[0] == cnn_oof_train_preds_model_df.shape[0]
    
    if mode == "trainmulti":
        cnn_oof_test_preds_model_df, runtime = isic_cnn_multi_predict.run(test_metadata, test_images, model_name, version, path, folds_to_run)
    elif mode == "trainbinary":
        cnn_oof_test_preds_model_df, runtime = isic_cnn_binary_predict.run(test_metadata, test_images, model_name, version, path, folds_to_run, cat_cols, cont_cols)
    cnn_oof_test_preds_model_df = cnn_oof_test_preds_model_df.rename(columns=columns_mapper)
    if idx == 0:
        cnn_oof_test_preds_df = cnn_oof_test_preds_model_df[[id_column] + cnn_oof_new_columns].copy()
    else:
        cnn_oof_test_preds_df = cnn_oof_test_preds_df.merge(cnn_oof_test_preds_model_df[[id_column] + cnn_oof_new_columns], on=id_column, how="left")
    assert cnn_oof_test_preds_df.shape[0] == cnn_oof_test_preds_model_df.shape[0]
    
    TOTAL_RUNTIME += runtime



Predicting for efficientnet_b0_v1
{'debug': False,
 'ext': '2020,2019',
 'fold_column': 'gkf_fold',
 'image_size': 64,
 'init_lr': 3e-05,
 'mixed_precision': 'fp16',
 'mode': 'trainmulti',
 'n_tta': 8,
 'num_epochs': 20,
 'num_workers': 8,
 'seed': 2022,
 'train_batch_size': 64,
 'val_batch_size': 512}

Fold 1
Step: 1/1

Fold 2
Step: 1/1

Fold 3
Step: 1/1

Fold 4
Step: 1/1

Fold 5
Step: 1/1
Time taken: 4.81 s
Predictions generated for efficientnet_b0_v1


Predicting for efficientnet_b1_v1
{'debug': False,
 'ext': '2020,2019',
 'fold_column': 'gkf_fold',
 'image_size': 92,
 'init_lr': 3e-05,
 'mixed_precision': 'fp16',
 'mode': 'trainmulti',
 'n_tta': 8,
 'num_epochs': 20,
 'num_workers': 8,
 'seed': 2022,
 'train_batch_size': 64,
 'val_batch_size': 512}

Fold 1
Step: 1/1

Fold 2
Step: 1/1

Fold 3
Step: 1/1

Fold 4
Step: 1/1

Fold 5
Step: 1/1
Time taken: 4.78 s
Predictions generated for efficientnet_b1_v1


Predicting for efficientnet_b0_v1
{'debug': False,
 'down_sampling': True,
 'f

In [7]:
del train_metadata, test_metadata
gc.collect()

69162

In [8]:
train_metadata = pd.read_csv(INPUT_PATH / "train-metadata.csv", low_memory=False, na_values=["NA"])
test_metadata = pd.read_csv(INPUT_PATH / "test-metadata.csv", low_memory=False, na_values=["NA"])

folds_df = pd.read_csv(FOLDS_PATH / "folds.csv")
train_metadata = train_metadata.merge(folds_df, on=[id_column, group_column], how="inner")
print(f"Train data size: {train_metadata.shape}")
print(f"Test data size: {test_metadata.shape}")

train_metadata, _ = isic_boosting_predict.boosting_feature_engineering(train_metadata)
test_metadata, _ = isic_boosting_predict.boosting_feature_engineering(test_metadata)

if len(all_cnn_oof_columns) > 0:
    train_metadata = train_metadata.merge(cnn_oof_train_preds_df, on=id_column, how="left")
    test_metadata = test_metadata.merge(cnn_oof_test_preds_df, on=id_column, how="left")

Train data size: (401059, 58)
Test data size: (3, 44)


In [9]:
boosting_model_names = ["xgb", "xgb", "lgb", "cb"]
boosting_versions = ["v4", "v5", "v6", "v1"]
boosting_modes = ["train", "train", "train", "train"]
boosting_paths = [f"/kaggle/input/isic-scd-{model_name.replace('_', '-')}-{version}-{mode}" for model_name, version, mode in zip(
    boosting_model_names, boosting_versions, boosting_modes)]

all_boosting_oof_columns = []
for idx, path in enumerate(boosting_paths):
    print("\n")
    model_name = boosting_model_names[idx]
    version = boosting_versions[idx]
    mode = boosting_modes[idx]
    
    boosting_oof_train_preds_model_df = pd.read_csv(f"{path}/oof_train_preds_{model_name}_{version}.csv")
    boosting_oof_columns = [col for col in boosting_oof_train_preds_model_df if col.startswith("oof_")]
    all_boosting_oof_columns += boosting_oof_columns
    if idx == 0:
        boosting_oof_train_preds_df = boosting_oof_train_preds_model_df[[id_column] + boosting_oof_columns].copy()
    else:
        boosting_oof_train_preds_df = boosting_oof_train_preds_df.merge(boosting_oof_train_preds_model_df[[id_column] + boosting_oof_columns], on=id_column, how="left")
        assert boosting_oof_train_preds_df.shape[0] == boosting_oof_train_preds_df.shape[0]
        
    boosting_oof_test_preds_model_df, runtime = isic_boosting_predict.run(train_metadata, test_metadata, model_name, version, path, folds_to_run)
    if idx == 0:
        boosting_oof_test_preds_df = boosting_oof_test_preds_model_df[[id_column] + boosting_oof_columns].copy()
    else:
        boosting_oof_test_preds_df = boosting_oof_test_preds_df.merge(boosting_oof_test_preds_model_df[[id_column] + boosting_oof_columns], on=id_column, how="left")
    assert boosting_oof_test_preds_df.shape[0] == boosting_oof_test_preds_df.shape[0]
    
    TOTAL_RUNTIME += runtime



Predicting for xgb_v4
{'best_num_rounds': {'fold_1': 82,
                     'fold_2': 152,
                     'fold_3': 20,
                     'fold_4': 142,
                     'fold_5': 77},
 'config': {'_key': None,
            '_parent': None,
            '_temp': False,
            'fold_column': 'gkf_fold',
            'model_name': 'xgb_v4',
            'models_output_dir': 'models',
            'sampling_ratio': 0.01,
            'seed': 2022},
 'cv_auc_avg': 0.9680055039541848,
 'cv_auc_oof': 0.947856723136131,
 'cv_auc_std': 0.006492120739229583,
 'cv_pauc_avg': 0.17415984683425967,
 'cv_pauc_oof': 0.1540868271122474,
 'cv_pauc_std': 0.005848210144159227,
 'es_rounds': 150,
 'num_rounds': 2000,
 'params': {'alpha': 0.6779926606782505,
            'colsample_bylevel': 0.5476090898823716,
            'colsample_bynode': 0.9928601203635129,
            'colsample_bytree': 0.8437772277074493,
            'disable_default_eval_metric': True,
            'enable_categorica

In [10]:
factor = EXPECTED_TEST_SIZE / sample_size
expected_total_runtime = TOTAL_RUNTIME * factor
print(f"Expected total runtime during submission: {time_to_str(expected_total_runtime)}")

Expected total runtime during submission: 95383 min 5.27 sec


In [11]:
ensemble_preds_df = boosting_oof_test_preds_df.merge(cnn_oof_test_preds_df, on=id_column, how="left")

oof_columns = ['oof_xgb_v4',
 'oof_xgb_v5',
 'oof_lgb_v6',
 'oof_cb_v1',
 'oof_efficientnet_b1_v1_trainbinary',
 'oof_efficientnet_b2_v1_trainbinary']

weights = [3.9905912950723423,
 3.882215991236038,
 5.566407318116632,
 10.0,
 4.358581822764368,
 0.9840327592527358]

ensemble_preds = 0
for idx, (oof_column, weight) in enumerate(zip(oof_columns, weights)):
    ensemble_preds += ensemble_preds_df[oof_column].rank(pct=True).values * weight
ensemble_preds_df[target_column] = ensemble_preds
ensemble_preds_df.head()

Unnamed: 0,isic_id,oof_xgb_v4,oof_xgb_v5,oof_lgb_v6,oof_cb_v1,oof_efficientnet_b0_v1_trainmulti,oof_efficientnet_b0_v1_AKIEC_trainmulti,oof_efficientnet_b0_v1_BCC_trainmulti,oof_efficientnet_b0_v1_BKL_trainmulti,oof_efficientnet_b0_v1_DF_trainmulti,...,oof_efficientnet_b1_v1_DF_trainmulti,oof_efficientnet_b1_v1_MEL_trainmulti,oof_efficientnet_b1_v1_NV_trainmulti,oof_efficientnet_b1_v1_SCC_trainmulti,oof_efficientnet_b1_v1_VASC_trainmulti,oof_efficientnet_b1_v1_unknown_trainmulti,oof_efficientnet_b0_v1_trainbinary,oof_efficientnet_b1_v1_trainbinary,oof_efficientnet_b2_v1_trainbinary,target
0,ISIC_0015657,0.229051,0.311791,0.465946,0.220589,0.001234,0.000142,0.000799,0.000456,3.516355e-05,...,7.87291e-06,2.8e-05,3.8e-05,1.03214e-05,5.414601e-07,0.99976,0.002732,0.000534,0.002771,26.15756
1,ISIC_0015729,0.141931,0.25371,0.247114,0.032359,2.5e-05,2e-06,1.5e-05,2.8e-05,2.128295e-06,...,6.767254e-07,1e-06,2.1e-05,2.178217e-07,1.77239e-07,0.99997,0.001435,0.000102,0.000247,11.449412
2,ISIC_0015740,0.325655,0.366158,0.206437,0.045649,3.4e-05,3e-06,2.6e-05,8e-06,6.783895e-07,...,2.987788e-06,3e-06,1e-05,3.863554e-06,3.349879e-07,0.999949,0.001809,0.000159,0.000853,19.956686


In [12]:
ensemble_preds_df[target_column].describe()

count     3.000000
mean     19.187886
std       7.384152
min      11.449412
25%      15.703049
50%      19.956686
75%      23.057123
max      26.157560
Name: target, dtype: float64

In [13]:
ensemble_preds_df[[id_column, target_column]].head()

Unnamed: 0,isic_id,target
0,ISIC_0015657,26.15756
1,ISIC_0015729,11.449412
2,ISIC_0015740,19.956686


In [14]:
ensemble_preds_df[[id_column, target_column]].to_csv("submission.csv", index=False)