In [1]:
import gc
from pathlib import Path

import h5py

import pandas as pd
import numpy as np

import isic_cnn_binary_predict

from isic_helper import time_to_str

In [2]:
id_column = "isic_id"
target_column = "target"
group_column = "patient_id"
fold_column = "gkf_fold"

INPUT_PATH = Path("/kaggle/input/isic-2024-challenge/")
FOLDS_PATH = Path("/kaggle/input/isic-scd-folds")

TOTAL_RUNTIME = 0
TEST_FOLD = -1

EXPECTED_TEST_SIZE = 500000

In [3]:
train_metadata = pd.read_csv(INPUT_PATH / "train-metadata.csv", low_memory=False, na_values=["NA"])
test_metadata = pd.read_csv(INPUT_PATH / "test-metadata.csv", low_memory=False, na_values=["NA"])

folds_df = pd.read_csv(FOLDS_PATH / "folds.csv")
train_metadata = train_metadata.merge(folds_df, on=[id_column, group_column], how="inner")
print(f"Train data size: {train_metadata.shape}")
print(f"Test data size: {test_metadata.shape}")

train_images = h5py.File(INPUT_PATH / "train-image.hdf5", mode="r")
test_images = h5py.File(INPUT_PATH / "test-image.hdf5", mode="r")

train_metadata, cat_cols, cont_cols = isic_cnn_binary_predict.cnn_feature_engineering(train_metadata)
test_metadata, _, _ = isic_cnn_binary_predict.cnn_feature_engineering(test_metadata)

Train data size: (401059, 58)
Test data size: (3, 44)


In [4]:
if (TEST_FOLD != -1) and (test_metadata.shape[0] == 3):
    test_metadata = train_metadata[train_metadata[fold_column] == TEST_FOLD].reset_index(drop=True)
    test_images = train_images
    folds_to_run = [TEST_FOLD]
    test_targets = test_metadata[target_column]
else:
    folds_to_run = np.unique(train_metadata[fold_column])
sample_size = test_metadata.shape[0];sample_size

3

In [5]:
cnn_model_names = ["efficientnet_b0"]
cnn_versions = ["v2"]
cnn_modes = ["trainbinary"]
cnn_paths = [f"/kaggle/input/isic-scd-{model_name.replace('_', '-')}-{version}-{mode}" for model_name, version, mode in zip(cnn_model_names, cnn_versions, cnn_modes)]

all_cnn_oof_columns = []
for idx, path in enumerate(cnn_paths):
    print("\n")
    model_name = cnn_model_names[idx]
    version = cnn_versions[idx]
    mode = cnn_modes[idx]
    
    cnn_oof_train_preds_model_df = pd.read_csv(f"{path}/oof_train_preds_{model_name}_{version}.csv")
    cnn_oof_columns = [col for col in cnn_oof_train_preds_model_df if col.startswith("oof_")]
    cnn_oof_new_columns = [f"{col}_{mode}" for col in cnn_oof_columns]
    columns_mapper = dict(zip(cnn_oof_columns, cnn_oof_new_columns))
    cnn_oof_train_preds_model_df = cnn_oof_train_preds_model_df.rename(columns=columns_mapper)
    all_cnn_oof_columns += cnn_oof_new_columns
    if idx == 0:
        cnn_oof_train_preds_df = cnn_oof_train_preds_model_df[[id_column] + cnn_oof_new_columns].copy()
    else:
        cnn_oof_train_preds_df = cnn_oof_train_preds_df.merge(cnn_oof_train_preds_model_df[[id_column] + cnn_oof_new_columns], on=id_column, how="left")
        assert cnn_oof_train_preds_df.shape[0] == cnn_oof_train_preds_model_df.shape[0]
    
    if mode == "trainmulti":
        cnn_oof_test_preds_model_df, runtime = isic_cnn_multi_predict.run(test_metadata, test_images, model_name, version, path, folds_to_run)
    elif mode == "trainbinary":
        cnn_oof_test_preds_model_df, runtime = isic_cnn_binary_predict.run(test_metadata, test_images, model_name, version, path, folds_to_run, cat_cols, cont_cols)
    cnn_oof_test_preds_model_df = cnn_oof_test_preds_model_df.rename(columns=columns_mapper)
    if idx == 0:
        cnn_oof_test_preds_df = cnn_oof_test_preds_model_df[[id_column] + cnn_oof_new_columns].copy()
    else:
        cnn_oof_test_preds_df = cnn_oof_test_preds_df.merge(cnn_oof_test_preds_model_df[[id_column] + cnn_oof_new_columns], on=id_column, how="left")
    assert cnn_oof_test_preds_df.shape[0] == cnn_oof_test_preds_model_df.shape[0]
    
    TOTAL_RUNTIME += runtime



Predicting for efficientnet_b0_v2
{'debug': False,
 'down_sampling': True,
 'fold_column': 'gkf_fold',
 'image_size': 64,
 'init_lr': 3e-05,
 'load_checkpoint': True,
 'mixed_precision': 'fp16',
 'mode': 'trainbinary',
 'n_tta': 7,
 'num_epochs': 20,
 'num_workers': 8,
 'sampling_rate': 0.01,
 'seed': 2022,
 'train_batch_size': 64,
 'use_meta': True,
 'val_batch_size': 512}

Fold 1
Step: 1/1

Fold 2
Step: 1/1

Fold 3
Step: 1/1

Fold 4
Step: 1/1

Fold 5
Step: 1/1
Time taken: 5.16 s
Predictions generated for efficientnet_b0_v2


In [6]:
if (TEST_FOLD != -1):
    factor = EXPECTED_TEST_SIZE / sample_size
    expected_total_runtime = TOTAL_RUNTIME * factor * 5
    print(f"Expected total runtime during submission: {time_to_str(expected_total_runtime)}")

In [7]:
ensemble_preds_df = cnn_oof_test_preds_df.copy()

weights = {'oof_efficientnet_b0_v2_trainbinary': 1}

ensemble_preds = 0
for oof_column, weight in weights.items():
    ensemble_preds += ensemble_preds_df[oof_column].rank(pct=True).values * weight
ensemble_preds_df[target_column] = ensemble_preds
ensemble_preds_df.head()

Unnamed: 0,isic_id,oof_efficientnet_b0_v2_trainbinary,target
0,ISIC_0015657,0.001824,1.0
1,ISIC_0015729,7.3e-05,0.666667
2,ISIC_0015740,5.6e-05,0.333333


In [8]:
ensemble_preds_df[target_column].describe()

count    3.000000
mean     0.666667
std      0.333333
min      0.333333
25%      0.500000
50%      0.666667
75%      0.833333
max      1.000000
Name: target, dtype: float64

In [9]:
ensemble_preds_df[[id_column, target_column]].head()

Unnamed: 0,isic_id,target
0,ISIC_0015657,1.0
1,ISIC_0015729,0.666667
2,ISIC_0015740,0.333333


In [10]:
ensemble_preds_df[[id_column, target_column]].to_csv("submission.csv", index=False)

In [11]:
# from isic_helper import compute_auc, compute_pauc
# testing_pauc = compute_pauc(test_targets, ensemble_preds_df[target_column], min_tpr=0.8)
# testing_auc = compute_auc(test_targets, ensemble_preds_df[target_column])
# testing_auc, testing_pauc