In [1]:
import gc
import time
from pathlib import Path
from dataclasses import dataclass

import h5py

import pandas as pd
import numpy as np

import isic_cnn_binary_predict
import isic_cnn_psuedo_train_predict

from isic_helper import time_to_str

In [2]:
id_column = "isic_id"
target_column = "target"
group_column = "patient_id"
fold_column = "gkf_fold"

INPUT_PATH = Path("/kaggle/input/isic-2024-challenge/")
FOLDS_PATH = Path("/kaggle/input/isic-scd-folds")

TOTAL_RUNTIME = 0
TEST_FOLD = -1

EXPECTED_TEST_SIZE = 500000

PSEUDO_LABELLING = True

In [3]:
train_metadata = pd.read_csv(INPUT_PATH / "train-metadata.csv", low_memory=False, na_values=["NA"])
test_metadata = pd.read_csv(INPUT_PATH / "test-metadata.csv", low_memory=False, na_values=["NA"])

folds_df = pd.read_csv(FOLDS_PATH / "folds.csv")
train_metadata = train_metadata.merge(folds_df, on=[id_column, group_column], how="inner")
print(f"Train data size: {train_metadata.shape}")
print(f"Test data size: {test_metadata.shape}")

train_images = h5py.File(INPUT_PATH / "train-image.hdf5", mode="r")
test_images = h5py.File(INPUT_PATH / "test-image.hdf5", mode="r")

train_metadata, cat_cols, cont_cols = isic_cnn_binary_predict.cnn_feature_engineering(train_metadata)
test_metadata, _, _ = isic_cnn_binary_predict.cnn_feature_engineering(test_metadata)

Train data size: (401059, 58)
Test data size: (3, 44)


In [4]:
if (TEST_FOLD != -1) and (test_metadata.shape[0] == 3):
    test_metadata = train_metadata[train_metadata[fold_column] == TEST_FOLD].reset_index(drop=True)
    test_images = train_images
    folds_to_run = [TEST_FOLD]
    test_targets = test_metadata[target_column]
else:
    folds_to_run = np.unique(train_metadata[fold_column])
sample_size = test_metadata.shape[0];sample_size

3

In [5]:
cnn_model_names = ["efficientnet_b0"]
cnn_versions = ["v2"]
cnn_modes = ["trainbinary"]
cnn_paths = [f"/kaggle/input/isic-scd-{model_name.replace('_', '-')}-{version}-{mode}" for model_name, version, mode in zip(cnn_model_names, cnn_versions, cnn_modes)]

all_cnn_oof_columns = []
for idx, path in enumerate(cnn_paths):
    print("\n")
    model_name = cnn_model_names[idx]
    version = cnn_versions[idx]
    mode = cnn_modes[idx]
    
    cnn_oof_train_preds_model_df = pd.read_csv(f"{path}/oof_train_preds_{model_name}_{version}.csv")
    cnn_oof_columns = [col for col in cnn_oof_train_preds_model_df if col.startswith("oof_")]
    cnn_oof_new_columns = [f"{col}_{mode}" for col in cnn_oof_columns]
    columns_mapper = dict(zip(cnn_oof_columns, cnn_oof_new_columns))
    cnn_oof_train_preds_model_df = cnn_oof_train_preds_model_df.rename(columns=columns_mapper)
    all_cnn_oof_columns += cnn_oof_new_columns
    if idx == 0:
        cnn_oof_train_preds_df = cnn_oof_train_preds_model_df[[id_column] + cnn_oof_new_columns].copy()
    else:
        cnn_oof_train_preds_df = cnn_oof_train_preds_df.merge(cnn_oof_train_preds_model_df[[id_column] + cnn_oof_new_columns], on=id_column, how="left")
        assert cnn_oof_train_preds_df.shape[0] == cnn_oof_train_preds_model_df.shape[0]
    
    if mode == "trainmulti":
        cnn_oof_test_preds_model_df, runtime = isic_cnn_multi_predict.run(test_metadata, test_images, model_name, version, path, folds_to_run)
    elif mode == "trainbinary":
        cnn_oof_test_preds_model_df, runtime = isic_cnn_binary_predict.run(test_metadata, test_images, model_name, version, path, folds_to_run, cat_cols, cont_cols)
    cnn_oof_test_preds_model_df = cnn_oof_test_preds_model_df.rename(columns=columns_mapper)
    if idx == 0:
        cnn_oof_test_preds_df = cnn_oof_test_preds_model_df[[id_column] + cnn_oof_new_columns].copy()
    else:
        cnn_oof_test_preds_df = cnn_oof_test_preds_df.merge(cnn_oof_test_preds_model_df[[id_column] + cnn_oof_new_columns], on=id_column, how="left")
    assert cnn_oof_test_preds_df.shape[0] == cnn_oof_test_preds_model_df.shape[0]
    
    TOTAL_RUNTIME += runtime



Predicting for efficientnet_b0_v2
{'debug': False,
 'down_sampling': True,
 'fold_column': 'gkf_fold',
 'image_size': 64,
 'init_lr': 3e-05,
 'load_checkpoint': True,
 'mixed_precision': 'fp16',
 'mode': 'trainbinary',
 'n_tta': 7,
 'num_epochs': 20,
 'num_workers': 8,
 'sampling_rate': 0.01,
 'seed': 2022,
 'train_batch_size': 64,
 'use_meta': True,
 'val_batch_size': 512}

Fold 1
Step: 1/1

Fold 2
Step: 1/1

Fold 3
Step: 1/1

Fold 4
Step: 1/1

Fold 5
Step: 1/1
Time taken: 4.87 s
Predictions generated for efficientnet_b0_v2


In [6]:
weights = {'oof_efficientnet_b0_v2_trainbinary': 1}

ensemble_train_preds_df = cnn_oof_train_preds_df.copy()
ensemble_test_preds_df = cnn_oof_test_preds_df.copy()
ensemble_train_preds = 0
ensemble_test_preds = 0
for oof_column, weight in weights.items():
    ensemble_train_preds += ensemble_train_preds_df[oof_column].rank(pct=True).values * weight
    ensemble_test_preds += ensemble_test_preds_df[oof_column].rank(pct=True).values * weight
ensemble_train_preds_df["ensemble"] = ensemble_train_preds
ensemble_test_preds_df["ensemble"] = ensemble_test_preds

In [7]:
if PSEUDO_LABELLING:
    print("Pseudo-Labelling")
    psuedo_start_time = time.time()
    train_metadata = pd.read_csv(INPUT_PATH / "train-metadata.csv", low_memory=False, na_values=["NA"])
    test_metadata = pd.read_csv(INPUT_PATH / "test-metadata.csv", low_memory=False, na_values=["NA"])

    folds_df = pd.read_csv(FOLDS_PATH / "folds.csv")
    train_metadata = train_metadata.merge(folds_df, on=[id_column, group_column], how="inner")
    print(f"Train data size: {train_metadata.shape}")
    print(f"Test data size: {test_metadata.shape}")

    folds = train_metadata[fold_column]

    train_metadata["label"] = train_metadata["target"].astype(float)

    test_psuedo_metadata = test_metadata.copy()
    test_psuedo_metadata = test_psuedo_metadata.merge(ensemble_test_preds_df[[id_column, "ensemble"]].rename(columns={"ensemble": "label"}), on=id_column, how="left")
    test_psuedo_metadata["label"] = test_psuedo_metadata["label"].rank(pct=True)

    test_psuedo_metadata = test_psuedo_metadata[test_psuedo_metadata["label"] >= 0.5].sort_values("label", ascending=False).reset_index(drop=True)
    num_pseudo_samples = min(2500, test_psuedo_metadata.shape[0])
    test_psuedo_metadata = test_psuedo_metadata.loc[:(num_pseudo_samples - 1), :].reset_index(drop=True)
    assert test_psuedo_metadata.shape[0] == num_pseudo_samples

    train_metadata, cat_cols, cont_cols = isic_cnn_psuedo_train_predict.cnn_feature_engineering(train_metadata)
    emb_szs = isic_cnn_psuedo_train_predict.get_emb_szs(cat_cols)
    test_psuedo_metadata, _, _ = isic_cnn_psuedo_train_predict.cnn_feature_engineering(test_psuedo_metadata)
    test_metadata, _, _ = isic_cnn_psuedo_train_predict.cnn_feature_engineering(test_metadata)
    print(f"Added {num_pseudo_samples} samples from test data")

    @dataclass
    class TrainBinaryConfig:
        mode: str = "trainbinary"
        fold_column: str = "gkf_fold"
        mixed_precision: bool = "fp16"
        image_size: int = 64
        train_batch_size: int = 64
        val_batch_size: int = 512
        num_workers: int = 4
        init_lr: float = 3e-5
        num_epochs: int = 2
        n_tta: int = 7
        sampling_rate: float = 0.5
        use_meta: bool = True
        seed: int = 2022

        model_name: str = "efficientnet_b0"
        version: str = "v8"
        model_dir: str = "isic-scd-efficientnet-b0-v8-trainbinary"
        pretrained_weights_dir: str = "/kaggle/input/isic-scd-efficientnet-b0-v2-trainbinary"

    args = TrainBinaryConfig()
    all_folds = np.unique(folds)
    for idx, fold in enumerate(all_folds):
        args.fold = fold
        oof_pseudo_train_fold_preds_df, oof_pseudo_test_fold_preds_df = isic_cnn_psuedo_train_predict.main(args, 
                                                                                                           train_metadata, train_images, 
                                                                                                           test_psuedo_metadata, 
                                                                                                           test_metadata, test_images,
                                                                                                           emb_szs, cat_cols, cont_cols)
        if idx == 0:
            oof_pseudo_train_preds_df = oof_pseudo_train_fold_preds_df.copy()
            oof_pseudo_test_preds_df = oof_pseudo_test_fold_preds_df.copy()
        else:
            oof_pseudo_train_preds_df = pd.concat([oof_pseudo_train_preds_df, oof_pseudo_train_fold_preds_df], ignore_index=True)
            oof_pseudo_test_preds_df = oof_pseudo_test_preds_df.merge(oof_pseudo_test_fold_preds_df[[id_column, group_column, f"oof_{args.model_name}_{args.version}_fold_{args.fold}"]],
                                                                      on=[id_column, group_column], how="left")

    oof_pseudo_test_preds_df[f"oof_{args.model_name}_{args.version}"] = oof_pseudo_test_preds_df[[f"oof_{args.model_name}_{args.version}_fold_{i}" for i in all_folds]].mean(axis=1)

    oof_train_preds_df = ensemble_train_preds_df.merge(oof_pseudo_train_preds_df[[id_column, "fold", target_column, f"oof_{args.model_name}_{args.version}"]], on=id_column, how="left")
    oof_test_preds_df = ensemble_test_preds_df.merge(oof_pseudo_test_preds_df[[id_column, f"oof_{args.model_name}_{args.version}"]], on=id_column, how="left")

    final_oof_columns = ["ensemble", f"oof_{args.model_name}_{args.version}"]
    all_folds = np.unique(oof_train_preds_df["fold"])
    final_weights = isic_cnn_psuedo_train_predict.blend_optimizer(
        oof_train_preds_df, final_oof_columns, all_folds,
        init_points=50, n_iter=100
    )

    print(final_weights)

    final_ensemble_train_preds_df = oof_train_preds_df.copy()
    final_ensemble_test_preds_df = oof_test_preds_df.copy()
    final_ensemble_train_preds = 0
    final_ensemble_test_preds = 0
    for oof_column, weight in final_weights.items():
        final_ensemble_train_preds += final_ensemble_train_preds_df[oof_column].rank(pct=True).values * weight
        final_ensemble_test_preds += final_ensemble_test_preds_df[oof_column].rank(pct=True).values * weight
    final_ensemble_train_preds_df[target_column] = final_ensemble_train_preds
    final_ensemble_test_preds_df[target_column] = final_ensemble_test_preds
    pseudo_runtime = time.time() - psuedo_start_time
    print(f"Psuedo Labelling took: {time_to_str(pseudo_runtime)}")
else:
    final_ensemble_train_preds_df = ensemble_train_preds_df.copy()
    final_ensemble_test_preds_df = ensemble_test_preds_df.copy()
    final_ensemble_train_preds_df = final_ensemble_train_preds_df.rename(columns={"ensemble": target_column})
    final_ensemble_test_preds_df = final_ensemble_test_preds_df.rename(columns={"ensemble": target_column})

Pseudo-Labelling
Train data size: (401059, 58)
Test data size: (3, 44)
Added 2 samples from test data
Loading pretrained weights from /kaggle/input/isic-scd-efficientnet-b0-v2-trainbinary
Pretrained weights loaded successfully
Fold 1 | Epoch 1 | LR 0.0000300
Epoch: 1 | Step: 1/15 | Loss: 2.25930 | Smooth loss: 2.25930
Epoch: 1 | Step: 1/157
Epoch: 1 | Step: 10/157
Epoch: 1 | Step: 20/157
Epoch: 1 | Step: 30/157
Epoch: 1 | Step: 40/157
Epoch: 1 | Step: 50/157
Epoch: 1 | Step: 60/157
Epoch: 1 | Step: 70/157
Epoch: 1 | Step: 80/157
Epoch: 1 | Step: 90/157
Epoch: 1 | Step: 100/157
Epoch: 1 | Step: 110/157
Epoch: 1 | Step: 120/157
Epoch: 1 | Step: 130/157
Epoch: 1 | Step: 140/157
Epoch: 1 | Step: 150/157
Fold: 1 | Epoch: 1 | LR: 0.0000300 | Train loss: 1.42742 | Val loss: 0.00797 | Val AUC: 0.98022 | Val pAUC: 0.18632
pAUC: 0.00000 --> 0.18632, saving model...
Epoch 1 took 1m 15s
Fold 1 | Epoch 2 | LR 0.0002962
Epoch: 2 | Step: 1/15 | Loss: 0.78428 | Smooth loss: 0.78428
Epoch: 2 | Step: 1/

In [8]:
if (TEST_FOLD != -1):
    factor = EXPECTED_TEST_SIZE / sample_size
    expected_total_runtime = TOTAL_RUNTIME * factor * 5
    print(f"Expected total runtime during submission: {time_to_str(expected_total_runtime)}")

In [9]:
final_ensemble_test_preds_df[[id_column, target_column]].head()

Unnamed: 0,isic_id,target
0,ISIC_0015657,9.304098
1,ISIC_0015729,6.202732
2,ISIC_0015740,3.101366


In [10]:
final_ensemble_test_preds_df[[id_column, target_column]].to_csv("submission.csv", index=False)

In [11]:
# from isic_helper import compute_pauc, compute_auc
# testing_pauc = compute_pauc(test_targets, final_ensemble_test_preds_df[target_column], min_tpr=0.8)
# testing_auc = compute_auc(test_targets, final_ensemble_test_preds_df[target_column])
# testing_auc, testing_pauc