In [1]:
import gc
import time
import json
import joblib
from pprint import pprint
from typing import List, Dict
from collections import defaultdict

from pathlib import Path

import pandas as pd
import numpy as np

import h5py
from io import BytesIO
from PIL import Image

import torch
from torch.utils.data import DataLoader, Dataset
import torch.nn as nn
import torch.nn.functional as F

from timm import create_model

import albumentations as A
from albumentations.pytorch import ToTensorV2

from accelerate import Accelerator

from isic_helper import DotDict
from isic_multi_predict import main as get_dnn_multi_predictions

In [2]:
id_column = "isic_id"
target_column = "target"
group_column = "patient_id"
fold_column = "fold"

INPUT_PATH = Path("/kaggle/input/isic-2024-challenge/")

In [3]:
cnn_model_names = ["efficientnet_b0"]
cnn_versions = ["v3"]
cnn_modes = ["train"]
cnn_paths = [f"/kaggle/input/isic-scd-{model_name.replace('_', '-')}-{version}-{mode}" for model_name, version, mode in zip(cnn_model_names, cnn_versions, cnn_modes)]

all_oof_columns = []
for idx, path in enumerate(cnn_paths):
    model_name = cnn_model_names[idx]
    version = cnn_versions[idx]
    mode = cnn_modes[idx]
    oof_train_preds_model_df = pd.read_csv(f"{path}/oof_preds_{model_name}_{version}.csv")
    oof_columns = [col for col in oof_train_preds_model_df if col.startswith("oof_")]
    all_oof_columns += oof_columns
    if idx == 0:
        oof_train_preds_df = oof_train_preds_model_df[[id_column] + oof_columns].copy()
    else:
        oof_train_preds_df = oof_train_preds_df.merge(oof_train_preds_model_df[[id_column] + oof_columns], on=id_column, how="inner")
        assert oof_train_preds_df.shape[0] == oof_train_preds_model_df.shape[0]
    
    with open(f"{path}/{model_name}_{version}_run_metadata.json", "r") as f:
        run_metadata = json.load(f)
    mixed_precision = run_metadata["params"]["mixed_precision"]
    image_size = run_metadata["params"]["image_size"]
    batch_size = run_metadata["params"]["val_batch_size"]
    n_tta = run_metadata["params"]["n_tta"]

    oof_test_preds_model_df, _ = get_dnn_multi_predictions(model_name, version, path, 
                                                           mixed_precision, image_size, batch_size, n_tta)
    if idx == 0:
        oof_test_preds_df = oof_test_preds_model_df[[id_column] + oof_columns].copy()
    else:
        oof_test_preds_df = oof_test_preds_df.merge(oof_test_preds_model_df[[id_column] + oof_columns], on=id_column, how="inner")
    assert oof_test_preds_df.shape[0] == oof_test_preds_model_df.shape[0]

Fold 1
Step: 1/1
Fold 2
Step: 1/1
Fold 3
Step: 1/1
Fold 4
Step: 1/1
Fold 5
Step: 1/1
Time taken: 15.34 s
Finished predicting


In [4]:
oof_train_preds_df.head()

Unnamed: 0,isic_id,oof_efficientnet_b0_v3,oof_efficientnet_b0_v3_AKIEC,oof_efficientnet_b0_v3_BCC,oof_efficientnet_b0_v3_BKL,oof_efficientnet_b0_v3_DF,oof_efficientnet_b0_v3_MEL,oof_efficientnet_b0_v3_NV,oof_efficientnet_b0_v3_SCC,oof_efficientnet_b0_v3_VASC,oof_efficientnet_b0_v3_unknown
0,ISIC_0015845,0.001611,0.000285,0.001315,0.00064,0.0001816614,2.8e-05,0.000342,0.000268,2.862305e-06,0.996938
1,ISIC_0024200,0.001537,0.000349,0.000938,0.000595,5.102142e-05,0.000158,0.000528,0.000441,2.101307e-06,0.996939
2,ISIC_0051648,7.3e-05,3e-06,5e-06,2.2e-05,4.555343e-07,6.6e-05,6.8e-05,2e-06,6.524533e-09,0.999834
3,ISIC_0051896,9.3e-05,1.4e-05,1.8e-05,4.2e-05,2.046736e-06,6.3e-05,9.9e-05,1.2e-05,2.786641e-08,0.999749
4,ISIC_0052026,0.001067,0.00024,0.000564,0.000495,3.249337e-05,0.000124,0.000417,0.000379,1.21553e-06,0.997748


In [5]:
oof_test_preds_df.head()

Unnamed: 0,isic_id,oof_efficientnet_b0_v3,oof_efficientnet_b0_v3_AKIEC,oof_efficientnet_b0_v3_BCC,oof_efficientnet_b0_v3_BKL,oof_efficientnet_b0_v3_DF,oof_efficientnet_b0_v3_MEL,oof_efficientnet_b0_v3_NV,oof_efficientnet_b0_v3_SCC,oof_efficientnet_b0_v3_VASC,oof_efficientnet_b0_v3_unknown
0,ISIC_0015657,0.001107,0.000123,0.000784,0.000193,3.776647e-05,0.0001911019,0.000244,0.0001311617,5.026696e-06,0.99829
1,ISIC_0015729,8e-06,2e-06,3e-06,6e-06,4.584421e-07,4.916471e-06,3e-05,2.749715e-07,0.0001159906,0.999838
2,ISIC_0015740,1e-05,1e-06,7e-06,3e-06,2.163147e-06,6.565386e-07,2e-06,2.075606e-06,8.807461e-07,0.999981


In [6]:
boosting_model_names = ["xgb", "xgb"]
boosting_versions = ["v1", "v4"]
boosting_modes = ["train", "train"]
boosting_oof_columns = [f"oof_{model_name}_{version}" for model_name, version in zip(boosting_model_names, boosting_versions)]
boosting_paths = [f"/kaggle/input/isic-scd-{model_name.replace('_', '-')}-{version}-{mode}" for model_name, version, mode in zip(
    boosting_model_names, boosting_versions, boosting_modes)]

cnn_model_names = ["efficientnet_b2"]
cnn_versions = ["v3"]
cnn_modes = ["pretrain"]
cnn_oof_columns = [f"oof_{model_name}_{version}" for model_name, version in zip(cnn_model_names, cnn_versions)]
cnn_paths = [f"/kaggle/input/isic-scd-{model_name.replace('_', '-')}-{version}-{mode}" for model_name, version, mode in zip(
    cnn_model_names, cnn_versions, cnn_modes)]

blend_oof_columns = boosting_oof_columns + cnn_oof_columns

weights = [4.332719131764335, 1.8340030190599919, 1.8661240013799958]

In [7]:
ord_categorical_features = [
    "sex",
    "tbp_lv_location",
    "tbp_tile_type",
    "tbp_lv_location_simple",
]

ohe_categorical_features = [
    "anatom_site_general", 
    "attribution",
]

attribution_mapper = {
    "Memorial Sloan Kettering Cancer Center": "MSKCC",
    "ACEMID MIA": "ACEMIDMIA",
    "Department of Dermatology, Hospital Clínic de Barcelona": "DoD_HCB",
    "University Hospital of Basel": "UHB",
    "Frazer Institute, The University of Queensland, Dermatology Research Centre": "FI_TUQ-DRC",
    "Department of Dermatology, University of Athens, Andreas Syggros Hospital of Skin and Venereal Diseases, Alexander Stratigos, Konstantinos Liopyris": "DoD_UA",
    "ViDIR Group, Department of Dermatology, Medical University of Vienna": "ViDIR"
}

def boosting_norm_feature(df, value_col, group_cols, err=1e-5):
    stats = ["mean", "std"]
    tmp = df.groupby(group_cols)[value_col].agg(stats)
    tmp.columns = [f"{value_col}_{stat}" for stat in stats]
    tmp.reset_index(inplace=True)
    df = df.merge(tmp, on=group_cols, how="left")
    feature_name = f"{value_col}_patient_norm"
    df[feature_name] = ((df[value_col] - df[f"{value_col}_mean"]) / 
                                       (df[f"{value_col}_std"] + err))
    return df, feature_name

def boosting_feature_engineering(df):
    df["sex"] = df["sex"].fillna("missing_sex")
    df["anatom_site_general"] = df["anatom_site_general"].fillna("missing_anatom_site_general")
    df["tbp_tile_type"] = df["tbp_tile_type"].map({"3D: white": "white", "3D: XP": "XP"})
    df["attribution"] = df["attribution"].map(attribution_mapper)

    cols_to_norm = [
        "age_approx",
        "clin_size_long_diam_mm",
        "tbp_lv_A", "tbp_lv_Aext",
        "tbp_lv_B", "tbp_lv_Bext",
        "tbp_lv_C", "tbp_lv_Cext",
        "tbp_lv_H", "tbp_lv_Hext",
        "tbp_lv_L", "tbp_lv_Lext",
        "tbp_lv_areaMM2", "tbp_lv_area_perim_ratio",
        "tbp_lv_color_std_mean",
        "tbp_lv_deltaA", "tbp_lv_deltaB", "tbp_lv_deltaL", "tbp_lv_deltaLB", "tbp_lv_deltaLBnorm",
        "tbp_lv_eccentricity",
        "tbp_lv_minorAxisMM", "tbp_lv_nevi_confidence", "tbp_lv_norm_border",
        "tbp_lv_norm_color", "tbp_lv_perimeterMM",
        "tbp_lv_radial_color_std_max", "tbp_lv_stdL", "tbp_lv_stdLExt",
        "tbp_lv_symm_2axis", "tbp_lv_symm_2axis_angle",
        "tbp_lv_x", "tbp_lv_y", "tbp_lv_z"
    ]
    numerical_features = cols_to_norm[:]
    for col in cols_to_norm:
        df, feature_name = boosting_norm_feature(df, col, ["patient_id"])
        numerical_features += [feature_name]
    
    df["num_images"] = df["patient_id"].map(df.groupby("patient_id")["isic_id"].count())
    numerical_features += ["num_images"]
    return df, numerical_features


# class PAUC:
#     def get_final_error(self, error, weight):
#         return error

#     def is_max_optimal(self):
#         return True

#     def evaluate(self, approxes, target, weight):
#         y_true = target.astype(int)
#         y_pred = approxes[0].astype(float)
        
#         score = compute_pauc(y_true, y_pred, min_tpr=0.8)
        
#         return score, 1.0


def pauc_80(y_train, y_pred):
    score_value = compute_pauc(y_train, y_pred, min_tpr=0.8)   
    return score_value


def get_boosting_predictions(train, test, model_name, version, path, oof_column):
    start_time = time.time()
    with open(path / f"{model_name}_{version}_run_metadata.json", "r") as f:
        run_metadata = json.load(f)
    pprint(run_metadata)
    
    with open(path / f"{model_name}_{version}_encoder.joblib", "rb") as f:
        mixed_encoded_preprocessor = joblib.load(f)

    enc = mixed_encoded_preprocessor.fit(train)
    X_test = enc.transform(test)

    columns_for_model = len(X_test.columns)
    print(f"Total number of columns: {columns_for_model}")
        
    all_folds = np.unique(train[fold_column])
#     all_folds = [1]
    test_predictions_df = pd.DataFrame({id_column: test[id_column]})
    for fold in all_folds:
        model_filepath = path / f"models/{model_name}_{version}_fold_{fold}.txt"
        with open(model_filepath, "rb") as f:
            estimator = joblib.load(f)
        test_predictions_df[f"fold_{fold}"] = estimator.predict_proba(X_test)[:, -1]
    test_predictions_df[oof_column] = test_predictions_df[[f"fold_{fold}" for fold in all_folds]].mean(axis=1)
    end_time = time.time()
    return test_predictions_df[[id_column, oof_column]], (end_time - start_time)

In [8]:
train_metadata = pd.read_csv(INPUT_PATH / "train-metadata.csv", low_memory=False, na_values=["NA"])
test_metadata = pd.read_csv(INPUT_PATH / "test-metadata.csv", low_memory=False, na_values=["NA"])

folds_df = pd.read_csv("/kaggle/input/isic-scd-folds/folds.csv")
train_metadata = train_metadata.merge(folds_df, on=[id_column, group_column], how="inner")
print(f"Train data size: {train_metadata.shape}")
print(f"Test data size: {test_metadata.shape}")

train_metadata, numerical_features = boosting_feature_engineering(train_metadata)
test_metadata, _ = boosting_feature_engineering(test_metadata)

Train data size: (401059, 56)
Test data size: (3, 44)


In [9]:
train_metadata = train_metadata.merge(oof_train_preds_df, on=id_column, how="left")
test_metadata = test_metadata.merge(oof_test_preds_df, on=id_column, how="left")
numerical_features += all_oof_columns

In [10]:
for idx, (model_name, version, path, oof_column) in enumerate(zip(boosting_model_names, boosting_versions, boosting_paths, boosting_oof_columns)):
    print(f"Generating predictions for {model_name}_{version}")
    model_preds_df, _ = get_boosting_predictions(
        train_metadata, 
        test_metadata,
        model_name, 
        version, 
        Path(path),
        oof_column
    )
    print("\n")
    if idx == 0:
        ensemble_preds_df = model_preds_df.copy()
    else:
        ensemble_preds_df = ensemble_preds_df.merge(model_preds_df, on=id_column, how="left")

Generating predictions for xgb_v1
{'best_num_rounds': {'fold_1': 775,
                     'fold_2': 991,
                     'fold_3': 229,
                     'fold_4': 516,
                     'fold_5': 712},
 'config': {'_key': None,
            '_parent': None,
            '_temp': False,
            'model_name': 'xgb_v1',
            'models_output_dir': 'models',
            'sampling_ratio': 0.01,
            'seed': 2022},
 'cv_auc_avg': 0.9752792248538455,
 'cv_auc_oof': 0.9705394843285675,
 'cv_auc_std': 0.009140741018392141,
 'cv_pauc_avg': 0.18016309405063816,
 'cv_pauc_oof': 0.1755785967509135,
 'cv_pauc_std': 0.008152214905482494,
 'es_rounds': 250,
 'num_rounds': 3000,
 'params': {'alpha': 0.6779926606782505,
            'colsample_bylevel': 0.5476090898823716,
            'colsample_bynode': 0.9928601203635129,
            'colsample_bytree': 0.8437772277074493,
            'disable_default_eval_metric': True,
            'enable_categorical': True,
            'la

In [11]:
del train_metadata, test_metadata
gc.collect()

18021

In [12]:
feature_mapping_dict = {
    "sex": defaultdict(lambda: 0, {
        "missing_sex": 0,
        "female": 1,
        "male": 2,
    }),
    "anatom_site_general": defaultdict(lambda: 0, {
        "missing_anatom_site_general": 0,
        "lower extremity": 1,
        "head/neck": 2,
        "posterior torso": 3,
        "anterior torso": 4,
        "upper extremity": 5,
    }),
    "tbp_tile_type": defaultdict(lambda: 0, {
        "3D: white": 0,
        "3D: XP": 1,
    }),
    "tbp_lv_location": defaultdict(lambda: 0, {
        "Unknown": 0,
        "Right Leg - Upper": 1,
        "Head & Neck": 2,
        "Torso Back Top Third": 3,
        "Torso Front Top Half": 4,
        "Right Arm - Upper": 5,
        "Left Leg - Upper": 6,
        "Torso Front Bottom Half": 7,
        "Left Arm - Upper": 8,
        "Right Leg": 9,
        "Torso Back Middle Third": 10,
        "Right Arm - Lower": 11,
        "Right Leg - Lower": 12,
        "Left Leg - Lower": 13,
        "Left Arm - Lower": 14,
        "Left Leg": 15,
        "Torso Back Bottom Third": 16,
        "Left Arm": 17,
        "Right Arm": 18,
        "Torso Front": 19,
        "Torso Back": 20
    }),
    "tbp_lv_location_simple": defaultdict(lambda: 0, {
        "Unknown": 0,
        "Right Leg": 1,
        "Head & Neck": 2,
        "Torso Back": 3,
        "Torso Front": 4,
        "Right Arm": 5,
        "Left Leg": 6,
        "Left Arm": 7,
    }),
}


def get_emb_szs(cat_cols):
    emb_szs = {}
    for col in cat_cols:
        emb_szs[col] = (len(feature_mapping_dict[col]), min(600, round(1.6 * len(feature_mapping_dict[col]) ** 0.56)))
    return emb_szs


def cnn_norm_feature(df, value_col, group_cols, err=1e-5):
    stats = ["mean", "std"]
    tmp = df.groupby(group_cols)[value_col].agg(stats)
    tmp.columns = [f"{value_col}_{stat}" for stat in stats]
    tmp.reset_index(inplace=True)
    df = df.merge(tmp, on=group_cols, how="left")
    feature_name = f"{value_col}_patient_norm"
    df[feature_name] = ((df[value_col] - df[f"{value_col}_mean"]) / (df[f"{value_col}_std"] + err)).fillna(0)
    return df, feature_name


def cnn_feature_engineering(df):
    df["age_approx"] = df["age_approx"].fillna(0)
    df["age_approx"] = df["age_approx"] / 90
    df["sex"] = df["sex"].fillna("missing_sex")
    df["sex"] = df["sex"].map(feature_mapping_dict["sex"])
    df["anatom_site_general"] = df["anatom_site_general"].fillna("missing_anatom_site_general")
    df["anatom_site_general"] = df["anatom_site_general"].map(feature_mapping_dict["anatom_site_general"])
    df["tbp_tile_type"] = df["tbp_tile_type"].map(feature_mapping_dict["tbp_tile_type"])
    df["tbp_lv_location"] = df["tbp_lv_location"].map(feature_mapping_dict["tbp_lv_location"])
    df["tbp_lv_location_simple"] = df["tbp_lv_location_simple"].map(feature_mapping_dict["tbp_lv_location_simple"])

    cat_cols = ["sex", "anatom_site_general",
                "tbp_tile_type", "tbp_lv_location", "tbp_lv_location_simple"]

    df["num_images"] = df["patient_id"].map(df.groupby("patient_id")["isic_id"].count())
    df["num_images"] = np.log1p(df["num_images"])

    cols_to_norm = [
        "age_approx",
        "clin_size_long_diam_mm",
        "tbp_lv_A", "tbp_lv_Aext",
        "tbp_lv_B", "tbp_lv_Bext",
        "tbp_lv_C", "tbp_lv_Cext",
        "tbp_lv_H", "tbp_lv_Hext",
        "tbp_lv_L", "tbp_lv_Lext",
        "tbp_lv_areaMM2", "tbp_lv_area_perim_ratio",
        "tbp_lv_color_std_mean",
        "tbp_lv_deltaA", "tbp_lv_deltaB", "tbp_lv_deltaL", "tbp_lv_deltaLB", "tbp_lv_deltaLBnorm",
        "tbp_lv_eccentricity",
        "tbp_lv_minorAxisMM", "tbp_lv_nevi_confidence", "tbp_lv_norm_border",
        "tbp_lv_norm_color", "tbp_lv_perimeterMM",
        "tbp_lv_radial_color_std_max", "tbp_lv_stdL", "tbp_lv_stdLExt",
        "tbp_lv_symm_2axis", "tbp_lv_symm_2axis_angle",
        "tbp_lv_x", "tbp_lv_y", "tbp_lv_z"
    ]
    cont_cols = cols_to_norm[:]
    for col in cols_to_norm:
        df, feature_name = cnn_norm_feature(df, col, ["patient_id"])
        cont_cols += [feature_name]

    df["num_images"] = np.log1p(df["patient_id"].map(df.groupby("patient_id")["isic_id"].count()))
    cont_cols += ["num_images"]
    assert df[cont_cols].isnull().sum().sum() == 0
    return df, cat_cols, cont_cols

def test_augment(image_size, mean=None, std=None):
    if mean is not None and std is not None:
        normalize = A.Normalize(mean=mean, std=std, max_pixel_value=255.0, p=1.0)
    else:
        normalize = A.Normalize(max_pixel_value=255.0, p=1.0)
    transform = A.Compose(
        [A.Resize(image_size, image_size), normalize, ToTensorV2()], p=1.0
    )
    return transform


class ISICDataset(Dataset):
    def __init__(self, metadata, images, augment,
                 use_meta=False, cat_cols: List = None, cont_cols: List = None,
                 infer=False):
        self.metadata = metadata
        self.images = images
        self.augment = augment
        self.use_meta = use_meta
        self.cat_cols = cat_cols
        self.cont_cols = cont_cols
        self.length = len(self.metadata)
        self.infer = infer

    def __len__(self):
        return self.length

    def __getitem__(self, index):
        row = self.metadata.iloc[index]
        image = np.array(Image.open(BytesIO(self.images[row["isic_id"]][()])))
        if self.augment is not None:
            image = self.augment(image=image)["image"].float()

        if self.use_meta:
            x_cat = torch.tensor([row[col] for col in self.cat_cols], dtype=torch.long)
            x_cont = torch.tensor([row[col] for col in self.cont_cols], dtype=torch.float)
        else:
            x_cat = torch.tensor(0)
            x_cont = torch.tensor(0)

        if self.infer:
            return image, x_cat, x_cont
        else:
            target = torch.tensor(row["target"])
            return image, x_cat, x_cont, target

    
class ISICNet(nn.Module):
    def __init__(
        self,
        model_name,
        pretrained=True,
        use_meta=False,
        cat_cols: List = None, cont_cols: List = None, emb_szs: Dict = None,
    ):
        super(ISICNet, self).__init__()
        self.model = create_model(
            model_name=model_name,
            pretrained=pretrained,
            in_chans=3,
            num_classes=0,
            global_pool="",
        )
        in_dim = self.model.num_features
        self.dropouts = nn.ModuleList([nn.Dropout(0.5) for _ in range(5)])
        self.use_meta = use_meta
        if use_meta:
            self.linear = nn.Linear(in_dim, 256)

            self.embeddings = nn.ModuleList([nn.Embedding(emb_szs[col][0], emb_szs[col][1]) for col in cat_cols])
            self.embedding_dropout = nn.Dropout(0.1)
            n_emb = sum([emb_szs[col][1] for col in cat_cols])
            n_cont = len(cont_cols)
            self.bn_cont = nn.BatchNorm1d(n_cont)
            self.meta = nn.Sequential(
                nn.Linear(n_emb + n_cont, 256),
                nn.BatchNorm1d(256),
                nn.SiLU(),
                nn.Dropout(0.3),
                nn.Linear(256, 64),
                nn.BatchNorm1d(64),
                nn.SiLU(),
                nn.Dropout(0.1),
            )
            self.classifier = nn.Linear(256 + 64, 1)
        else:
            self.linear = nn.Linear(in_dim, 1)

    def forward(self, images, x_cat=None, x_cont=None):
        x = self.model(images)
        bs = len(images)
        pool = F.adaptive_avg_pool2d(x, 1).reshape(bs, -1)
        if self.training:
            x_image = 0
            for i in range(len(self.dropouts)):
                x_image += self.linear(self.dropouts[i](pool))
            x_image = x_image / len(self.dropouts)
        else:
            x_image = self.linear(pool)

        if self.use_meta:
            x_cat = [emb(x_cat[:, i]) for i, emb in enumerate(self.embeddings)]
            x_cat = torch.cat(x_cat, 1)
            x_cat = self.embedding_dropout(x_cat)
            x_cont = self.bn_cont(x_cont)
            x_meta = self.meta(torch.cat([x_cat, x_cont], 1))
            x = torch.cat([x_image, x_meta], 1)
            logits = self.classifier(x)
        else:
            logits = x_image
        return logits


def get_trans(img, iteration):
    if iteration >= 6:
        img = img.transpose(2, 3)
    if iteration % 6 == 0:
        return img
    elif iteration % 6 == 1:
        return torch.flip(img, dims=[2])
    elif iteration % 6 == 2:
        return torch.flip(img, dims=[3])
    elif iteration % 6 == 3:
        return torch.rot90(img, 1, dims=[2, 3])
    elif iteration % 6 == 4:
        return torch.rot90(img, 2, dims=[2, 3])
    elif iteration % 6 == 5:
        return torch.rot90(img, 3, dims=[2, 3])

    
def predict(model, test_dataloader, accelerator, n_tta, use_meta, log_interval=10):
    model.eval()
    test_probs = []
    total_steps = len(test_dataloader)
    with torch.no_grad():
        for step, (images, x_cat, x_cont) in enumerate(test_dataloader):
            logits = 0
            probs = 0
            for i in range(n_tta):
                if use_meta:
                    logits_iter = model(get_trans(images, i), x_cat, x_cont)
                else:
                    logits_iter = model(get_trans(images, i))
                logits += logits_iter
                probs += torch.sigmoid(logits_iter)
            logits /= n_tta
            probs /= n_tta

            probs = accelerator.gather(probs)
            test_probs.append(probs)

            if (step == 0) or ((step + 1) % log_interval == 0):
                print(f"Step: {step + 1}/{total_steps}")

    test_probs = torch.cat(test_probs).cpu().numpy()
    return test_probs


def get_dnn_predictions(train, test, images, model_name, version, path, oof_column):
    start_time = time.time()
    with open(path / f"{model_name}_{version}_run_metadata.json", "r") as f:
        run_metadata = json.load(f)
    pprint(run_metadata["params"])
    
    image_size = run_metadata["params"]["image_size"]
    batch_size = run_metadata["params"]["val_batch_size"]
    use_meta = run_metadata["params"]["use_meta"]
    
    test_dataset = ISICDataset(
        test, images, augment=test_augment(image_size), 
        use_meta=use_meta,
        cat_cols=cat_cols,
        cont_cols=cont_cols,
        infer=True
    )
    test_dataloader = DataLoader(
        test_dataset,
        batch_size=batch_size,
        shuffle=False,
        num_workers=4,
        drop_last=False,
        pin_memory=True,
    )

    all_folds = np.unique(train[fold_column])
#     all_folds = [1]
    test_predictions_df = pd.DataFrame({id_column: test[id_column]})
    for fold in all_folds:
        print(f"\nFold {fold}")
        accelerator = Accelerator(
            mixed_precision=run_metadata["params"]["mixed_precision"],
        )
        
        model = ISICNet(model_name=model_name, pretrained=False,
                        use_meta=use_meta,
                        cat_cols=cat_cols,
                        cont_cols=cont_cols,
                        emb_szs=emb_szs,)
        model = model.to(accelerator.device)
        
        model, test_dataloader = accelerator.prepare(model, test_dataloader)
        model_filepath = path / f"models/fold_{fold}"
        accelerator.load_state(model_filepath)

        test_predictions_df[f"fold_{fold}"] = predict(model, test_dataloader, accelerator, n_tta=run_metadata["params"]["n_tta"], use_meta=use_meta)
    test_predictions_df[oof_column] = test_predictions_df[[f"fold_{fold}" for fold in all_folds]].mean(axis=1)
    end_time = time.time()
    return test_predictions_df[[id_column, oof_column]], (end_time - start_time)

In [13]:
train_metadata = pd.read_csv(INPUT_PATH / "train-metadata.csv", low_memory=False, na_values=["NA"])
test_metadata = pd.read_csv(INPUT_PATH / "test-metadata.csv", low_memory=False, na_values=["NA"])

folds_df = pd.read_csv("/kaggle/input/isic-scd-folds/folds.csv")
train_metadata = train_metadata.merge(folds_df, on=[id_column, group_column], how="inner")
print(f"Train data size: {train_metadata.shape}")
print(f"Test data size: {test_metadata.shape}")

train_images = h5py.File(INPUT_PATH / "train-image.hdf5", mode="r")
test_images = h5py.File(INPUT_PATH / "test-image.hdf5", mode="r")

train_metadata, cat_cols, cont_cols = cnn_feature_engineering(train_metadata)
test_metadata, _, _ = cnn_feature_engineering(test_metadata)
emb_szs = get_emb_szs(cat_cols)

Train data size: (401059, 56)
Test data size: (3, 44)


In [14]:
for idx, (model_name, version, path, oof_column) in enumerate(zip(cnn_model_names, cnn_versions, cnn_paths, cnn_oof_columns)):
    print(f"Generating predictions for {model_name}_{version}")
    model_preds_df, _ = get_dnn_predictions(
        train_metadata, 
        test_metadata,
        test_images,
        model_name, 
        version, 
        Path(path),
        oof_column
    )
    print("\n")
    ensemble_preds_df = ensemble_preds_df.merge(model_preds_df, on=id_column, how="left")

Generating predictions for efficientnet_b2_v3
{'debug': False,
 'down_sampling': True,
 'image_size': 128,
 'init_lr': 3e-05,
 'mixed_precision': 'fp16',
 'mode': 'pretrain',
 'n_tta': 8,
 'num_epochs': 20,
 'num_workers': 8,
 'seed': 2022,
 'train_batch_size': 64,
 'use_meta': True,
 'val_batch_size': 512}

Fold 1
Step: 1/1

Fold 2
Step: 1/1

Fold 3
Step: 1/1

Fold 4
Step: 1/1

Fold 5
Step: 1/1




In [15]:
ensemble_preds_df

Unnamed: 0,isic_id,oof_xgb_v1,oof_xgb_v4,oof_efficientnet_b2_v3
0,ISIC_0015657,0.141293,0.231877,0.005821
1,ISIC_0015729,0.016932,0.14802,0.000531
2,ISIC_0015740,0.017129,0.325397,0.001743


In [16]:
ensemble_preds = 0
for idx, (blend_oof_column, weight) in enumerate(zip(blend_oof_columns, weights)):
    ensemble_preds += ensemble_preds_df[blend_oof_column].rank(pct=True).values * weight
ensemble_preds_df[target_column] = ensemble_preds
ensemble_preds_df.head()

Unnamed: 0,isic_id,oof_xgb_v1,oof_xgb_v4,oof_efficientnet_b2_v3,target
0,ISIC_0015657,0.141293,0.231877,0.005821,7.421512
1,ISIC_0015729,0.016932,0.14802,0.000531,2.677615
2,ISIC_0015740,0.017129,0.325397,0.001743,5.966565


In [17]:
ensemble_preds_df[target_column].describe()

count    3.000000
mean     5.355231
std      2.430316
min      2.677615
25%      4.322090
50%      5.966565
75%      6.694038
max      7.421512
Name: target, dtype: float64

In [18]:
ensemble_preds_df[[id_column, target_column]].head()

Unnamed: 0,isic_id,target
0,ISIC_0015657,7.421512
1,ISIC_0015729,2.677615
2,ISIC_0015740,5.966565


In [19]:
ensemble_preds_df[[id_column, target_column]].to_csv("submission.csv", index=False)