In [1]:
import gc
import time
import json
import joblib
from pprint import pprint
from typing import List, Dict
from collections import defaultdict

from pathlib import Path

import pandas as pd
import numpy as np

import h5py
from io import BytesIO
from PIL import Image

import torch
from torch.utils.data import DataLoader, Dataset
import torch.nn as nn
import torch.nn.functional as F

from timm import create_model

import albumentations as A
from albumentations.pytorch import ToTensorV2

from accelerate import Accelerator

from isic_helper import DotDict

In [2]:
id_column = "isic_id"
target_column = "target"
group_column = "patient_id"
fold_column = "fold"

INPUT_PATH = Path("/kaggle/input/isic-2024-challenge/")

boosting_model_names = ["xgb", "xgb", "lgb"]
boosting_versions = ["v1", "v2", "v6"]
boosting_modes = ["train", "train", "train"]
boosting_oof_columns = [f"oof_{model_name}_{version}" for model_name, version in zip(boosting_model_names, boosting_versions)]
boosting_paths = [f"/kaggle/input/isic-scd-{model_name.replace('_', '-')}-{version}-{mode}" for model_name, version, mode in zip(
    boosting_model_names, boosting_versions, boosting_modes)]

cnn_model_names = ["efficientnet_b2", "efficientnet_b2", "resnet18"]
cnn_versions = ["v1", "v2", "v1"]
cnn_modes = ["pretrain", "pretrain", "pretrain"]
cnn_oof_columns = [f"oof_{model_name}_{version}" for model_name, version in zip(cnn_model_names, cnn_versions)]
cnn_paths = [f"/kaggle/input/isic-scd-{model_name.replace('_', '-')}-{version}-{mode}" for model_name, version, mode in zip(
    cnn_model_names, cnn_versions, cnn_modes)]

oof_columns = boosting_oof_columns + cnn_oof_columns

weights = [
    9.184200253001404,
    0.8552034877824392,
    2.4021046742664947,
    4.764737619598357,
    3.2308662644235215,
    1.282657759225282
]

SAMPLE_SIZE = 5000
EXPECTED_TEST_SIZE = 500000
TOTAL_RUNTIME = 0

In [3]:
numerical_features = [
    "age_approx",
    "clin_size_long_diam_mm",
    "tbp_lv_A", "tbp_lv_Aext",
    "tbp_lv_B", "tbp_lv_Bext",
    "tbp_lv_C", "tbp_lv_Cext",
    "tbp_lv_H", "tbp_lv_Hext",
    "tbp_lv_L", "tbp_lv_Lext",
    "tbp_lv_areaMM2",
    "tbp_lv_area_perim_ratio",
    "tbp_lv_color_std_mean",
    "tbp_lv_deltaA", "tbp_lv_deltaB", "tbp_lv_deltaL", "tbp_lv_deltaLB", "tbp_lv_deltaLBnorm",
    "tbp_lv_eccentricity",
    "tbp_lv_minorAxisMM",
    "tbp_lv_nevi_confidence",
    "tbp_lv_norm_border", "tbp_lv_norm_color",
    "tbp_lv_perimeterMM",
    "tbp_lv_radial_color_std_max",
    "tbp_lv_stdL", "tbp_lv_stdLExt",
    "tbp_lv_symm_2axis", "tbp_lv_symm_2axis_angle",
    "tbp_lv_x", "tbp_lv_y", "tbp_lv_z",
]

ord_categorical_features = [
    "sex",
    "tbp_lv_location",
    "tbp_tile_type",
    "tbp_lv_location_simple",
]

ohe_categorical_features = [
    "anatom_site_general", 
    "attribution",
]

attribution_mapper = {
    "Memorial Sloan Kettering Cancer Center": "MSKCC",
    "ACEMID MIA": "ACEMIDMIA",
    "Department of Dermatology, Hospital Clínic de Barcelona": "DoD_HCB",
    "University Hospital of Basel": "UHB",
    "Frazer Institute, The University of Queensland, Dermatology Research Centre": "FI_TUQ-DRC",
    "Department of Dermatology, University of Athens, Andreas Syggros Hospital of Skin and Venereal Diseases, Alexander Stratigos, Konstantinos Liopyris": "DoD_UA",
    "ViDIR Group, Department of Dermatology, Medical University of Vienna": "ViDIR"
}

def boosting_preprocess(df):
    df["anatom_site_general"] = df["anatom_site_general"].fillna("missing_site")
    df["sex"] = df["sex"].fillna("missing_sex")
    df["tbp_tile_type"] = df["tbp_tile_type"].map({"3D: white": "white", "3D: XP": "XP"})
    df["attribution"] = df["attribution"].map(attribution_mapper)
    return df

def norm_feature(df, value_col, group_cols=[group_column], err=1e-5):
    stats = ["mean", "std"]
    tmp = df.groupby(group_cols)[value_col].agg(stats)
    tmp.columns = [f"{value_col}_{stat}" for stat in stats]
    tmp.reset_index(inplace=True)
    df = df.merge(tmp, on=group_cols, how="left")
    feature_name = f"{value_col}_patient_norm"
    df[feature_name] = ((df[value_col] - df[f"{value_col}_mean"]) / 
                                       (df[f"{value_col}_std"] + err))
    return df, feature_name

def count_features(df, col):
    tmp = df[[id_column, group_column, col]].pivot_table(
        values=id_column, 
        index=group_column, 
        columns=col, 
        aggfunc="count", 
        fill_value=0)
    feature_cols = tmp.columns.tolist()
    tmp.reset_index(inplace=True)
    tmp.index.name = None
    df = df.merge(tmp, on=group_column, how="left")
    return df, feature_cols

def boosting_feature_engineering(df, err=1e-5):
    new_num_cols = []
    
    df["lesion_size_ratio"] = df["tbp_lv_minorAxisMM"] / df["clin_size_long_diam_mm"]
    new_num_cols += ["lesion_size_ratio"]
    
    df["lesion_shape_index"] = df["tbp_lv_areaMM2"] / df["tbp_lv_perimeterMM"]**2
    new_num_cols += ["lesion_shape_index"]
    
    df["hue_contrast"] = np.abs(df["tbp_lv_H"] - df["tbp_lv_Hext"])
    new_num_cols += ["hue_contrast"]
    
    df["luminance_contrast"] = np.abs(df["tbp_lv_L"] - df["tbp_lv_Lext"])
    new_num_cols += ["luminance_contrast"]
    
    df["lesion_color_difference"] = np.sqrt(df["tbp_lv_deltaA"]**2 +
                                            df["tbp_lv_deltaB"]**2 +
                                            df["tbp_lv_deltaL"]**2)
    new_num_cols += ["lesion_color_difference"]
    
    df["border_complexity"] = df["tbp_lv_norm_border"] + df["tbp_lv_symm_2axis"]
    new_num_cols += ["border_complexity"]
    
    df["color_uniformity"] = df["tbp_lv_color_std_mean"] / (df["tbp_lv_radial_color_std_max"] + err)
    new_num_cols += ["color_uniformity"]
    
    df["position_distance_3d"] = np.sqrt(df["tbp_lv_x"]**2 +
                                         df["tbp_lv_y"]**2 +
                                         df["tbp_lv_z"]**2)
    new_num_cols += ["position_distance_3d"]
    
    df["perimeter_to_area_ratio"] = df["tbp_lv_perimeterMM"] / df["tbp_lv_areaMM2"]
    new_num_cols += ["perimeter_to_area_ratio"]
    
    df["area_to_perimeter_ratio"] = df["tbp_lv_areaMM2"] / df["tbp_lv_perimeterMM"]
    new_num_cols += ["area_to_perimeter_ratio"]
    
    df["lesion_visibility_score"] = df["tbp_lv_deltaLBnorm"] + df["tbp_lv_norm_color"]
    new_num_cols += ["lesion_visibility_score"]
    
    df["symmetry_border_consistency"] = df["tbp_lv_symm_2axis"] * df["tbp_lv_norm_border"]
    new_num_cols += ["symmetry_border_consistency"]
    
    df["consistency_symmetry_border"] = (df["tbp_lv_symm_2axis"] * df["tbp_lv_norm_border"] /
                                         (df["tbp_lv_symm_2axis"] + df["tbp_lv_norm_border"]))
    new_num_cols += ["consistency_symmetry_border"]
    
    df["color_consistency"] = df["tbp_lv_stdL"] / df["tbp_lv_Lext"]
    new_num_cols += ["color_consistency"]
    
    df["consistency_color"] = (df["tbp_lv_stdL"] * df["tbp_lv_Lext"] /
                               (df["tbp_lv_stdL"] * df["tbp_lv_Lext"]))
    new_num_cols += ["consistency_color"]
    
    df["size_age_interaction"] = df["clin_size_long_diam_mm"] * df["age_approx"]
    new_num_cols += ["size_age_interaction"]
    
    df["hue_color_std_interaction"] = df["tbp_lv_H"] * df["tbp_lv_color_std_mean"]
    new_num_cols += ["hue_color_std_interaction"]
    
    df["lesion_severity_index"] = (df["tbp_lv_norm_border"] +
                                   df["tbp_lv_norm_color"] +
                                   df["tbp_lv_eccentricity"]) / 3
    new_num_cols += ["lesion_severity_index"]
    
    df["shape_complexity_index"] = df["border_complexity"] + df["lesion_shape_index"]
    new_num_cols += ["shape_complexity_index"]
    
    df["color_contrast_index"] = (df["tbp_lv_deltaA"] +
                                  df["tbp_lv_deltaB"] + 
                                  df["tbp_lv_deltaL"] +
                                  df["tbp_lv_deltaLBnorm"])
    new_num_cols += ["color_contrast_index"]
    
    df["log_lesion_area"] = np.log1p(df["tbp_lv_areaMM2"])
    new_num_cols += ["log_lesion_area"]
    
    df["normalized_lesion_size"] = df["clin_size_long_diam_mm"] / df["age_approx"]
    new_num_cols += ["normalized_lesion_size"]
    
    df["mean_hue_difference"] = (df["tbp_lv_H"] + df["tbp_lv_Hext"]) / 2
    new_num_cols += ["mean_hue_difference"]
    
    df["std_dev_contrast"] = np.sqrt((df["tbp_lv_deltaA"]**2 +
                                      df["tbp_lv_deltaB"]**2 + 
                                      df["tbp_lv_deltaL"]**2) / 3)
    new_num_cols += ["std_dev_contrast"]
    
    df["color_shape_composite_index"] = (df["tbp_lv_color_std_mean"] + 
                                         df["tbp_lv_area_perim_ratio"] +
                                         df["tbp_lv_symm_2axis"]) / 3
    new_num_cols += ["color_shape_composite_index"]
    
    df["lesion_orientation_3d"] = np.arctan2(df["tbp_lv_y"], df["tbp_lv_x"])
    new_num_cols += ["lesion_orientation_3d"]
    
    df["overall_color_difference"] = (df["tbp_lv_deltaA"] + 
                                      df["tbp_lv_deltaB"] + 
                                      df["tbp_lv_deltaL"]) / 3
    new_num_cols += ["overall_color_difference"]
    
    df["symmetry_perimeter_interaction"] = df["tbp_lv_symm_2axis"] * df["tbp_lv_perimeterMM"]
    new_num_cols += ["symmetry_perimeter_interaction"]
    
    df["comprehensive_lesion_index"] = (df["tbp_lv_area_perim_ratio"] +
                                        df["tbp_lv_eccentricity"] +
                                        df["tbp_lv_norm_color"] +
                                        df["tbp_lv_symm_2axis"]) / 4
    new_num_cols += ["comprehensive_lesion_index"]
    
    df["color_variance_ratio"] = df["tbp_lv_color_std_mean"] / df["tbp_lv_stdLExt"]
    new_num_cols += ["color_variance_ratio"]
    
    df["border_color_interaction"] = df["tbp_lv_norm_border"] * df["tbp_lv_norm_color"]
    new_num_cols += ["border_color_interaction"]
    
    df["border_color_interaction_2"] = ((df["tbp_lv_norm_border"] * df["tbp_lv_norm_color"]) /
                                        (df["tbp_lv_norm_border"] + df["tbp_lv_norm_color"]))
    new_num_cols += ["border_color_interaction_2"]
    
    df["size_color_contrast_ratio"] = df["clin_size_long_diam_mm"] / df["tbp_lv_deltaLBnorm"]
    new_num_cols += ["size_color_contrast_ratio"]
    
    df["age_normalized_nevi_confidence"] = df["tbp_lv_nevi_confidence"] / df["age_approx"]
    new_num_cols += ["age_normalized_nevi_confidence"]
    
    df["age_normalized_nevi_confidence_2"] = np.sqrt(df["tbp_lv_nevi_confidence"]**2 + df["age_approx"]**2)
    new_num_cols += ["age_normalized_nevi_confidence_2"]
    
    df["color_asymmetry_index"] = df["tbp_lv_radial_color_std_max"] * df["tbp_lv_symm_2axis"]
    new_num_cols += ["color_asymmetry_index"]
    
    df["volume_approximation_3d"] = df["tbp_lv_areaMM2"] * np.sqrt(df["tbp_lv_x"]**2 +
                                                                   df["tbp_lv_y"]**2 +
                                                                   df["tbp_lv_z"]**2)
    new_num_cols += ["volume_approximation_3d"]
    
    df["color_range"] = (np.abs(df["tbp_lv_L"] - df["tbp_lv_Lext"]) +
                         np.abs(df["tbp_lv_A"] - df["tbp_lv_Aext"]) +
                         np.abs(df["tbp_lv_B"] - df["tbp_lv_Bext"]))
    new_num_cols += ["color_range"]
    
    df["shape_color_consistency"] = df["tbp_lv_eccentricity"] * df["tbp_lv_color_std_mean"]
    new_num_cols += ["shape_color_consistency"]
    
    df["border_length_ratio"] = df["tbp_lv_perimeterMM"] / np.sqrt(2 * df["tbp_lv_areaMM2"])
    new_num_cols += ["border_length_ratio"]
    
    df["age_size_symmetry_index"] = (df["age_approx"] *
                                     df["clin_size_long_diam_mm"] *
                                     df["tbp_lv_symm_2axis"])
    new_num_cols += ["age_size_symmetry_index"]
    
    df["age_area_symmetry"] = (df["age_approx"] *
                               df["tbp_lv_areaMM2"] *
                               df["tbp_lv_symm_2axis"])
    new_num_cols += ["age_area_symmetry"]
    
    for col in numerical_features:
        df, feature_name = norm_feature(df, col)
        new_num_cols += [feature_name]
    
    df["num_images"] = df[group_column].map(df.groupby(group_column)[id_column].count())
    new_num_cols += ["num_images"]

    return df, new_num_cols


# class PAUC:
#     def get_final_error(self, error, weight):
#         return error

#     def is_max_optimal(self):
#         return True

#     def evaluate(self, approxes, target, weight):
#         y_true = target.astype(int)
#         y_pred = approxes[0].astype(float)
        
#         score = compute_pauc(y_true, y_pred, min_tpr=0.8)
        
#         return score, 1.0


def pauc_80(y_train, y_pred):
    score_value = compute_pauc(y_train, y_pred, min_tpr=0.8)   
    return score_value


def get_boosting_predictions(train, test, model_name, version, path, oof_column):
    start_time = time.time()
    with open(path / f"{model_name}_{version}_run_metadata.json", "r") as f:
        run_metadata = json.load(f)
    pprint(run_metadata)
    
    with open(path / f"{model_name}_{version}_encoder.joblib", "rb") as f:
        mixed_encoded_preprocessor = joblib.load(f)

    enc = mixed_encoded_preprocessor.fit(train)
    X_test = enc.transform(test)

    columns_for_model = len(X_test.columns)
    print(f"Total number of columns: {columns_for_model}")
        
    all_folds = np.unique(train[fold_column])
#     all_folds = [1]
    test_predictions_df = pd.DataFrame({id_column: test[id_column]})
    for fold in all_folds:
        model_filepath = path / f"models/{model_name}_{version}_fold_{fold}.txt"
        with open(model_filepath, "rb") as f:
            estimator = joblib.load(f)
        test_predictions_df[f"fold_{fold}"] = estimator.predict_proba(X_test)[:, -1]
    test_predictions_df[oof_column] = test_predictions_df[[f"fold_{fold}" for fold in all_folds]].mean(axis=1)
    end_time = time.time()
    return test_predictions_df[[id_column, oof_column]], (end_time - start_time)

In [4]:
train_metadata = pd.read_csv(INPUT_PATH / "train-metadata.csv", low_memory=False, na_values=["NA"])
test_metadata = pd.read_csv(INPUT_PATH / "test-metadata.csv", low_memory=False, na_values=["NA"])

folds_df = pd.read_csv("/kaggle/input/isic-scd-folds/folds.csv")
train_metadata = train_metadata.merge(folds_df, on=[id_column, group_column], how="inner")
print(f"Train data size: {train_metadata.shape}")
print(f"Test data size: {test_metadata.shape}")

train_metadata = boosting_preprocess(train_metadata)
test_metadata = boosting_preprocess(test_metadata)

train_metadata, new_num_cols = boosting_feature_engineering(train_metadata)
test_metadata, _ = boosting_feature_engineering(test_metadata)

Train data size: (401059, 56)
Test data size: (3, 44)


In [5]:
if test_metadata.shape[0] == 3:
    test_metadata = train_metadata.sample(n=SAMPLE_SIZE, random_state=42)
for idx, (model_name, version, path, oof_column) in enumerate(zip(boosting_model_names, boosting_versions, boosting_paths, boosting_oof_columns)):
    print(f"Generating predictions for {model_name}_{version}")
    model_preds_df, runtime = get_boosting_predictions(
        train_metadata, 
        test_metadata,
        model_name, 
        version, 
        Path(path),
        oof_column
    )
    print("\n")
    if idx == 0:
        ensemble_preds_df = model_preds_df.copy()
    else:
        ensemble_preds_df = ensemble_preds_df.merge(model_preds_df, on=id_column, how="left")
    TOTAL_RUNTIME += runtime

Generating predictions for xgb_v1
{'best_num_rounds': {'fold_1': 82,
                     'fold_2': 173,
                     'fold_3': 20,
                     'fold_4': 142,
                     'fold_5': 77},
 'config': {'_key': None,
            '_parent': None,
            '_temp': False,
            'model_name': 'xgb_v1',
            'models_output_dir': 'models',
            'sampling_ratio': 0.01,
            'seed': 2022},
 'cv_auc_avg': 0.9673757713901316,
 'cv_auc_oof': 0.9464921630675764,
 'cv_auc_std': 0.006521637731286478,
 'cv_pauc_avg': 0.17368981947890508,
 'cv_pauc_oof': 0.15296119492851015,
 'cv_pauc_std': 0.00587196296972231,
 'es_rounds': 150,
 'num_rounds': 2000,
 'params': {'alpha': 0.6779926606782505,
            'colsample_bylevel': 0.5476090898823716,
            'colsample_bynode': 0.9928601203635129,
            'colsample_bytree': 0.8437772277074493,
            'disable_default_eval_metric': True,
            'enable_categorical': True,
            'lambd

In [6]:
del train_metadata, test_metadata
gc.collect()

88

In [7]:
feature_mapping_dict = {
    "sex": defaultdict(lambda: 0, {
        "missing_sex": 0,
        "female": 1,
        "male": 2,
    }),
    "anatom_site_general": defaultdict(lambda: 0, {
        "missing_anatom_site_general": 0,
        "lower extremity": 1,
        "head/neck": 2,
        "posterior torso": 3,
        "anterior torso": 4,
        "upper extremity": 5,
    }),
    "tbp_tile_type": defaultdict(lambda: 0, {
        "3D: white": 0,
        "3D: XP": 1,
    }),
    "tbp_lv_location": defaultdict(lambda: 0, {
        "Unknown": 0,
        "Right Leg - Upper": 1,
        "Head & Neck": 2,
        "Torso Back Top Third": 3,
        "Torso Front Top Half": 4,
        "Right Arm - Upper": 5,
        "Left Leg - Upper": 6,
        "Torso Front Bottom Half": 7,
        "Left Arm - Upper": 8,
        "Right Leg": 9,
        "Torso Back Middle Third": 10,
        "Right Arm - Lower": 11,
        "Right Leg - Lower": 12,
        "Left Leg - Lower": 13,
        "Left Arm - Lower": 14,
        "Left Leg": 15,
        "Torso Back Bottom Third": 16,
        "Left Arm": 17,
        "Right Arm": 18,
        "Torso Front": 19,
        "Torso Back": 20
    }),
    "tbp_lv_location_simple": defaultdict(lambda: 0, {
        "Unknown": 0,
        "Right Leg": 1,
        "Head & Neck": 2,
        "Torso Back": 3,
        "Torso Front": 4,
        "Right Arm": 5,
        "Left Leg": 6,
        "Left Arm": 7,
    }),
}


def cnn_preprocess(df):
    df["age_approx"] = df["age_approx"].fillna(0)
    df["age_approx"] = df["age_approx"] / 90
    df["sex"] = df["sex"].fillna("missing_sex")
    df["sex"] = df["sex"].map(feature_mapping_dict["sex"])
    df["anatom_site_general"] = df["anatom_site_general"].fillna("missing_anatom_site_general")
    df["anatom_site_general"] = df["anatom_site_general"].map(feature_mapping_dict["anatom_site_general"])
    df["tbp_tile_type"] = df["tbp_tile_type"].map(feature_mapping_dict["tbp_tile_type"])
    df["tbp_lv_location"] = df["tbp_lv_location"].map(feature_mapping_dict["tbp_lv_location"])
    df["tbp_lv_location_simple"] = df["tbp_lv_location_simple"].map(feature_mapping_dict["tbp_lv_location_simple"])
    return df


def get_emb_szs(cat_cols):
    emb_szs = {}
    for col in cat_cols:
        emb_szs[col] = (len(feature_mapping_dict[col]), min(600, round(1.6 * len(feature_mapping_dict[col]) ** 0.56)))
    return emb_szs


def cnn_feature_engineering(df, stats_dict=None):
    cat_cols = ["sex", "anatom_site_general",
                "tbp_tile_type", "tbp_lv_location", "tbp_lv_location_simple"]
    cont_cols = ["age_approx",
                 "clin_size_long_diam_mm",
                 "tbp_lv_A", "tbp_lv_Aext",
                 "tbp_lv_B", "tbp_lv_Bext",
                 "tbp_lv_C", "tbp_lv_Cext",
                 "tbp_lv_H", "tbp_lv_Hext",
                 "tbp_lv_L", "tbp_lv_Lext",
                 "tbp_lv_areaMM2", "tbp_lv_area_perim_ratio",
                 "tbp_lv_color_std_mean",
                 "tbp_lv_deltaA", "tbp_lv_deltaB", "tbp_lv_deltaL", "tbp_lv_deltaLB", "tbp_lv_deltaLBnorm",
                 "tbp_lv_eccentricity",
                 "tbp_lv_minorAxisMM", "tbp_lv_nevi_confidence", "tbp_lv_norm_border",
                 "tbp_lv_norm_color", "tbp_lv_perimeterMM",
                 "tbp_lv_radial_color_std_max", "tbp_lv_stdL", "tbp_lv_stdLExt",
                 "tbp_lv_symm_2axis", "tbp_lv_symm_2axis_angle",
                 "tbp_lv_x", "tbp_lv_y", "tbp_lv_z"
                 ]

    df["num_images"] = df["patient_id"].map(df.groupby("patient_id")["isic_id"].count())
    cont_cols.append("num_images")

    df["num_images"] = np.log1p(df["num_images"])

    if stats_dict is None:
        stats_dict = {}
        for col in cont_cols:
            if col not in ["num_images", "age_approx"]:
                stats_dict[col] = {"mean": df[col].mean(), "std": df[col].std()}
                df[col] = (df[col] - stats_dict[col]["mean"]) / stats_dict[col]["std"]
    else:
        for col in cont_cols:
            if col not in ["num_images", "age_approx"]:
                df[col] = (df[col] - stats_dict[col]["mean"]) / stats_dict[col]["std"]
    return df, cat_cols, cont_cols, stats_dict

def test_augment(image_size, mean=None, std=None):
    if mean is not None and std is not None:
        normalize = A.Normalize(mean=mean, std=std, max_pixel_value=255.0, p=1.0)
    else:
        normalize = A.Normalize(max_pixel_value=255.0, p=1.0)
    transform = A.Compose(
        [A.Resize(image_size, image_size), normalize, ToTensorV2()], p=1.0
    )
    return transform


class ISICDataset(Dataset):
    def __init__(self, metadata, images, augment,
                 use_meta=False, cat_cols: List = None, cont_cols: List = None,
                 infer=False):
        self.metadata = metadata
        self.images = images
        self.augment = augment
        self.use_meta = use_meta
        self.cat_cols = cat_cols
        self.cont_cols = cont_cols
        self.length = len(self.metadata)
        self.infer = infer

    def __len__(self):
        return self.length

    def __getitem__(self, index):
        row = self.metadata.iloc[index]
        image = np.array(Image.open(BytesIO(self.images[row["isic_id"]][()])))
        if self.augment is not None:
            image = self.augment(image=image)["image"].float()

        if self.use_meta:
            x_cat = torch.tensor([row[col] for col in self.cat_cols], dtype=torch.long)
            x_cont = torch.tensor([row[col] for col in self.cont_cols], dtype=torch.float)
        else:
            x_cat = torch.tensor(0)
            x_cont = torch.tensor(0)

        if self.infer:
            return image, x_cat, x_cont
        else:
            target = torch.tensor(row["target"])
            return image, x_cat, x_cont, target

    
class ISICNet(nn.Module):
    def __init__(
        self,
        model_name,
        pretrained=True,
        use_meta=False,
        cat_cols: List = None, cont_cols: List = None, emb_szs: Dict = None,
    ):
        super(ISICNet, self).__init__()
        self.model = create_model(
            model_name=model_name,
            pretrained=pretrained,
            in_chans=3,
            num_classes=0,
            global_pool="",
        )
        in_dim = self.model.num_features
        self.dropouts = nn.ModuleList([nn.Dropout(0.5) for _ in range(5)])
        self.use_meta = use_meta
        if use_meta:
            self.linear = nn.Linear(in_dim, 256)

            self.embeddings = nn.ModuleList([nn.Embedding(emb_szs[col][0], emb_szs[col][1]) for col in cat_cols])
            self.embedding_dropout = nn.Dropout(0.1)
            n_emb = sum([emb_szs[col][1] for col in cat_cols])
            n_cont = len(cont_cols)
            self.bn_cont = nn.BatchNorm1d(n_cont)
            self.meta = nn.Sequential(
                nn.Linear(n_emb + n_cont, 256),
                nn.BatchNorm1d(256),
                nn.SiLU(),
                nn.Dropout(0.3),
                nn.Linear(256, 64),
                nn.BatchNorm1d(64),
                nn.SiLU(),
                nn.Dropout(0.1),
            )
            self.classifier = nn.Linear(256 + 64, 1)
        else:
            self.linear = nn.Linear(in_dim, 1)

    def forward(self, images, x_cat=None, x_cont=None):
        x = self.model(images)
        bs = len(images)
        pool = F.adaptive_avg_pool2d(x, 1).reshape(bs, -1)
        if self.training:
            x_image = 0
            for i in range(len(self.dropouts)):
                x_image += self.linear(self.dropouts[i](pool))
            x_image = x_image / len(self.dropouts)
        else:
            x_image = self.linear(pool)

        if self.use_meta:
            x_cat = [emb(x_cat[:, i]) for i, emb in enumerate(self.embeddings)]
            x_cat = torch.cat(x_cat, 1)
            x_cat = self.embedding_dropout(x_cat)
            x_cont = self.bn_cont(x_cont)
            x_meta = self.meta(torch.cat([x_cat, x_cont], 1))
            x = torch.cat([x_image, x_meta], 1)
            logits = self.classifier(x)
        else:
            logits = x_image
        return logits


def get_trans(img, iteration):
    if iteration >= 6:
        img = img.transpose(2, 3)
    if iteration % 6 == 0:
        return img
    elif iteration % 6 == 1:
        return torch.flip(img, dims=[2])
    elif iteration % 6 == 2:
        return torch.flip(img, dims=[3])
    elif iteration % 6 == 3:
        return torch.rot90(img, 1, dims=[2, 3])
    elif iteration % 6 == 4:
        return torch.rot90(img, 2, dims=[2, 3])
    elif iteration % 6 == 5:
        return torch.rot90(img, 3, dims=[2, 3])

    
def predict(model, test_dataloader, accelerator, n_tta, use_meta, log_interval=10):
    model.eval()
    test_probs = []
    total_steps = len(test_dataloader)
    with torch.no_grad():
        for step, (images, x_cat, x_cont) in enumerate(test_dataloader):
            logits = 0
            probs = 0
            for i in range(n_tta):
                if use_meta:
                    logits_iter = model(get_trans(images, i), x_cat, x_cont)
                else:
                    logits_iter = model(get_trans(images, i))
                logits += logits_iter
                probs += torch.sigmoid(logits_iter)
            logits /= n_tta
            probs /= n_tta

            probs = accelerator.gather(probs)
            test_probs.append(probs)

            if (step == 0) or ((step + 1) % log_interval == 0):
                print(f"Step: {step + 1}/{total_steps}")

    test_probs = torch.cat(test_probs).cpu().numpy()
    return test_probs


def get_dnn_predictions(train, test, images, model_name, version, path, oof_column):
    start_time = time.time()
    with open(path / f"{model_name}_{version}_run_metadata.json", "r") as f:
        run_metadata = json.load(f)
    pprint(run_metadata["params"])
    
    image_size = run_metadata["params"]["image_size"]
    batch_size = run_metadata["params"]["val_batch_size"]
    use_meta = run_metadata["params"]["use_meta"]
    
    test_dataset = ISICDataset(
        test, images, augment=test_augment(image_size), 
        use_meta=use_meta,
        cat_cols=cat_cols,
        cont_cols=cont_cols,
        infer=True
    )
    test_dataloader = DataLoader(
        test_dataset,
        batch_size=batch_size,
        shuffle=False,
        num_workers=4,
        drop_last=False,
        pin_memory=True,
    )

    all_folds = np.unique(train[fold_column])
#     all_folds = [1]
    test_predictions_df = pd.DataFrame({id_column: test[id_column]})
    for fold in all_folds:
        print(f"\nFold {fold}")
        accelerator = Accelerator(
            mixed_precision=run_metadata["params"]["mixed_precision"],
        )
        
        model = ISICNet(model_name=model_name, pretrained=False,
                        use_meta=use_meta,
                        cat_cols=cat_cols,
                        cont_cols=cont_cols,
                        emb_szs=emb_szs,)
        model = model.to(accelerator.device)
        
        model, test_dataloader = accelerator.prepare(model, test_dataloader)
        model_filepath = path / f"models/fold_{fold}"
        accelerator.load_state(model_filepath)

        test_predictions_df[f"fold_{fold}"] = predict(model, test_dataloader, accelerator, n_tta=run_metadata["params"]["n_tta"], use_meta=use_meta)
    test_predictions_df[oof_column] = test_predictions_df[[f"fold_{fold}" for fold in all_folds]].mean(axis=1)
    end_time = time.time()
    return test_predictions_df[[id_column, oof_column]], (end_time - start_time)

In [8]:
train_metadata = pd.read_csv(INPUT_PATH / "train-metadata.csv", low_memory=False, na_values=["NA"])
test_metadata = pd.read_csv(INPUT_PATH / "test-metadata.csv", low_memory=False, na_values=["NA"])

folds_df = pd.read_csv("/kaggle/input/isic-scd-folds/folds.csv")
train_metadata = train_metadata.merge(folds_df, on=[id_column, group_column], how="inner")
print(f"Train data size: {train_metadata.shape}")
print(f"Test data size: {test_metadata.shape}")

train_metadata = cnn_preprocess(train_metadata)
test_metadata = cnn_preprocess(test_metadata)

train_metadata, cat_cols, cont_cols, stats_dict = cnn_feature_engineering(train_metadata)
test_metadata, _, _, _ = cnn_feature_engineering(test_metadata, stats_dict=stats_dict)
emb_szs = get_emb_szs(cat_cols)

train_images = h5py.File(INPUT_PATH / "train-image.hdf5", mode="r")
test_images = h5py.File(INPUT_PATH / "test-image.hdf5", mode="r")

Train data size: (401059, 56)
Test data size: (3, 44)


In [9]:
if test_metadata.shape[0] == 3:
    test_metadata = train_metadata.sample(n=SAMPLE_SIZE, random_state=42)
    test_images = train_images
for idx, (model_name, version, path, oof_column) in enumerate(zip(cnn_model_names, cnn_versions, cnn_paths, cnn_oof_columns)):
    print(f"Generating predictions for {model_name}_{version}")
    model_preds_df, runtime = get_dnn_predictions(
        train_metadata, 
        test_metadata,
        test_images,
        model_name, 
        version, 
        Path(path),
        oof_column
    )
    print("\n")
    ensemble_preds_df = ensemble_preds_df.merge(model_preds_df, on=id_column, how="left")
    TOTAL_RUNTIME += runtime

Generating predictions for efficientnet_b2_v1
{'debug': False,
 'down_sampling': True,
 'image_size': 128,
 'init_lr': 3e-05,
 'mixed_precision': 'fp16',
 'mode': 'pretrain',
 'n_tta': 8,
 'num_epochs': 20,
 'num_workers': 8,
 'seed': 2022,
 'train_batch_size': 64,
 'use_meta': True,
 'val_batch_size': 512}

Fold 1
Step: 1/10
Step: 10/10

Fold 2
Step: 1/10
Step: 10/10

Fold 3
Step: 1/10
Step: 10/10

Fold 4
Step: 1/10
Step: 10/10

Fold 5
Step: 1/10
Step: 10/10


Generating predictions for efficientnet_b2_v2
{'debug': False,
 'down_sampling': True,
 'image_size': 92,
 'init_lr': 3e-05,
 'mixed_precision': 'fp16',
 'mode': 'pretrain',
 'n_tta': 8,
 'num_epochs': 20,
 'num_workers': 8,
 'seed': 2022,
 'train_batch_size': 64,
 'use_meta': True,
 'val_batch_size': 512}

Fold 1
Step: 1/10
Step: 10/10

Fold 2
Step: 1/10
Step: 10/10

Fold 3
Step: 1/10
Step: 10/10

Fold 4
Step: 1/10
Step: 10/10

Fold 5
Step: 1/10
Step: 10/10


Generating predictions for resnet18_v1
{'debug': False,
 'down_sampli

In [10]:
factor = EXPECTED_TEST_SIZE / SAMPLE_SIZE
expected_total_runtime = TOTAL_RUNTIME * factor
total_runtime_minutes = int(expected_total_runtime // 60)
total_runtime_seconds = expected_total_runtime % 60
print(f"Expected total runtime during submission: {total_runtime_minutes} mins and {total_runtime_seconds} secs")

Expected total runtime during submission: 334 mins and 16.996917724609375 secs


In [11]:
ensemble_preds_df

Unnamed: 0,isic_id,oof_xgb_v1,oof_xgb_v2,oof_lgb_v6,oof_efficientnet_b2_v1,oof_efficientnet_b2_v2,oof_resnet18_v1
0,ISIC_6973879,0.007072,0.002608,0.001650,0.000019,0.000090,6.663231e-06
1,ISIC_5407194,0.005565,0.001158,0.000617,0.000014,0.000107,7.876630e-07
2,ISIC_5273739,0.008066,0.005405,0.006381,0.000225,0.002308,6.009169e-05
3,ISIC_0802250,0.006882,0.002557,0.003496,0.000042,0.000395,1.347116e-05
4,ISIC_8084953,0.117304,0.086990,0.129695,0.000091,0.000566,2.165027e-05
...,...,...,...,...,...,...,...
4995,ISIC_7957551,0.005735,0.001501,0.000910,0.000083,0.000278,4.376952e-06
4996,ISIC_7499278,0.006262,0.002116,0.001097,0.000013,0.000145,1.715483e-06
4997,ISIC_5754512,0.007842,0.003500,0.004969,0.000041,0.000576,3.080450e-05
4998,ISIC_2067724,0.010438,0.005192,0.005186,0.000068,0.000262,1.566706e-05


In [12]:
ensemble_preds = 0
for idx, (oof_column, weight) in enumerate(zip(oof_columns, weights)):
    ensemble_preds += ensemble_preds_df[oof_column].rank(pct=True).values * weight
ensemble_preds_df[target_column] = ensemble_preds
ensemble_preds_df.head()

Unnamed: 0,isic_id,oof_xgb_v1,oof_xgb_v2,oof_lgb_v6,oof_efficientnet_b2_v1,oof_efficientnet_b2_v2,oof_resnet18_v1,target
0,ISIC_6973879,0.007072,0.002608,0.00165,1.9e-05,9e-05,6.663231e-06,9.28609
1,ISIC_5407194,0.005565,0.001158,0.000617,1.4e-05,0.000107,7.87663e-07,2.695662
2,ISIC_5273739,0.008066,0.005405,0.006381,0.000225,0.002308,6.009169e-05,16.697294
3,ISIC_0802250,0.006882,0.002557,0.003496,4.2e-05,0.000395,1.347116e-05,12.144622
4,ISIC_8084953,0.117304,0.08699,0.129695,9.1e-05,0.000566,2.165027e-05,18.554648


In [13]:
ensemble_preds_df[target_column].describe()

count    5000.000000
mean       10.862057
std         5.553865
min         0.035798
25%         6.314943
50%        10.404242
75%        15.374151
max        21.717738
Name: target, dtype: float64

In [14]:
ensemble_preds_df[[id_column, target_column]].head()

Unnamed: 0,isic_id,target
0,ISIC_6973879,9.28609
1,ISIC_5407194,2.695662
2,ISIC_5273739,16.697294
3,ISIC_0802250,12.144622
4,ISIC_8084953,18.554648


In [15]:
ensemble_preds_df[[id_column, target_column]].to_csv("submission.csv", index=False)