In [1]:
import json
import joblib
from pprint import pprint
from tqdm import tqdm
from pathlib import Path

import pandas as pd
import numpy as np

import lightgbm as lgb
import catboost as cb

import h5py
from io import BytesIO
from PIL import Image

import torch
from torch.utils.data import DataLoader, Dataset
import torch.nn as nn
import torch.nn.functional as F

from timm import create_model

import albumentations as A
from albumentations.pytorch import ToTensorV2

from accelerate import Accelerator

from isic_helper import DotDict, get_folds

In [2]:
model_names = ["cb", "lgb", "resnet18", "resnet18"]
versions = ["v1", "v3", "v4", "v5"]
paths = [f"/kaggle/input/isic-scd-{model_name.replace('_', '-')}-{version}-train" for model_name, version in zip(model_names, versions)]

weights = [0.5182956326764501,
 0.5603260166616933,
 0.3794339142480809,
 0.7507678469548426]

In [3]:
id_column = "isic_id"
target_column = "target"
group_column = "patient_id"

In [4]:
def count_features(df, col):
    tmp = df[[id_column, group_column, col]].pivot_table(
        values=id_column, 
        index=group_column, 
        columns=col, 
        aggfunc="count", 
        fill_value=0)
    feature_cols = tmp.columns.tolist()
    tmp.reset_index(inplace=True)
    tmp.index.name = None
    df = df.merge(tmp, on=group_column, how="left")
    return df, feature_cols

def mean_features(df, col, val):
    tmp = df[[id_column, group_column, col, val]].pivot_table(
        values=val, 
        index=group_column, 
        columns=col, 
        aggfunc="mean", 
        fill_value=0)
    tmp.columns = [f"{c}_{val}_mean" for c in tmp.columns.tolist()]
    feature_cols = tmp.columns.tolist()
    tmp.reset_index(inplace=True)
    tmp.index.name = None
    df = df.merge(tmp, on=group_column, how="left")
    return df, feature_cols


def stat_features(df, group_cols, value_col, stats):
    tmp = df.groupby(group_cols)[value_col].agg(stats)
    tmp.columns = [f"{value_col}_{stat}" for stat in stats]
    tmp.reset_index(inplace=True)
    df = df.merge(tmp, on=group_cols, how="left")
    df[f"{value_col}_mean_diff"] = df[value_col] - df[f"{value_col}_mean"]
    return df


def feature_engineering(df):
    new_num_cols = []
    
    df["lesion_size_ratio"] = df["tbp_lv_minorAxisMM"] / df["clin_size_long_diam_mm"]
    new_num_cols += ["lesion_size_ratio"]
    
    df["lesion_distance"] = np.sqrt(df["tbp_lv_x"]**2 + df["tbp_lv_y"]**2 + df["tbp_lv_z"]**2)
    new_num_cols += ["lesion_distance"]
    
    df["hue_contrast"] = df["tbp_lv_H"] - df["tbp_lv_Hext"]
    df, feature_cols = mean_features(df, "anatom_site_general", "hue_contrast")
    new_num_cols += feature_cols
    
    df, feature_cols = count_features(df, "anatom_site_general")
    new_num_cols += feature_cols
    
    df["tbp_lv_A_diff"] =  df["tbp_lv_Aext"] - df["tbp_lv_A"]
    df = stat_features(df, ["patient_id", "tbp_lv_location"], "tbp_lv_A_diff", ["mean"])
    new_num_cols += ["tbp_lv_A_diff_mean_diff"]
    
    df["tbp_lv_B_diff"] =  df["tbp_lv_Bext"] - df["tbp_lv_B"]
    df = stat_features(df, ["patient_id", "tbp_lv_location"], "tbp_lv_B_diff", ["mean"])
    new_num_cols += ["tbp_lv_B_diff_mean_diff"]
    
    df["tbp_lv_L_diff"] =  df["tbp_lv_Lext"] - df["tbp_lv_L"]
    df = stat_features(df, ["patient_id", "tbp_lv_location"], "tbp_lv_L_diff", ["mean"])
    new_num_cols += ["tbp_lv_L_diff_mean_diff"]
    
    df["tbp_lv_L_std_diff"] =  df["tbp_lv_stdLExt"] - df["tbp_lv_stdL"]
    df = stat_features(df, ["patient_id", "tbp_lv_location"], "tbp_lv_L_std_diff", ["mean"])
    new_num_cols += ["tbp_lv_L_std_diff_mean_diff"]
    
    df["color_uniformity"] = df["tbp_lv_color_std_mean"] / df["tbp_lv_radial_color_std_max"]
    df, feature_cols = mean_features(df, "anatom_site_general", "color_uniformity")
    new_num_cols += feature_cols
    
    df["radius"] = np.cos(df["tbp_lv_symm_2axis_angle"]) * np.sqrt(df["tbp_lv_x"]**2 + df["tbp_lv_y"]**2 + df["tbp_lv_z"]**2)
    new_num_cols += ["radius"]
    
    return df, new_num_cols

In [5]:
label_mapping = {
    "2024": {
        "Hidradenoma": "unknown",
        "Lichen planus like keratosis": "BKL",
        "Pigmented benign keratosis": "BKL",
        "Seborrheic keratosis": "BKL",
        "Solar lentigo": "BKL",
        "Nevus": "NV",
        "Angiofibroma": "unknown",
        "Dermatofibroma": "DF",
        "Fibroepithelial polyp": "unknown",
        "Scar": "unknown",
        "Hemangioma": "unknown",
        "Trichilemmal or isthmic-catagen or pilar cyst": "unknown",
        "Lentigo NOS": "BKL",
        "Verruca": "unknown",
        "Solar or actinic keratosis": "AKIEC",
        "Atypical intraepithelial melanocytic proliferation": "unknown",
        "Atypical melanocytic neoplasm": "unknown",
        "Basal cell carcinoma": "BCC",
        "Squamous cell carcinoma in situ": "SCC",
        "Squamous cell carcinoma, Invasive": "SCC",
        "Squamous cell carcinoma, NOS": "SCC",
        "Melanoma in situ": "MEL",
        "Melanoma Invasive": "MEL",
        "Melanoma metastasis": "MEL",
        "Melanoma, NOS": "MEL"
    },
    "2020": {
        "nevus": "NV",
        "melanoma": "MEL",
        "seborrheic keratosis": "BKL",
        "lentigo NOS": "BKL",
        "lichenoid keratosis": "BKL",
        "other": "unknown",
        "solar lentigo": "BKL",
        "scar": "unknown",
        "cafe-au-lait macule": "unknown",
        "atypical melanocytic proliferation": "unknown",
        "pigmented benign keratosis": "BKL"
    },
    "2019": {
        "nevus": "NV",
        "melanoma": "MEL",
        "seborrheic keratosis": "BKL",
        "pigmented benign keratosis": "BKL",
        "dermatofibroma": "DF",
        "squamous cell carcinoma": "SCC",
        "basal cell carcinoma": "BCC",
        "vascular lesion": "VASC",
        "actinic keratosis": "AKIEC",
        "solar lentigo": "BKL",
    },
}


all_labels = np.unique(list(label_mapping["2024"].values()) +
                       list(label_mapping["2020"].values()) +
                       list(label_mapping["2019"].values()))
label2idx = {label: idx for idx, label in enumerate(all_labels)}
malignant_labels = ["BCC", "MEL", "SCC"]
malignant_idx = [label2idx[label] for label in malignant_labels]

In [6]:
def val_augment(image_size):
    transform = A.Compose([A.Resize(image_size, image_size), ToTensorV2()], p=1.0)
    return transform


class ISICDataset(Dataset):
    def __init__(self, metadata, images, augment, infer=False):
        self.metadata = metadata
        self.images = images
        self.augment = augment
        self.length = len(self.metadata)
        self.infer = infer

    def __len__(self):
        return self.length

    def __getitem__(self, index):
        row = self.metadata.iloc[index]

        image = np.array(Image.open(BytesIO(self.images[row["isic_id"]][()])))
        image = self.augment(image=image)["image"]

        data = image.float().div(255)

        if not self.infer:
            label = torch.tensor(row["label"]).long()
            return data, label

        return data

    
class ISICNet(nn.Module):
    def __init__(self, model_name, out_dim, pretrained=True, infer=False):
        super(ISICNet, self).__init__()
        self.infer = infer
        self.model = create_model(
            model_name=model_name,
            pretrained=pretrained,
            in_chans=3,
            num_classes=0,
            global_pool="",
        )
        self.classifier = nn.Linear(self.model.num_features, out_dim)

        self.dropouts = nn.ModuleList([nn.Dropout(0.5) for _ in range(5)])

    def forward(self, data):
        image = data
        x = self.model(image)
        bs = len(image)
        pool = F.adaptive_avg_pool2d(x, 1).reshape(bs, -1)

        if self.training:
            logit = 0
            for i in range(len(self.dropouts)):
                logit += self.classifier(self.dropouts[i](pool))
            logit = logit / len(self.dropouts)
        else:
            logit = self.classifier(pool)
        return logit
    

def get_trans(img, iteration):
    if iteration >= 6:
        img = img.transpose(2, 3)
    if iteration % 6 == 0:
        return img
    elif iteration % 6 == 1:
        return torch.flip(img, dims=[2])
    elif iteration % 6 == 2:
        return torch.flip(img, dims=[3])
    elif iteration % 6 == 3:
        return torch.rot90(img, 1, dims=[2, 3])
    elif iteration % 6 == 4:
        return torch.rot90(img, 2, dims=[2, 3])
    elif iteration % 6 == 5:
        return torch.rot90(img, 3, dims=[2, 3])

    
def predict(model, test_dataloader, accelerator, out_dim, n_tta, malignant_idx, log_interval=50):
    model.eval()
    test_probs = []
    total_steps = len(test_dataloader)
    with torch.no_grad():
        for step, data in enumerate(test_dataloader):
            logits = torch.zeros((data.shape[0], out_dim)).to(accelerator.device)
            probs = torch.zeros((data.shape[0], out_dim)).to(accelerator.device)
            for idx in range(n_tta):
                logits_iter = model(get_trans(data, idx))
                logits += logits_iter
                probs += logits_iter.softmax(1)
            logits /= n_tta
            probs /= n_tta

            logits, probs = accelerator.gather((logits, probs))
            test_probs.append(probs)

            if step % log_interval == 0:
                print(
                    f"Step: {step + 1}/{total_steps}"
                )

    test_probs = torch.cat(test_probs).cpu().numpy()
    if out_dim == 9:
        binary_probs = test_probs[:, malignant_idx].sum(1)
    else:
        binary_probs = test_probs[:, 1]
    return binary_probs

In [7]:
INPUT_PATH = Path("../input/isic-2024-challenge/")

train_metadata = pd.read_csv(INPUT_PATH / "train-metadata.csv", low_memory=False)
test_metadata = pd.read_csv(INPUT_PATH / "test-metadata.csv")

folds_df = get_folds()
train_metadata = train_metadata.merge(folds_df, on=["isic_id", "patient_id"], how="inner")
print(f"Train data size: {train_metadata.shape}")
print(f"Test data size: {test_metadata.shape}")

train_metadata, new_num_cols = feature_engineering(train_metadata.copy())
test_metadata, _ = feature_engineering(test_metadata.copy())

test_images = h5py.File(INPUT_PATH / "test-image.hdf5", mode="r")

Train data size: (401059, 57)
Test data size: (3, 44)


In [8]:
def get_boosting_predictions(train, test, test_images, model_name, version, path):
    with open(path / f"{model_name}_{version}_encoder.joblib", "rb") as f:
        mixed_encoded_preprocessor = joblib.load(f)

    enc = mixed_encoded_preprocessor.fit(train)

    for col in mixed_encoded_preprocessor.feature_names_in_:
        if col not in test.columns:
            test[col] = np.nan

    X_test = enc.transform(test)

    columns_for_model = len(X_test.columns)
    print(f"Total number of columns: {columns_for_model}")

    with open(path / f"{model_name}_{version}_run_metadata.json", "r") as f:
        run_metadata = json.load(f)
        
    all_folds = np.unique(train["fold"])
    test_predictions_df = pd.DataFrame({id_column: test_metadata[id_column]})
    for fold in all_folds:
        model_filepath = path / f"models/{model_name}_{version}_fold_{fold}.txt"
        if "lgb" in model_name:
            model = lgb.Booster(model_file=model_filepath)
            test_predictions_df[f"fold_{fold}"] = model.predict(X_test, num_iteration=run_metadata["best_num_rounds"][f"fold_{fold}"])
        elif "cb" in model_name:
            model = cb.CatBoostClassifier(use_best_model=True)
            model.load_model(model_filepath)
            test_predictions_df[f"fold_{fold}"] = model.predict_proba(X_test)[:, -1]
    test_predictions_df[target_column] = test_predictions_df[[f"fold_{fold}" for fold in all_folds]].mean(axis=1)
    return test_predictions_df[[id_column, target_column]]


def get_dnn_predictions(train, test, test_images, model_name, version, path):
    with open(path / f"{model_name}_{version}_run_metadata.json", "r") as f:
        run_metadata = json.load(f)
    pprint(run_metadata)
    
    test_dataset = ISICDataset(
        test, test_images, augment=val_augment(run_metadata["params"]["image_size"]), infer=True
    )
    test_dataloader = DataLoader(
        test_dataset,
        batch_size=run_metadata["params"]["val_batch_size"],
        shuffle=False,
        num_workers=2,
        drop_last=False,
        pin_memory=True,
    )
    
    all_folds = np.unique(train["fold"])
    test_predictions_df = pd.DataFrame({id_column: test_metadata[id_column]})
    for fold in all_folds:
        accelerator = Accelerator(
            mixed_precision=run_metadata["params"]["mixed_precision"],
        )
        
        model = ISICNet(model_name=model_name, out_dim=run_metadata["params"]["out_dim"], pretrained=False, infer=True)
        model = model.to(accelerator.device)
        
        model, test_dataloader = accelerator.prepare(model, test_dataloader)
        model_filepath = path / f"models/fold_{fold}"
        accelerator.load_state(model_filepath)

        test_predictions_df[f"fold_{fold}"] = predict(model, test_dataloader, accelerator, out_dim=run_metadata["params"]["out_dim"], n_tta=run_metadata["params"]["n_tta"], malignant_idx=malignant_idx)
    test_predictions_df[target_column] = test_predictions_df[[f"fold_{fold}" for fold in all_folds]].mean(axis=1)
    return test_predictions_df[[id_column, target_column]]

In [9]:
model_predict_function_topology = {
    "lgb": get_boosting_predictions,
    "cb": get_boosting_predictions,
    "resnet18": get_dnn_predictions,
}

In [10]:
ensemble_preds = 0
previous_model_name = None
for idx, (model_name, version, path, weight) in enumerate(zip(model_names, versions, paths, weights)):
    print(f"Generating predictions for {model_name}_{version}")
    model_preds_df = model_predict_function_topology[model_name](train_metadata, test_metadata, test_images, model_name, version, Path(path))
    if idx == 0:
        ensemble_preds_df = model_preds_df.copy()
    else:
        ensemble_preds_df = ensemble_preds_df.merge(model_preds_df, on=id_column, how="inner", suffixes=(f"_{previous_model_name}", ""))
    ensemble_preds += ensemble_preds_df[target_column].rank(pct=True).values * weight
    previous_model_name = model_name
    print("\n")
ensemble_preds_df.rename(columns={target_column: f"{target_column}_{previous_model_name}"}, inplace=True)
ensemble_preds_df[target_column] = ensemble_preds

Generating predictions for cb_v1
Total number of columns: 60


Generating predictions for lgb_v3
Total number of columns: 60


Generating predictions for resnet18_v4
{'best_num_epochs': {'fold_1': 3,
                     'fold_2': 6,
                     'fold_3': 6,
                     'fold_4': 6,
                     'fold_5': 6},
 'cv_auc_avg': 0.9387605558205776,
 'cv_auc_oof': 0.9312259623350531,
 'cv_auc_std': 0.009309164824754284,
 'cv_pauc_avg': 0.15499991455799894,
 'cv_pauc_oof': 0.14688454600952008,
 'cv_pauc_std': 0.0074426762959741875,
 'params': {'image_size': 128,
            'learning_rate': 0.001,
            'mixed_precision': 'fp16',
            'n_tta': 10,
            'num_epochs': 6,
            'num_workers': 4,
            'out_dim': 2,
            'seed': 2022,
            'train_batch_size': 64,
            'val_batch_size': 512},
 'val_auc_scores': {'fold_1': 0.9487945075875744,
                    'fold_2': 0.9486271552841299,
                    'fold_3':

In [11]:
ensemble_preds_df.head()

Unnamed: 0,isic_id,target_cb,target_lgb,target_resnet18,target_resnet18.1,target
0,ISIC_0015657,0.000182,3.6e-05,0.010517,0.083855,1.521536
1,ISIC_0015729,2.3e-05,2.8e-05,8.7e-05,0.090158,0.98653
2,ISIC_0015740,0.000113,4.9e-05,0.001205,0.181383,1.90958


In [12]:
ensemble_preds_df[target_column].describe()

count    3.000000
mean     1.472549
std      0.463471
min      0.986530
25%      1.254033
50%      1.521536
75%      1.715558
max      1.909580
Name: target, dtype: float64

In [13]:
ensemble_preds_df[[id_column, target_column]].head()

Unnamed: 0,isic_id,target
0,ISIC_0015657,1.521536
1,ISIC_0015729,0.98653
2,ISIC_0015740,1.90958


In [14]:
ensemble_preds_df[[id_column, target_column]].to_csv("submission.csv", index=False)