In [1]:
from pathlib import Path
from pprint import pprint

import pandas as pd
import numpy as np

from bayes_opt import BayesianOptimization
from isic_helper import compute_pauc, compute_auc

In [2]:
id_column = "isic_id"
target_column = "target"
ensemble_column = "oof_preds_ensemble"
fold_column = "fold"

# model_names = ["xgb", "xgb", "lgb", "lgb", "efficientnet_b2"]
# versions = ["v1", "v2", "v5", "v6", "v1"]
# modes = ["train", "train", "train", "train", "pretrain"]

model_names = ["xgb", "xgb", "lgb", "lgb"]
versions = ["v1", "v2", "v5", "v6"]
modes = ["train", "train", "train", "train"]

paths = [f"/kaggle/input/isic-scd-{model_name.replace('_', '-')}-{version}-{mode}" for model_name, version, mode in zip(model_names, versions, modes)]

In [3]:
val_auc_scores = {}
val_pauc_scores = {}
for idx, path in enumerate(paths):
    model_name = model_names[idx]
    version = versions[idx]
    mode = modes[idx]
    model_identifier = f"{model_name}_{version}"
    oof_preds_model_df = pd.read_csv(f"{path}/oof_preds_{model_identifier}.csv")
    if idx == 0:
        oof_preds_df = oof_preds_model_df.copy()
    else:
        oof_preds_df = oof_preds_df.merge(oof_preds_model_df[[id_column, f"oof_{model_name}_{version}"]],
                                          on="isic_id", how="inner")
        assert oof_preds_df.shape[0] == oof_preds_model_df.shape[0]
    val_auc_scores[f"{model_name}_{version}"] = {}
    val_pauc_scores[f"{model_name}_{version}"] = {}
val_auc_scores["ensemble"] = {}
val_pauc_scores["ensemble"] = {}

oof_columns = [col for col in oof_preds_df.columns if col.startswith("oof")]
all_folds = np.unique(oof_preds_df["fold"])

In [4]:
def blend_optimizer(oof_preds_df, oof_columns, init_points = 20, n_iter = 100):
    pbounds = {f"w{i}": (0.0, 10.0) for i in range(len(oof_columns))}

    def dim_opt(oof_preds_df, *args):
        weights = args
        score = 0
        for fold in all_folds:
            fold_ensemble_preds = 0
            for weight, oof_column in zip(weights, oof_columns):
                fold_ensemble_preds += weight * oof_preds_df.loc[oof_preds_df["fold"] == fold, oof_column].rank(pct=True).values
            score += compute_pauc(np.array(oof_preds_df.loc[oof_preds_df["fold"] == fold, "target"]), fold_ensemble_preds)
        return score / len(all_folds)

    def q(**ws):
        ws = tuple(ws.values())
        return dim_opt(oof_preds_df, *ws)

    optimizer = BayesianOptimization(
        f=q,
        pbounds=pbounds,
        random_state=2022,
    )

    optimizer.maximize(
        init_points=init_points,
        n_iter=n_iter,
    )
    
    weights = []
    for i in range(len(oof_columns)):
        weights.append(optimizer.max["params"][f"w{i}"])
    
    print(f"Best pAUC: {optimizer.max['target']}")
    print(f"Best weights: {weights}")
    return weights


weights = blend_optimizer(
    oof_preds_df, oof_columns, 
    init_points=40, 
    n_iter=40
)

|   iter    |  target   |    w0     |    w1     |    w2     |    w3     |
-------------------------------------------------------------------------
| [30m1         | [30m0.1737    | [30m0.09359   | [30m4.991     | [30m1.134     | [30m0.4997    |
| [35m2         | [35m0.1747    | [35m6.854     | [35m4.87      | [35m8.977     | [35m6.475     |
| [35m3         | [35m0.1749    | [35m8.97      | [35m7.211     | [35m8.314     | [35m8.276     |
| [35m4         | [35m0.1751    | [35m8.336     | [35m9.57      | [35m3.68      | [35m4.948     |
| [30m5         | [30m0.1744    | [30m3.395     | [30m6.194     | [30m9.775     | [30m0.9643    |
| [30m6         | [30m0.175     | [30m7.442     | [30m2.925     | [30m2.987     | [30m7.525     |
| [30m7         | [30m0.1737    | [30m0.1866    | [30m5.237     | [30m8.644     | [30m3.888     |
| [30m8         | [30m0.1744    | [30m2.122     | [30m4.752     | [30m5.647     | [30m3.494     |
| [30m9         | 

In [5]:
weights

[10.0, 6.718043378657121, 0.0, 5.87588471239134]

In [6]:
all_folds = np.unique(oof_preds_df["fold"])
for fold in all_folds:
    fold_index = oof_preds_df[oof_preds_df["fold"] == fold].index
    fold_target = oof_preds_df.loc[fold_index, target_column]
    fold_ensemble_preds = 0
    for model_name, version, weight in zip(model_names, versions, weights):
        fold_model_preds = oof_preds_df.loc[fold_index, f"oof_{model_name}_{version}"]
        fold_ensemble_preds += fold_model_preds.rank(pct=True).values * weight 
        
        val_auc_scores[f"{model_name}_{version}"][f"fold_{fold}"] = compute_auc(fold_target, fold_model_preds)
        val_pauc_scores[f"{model_name}_{version}"][f"fold_{fold}"] = compute_pauc(fold_target, fold_model_preds, min_tpr=0.8)
    
    oof_preds_df.loc[fold_index, ensemble_column] = fold_ensemble_preds
    val_auc_scores["ensemble"][f"fold_{fold}"] = compute_auc(fold_target, fold_ensemble_preds)
    val_pauc_scores["ensemble"][f"fold_{fold}"] = compute_pauc(fold_target, fold_ensemble_preds, min_tpr=0.8)

for model_name, version, weight in zip(model_names, versions, weights):
    print(f"Model: {model_name}_{version} | Weightage: {weight}")
    
    print("Val AUC scores:")
    pprint(val_auc_scores[f"{model_name}_{version}"])
    print("Val PAUC scores:")
    pprint(val_pauc_scores[f"{model_name}_{version}"])
    
    cv_model_auc_oof = compute_auc(oof_preds_df[target_column], oof_preds_df[f"oof_{model_name}_{version}"])
    cv_model_pauc_oof = compute_pauc(oof_preds_df[target_column], oof_preds_df[f"oof_{model_name}_{version}"], min_tpr=0.8)

    cv_model_auc_avg = np.mean(list(val_auc_scores[f"{model_name}_{version}"].values()))
    cv_model_pauc_avg = np.mean(list(val_pauc_scores[f"{model_name}_{version}"].values()))

    cv_model_auc_std = np.std(list(val_auc_scores[f"{model_name}_{version}"].values()))
    cv_model_pauc_std = np.std(list(val_pauc_scores[f"{model_name}_{version}"].values()))
    
    print(f"CV AUC OOF: {cv_model_auc_oof}")
    print(f"CV PAUC OOF: {cv_model_pauc_oof}")
    print(f"CV AUC AVG: {cv_model_auc_avg}")
    print(f"CV PAUC AVG: {cv_model_pauc_avg}")
    print(f"CV AUC STD: {cv_model_auc_std}")
    print(f"CV PAUC STD: {cv_model_pauc_std}")
    print("\n")

print("Val AUC scores:")
pprint(val_auc_scores["ensemble"])
print("Val PAUC scores:")
pprint(val_pauc_scores["ensemble"])

cv_ensemble_auc_oof = compute_auc(oof_preds_df[target_column], oof_preds_df[ensemble_column])
cv_ensemble_pauc_oof = compute_pauc(oof_preds_df[target_column], oof_preds_df[ensemble_column], min_tpr=0.8)

cv_ensemble_auc_avg = np.mean(list(val_auc_scores["ensemble"].values()))
cv_ensemble_pauc_avg = np.mean(list(val_pauc_scores["ensemble"].values()))

cv_ensemble_auc_std = np.std(list(val_auc_scores["ensemble"].values()))
cv_ensemble_pauc_std = np.std(list(val_pauc_scores["ensemble"].values()))

print(f"CV AUC OOF: {cv_ensemble_auc_oof}")
print(f"CV PAUC OOF: {cv_ensemble_pauc_oof}")
print(f"CV AUC AVG: {cv_ensemble_auc_avg}")
print(f"CV PAUC AVG: {cv_ensemble_pauc_avg}")
print(f"CV AUC STD: {cv_ensemble_auc_std}")
print(f"CV PAUC STD: {cv_ensemble_pauc_std}")

Model: xgb_v1 | Weightage: 10.0
Val AUC scores:
{'fold_1': 0.9763855695822203,
 'fold_2': 0.9572885288935904,
 'fold_3': 0.9635023430382749,
 'fold_4': 0.9709627048208559,
 'fold_5': 0.9687397106157167}
Val PAUC scores:
{'fold_1': 0.18367412688887158,
 'fold_2': 0.16537818384974395,
 'fold_3': 0.17194383259911888,
 'fold_4': 0.17426448565474031,
 'fold_5': 0.17318846840205077}
CV AUC OOF: 0.9464921630675764
CV PAUC OOF: 0.15296119492851015
CV AUC AVG: 0.9673757713901316
CV PAUC AVG: 0.17368981947890508
CV AUC STD: 0.006521637731286478
CV PAUC STD: 0.00587196296972231


Model: xgb_v2 | Weightage: 6.718043378657121
Val AUC scores:
{'fold_1': 0.9751129113471103,
 'fold_2': 0.963817176741778,
 'fold_3': 0.9639184585241667,
 'fold_4': 0.9684925309805195,
 'fold_5': 0.9623259344160846}
Val PAUC scores:
{'fold_1': 0.18166898925849218,
 'fold_2': 0.17040633221405424,
 'fold_3': 0.17186716127341475,
 'fold_4': 0.17183237448677788,
 'fold_5': 0.16723127288063078}
CV AUC OOF: 0.9584420660973526
C

In [7]:
oof_preds_df.pivot_table(index="target", values=oof_columns+["oof_preds_ensemble"], aggfunc="mean")

Unnamed: 0_level_0,oof_lgb_v5,oof_lgb_v6,oof_preds_ensemble,oof_xgb_v1,oof_xgb_v2
target,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,0.011055,0.011006,11.286765,0.014516,0.009843
1,0.394695,0.38626,21.839171,0.328668,0.339775


In [8]:
oof_preds_df[oof_columns+["oof_preds_ensemble"]].corr()

Unnamed: 0,oof_xgb_v1,oof_xgb_v2,oof_lgb_v5,oof_lgb_v6,oof_preds_ensemble
oof_xgb_v1,1.0,0.875487,0.882708,0.866551,0.322001
oof_xgb_v2,0.875487,1.0,0.893905,0.914023,0.309227
oof_lgb_v5,0.882708,0.893905,1.0,0.938991,0.337036
oof_lgb_v6,0.866551,0.914023,0.938991,1.0,0.334305
oof_preds_ensemble,0.322001,0.309227,0.337036,0.334305,1.0


In [9]:
# from pathlib import Path
# import pandas as pd
# import numpy as np
# INPUT_PATH = Path("../input/isic-2024-challenge/")
# train_metadata = pd.read_csv(INPUT_PATH / "train-metadata.csv", low_memory=False, na_values=["NA"])

# min(0, train_metadata["tbp_lv_A"].min())

# np.log1p(train_metadata["tbp_lv_z"])

# feature_mapping_dict = {
#     "age_approx": {
#         "missing_age_approx": 0,
#         "5.0": 1,
#         "15.0": 2,
#         "20.0": 3,
#         "25.0": 4,
#         "30.0": 5,
#         "35.0": 6,
#         "40.0": 7,
#         "45.0": 8,
#         "50.0": 9,
#         "55.0": 10,
#         "60.0": 11,
#         "65.0": 12,
#         "70.0": 13,
#         "75.0": 14,
#         "80.0": 15,
#         "85.0": 16,
#     },
#     "sex": {
#         "missing_sex": 0,
#         "female": 1,
#         "male": 2,
#     },
#     "anatom_site_general": {
#         "missing_anatom_site_general": 0,
#         "lower extremity": 1,
#         "head/neck": 2,
#         "posterior torso": 3,
#         "anterior torso": 4,
#         "upper extremity": 5,
#     },
#     "tbp_tile_type": {
#         "3D: white": 0,
#         "3D: XP": 1,
#     },
#     "tbp_lv_location": {
#         "Right Leg - Upper": 0,
#         "Head & Neck": 1,
#         "Torso Back Top Third": 2,
#         "Torso Front Top Half": 3,
#         "Right Arm - Upper": 4,
#         "Left Leg - Upper": 5,
#         "Torso Front Bottom Half": 6,
#         "Left Arm - Upper": 7,
#         "Right Leg": 8,
#         "Torso Back Middle Third": 9,
#         "Right Arm - Lower": 10,
#         "Right Leg - Lower": 11,
#         "Left Leg - Lower": 12,
#         "Left Arm - Lower": 13,
#         "Unknown": 14,
#         "Left Leg": 15,
#         "Torso Back Bottom Third": 16,
#         "Left Arm": 17,
#         "Right Arm": 18,
#         "Torso Front": 19,
#         "Torso Back": 20
#     },
#     "tbp_lv_location_simple": {
#         "Right Leg": 0,
#         "Head & Neck": 1,
#         "Torso Back": 2,
#         "Torso Front": 3,
#         "Right Arm": 4,
#         "Left Leg": 5,
#         "Left Arm": 6,
#         "Unknown": 7
#     }
# }

# import torch.nn as nn
# import torch
# from timm import create_model
# from typing import List, Dict
# import h5py
# from PIL import Image
# from io import BytesIO
# from torch.utils.data import Dataset, DataLoader
# import torch.nn.functional as F

# class ISICNet(nn.Module):
#     def __init__(
#         self,
#         model_name,
#         pretrained=True,
#         use_meta=False,
#         cat_cols: List = None, cont_cols: List = None, emb_szs: Dict = None,
#     ):
#         super(ISICNet, self).__init__()
#         self.model = create_model(
#             model_name=model_name,
#             pretrained=pretrained,
#             in_chans=3,
#             num_classes=0,
#             global_pool="",
#         )
#         in_dim = self.model.num_features
#         self.dropouts = nn.ModuleList([nn.Dropout(0.5) for _ in range(5)])
#         self.use_meta = use_meta
#         if use_meta:
#             self.linear = nn.Linear(in_dim, 512)

#             self.embeddings = nn.ModuleList([nn.Embedding(emb_szs[col][0], emb_szs[col][1]) for col in cat_cols])
#             self.embedding_dropout = nn.Dropout(0.1)
#             n_emb = sum([emb_szs[col][1] for col in cat_cols])
#             n_cont = len(cont_cols)
#             self.bn_cont = nn.BatchNorm1d(n_cont)
#             self.meta = nn.Sequential(
#                 nn.Linear(n_emb + n_cont, 512),
#                 nn.BatchNorm1d(512),
#                 nn.SiLU(),
#                 nn.Dropout(0.3),
#                 nn.Linear(512, 128),
#                 nn.BatchNorm1d(128),
#                 nn.SiLU(),
#                 nn.Dropout(0.1),
#             )
#             self.classifier = nn.Linear(512 + 128, 1)
#         else:
#             self.linear = nn.Linear(in_dim, 1)

#     def forward(self, images, x_cat=None, x_cont=None):
#         x = self.model(images)
#         bs = len(images)
#         pool = F.adaptive_avg_pool2d(x, 1).reshape(bs, -1)
#         if self.training:
#             x_image = 0
#             for i in range(len(self.dropouts)):
#                 x_image += self.linear(self.dropouts[i](pool))
#             x_image = x_image / len(self.dropouts)
#         else:
#             x_image = self.linear(pool)

#         if self.use_meta:
#             x_cat = [emb(x_cat[:, i]) for i, emb in enumerate(self.embeddings)]
#             x_cat = torch.cat(x_cat, 1)
#             x_cat = self.embedding_dropout(x_cat)
#             x_cont = self.bn_cont(x_cont)
#             x_meta = self.meta(torch.cat([x_cat, x_cont], 1))
#             x = torch.cat([x_image, x_meta], 1)
#             logits = self.classifier(x)
#         else:
#             logits = x_image
#         return logits

# def preprocess(df):
#     df["age_approx"] = df["age_approx"].fillna("missing_age_approx").astype(str)
#     df["age_approx"] = df["age_approx"].map(feature_mapping_dict["age_approx"])
#     df["sex"] = df["sex"].fillna("missing_sex")
#     df["sex"] = df["sex"].map(feature_mapping_dict["sex"])
#     df["anatom_site_general"] = df["anatom_site_general"].fillna("missing_anatom_site_general")
#     df["anatom_site_general"] = df["anatom_site_general"].map(feature_mapping_dict["anatom_site_general"])
#     df["tbp_tile_type"] = df["tbp_tile_type"].map(feature_mapping_dict["tbp_tile_type"])
#     df["tbp_lv_location"] = df["tbp_lv_location"].map(feature_mapping_dict["tbp_lv_location"])
#     df["tbp_lv_location_simple"] = df["tbp_lv_location_simple"].map(feature_mapping_dict["tbp_lv_location_simple"])
#     return df


# def get_emb_szs(cat_cols):
#     emb_szs = {}
#     for col in cat_cols:
#         emb_szs[col] = (len(feature_mapping_dict[col]), min(600, round(1.6 * len(feature_mapping_dict[col]) ** 0.56)))
#     return emb_szs


# def norm_feature(df, value_col, group_cols, err=1e-5):
#     stats = ["mean", "std"]
#     tmp = df.groupby(group_cols)[value_col].agg(stats)
#     tmp.columns = [f"{value_col}_{stat}" for stat in stats]
#     tmp.reset_index(inplace=True)
#     df = df.merge(tmp, on=group_cols, how="left")
#     feature_name = f"{value_col}_patient_norm"
#     df[feature_name] = ((df[value_col] - df[f"{value_col}_mean"]) / (df[f"{value_col}_std"] + err))
#     return df, feature_name


# def feature_engineering(df):
#     cat_cols = ["age_approx", "sex", "anatom_site_general",
#                 "tbp_tile_type", "tbp_lv_location", "tbp_lv_location_simple"]
#     cont_cols = ["clin_size_long_diam_mm",
#                  "tbp_lv_A", "tbp_lv_Aext",
#                  "tbp_lv_B", "tbp_lv_Bext",
#                  "tbp_lv_C", "tbp_lv_Cext",
#                  "tbp_lv_H", "tbp_lv_Hext",
#                  "tbp_lv_L", "tbp_lv_Lext",
#                  "tbp_lv_areaMM2", "tbp_lv_area_perim_ratio",
#                  "tbp_lv_color_std_mean",
#                  "tbp_lv_deltaA", "tbp_lv_deltaB", "tbp_lv_deltaL", "tbp_lv_deltaLB", "tbp_lv_deltaLBnorm",
#                  "tbp_lv_eccentricity",
#                  "tbp_lv_minorAxisMM", "tbp_lv_nevi_confidence", "tbp_lv_norm_border",
#                  "tbp_lv_norm_color", "tbp_lv_perimeterMM",
#                  "tbp_lv_radial_color_std_max", "tbp_lv_stdL", "tbp_lv_stdLExt",
#                  "tbp_lv_symm_2axis", "tbp_lv_symm_2axis_angle",
#                  "tbp_lv_x", "tbp_lv_y", "tbp_lv_z"
#                  ]

#     # for col in cont_cols:
#     #     df, feature_name = norm_feature(df, col, ["patient_id"])
#     #     cont_cols.append(feature_name)
#     df["num_images"] = df["patient_id"].map(df.groupby("patient_id")["isic_id"].count())
#     cont_cols.append("num_images")

#     for col in cont_cols:
#         min_value = min(0, df[col].min())
#         df[col] = np.log(df[col] - min_value + 1)
#     return df, cat_cols, cont_cols


# def get_data(data_dir):
#     train_metadata = pd.read_csv(f"{data_dir}/train-metadata.csv", low_memory=False)
#     train_images = h5py.File(f"{data_dir}/train-image.hdf5", mode="r")

#     print(f"Train metadata: {train_metadata.shape}")

#     print(f"Preprocessing metadata...")
#     train_metadata = preprocess(train_metadata)

#     print(f"Feature engineering...")
#     train_metadata, cat_cols, cont_cols = feature_engineering(train_metadata)

#     emb_szs = get_emb_szs(cat_cols)
#     return train_metadata, train_images, cat_cols, cont_cols, emb_szs

# train_metadata, train_images, cat_cols, cont_cols, emb_szs = get_data(INPUT_PATH)

# train_metadata[cont_cols].isnull().sum()

# class ISICDataset(Dataset):
#     def __init__(self, metadata, images, augment,
#                  use_meta=False, cat_cols: List = None, cont_cols: List = None,
#                  infer=False):
#         self.metadata = metadata
#         self.images = images
#         self.augment = augment
#         self.use_meta = use_meta
#         self.cat_cols = cat_cols
#         self.cont_cols = cont_cols
#         self.length = len(self.metadata)
#         self.infer = infer

#     def __len__(self):
#         return self.length

#     def __getitem__(self, index):
#         row = self.metadata.iloc[index]
#         image = np.array(Image.open(BytesIO(self.images[row["isic_id"]][()])))
#         if self.augment is not None:
#             image = self.augment(image=image)["image"].float()

#         if self.use_meta:
#             x_cat = torch.tensor([row[col] for col in self.cat_cols], dtype=torch.long)
#             x_cont = torch.tensor([row[col] for col in self.cont_cols], dtype=torch.float)
#         else:
#             x_cat = torch.tensor(0)
#             x_cont = torch.tensor(0)

#         if self.infer:
#             return image, x_cat, x_cont
#         else:
#             target = torch.tensor(row["target"])
#             return image, x_cat, x_cont, target

# m = ISICNet(model_name="resnet18", pretrained=False, use_meta=True, cat_cols=cat_cols, cont_cols=cont_cols, emb_szs=emb_szs )

# from albumentations.pytorch import ToTensorV2
# import albumentations as A
# def val_augment(image_size, mean=None, std=None):
#     if mean is not None and std is not None:
#         normalize = A.Normalize(mean=mean, std=std, max_pixel_value=255.0, p=1.0)
#     else:
#         normalize = A.Normalize(max_pixel_value=255.0, p=1.0)
#     transform = A.Compose(
#         [A.Resize(image_size, image_size), normalize, ToTensorV2()], p=1.0
#     )
#     return transform

# tr_da = ISICDataset(train_metadata, train_images, val_augment(32), use_meta=True, cat_cols=cat_cols, cont_cols=cont_cols)
# tr_dl = DataLoader(tr_da, shuffle=True, batch_size=2)

# for step, (images, x_cat, x_cont, targets) in enumerate(tr_dl):
#     break

# m(images, x_cat, x_cont)

