In [1]:
from pathlib import Path
from pprint import pprint

import pandas as pd
import numpy as np

from bayes_opt import BayesianOptimization
from isic_helper import compute_pauc, compute_auc

In [2]:
def blend_optimizer(oof_preds_df, oof_columns, folds, init_points = 20, n_iter = 100):
    pbounds = {f"w{i}": (0.0, 10.0) for i in range(len(oof_columns))}

    def dim_opt(oof_preds_df, *args):
        weights = args
        score = 0
        for fold in folds:
            fold_ensemble_preds = 0
            for weight, oof_column in zip(weights, oof_columns):
                fold_ensemble_preds += weight * oof_preds_df.loc[oof_preds_df["fold"] == fold, oof_column].rank(pct=True).values
            score += compute_pauc(np.array(oof_preds_df.loc[oof_preds_df["fold"] == fold, "target"]), fold_ensemble_preds)
        return score / len(folds)

    def q(**ws):
        ws = tuple(ws.values())
        return dim_opt(oof_preds_df, *ws)

    optimizer = BayesianOptimization(
        f=q,
        pbounds=pbounds,
        random_state=2022,
    )

    optimizer.maximize(
        init_points=init_points,
        n_iter=n_iter,
    )
    
    weights = []
    for i in range(len(oof_columns)):
        weights.append(optimizer.max["params"][f"w{i}"])
    
    print(f"Best pAUC: {optimizer.max['target']}")
    print(f"Best weights: {weights}")
    return weights

In [3]:
id_column = "isic_id"
target_column = "target"
ensemble_column = "oof_ensemble"
fold_column = "gkf_fold"

boosting_model_names = ["xgb", "xgb", "lgb", "lgb", "xgb"]
boosting_versions = ["v1", "v3", "v1", "v6", "v5"]
boosting_modes = ["train", "train", "train", "train", "train"]

cnn_model_names = ["efficientnet_b0", "efficientnet_b1", "tf_efficientnet_b1_ns", "mobilevitv2_200"]
cnn_versions = ["v2", "v1", "v1", "v1"]
cnn_modes = ["trainbinary", "trainbinary", "trainbinary", "trainbinary"]

model_names = boosting_model_names + cnn_model_names
versions = boosting_versions + cnn_versions
modes = boosting_modes + cnn_modes
paths = [f"/kaggle/input/isic-scd-{model_name.replace('_', '-')}-{version}-{mode}" for model_name, version, mode in zip(model_names, versions, modes)]

val_auc_scores = {}
val_pauc_scores = {}
oof_columns = []
for idx, path in enumerate(paths):
    model_name = model_names[idx]
    version = versions[idx]
    mode = modes[idx]
    oof_preds_model_df = pd.read_csv(f"{path}/oof_train_preds_{model_name}_{version}.csv")
    if mode in ["trainbinary", "trainmulti"]:
        oof_column = f"oof_{model_name}_{version}_{mode}"
        oof_preds_model_df = oof_preds_model_df.rename(columns={f"oof_{model_name}_{version}": oof_column})
    else:
        oof_column = f"oof_{model_name}_{version}"
    if idx == 0:
        oof_preds_df = oof_preds_model_df.copy()
    else:
        oof_preds_df = oof_preds_df.merge(oof_preds_model_df[[id_column, oof_column]], on="isic_id", how="inner")
        assert oof_preds_df.shape[0] == oof_preds_model_df.shape[0]
    
    val_auc_scores[f"{model_name}_{version}_{mode}"] = {}
    val_pauc_scores[f"{model_name}_{version}_{mode}"] = {}
    oof_columns.append(oof_column)

all_folds = np.unique(oof_preds_df["fold"])
weights = blend_optimizer(
    oof_preds_df, oof_columns, all_folds,
    init_points=50, n_iter=100
)

|   iter    |  target   |    w0     |    w1     |    w2     |    w3     |    w4     |    w5     |    w6     |    w7     |    w8     |
-------------------------------------------------------------------------------------------------------------------------------------
| [30m1         | [30m0.1841    | [30m0.09359   | [30m4.991     | [30m1.134     | [30m0.4997    | [30m6.854     | [30m4.87      | [30m8.977     | [30m6.475     | [30m8.97      |
| [35m2         | [35m0.1851    | [35m7.211     | [35m8.314     | [35m8.276     | [35m8.336     | [35m9.57      | [35m3.68      | [35m4.948     | [35m3.395     | [35m6.194     |
| [30m3         | [30m0.1845    | [30m9.775     | [30m0.9643    | [30m7.442     | [30m2.925     | [30m2.987     | [30m7.525     | [30m0.1866    | [30m5.237     | [30m8.644     |
| [30m4         | [30m0.1847    | [30m3.888     | [30m2.122     | [30m4.752     | [30m5.647     | [30m3.494     | [30m9.759     | [30m0.3782    | [30m7.943

In [4]:
pprint(oof_columns)

['oof_xgb_v1',
 'oof_xgb_v3',
 'oof_lgb_v1',
 'oof_lgb_v6',
 'oof_xgb_v5',
 'oof_efficientnet_b0_v2_trainbinary',
 'oof_efficientnet_b1_v1_trainbinary',
 'oof_tf_efficientnet_b1_ns_v1_trainbinary',
 'oof_mobilevitv2_200_v1_trainbinary']


In [5]:
pprint(weights)

[6.585365438709367,
 8.398017510540917,
 7.8452109076080125,
 5.187451888483045,
 3.2073486066662666,
 4.472284519831247,
 1.237978868777819,
 5.465443244560972,
 1.7122373594122717]


In [6]:
val_auc_scores["ensemble"] = {}
val_pauc_scores["ensemble"] = {}
for fold in all_folds:
    fold_index = oof_preds_df[oof_preds_df["fold"] == fold].index
    fold_target = oof_preds_df.loc[fold_index, target_column]
    fold_ensemble_preds = 0
    for model_name, version, mode, oof_column, weight in zip(model_names, versions, modes, oof_columns, weights):
        fold_model_preds = oof_preds_df.loc[fold_index, oof_column]
        fold_ensemble_preds += fold_model_preds.rank(pct=True).values * weight 
        
        val_auc_scores[f"{model_name}_{version}_{mode}"][f"fold_{fold}"] = compute_auc(fold_target, fold_model_preds)
        val_pauc_scores[f"{model_name}_{version}_{mode}"][f"fold_{fold}"] = compute_pauc(fold_target, fold_model_preds, min_tpr=0.8)
    
    oof_preds_df.loc[fold_index, ensemble_column] = fold_ensemble_preds
    val_auc_scores["ensemble"][f"fold_{fold}"] = compute_auc(fold_target, fold_ensemble_preds)
    val_pauc_scores["ensemble"][f"fold_{fold}"] = compute_pauc(fold_target, fold_ensemble_preds, min_tpr=0.8)

for model_name, version, mode, oof_column, weight in zip(model_names, versions, modes, oof_columns, weights):
    print(f"Model: {model_name}_{version}_{mode} | Weightage: {weight}")
    
    print("Val AUC scores:")
    pprint(val_auc_scores[f"{model_name}_{version}_{mode}"])
    print("Val PAUC scores:")
    pprint(val_pauc_scores[f"{model_name}_{version}_{mode}"])
    
    cv_model_auc_oof = compute_auc(oof_preds_df[target_column], oof_preds_df[oof_column])
    cv_model_pauc_oof = compute_pauc(oof_preds_df[target_column], oof_preds_df[oof_column], min_tpr=0.8)

    cv_model_auc_avg = np.mean(list(val_auc_scores[f"{model_name}_{version}_{mode}"].values()))
    cv_model_pauc_avg = np.mean(list(val_pauc_scores[f"{model_name}_{version}_{mode}"].values()))

    cv_model_auc_std = np.std(list(val_auc_scores[f"{model_name}_{version}_{mode}"].values()))
    cv_model_pauc_std = np.std(list(val_pauc_scores[f"{model_name}_{version}_{mode}"].values()))
    
    print(f"CV AUC OOF: {cv_model_auc_oof}")
    print(f"CV PAUC OOF: {cv_model_pauc_oof}")
    print(f"CV AUC AVG: {cv_model_auc_avg}")
    print(f"CV PAUC AVG: {cv_model_pauc_avg}")
    print(f"CV AUC STD: {cv_model_auc_std}")
    print(f"CV PAUC STD: {cv_model_pauc_std}")
    print("\n")

print("Val AUC scores:")
pprint(val_auc_scores["ensemble"])
print("Val PAUC scores:")
pprint(val_pauc_scores["ensemble"])

cv_ensemble_auc_oof = compute_auc(oof_preds_df[target_column], oof_preds_df[ensemble_column])
cv_ensemble_pauc_oof = compute_pauc(oof_preds_df[target_column], oof_preds_df[ensemble_column], min_tpr=0.8)

cv_ensemble_auc_avg = np.mean(list(val_auc_scores["ensemble"].values()))
cv_ensemble_pauc_avg = np.mean(list(val_pauc_scores["ensemble"].values()))

cv_ensemble_auc_std = np.std(list(val_auc_scores["ensemble"].values()))
cv_ensemble_pauc_std = np.std(list(val_pauc_scores["ensemble"].values()))

print(f"CV AUC OOF: {cv_ensemble_auc_oof}")
print(f"CV PAUC OOF: {cv_ensemble_pauc_oof}")
print(f"CV AUC AVG: {cv_ensemble_auc_avg}")
print(f"CV PAUC AVG: {cv_ensemble_pauc_avg}")
print(f"CV AUC STD: {cv_ensemble_auc_std}")
print(f"CV PAUC STD: {cv_ensemble_pauc_std}")
print(f"CV PAUC RANGE: ({cv_ensemble_pauc_avg - cv_ensemble_pauc_std}, {cv_ensemble_pauc_avg + cv_ensemble_pauc_std})")

Model: xgb_v1_train | Weightage: 6.585365438709367
Val AUC scores:
{'fold_1': 0.9882204160440182,
 'fold_2': 0.9753371436177736,
 'fold_3': 0.9653197576468532,
 'fold_4': 0.9830886298685901,
 'fold_5': 0.9803896534831356}
Val PAUC scores:
{'fold_1': 0.19097631345282237,
 'fold_2': 0.18025960362546575,
 'fold_3': 0.1700443648525539,
 'fold_4': 0.18525867017758416,
 'fold_5': 0.1839772787996366}
CV AUC OOF: 0.9240157408906537
CV PAUC OOF: 0.1327374228525281
CV AUC AVG: 0.9784711201320742
CV PAUC AVG: 0.18210324618161255
CV AUC STD: 0.007782429969803381
CV PAUC STD: 0.006942847520007067


Model: xgb_v3_train | Weightage: 8.398017510540917
Val AUC scores:
{'fold_1': 0.9789757767096575,
 'fold_2': 0.9653996222993153,
 'fold_3': 0.9664791092086708,
 'fold_4': 0.9677780759007127,
 'fold_5': 0.9726160386562468}
Val PAUC scores:
{'fold_1': 0.1851343535234871,
 'fold_2': 0.17305110094437967,
 'fold_3': 0.17426167775267995,
 'fold_4': 0.17145455566509835,
 'fold_5': 0.17612597262179827}
CV AUC OO

In [7]:
oof_preds_df.pivot_table(index="target", values=oof_columns + [ensemble_column], aggfunc="mean")

Unnamed: 0_level_0,oof_efficientnet_b0_v2_trainbinary,oof_efficientnet_b1_v1_trainbinary,oof_ensemble,oof_lgb_v1,oof_lgb_v6,oof_mobilevitv2_200_v1_trainbinary,oof_tf_efficientnet_b1_ns_v1_trainbinary,oof_xgb_v1,oof_xgb_v3,oof_xgb_v5
target,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
0,0.00229,0.000992,22.035476,0.001558,0.011029,0.001209,0.001437,0.007329,0.010824,0.022984
1,0.32989,0.230465,42.923396,0.152065,0.482563,0.296823,0.272554,0.178794,0.337296,0.345672


In [8]:
oof_preds_df[oof_columns + [ensemble_column]].corr()

Unnamed: 0,oof_xgb_v1,oof_xgb_v3,oof_lgb_v1,oof_lgb_v6,oof_xgb_v5,oof_efficientnet_b0_v2_trainbinary,oof_efficientnet_b1_v1_trainbinary,oof_tf_efficientnet_b1_ns_v1_trainbinary,oof_mobilevitv2_200_v1_trainbinary,oof_ensemble
oof_xgb_v1,1.0,0.534844,0.645251,0.698925,0.412683,0.651226,0.587173,0.573828,0.559409,0.195024
oof_xgb_v3,0.534844,1.0,0.515177,0.708221,0.754328,0.500615,0.419937,0.387086,0.410881,0.331636
oof_lgb_v1,0.645251,0.515177,1.0,0.621281,0.451947,0.674625,0.652587,0.598209,0.593908,0.154989
oof_lgb_v6,0.698925,0.708221,0.621281,1.0,0.625106,0.615857,0.496669,0.517019,0.501281,0.352436
oof_xgb_v5,0.412683,0.754328,0.451947,0.625106,1.0,0.429455,0.357309,0.330237,0.364992,0.342505
oof_efficientnet_b0_v2_trainbinary,0.651226,0.500615,0.674625,0.615857,0.429455,1.0,0.765765,0.726273,0.713612,0.151064
oof_efficientnet_b1_v1_trainbinary,0.587173,0.419937,0.652587,0.496669,0.357309,0.765765,1.0,0.742009,0.754796,0.092533
oof_tf_efficientnet_b1_ns_v1_trainbinary,0.573828,0.387086,0.598209,0.517019,0.330237,0.726273,0.742009,1.0,0.73495,0.112099
oof_mobilevitv2_200_v1_trainbinary,0.559409,0.410881,0.593908,0.501281,0.364992,0.713612,0.754796,0.73495,1.0,0.096616
oof_ensemble,0.195024,0.331636,0.154989,0.352436,0.342505,0.151064,0.092533,0.112099,0.096616,1.0
