In [1]:
from pathlib import Path
from pprint import pprint

import pandas as pd
import numpy as np

from bayes_opt import BayesianOptimization
from isic_helper import compute_pauc, compute_auc

In [2]:
id_column = "isic_id"
target_column = "target"
ensemble_column = "oof_preds_ensemble"
fold_method = "gkf"
if fold_method == "gkf":
    print("Using GroupKFold")
    fold_column = "gkf_fold"
elif fold_method == "sgkf":
    print("Using StratifiedGroupKFold")
    fold_column = "sgkf_fold"
else:
    raise ValueError(f"Fold method {fold_method} not supported")

# model_names = ["lgb", "cb", "xgb", "efficientnet_b0", "mobilevitv2_200", "efficientnet_b1"]
# versions = ["v5", "v2", "v1", "v3", "v1", "v1"]
# modes = ["train", "train", "train", "train", "train", "pretrain"]

# model_names = ["lgb", "cb", "xgb", "efficientnet_b2"]
# versions = ["v5", "v2", "v1", "v1"]
# modes = ["train", "train", "train", "pretrain"]

# model_names = ["lgb", "cb", "xgb", "efficientnet_b1", "efficientnet_b2", "efficientnet_b3"]
# versions = ["v5", "v2", "v1", "v1", "v1", "v1"]
# modes = ["train", "train", "train", "pretrain", "pretrain", "pretrain"]

model_names = ["lgb", "xgb", "efficientnet_b2", "efficientnet_b3"]
versions = ["v5", "v1", "v1", "v1"]
modes = ["train", "train", "pretrain", "pretrain"]

paths = [f"/kaggle/input/isic-scd-{model_name.replace('_', '-')}-{version}-{mode}" for model_name, version, mode in zip(model_names, versions, modes)]

Using GroupKFold


In [3]:
val_auc_scores = {}
val_pauc_scores = {}
for idx, path in enumerate(paths):
    model_name = model_names[idx]
    version = versions[idx]
    mode = modes[idx]
    model_identifier = f"{model_name}_{version}"
    oof_preds_model_df = pd.read_csv(f"{path}/oof_preds_{model_identifier}.csv")
    if idx == 0:
        oof_preds_df = oof_preds_model_df.copy()
    else:
        oof_preds_df = oof_preds_df.merge(oof_preds_model_df[[id_column, f"oof_{model_name}_{version}"]],
                                          on="isic_id", how="inner")
        assert oof_preds_df.shape[0] == oof_preds_model_df.shape[0]
    val_auc_scores[f"{model_name}_{version}"] = {}
    val_pauc_scores[f"{model_name}_{version}"] = {}
val_auc_scores["ensemble"] = {}
val_pauc_scores["ensemble"] = {}

In [4]:
oof_columns = [col for col in oof_preds_df.columns if col.startswith("oof")]
all_folds = np.unique(oof_preds_df["fold"])

In [5]:
def blend_optimizer(oof_preds_df, oof_columns, init_points = 20, n_iter = 100):
    pbounds = {f"w{i}": (0.0, 10.0) for i in range(len(oof_columns))}

    def dim_opt(oof_preds_df, *args):
        weights = args
        score = 0
        for fold in all_folds:
            fold_ensemble_preds = 0
            for weight, oof_column in zip(weights, oof_columns):
                fold_ensemble_preds += weight * oof_preds_df.loc[oof_preds_df["fold"] == fold, oof_column].rank(pct=True).values
            score += compute_pauc(np.array(oof_preds_df.loc[oof_preds_df["fold"] == fold, "target"]), fold_ensemble_preds)
        return score / len(all_folds)

    def q(**ws):
        ws = tuple(ws.values())
        return dim_opt(oof_preds_df, *ws)

    optimizer = BayesianOptimization(
        f=q,
        pbounds=pbounds,
        random_state=2022,
    )

    optimizer.maximize(
        init_points=init_points,
        n_iter=n_iter,
    )
    
    weights = []
    for i in range(len(oof_columns)):
        weights.append(optimizer.max["params"][f"w{i}"])
    
    print(f"Best pAUC: {optimizer.max['target']}")
    print(f"Best weights: {weights}")
    return weights


weights = blend_optimizer(
    oof_preds_df, oof_columns, 
    init_points=40, 
    n_iter=40
)

|   iter    |  target   |    w0     |    w1     |    w2     |    w3     |
-------------------------------------------------------------------------
| [30m1         | [30m0.1786    | [30m0.09359   | [30m4.991     | [30m1.134     | [30m0.4997    |
| [30m2         | [30m0.1773    | [30m6.854     | [30m4.87      | [30m8.977     | [30m6.475     |
| [30m3         | [30m0.1781    | [30m8.97      | [30m7.211     | [30m8.314     | [30m8.276     |
| [35m4         | [35m0.1792    | [35m8.336     | [35m9.57      | [35m3.68      | [35m4.948     |
| [30m5         | [30m0.1774    | [30m3.395     | [30m6.194     | [30m9.775     | [30m0.9643    |
| [30m6         | [30m0.1777    | [30m7.442     | [30m2.925     | [30m2.987     | [30m7.525     |
| [30m7         | [30m0.1742    | [30m0.1866    | [30m5.237     | [30m8.644     | [30m3.888     |
| [30m8         | [30m0.1772    | [30m2.122     | [30m4.752     | [30m5.647     | [30m3.494     |
| [30m9         | 

In [6]:
weights

[2.9316833237526385, 7.016298039373616, 3.582776640762239, 1.581709784928026]

In [7]:
all_folds = np.unique(oof_preds_df["fold"])
for fold in all_folds:
    fold_index = oof_preds_df[oof_preds_df["fold"] == fold].index
    fold_target = oof_preds_df.loc[fold_index, target_column]
    fold_ensemble_preds = 0
    for model_name, version, weight in zip(model_names, versions, weights):
        fold_model_preds = oof_preds_df.loc[fold_index, f"oof_{model_name}_{version}"]
        fold_ensemble_preds += fold_model_preds.rank(pct=True).values * weight 
        
        val_auc_scores[f"{model_name}_{version}"][f"fold_{fold}"] = compute_auc(fold_target, fold_model_preds)
        val_pauc_scores[f"{model_name}_{version}"][f"fold_{fold}"] = compute_pauc(fold_target, fold_model_preds, min_tpr=0.8)
    
    oof_preds_df.loc[fold_index, ensemble_column] = fold_ensemble_preds
    val_auc_scores["ensemble"][f"fold_{fold}"] = compute_auc(fold_target, fold_ensemble_preds)
    val_pauc_scores["ensemble"][f"fold_{fold}"] = compute_pauc(fold_target, fold_ensemble_preds, min_tpr=0.8)

for model_name, version, weight in zip(model_names, versions, weights):
    print(f"Model: {model_name}_{version} | Weightage: {weight}")
    
    print("Val AUC scores:")
    pprint(val_auc_scores[f"{model_name}_{version}"])
    print("Val PAUC scores:")
    pprint(val_pauc_scores[f"{model_name}_{version}"])
    
    cv_model_auc_oof = compute_auc(oof_preds_df[target_column], oof_preds_df[f"oof_{model_name}_{version}"])
    cv_model_pauc_oof = compute_pauc(oof_preds_df[target_column], oof_preds_df[f"oof_{model_name}_{version}"], min_tpr=0.8)

    cv_model_auc_avg = np.mean(list(val_auc_scores[f"{model_name}_{version}"].values()))
    cv_model_pauc_avg = np.mean(list(val_pauc_scores[f"{model_name}_{version}"].values()))

    cv_model_auc_std = np.std(list(val_auc_scores[f"{model_name}_{version}"].values()))
    cv_model_pauc_std = np.std(list(val_pauc_scores[f"{model_name}_{version}"].values()))
    
    print(f"CV AUC OOF: {cv_model_auc_oof}")
    print(f"CV PAUC OOF: {cv_model_pauc_oof}")
    print(f"CV AUC AVG: {cv_model_auc_avg}")
    print(f"CV PAUC AVG: {cv_model_pauc_avg}")
    print(f"CV AUC STD: {cv_model_auc_std}")
    print(f"CV PAUC STD: {cv_model_pauc_std}")
    print("\n")

print("Val AUC scores:")
pprint(val_auc_scores["ensemble"])
print("Val PAUC scores:")
pprint(val_pauc_scores["ensemble"])

cv_ensemble_auc_oof = compute_auc(oof_preds_df[target_column], oof_preds_df[ensemble_column])
cv_ensemble_pauc_oof = compute_pauc(oof_preds_df[target_column], oof_preds_df[ensemble_column], min_tpr=0.8)

cv_ensemble_auc_avg = np.mean(list(val_auc_scores["ensemble"].values()))
cv_ensemble_pauc_avg = np.mean(list(val_pauc_scores["ensemble"].values()))

cv_ensemble_auc_std = np.std(list(val_auc_scores["ensemble"].values()))
cv_ensemble_pauc_std = np.std(list(val_pauc_scores["ensemble"].values()))

print(f"CV AUC OOF: {cv_ensemble_auc_oof}")
print(f"CV PAUC OOF: {cv_ensemble_pauc_oof}")
print(f"CV AUC AVG: {cv_ensemble_auc_avg}")
print(f"CV PAUC AVG: {cv_ensemble_pauc_avg}")
print(f"CV AUC STD: {cv_ensemble_auc_std}")
print(f"CV PAUC STD: {cv_ensemble_pauc_std}")

Model: lgb_v5 | Weightage: 2.9316833237526385
Val AUC scores:
{'fold_1': 0.9798763448762824,
 'fold_2': 0.9502058596813567,
 'fold_3': 0.9700671400581549,
 'fold_4': 0.969838569286934,
 'fold_5': 0.9668910080418592}
Val PAUC scores:
{'fold_1': 0.18493155147135198,
 'fold_2': 0.15736031155895627,
 'fold_3': 0.17765128352323065,
 'fold_4': 0.1733665704622393,
 'fold_5': 0.1709486398724156}
CV AUC OOF: 0.9676101211731352
CV PAUC OOF: 0.17291670433806983
CV AUC AVG: 0.9673757843889174
CV PAUC AVG: 0.17285167137763877
CV AUC STD: 0.009639647709957823
CV PAUC STD: 0.00908656875652946


Model: xgb_v1 | Weightage: 7.016298039373616
Val AUC scores:
{'fold_1': 0.980802518544136,
 'fold_2': 0.9508768595287258,
 'fold_3': 0.9684103218479739,
 'fold_4': 0.9696985278668998,
 'fold_5': 0.9672836294473094}
Val PAUC scores:
{'fold_1': 0.18716467529874567,
 'fold_2': 0.15925984589082187,
 'fold_3': 0.1758745054972482,
 'fold_4': 0.1741027949834683,
 'fold_5': 0.17239045958783386}
CV AUC OOF: 0.954863605

In [8]:
oof_preds_df

Unnamed: 0,isic_id,patient_id,fold,target,oof_lgb_v5,oof_xgb_v1,oof_efficientnet_b2_v1,oof_efficientnet_b3_v1,oof_preds_ensemble
0,ISIC_0015670,IP_1235828,4,0,0.001568,0.007305,0.000974,0.001768,7.594819
1,ISIC_0015845,IP_8170065,1,0,0.954632,0.945787,0.008223,0.006001,14.624643
2,ISIC_0015864,IP_6724798,5,0,0.001870,0.000914,0.000187,0.000047,4.264615
3,ISIC_0015902,IP_4111386,2,0,0.005178,0.001571,0.000183,0.000772,8.806803
4,ISIC_0024200,IP_8313778,1,0,0.003908,0.002689,0.000443,0.001020,10.023809
...,...,...,...,...,...,...,...,...,...
401050,ISIC_9999937,IP_1140263,3,0,0.069581,0.049572,0.011383,0.025917,14.210894
401051,ISIC_9999951,IP_5678181,3,0,0.001908,0.001645,0.000608,0.000232,6.024592
401052,ISIC_9999960,IP_0076153,2,0,0.011912,0.002285,0.000112,0.001012,9.472541
401053,ISIC_9999964,IP_5231513,5,0,0.001390,0.001006,0.000852,0.000021,5.182937


In [9]:
oof_preds_df.pivot_table(index="target", values=oof_columns+["oof_preds_ensemble"], aggfunc="mean")

Unnamed: 0_level_0,oof_efficientnet_b2_v1,oof_efficientnet_b3_v1,oof_lgb_v5,oof_preds_ensemble,oof_xgb_v1
target,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,0.007566,0.008614,0.010912,7.549537,0.010073
1,0.180364,0.208613,0.391226,14.49733,0.327865


In [10]:
oof_preds_df[oof_columns+["oof_preds_ensemble"]].corr()

Unnamed: 0,oof_lgb_v5,oof_xgb_v1,oof_efficientnet_b2_v1,oof_efficientnet_b3_v1,oof_preds_ensemble
oof_lgb_v5,1.0,0.87825,0.439815,0.424659,0.335562
oof_xgb_v1,0.87825,1.0,0.422706,0.423687,0.316518
oof_efficientnet_b2_v1,0.439815,0.422706,1.0,0.730233,0.314256
oof_efficientnet_b3_v1,0.424659,0.423687,0.730233,1.0,0.308206
oof_preds_ensemble,0.335562,0.316518,0.314256,0.308206,1.0
