In [1]:
from pathlib import Path
from pprint import pprint

import pandas as pd
import numpy as np

from bayes_opt import BayesianOptimization

from isic_helper import get_folds
from isic_helper import compute_pauc, compute_auc

In [2]:
id_column = "isic_id"
target_column = "target"
ensemble_column = "oof_preds_ensemble"

model_names = ["efficientnet_b0", "mobilevitv2_200", "boosting"]
versions = ["v3", "v1", "v1"]
paths = [f"/kaggle/input/isic-scd-{model_name.replace('_', '-')}-{version}-train" for model_name, version in zip(model_names, versions)]

In [3]:
val_auc_scores = {}
val_pauc_scores = {}
for idx, path in enumerate(paths):
    model_name = model_names[idx]
    version = versions[idx]
    oof_preds_model_df = pd.read_csv(f"{path}/oof_preds_{model_name}_{version}.csv")
    if idx == 0:
        oof_preds_df = oof_preds_model_df.copy()
    else:
        oof_preds_df = oof_preds_df.merge(oof_preds_model_df[[id_column, f"oof_{model_name}_{version}"]],
                                          on="isic_id", how="inner")
        assert oof_preds_df.shape[0] == oof_preds_model_df.shape[0]
    val_auc_scores[f"{model_name}_{version}"] = {}
    val_pauc_scores[f"{model_name}_{version}"] = {}
val_auc_scores["ensemble"] = {}
val_pauc_scores["ensemble"] = {}

In [4]:
oof_columns = [col for col in oof_preds_df.columns if col.startswith("oof")]
all_folds = np.unique(oof_preds_df["fold"])

In [5]:
def blend_optimizer(oof_preds_df, oof_columns, init_points = 20, n_iter = 100):
    pbounds = {f"w{i}": (0.0, 1.0) for i in range(len(oof_columns))}

    def dim_opt(oof_preds_df, *args):
        weights = args
        score = 0
        for fold in all_folds:
            fold_ensemble_preds = 0
            for weight, oof_column in zip(weights, oof_columns):
                fold_ensemble_preds += weight * oof_preds_df.loc[oof_preds_df["fold"] == fold, oof_column].rank(pct=True).values
            score += compute_pauc(np.array(oof_preds_df.loc[oof_preds_df["fold"] == fold, "target"]), fold_ensemble_preds)
        return score / len(all_folds)
    
#     def dim_opt(oof_preds_df, *args):
#         weights = args
#         score = 0
#         for fold in all_folds:
#             oof_fold_preds_df = oof_preds_df[oof_preds_df["fold"] == fold]
#             fold_ensemble_preds = np.zeros((oof_fold_preds_df.shape[0], len(weights)))
#             for idx, (weight, oof_column) in enumerate(zip(weights, oof_columns)):
#                 fold_ensemble_preds[:, idx] = weight * np.log1p(oof_fold_preds_df[oof_column].rank(pct=True).values)
#             fold_ensemble_preds = np.exp(fold_ensemble_preds.sum(axis=1) / sum(weights)) - 1
#             score += compute_pauc(oof_fold_preds_df["target"], fold_ensemble_preds)
#         return score / len(all_folds)
    
#     def dim_opt(oof_preds_df, *args):
#         weights = args
#         score = 0
#         for fold in all_folds:
#             oof_fold_preds_df = oof_preds_df[oof_preds_df["fold"] == fold]
#             fold_ensemble_preds = np.zeros((oof_fold_preds_df.shape[0], len(weights)))
#             for idx, (weight, oof_column) in enumerate(zip(weights, oof_columns)):
#                 fold_ensemble_preds[:, idx] = weight * oof_fold_preds_df[oof_column].rank(pct=True).values
#             fold_ensemble_preds = fold_ensemble_preds.sum(axis=1) ** (1 / sum(weights))
#             score += compute_pauc(oof_fold_preds_df["target"], fold_ensemble_preds)
#         return score / len(all_folds)

    def q(**ws):
        ws = tuple(ws.values())
        return dim_opt(oof_preds_df, *ws)

    optimizer = BayesianOptimization(
        f=q,
        pbounds=pbounds,
        random_state=2022,
    )

    optimizer.maximize(
        init_points=init_points,
        n_iter=n_iter,
    )
    
    weights = []
    for i in range(len(oof_columns)):
        weights.append(optimizer.max["params"][f"w{i}"])
    
    print(f"Best pAUC: {optimizer.max['target']}")
    print(f"Best weights: {weights}")
    return weights


weights = blend_optimizer(
    oof_preds_df, oof_columns, 
    init_points=40, 
    n_iter=40
)

|   iter    |  target   |    w0     |    w1     |    w2     |
-------------------------------------------------------------
| [30m1         | [30m0.1712    | [30m0.009359  | [30m0.4991    | [30m0.1134    |
| [35m2         | [35m0.1765    | [35m0.04997   | [35m0.6854    | [35m0.487     |
| [35m3         | [35m0.1779    | [35m0.8977    | [35m0.6475    | [35m0.897     |
| [30m4         | [30m0.1776    | [30m0.7211    | [30m0.8314    | [30m0.8276    |
| [30m5         | [30m0.1742    | [30m0.8336    | [30m0.957     | [30m0.368     |
| [35m6         | [35m0.1785    | [35m0.4948    | [35m0.3395    | [35m0.6194    |
| [30m7         | [30m0.1776    | [30m0.9775    | [30m0.09643   | [30m0.7442    |
| [35m8         | [35m0.179     | [35m0.2925    | [35m0.2987    | [35m0.7525    |
| [30m9         | [30m0.1779    | [30m0.01866   | [30m0.5237    | [30m0.8644    |
| [30m10        | [30m0.1786    | [30m0.3888    | [30m0.2122    | [30m0.4752    |
| [30

In [6]:
weights

[0.40952716182378507, 0.3352538589099779, 0.9978620785343839]

In [7]:
all_folds = np.unique(oof_preds_df["fold"])
for fold in all_folds:
    fold_index = oof_preds_df[oof_preds_df["fold"] == fold].index
    fold_target = oof_preds_df.loc[fold_index, target_column]
    fold_ensemble_preds = 0
    for model_name, version, weight in zip(model_names, versions, weights):
        fold_model_preds = oof_preds_df.loc[fold_index, f"oof_{model_name}_{version}"]
        fold_ensemble_preds += fold_model_preds.rank(pct=True).values * weight 
        
        val_auc_scores[f"{model_name}_{version}"][f"fold_{fold}"] = compute_auc(fold_target, fold_model_preds)
        val_pauc_scores[f"{model_name}_{version}"][f"fold_{fold}"] = compute_pauc(fold_target, fold_model_preds, min_tpr=0.8)
    
    oof_preds_df.loc[fold_index, ensemble_column] = fold_ensemble_preds
    val_auc_scores["ensemble"][f"fold_{fold}"] = compute_auc(fold_target, fold_ensemble_preds)
    val_pauc_scores["ensemble"][f"fold_{fold}"] = compute_pauc(fold_target, fold_ensemble_preds, min_tpr=0.8)

for model_name, version, weight in zip(model_names, versions, weights):
    print(f"Model: {model_name}_{version} | Weightage: {weight}")
    
    print("Val AUC scores:")
    pprint(val_auc_scores[f"{model_name}_{version}"])
    print("Val PAUC scores:")
    pprint(val_pauc_scores[f"{model_name}_{version}"])
    
    cv_model_auc_oof = compute_auc(oof_preds_df[target_column], oof_preds_df[f"oof_{model_name}_{version}"])
    cv_model_pauc_oof = compute_pauc(oof_preds_df[target_column], oof_preds_df[f"oof_{model_name}_{version}"], min_tpr=0.8)

    cv_model_auc_avg = np.mean(list(val_auc_scores[f"{model_name}_{version}"].values()))
    cv_model_pauc_avg = np.mean(list(val_pauc_scores[f"{model_name}_{version}"].values()))

    cv_model_auc_std = np.std(list(val_auc_scores[f"{model_name}_{version}"].values()))
    cv_model_pauc_std = np.std(list(val_pauc_scores[f"{model_name}_{version}"].values()))
    
    print(f"CV AUC OOF: {cv_model_auc_oof}")
    print(f"CV PAUC OOF: {cv_model_pauc_oof}")
    print(f"CV AUC AVG: {cv_model_auc_avg}")
    print(f"CV PAUC AVG: {cv_model_pauc_avg}")
    print(f"CV AUC STD: {cv_model_auc_std}")
    print(f"CV PAUC STD: {cv_model_pauc_std}")
    print("\n")

print("Val AUC scores:")
pprint(val_auc_scores["ensemble"])
print("Val PAUC scores:")
pprint(val_pauc_scores["ensemble"])

cv_ensemble_auc_oof = compute_auc(oof_preds_df[target_column], oof_preds_df[ensemble_column])
cv_ensemble_pauc_oof = compute_pauc(oof_preds_df[target_column], oof_preds_df[ensemble_column], min_tpr=0.8)

cv_ensemble_auc_avg = np.mean(list(val_auc_scores["ensemble"].values()))
cv_ensemble_pauc_avg = np.mean(list(val_pauc_scores["ensemble"].values()))

cv_ensemble_auc_std = np.std(list(val_auc_scores["ensemble"].values()))
cv_ensemble_pauc_std = np.std(list(val_pauc_scores["ensemble"].values()))

print(f"CV AUC OOF: {cv_ensemble_auc_oof}")
print(f"CV PAUC OOF: {cv_ensemble_pauc_oof}")
print(f"CV AUC AVG: {cv_ensemble_auc_avg}")
print(f"CV PAUC AVG: {cv_ensemble_pauc_avg}")
print(f"CV AUC STD: {cv_ensemble_auc_std}")
print(f"CV PAUC STD: {cv_ensemble_pauc_std}")

Model: efficientnet_b0_v3 | Weightage: 0.40952716182378507
Val AUC scores:
{'fold_1': 0.9647128642848474,
 'fold_2': 0.9561526110431694,
 'fold_3': 0.928968189589547,
 'fold_4': 0.9572782381350537,
 'fold_5': 0.9422154578270037}
Val PAUC scores:
{'fold_1': 0.17332295716313081,
 'fold_2': 0.17211846439265513,
 'fold_3': 0.14460742409304764,
 'fold_4': 0.16890264067589317,
 'fold_5': 0.16295930451521776}
CV AUC OOF: 0.9488530223132682
CV PAUC OOF: 0.16344138790084992
CV AUC AVG: 0.9498654721759243
CV PAUC AVG: 0.1643821581679889
CV AUC STD: 0.012729033001532868
CV PAUC STD: 0.010519388794311729


Model: mobilevitv2_200_v1 | Weightage: 0.3352538589099779
Val AUC scores:
{'fold_1': 0.966520645083606,
 'fold_2': 0.9442821095178396,
 'fold_3': 0.939597346844542,
 'fold_4': 0.9521441452122151,
 'fold_5': 0.9482206984734033}
Val PAUC scores:
{'fold_1': 0.17764565195090418,
 'fold_2': 0.15499976641689273,
 'fold_3': 0.15599892675743465,
 'fold_4': 0.16321788696010278,
 'fold_5': 0.1654141976144

In [8]:
oof_preds_df

Unnamed: 0,isic_id,patient_id,fold,target,oof_efficientnet_b0_v3,oof_mobilevitv2_200_v1,oof_boosting_v1,oof_preds_ensemble
0,ISIC_0015845,IP_8170065,1,0,0.001611,0.000786,0.962175,1.657423
1,ISIC_0024200,IP_8313778,1,0,0.001537,0.000341,0.003955,1.364924
2,ISIC_0051648,IP_0218255,1,0,0.000073,0.000061,0.000754,0.561902
3,ISIC_0051896,IP_7438238,1,0,0.000093,0.000101,0.001005,0.720226
4,ISIC_0052026,IP_6422845,1,0,0.001067,0.001424,0.003502,1.368746
...,...,...,...,...,...,...,...,...
401054,ISIC_9999292,IP_7322743,5,0,0.000065,0.000359,0.001204,0.719170
401055,ISIC_9999411,IP_3713422,5,0,0.000192,0.000020,0.002482,0.921298
401056,ISIC_9999466,IP_8892837,5,0,0.000584,0.000305,0.002631,1.152047
401057,ISIC_9999919,IP_3026867,5,0,0.000478,0.000427,0.010574,1.431744


In [9]:
oof_preds_df.pivot_table(index="target", values=oof_columns+["oof_preds_ensemble"], aggfunc="mean")

Unnamed: 0_level_0,oof_boosting_v1,oof_efficientnet_b0_v3,oof_mobilevitv2_200_v1,oof_preds_ensemble
target,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,0.009603,0.001342,0.000884,0.870547
1,0.371107,0.04659,0.049913,1.671866


In [10]:
oof_preds_df[oof_columns+["oof_preds_ensemble"]].corr()

Unnamed: 0,oof_efficientnet_b0_v3,oof_mobilevitv2_200_v1,oof_boosting_v1,oof_preds_ensemble
oof_efficientnet_b0_v3,1.0,0.474561,0.276729,0.196216
oof_mobilevitv2_200_v1,0.474561,1.0,0.266288,0.161165
oof_boosting_v1,0.276729,0.266288,1.0,0.328019
oof_preds_ensemble,0.196216,0.161165,0.328019,1.0
