In [1]:
import json
import joblib
from pathlib import Path

import pandas as pd
import numpy as np

from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OrdinalEncoder

import lightgbm as lgb
from lightgbm.callback import log_evaluation, early_stopping
import catboost as cb

from isic_helper import DotDict
from isic_helper import get_folds
from isic_helper import compute_auc, compute_pauc

pd.options.display.max_columns = 1000

In [2]:
cfg = DotDict()
cfg.seed = 2022

cfg.models_output_dir = "models"
cfg.cb_model_name = "cb_v1"
cfg.lgb_model_name = "lgb_v1"

In [3]:
INPUT_PATH = Path("../input/isic-2024-challenge/")
MODELS_OUTPUT_PATH = Path(f"{cfg.models_output_dir}")
MODELS_OUTPUT_PATH.mkdir(exist_ok=True)

train_metadata = pd.read_csv(INPUT_PATH / "train-metadata.csv", low_memory=False)
test_metadata = pd.read_csv(INPUT_PATH / "test-metadata.csv")

folds_df = get_folds()
train_metadata = train_metadata.merge(folds_df, on=["isic_id", "patient_id"], how="inner")
print(f"Train data size: {train_metadata.shape}")
print(f"Test data size: {train_metadata.shape}")

Train data size: (401059, 57)
Test data size: (401059, 57)


In [4]:
train_metadata["target"].value_counts(normalize=True)

target
0    0.99902
1    0.00098
Name: proportion, dtype: float64

In [5]:
def feature_engineering(df):
    df["lesion_size_ratio"]              = df["tbp_lv_minorAxisMM"] / df["clin_size_long_diam_mm"]
    df["hue_contrast"]                   = (df["tbp_lv_H"] - df["tbp_lv_Hext"]).abs()
    df["normalized_lesion_size"]         = df["clin_size_long_diam_mm"] / df["age_approx"]
    df["overall_color_difference"]       = (df["tbp_lv_deltaA"] + df["tbp_lv_deltaB"] + df["tbp_lv_deltaL"]) / 3
    
    patient_num_images = df.groupby("patient_id", as_index=False)["isic_id"].count().rename(columns={"isic_id": "num_images"})
    df = df.merge(patient_num_images, on="patient_id", how="left")

    new_num_cols = [
        "num_images",
        "lesion_size_ratio",
        "normalized_lesion_size", 
        "overall_color_difference"
    ]
    
    new_cat_cols = []
    
    return df, new_num_cols, new_cat_cols

train_metadata, new_num_cols, new_cat_cols = feature_engineering(train_metadata.copy())

In [6]:
id_column = "isic_id"
target_column = "final_target"
group_column = "patient_id"
drop_features = ["image_type"] + train_metadata.columns[~np.in1d(train_metadata.columns, test_metadata.columns)].tolist() # target column removed
categorical_features = ["sex", "anatom_site_general", 
                        "tbp_tile_type", "tbp_lv_location", "tbp_lv_location_simple",
                        "attribution", "copyright_license"
                       ] + new_cat_cols
numerical_features = train_metadata.columns[~np.in1d(train_metadata.columns, 
                                                     [id_column] + [group_column] + categorical_features + drop_features)]

In [7]:
mixed_encoded_preprocessor = ColumnTransformer(
    [
        ("numerical", "passthrough", numerical_features),
        (
            "categorical",
            OrdinalEncoder(handle_unknown="use_encoded_value", unknown_value=-2, encoded_missing_value=-1, dtype=int),
            categorical_features,
        ),
    ],
    verbose_feature_names_out=False,

)
mixed_encoded_preprocessor.set_output(transform="pandas")

In [8]:
with open("encoder.joblib", "wb") as f:
    joblib.dump(mixed_encoded_preprocessor, f)

In [9]:
train_ids = train_metadata[id_column]
groups = train_metadata[group_column]
folds = train_metadata["fold"]

enc = mixed_encoded_preprocessor.fit(train_metadata)
X_train = enc.transform(train_metadata)
y_train = train_metadata[target_column]

In [10]:
columns_for_model = len(X_train.columns)
print(f"Total number of columns: {columns_for_model}")

Total number of columns: 41


In [11]:
def pauc_80(preds, data):
    score_value = compute_pauc(data.get_label(), preds, min_tpr=0.8)   
    return 'pauc_80', score_value, True

In [12]:
class PAUC:
    def get_final_error(self, error, weight):
        return error

    def is_max_optimal(self):
        return True

    def evaluate(self, approxes, target, weight):
        y_true = target.astype(int)
        y_pred = approxes[0].astype(float)
        
        score = compute_pauc(y_true, y_pred)
        
        return score, 1.0

In [13]:
cb_params = {
    'objective': 'Logloss',
    "random_state": cfg.seed,
#     "colsample_bylevel": 0.3, # 0.01, 0.1
    "iterations": 2500,
    "learning_rate": 0.05,
    "cat_features": categorical_features,
    "max_depth": 8,
    "l2_leaf_reg": 5,
    "verbose": 50,
    "early_stopping_rounds": 100,
    "eval_metric": "AUC",
    "task_type": "GPU",
    "devices": "0"
}

In [14]:
cb_best_num_rounds = {}
cb_val_auc_scores = {}
cb_val_pauc_scores = {}
all_folds = np.sort(folds.unique())
cb_oof_predictions = np.zeros(X_train.shape[0])
for fold in all_folds:
    print(f"Running fold: {fold}")
    dev_index = folds != fold
    val_index = folds == fold
    
    X_dev = X_train.loc[dev_index, :]
    y_dev = y_train[dev_index]
    
    X_val = X_train.loc[val_index, :]
    y_val = y_train[val_index]
    
    model = cb.CatBoostClassifier(**cb_params)
    
    model.fit(X_dev, y_dev, eval_set=(X_val, y_val), )
    model.save_model(MODELS_OUTPUT_PATH / f"{cfg.cb_model_name}_fold_{fold}.txt")
    cb_best_num_rounds[f"fold_{fold}"] = model.best_iteration_
    
    val_preds = model.predict_proba(X_val)[:, -1]
    cb_val_pauc_scores[f"fold_{fold}"] = compute_pauc(y_val, val_preds)
    cb_val_auc_scores[f"fold_{fold}"] = model.best_score_["validation"]["AUC"]
    cb_oof_predictions[val_index] = val_preds
    print("\n")

Running fold: 1


Default metric period is 5 because AUC is/are not implemented for GPU


0:	test: 0.7910135	best: 0.7910135 (0)	total: 12.6s	remaining: 8h 45m 4s
50:	test: 0.9112246	best: 0.9112246 (50)	total: 15s	remaining: 12m 1s
100:	test: 0.9317476	best: 0.9318910 (98)	total: 17.8s	remaining: 7m 2s
150:	test: 0.9393459	best: 0.9396319 (147)	total: 20.6s	remaining: 5m 20s
200:	test: 0.9425077	best: 0.9429066 (189)	total: 23.3s	remaining: 4m 27s
250:	test: 0.9454024	best: 0.9454167 (249)	total: 26.1s	remaining: 3m 54s
300:	test: 0.9479286	best: 0.9479707 (299)	total: 28.9s	remaining: 3m 31s
350:	test: 0.9488702	best: 0.9489912 (319)	total: 31.7s	remaining: 3m 13s
400:	test: 0.9488592	best: 0.9491112 (368)	total: 34.5s	remaining: 3m
450:	test: 0.9484326	best: 0.9494343 (414)	total: 37.3s	remaining: 2m 49s
500:	test: 0.9482795	best: 0.9494343 (414)	total: 40.1s	remaining: 2m 39s
bestTest = 0.94943434
bestIteration = 414
Shrink model to first 415 iterations.


Running fold: 2


Default metric period is 5 because AUC is/are not implemented for GPU


0:	test: 0.7750924	best: 0.7750924 (0)	total: 55.6ms	remaining: 2m 18s
50:	test: 0.9206608	best: 0.9211227 (49)	total: 2.71s	remaining: 2m 10s
100:	test: 0.9308538	best: 0.9309177 (94)	total: 5.45s	remaining: 2m 9s
150:	test: 0.9368564	best: 0.9368564 (150)	total: 8.21s	remaining: 2m 7s
200:	test: 0.9406352	best: 0.9406352 (200)	total: 11s	remaining: 2m 5s
250:	test: 0.9408953	best: 0.9410430 (213)	total: 13.7s	remaining: 2m 2s
300:	test: 0.9441121	best: 0.9441121 (298)	total: 16.5s	remaining: 2m
350:	test: 0.9460928	best: 0.9462559 (343)	total: 19.3s	remaining: 1m 57s
400:	test: 0.9479153	best: 0.9479153 (400)	total: 22s	remaining: 1m 55s
450:	test: 0.9479091	best: 0.9483044 (409)	total: 24.8s	remaining: 1m 52s
500:	test: 0.9492125	best: 0.9493437 (499)	total: 27.5s	remaining: 1m 49s
550:	test: 0.9489245	best: 0.9494674 (539)	total: 30.3s	remaining: 1m 47s
600:	test: 0.9490053	best: 0.9494674 (539)	total: 33s	remaining: 1m 44s
650:	test: 0.9500226	best: 0.9503380 (649)	total: 35.7s	re

Default metric period is 5 because AUC is/are not implemented for GPU


0:	test: 0.8307567	best: 0.8307567 (0)	total: 56.4ms	remaining: 2m 20s
50:	test: 0.9300241	best: 0.9300241 (50)	total: 2.51s	remaining: 2m
100:	test: 0.9407637	best: 0.9407637 (100)	total: 5.28s	remaining: 2m 5s
150:	test: 0.9464350	best: 0.9471806 (147)	total: 8.05s	remaining: 2m 5s
200:	test: 0.9505292	best: 0.9508083 (191)	total: 10.8s	remaining: 2m 3s
250:	test: 0.9520975	best: 0.9521679 (240)	total: 13.6s	remaining: 2m 1s
300:	test: 0.9537444	best: 0.9539196 (292)	total: 16.4s	remaining: 1m 59s
350:	test: 0.9545068	best: 0.9545068 (350)	total: 19.2s	remaining: 1m 57s
400:	test: 0.9551710	best: 0.9551710 (400)	total: 21.9s	remaining: 1m 54s
450:	test: 0.9548962	best: 0.9552124 (408)	total: 24.7s	remaining: 1m 52s
500:	test: 0.9549462	best: 0.9552124 (408)	total: 27.5s	remaining: 1m 49s
bestTest = 0.9552124143
bestIteration = 408
Shrink model to first 409 iterations.


Running fold: 4


Default metric period is 5 because AUC is/are not implemented for GPU


0:	test: 0.7926639	best: 0.7926639 (0)	total: 54.7ms	remaining: 2m 16s
50:	test: 0.9361942	best: 0.9361942 (50)	total: 2.63s	remaining: 2m 6s
100:	test: 0.9470345	best: 0.9473748 (94)	total: 5.38s	remaining: 2m 7s
150:	test: 0.9514520	best: 0.9517281 (145)	total: 8.14s	remaining: 2m 6s
200:	test: 0.9554927	best: 0.9554927 (200)	total: 10.9s	remaining: 2m 4s
250:	test: 0.9574988	best: 0.9576829 (249)	total: 13.6s	remaining: 2m 2s
300:	test: 0.9585658	best: 0.9589404 (289)	total: 16.4s	remaining: 2m
350:	test: 0.9592127	best: 0.9593363 (326)	total: 19.2s	remaining: 1m 57s
400:	test: 0.9591904	best: 0.9593673 (389)	total: 22s	remaining: 1m 54s
450:	test: 0.9596305	best: 0.9598421 (444)	total: 24.7s	remaining: 1m 52s
500:	test: 0.9600862	best: 0.9601966 (495)	total: 27.5s	remaining: 1m 49s
550:	test: 0.9608502	best: 0.9608502 (550)	total: 30.2s	remaining: 1m 46s
600:	test: 0.9604439	best: 0.9610423 (559)	total: 32.9s	remaining: 1m 44s
650:	test: 0.9610859	best: 0.9614055 (642)	total: 35.7s

Default metric period is 5 because AUC is/are not implemented for GPU


0:	test: 0.7781612	best: 0.7781612 (0)	total: 55.6ms	remaining: 2m 18s
50:	test: 0.9192275	best: 0.9192275 (50)	total: 2.45s	remaining: 1m 57s
100:	test: 0.9391043	best: 0.9391043 (100)	total: 5.27s	remaining: 2m 5s
150:	test: 0.9479375	best: 0.9479375 (150)	total: 8.03s	remaining: 2m 4s
200:	test: 0.9501294	best: 0.9501294 (200)	total: 10.8s	remaining: 2m 3s
250:	test: 0.9488298	best: 0.9504511 (231)	total: 13.6s	remaining: 2m 1s
300:	test: 0.9486533	best: 0.9504511 (231)	total: 16.3s	remaining: 1m 59s
bestTest = 0.9504511356
bestIteration = 231
Shrink model to first 232 iterations.




In [15]:
cb_oof_preds_df = pd.DataFrame({
    id_column: train_ids,
    group_column: groups,
    "fold": folds,
    target_column: y_train,
    f"oof_{cfg.cb_model_name}": cb_oof_predictions
})
cb_oof_preds_df.to_csv(f"oof_preds_{cfg.cb_model_name}.csv", index=False)
cb_oof_preds_df.head()

Unnamed: 0,isic_id,patient_id,fold,final_target,oof_cb_v1
0,ISIC_0015670,IP_1235828,4,0,7.6e-05
1,ISIC_0015845,IP_8170065,1,0,0.455154
2,ISIC_0015864,IP_6724798,5,0,7.7e-05
3,ISIC_0015902,IP_4111386,2,0,8e-05
4,ISIC_0024200,IP_8313778,1,0,0.0006


In [16]:
cb_val_auc_scores

{'fold_1': 0.9494343400001526,
 'fold_2': 0.9503380060195923,
 'fold_3': 0.955212414264679,
 'fold_4': 0.9614055156707764,
 'fold_5': 0.950451135635376}

In [17]:
cb_val_pauc_scores

{'fold_1': 0.16176580074155472,
 'fold_2': 0.16379234653749836,
 'fold_3': 0.16503569155507852,
 'fold_4': 0.16790786337372549,
 'fold_5': 0.16277633946384645}

In [18]:
cb_cv_auc_oof = compute_auc(cb_oof_preds_df[target_column], cb_oof_preds_df[f"oof_{cfg.cb_model_name}"])
cb_cv_pauc_oof = compute_pauc(cb_oof_preds_df[target_column], cb_oof_preds_df[f"oof_{cfg.cb_model_name}"], min_tpr=0.8)

cb_cv_auc_avg = np.mean(list(cb_val_auc_scores.values()))
cb_cv_pauc_avg = np.mean(list(cb_val_pauc_scores.values()))

In [19]:
print(f"CV AUC OOF: {cb_cv_auc_oof}")
print(f"CV PAUC OOF: {cb_cv_pauc_oof}")
print(f"CV AUC AVG: {cb_cv_auc_avg}")
print(f"CV PAUC AVG: {cb_cv_pauc_avg}")

CV AUC OOF: 0.952448562456487
CV PAUC OOF: 0.1632807063262568
CV AUC AVG: 0.9533682823181152
CV PAUC AVG: 0.16425560833434072


In [20]:
lgb_params = {
    "objective": "binary",
#     "boosting_type": "dart",
    "class_weight": "scale_pos_weight",
    "metric": "auc",
    "num_leaves": 63,
    "learning_rate": 0.01,
    "bagging_freq": 5,
    "bagging_fraction": 0.6,
    "bagging_seed": cfg.seed,
    "feature_fraction": 0.3,
    "feature_fraction_seed": cfg.seed,
    "lambda_l1": 0.95,
    "lambda_l2": 0.95,
#     "min_child_samples": 60,
#     "extra_trees": True,
    "verbosity": -1,
    "device": "gpu"
}
num_rounds = 3000
es_rounds = 200
log_rounds = 50

In [21]:
lgb_best_num_rounds = {}
lgb_val_auc_scores = {}
lgb_val_pauc_scores = {}
all_folds = np.sort(folds.unique())
lgb_oof_predictions = np.zeros(X_train.shape[0])
for fold in all_folds:
    print(f"Running fold: {fold}")
    dev_index = folds != fold
    val_index = folds == fold
    
    X_dev = X_train.loc[dev_index, :]
    y_dev = y_train[dev_index]
    
    X_val = X_train.loc[val_index, :]
    y_val = y_train[val_index]
    
    lgb_dataset_dev = lgb.Dataset(X_dev, label=y_dev, categorical_feature=categorical_features, free_raw_data=False)
    lgb_dataset_val = lgb.Dataset(X_val, label=y_val, categorical_feature=categorical_features, free_raw_data=False)
    
    model = lgb.train(
        lgb_params,
        lgb_dataset_dev,
        num_boost_round=num_rounds,
        valid_sets=[lgb_dataset_val],
        feval=pauc_80,
        callbacks=[early_stopping(stopping_rounds=es_rounds), log_evaluation(log_rounds)],
    )
    
    model.save_model(MODELS_OUTPUT_PATH / f"{cfg.lgb_model_name}_fold_{fold}.txt")
    lgb_best_num_rounds[f"fold_{fold}"] = model.best_iteration
    val_scores = dict(model.best_score["valid_0"])
    
    lgb_val_auc_scores[f"fold_{fold}"] = val_scores["auc"]
    lgb_val_pauc_scores[f"fold_{fold}"] = val_scores["pauc_80"]
    
    val_preds = model.predict(X_val, num_iteration=model.best_iteration)
    lgb_oof_predictions[val_index] = val_preds
    print("\n")

Running fold: 1




Training until validation scores don't improve for 200 rounds
[50]	valid_0's auc: 0.93506	valid_0's pauc_80: 0.151442
[100]	valid_0's auc: 0.937338	valid_0's pauc_80: 0.151539
[150]	valid_0's auc: 0.941223	valid_0's pauc_80: 0.153938
[200]	valid_0's auc: 0.945263	valid_0's pauc_80: 0.15787
[250]	valid_0's auc: 0.947002	valid_0's pauc_80: 0.159147
[300]	valid_0's auc: 0.9489	valid_0's pauc_80: 0.160842
[350]	valid_0's auc: 0.949703	valid_0's pauc_80: 0.161416
[400]	valid_0's auc: 0.950503	valid_0's pauc_80: 0.162055
[450]	valid_0's auc: 0.950738	valid_0's pauc_80: 0.162146
[500]	valid_0's auc: 0.951037	valid_0's pauc_80: 0.162375
[550]	valid_0's auc: 0.951913	valid_0's pauc_80: 0.163204
[600]	valid_0's auc: 0.951903	valid_0's pauc_80: 0.163168
[650]	valid_0's auc: 0.952175	valid_0's pauc_80: 0.163382
[700]	valid_0's auc: 0.952223	valid_0's pauc_80: 0.163357
[750]	valid_0's auc: 0.952246	valid_0's pauc_80: 0.163274
Early stopping, best iteration is:
[568]	valid_0's auc: 0.95248	valid_0's

In [22]:
lgb_oof_preds_df = pd.DataFrame({
    id_column: train_ids,
    group_column: groups,
    "fold": folds,
    target_column: y_train,
    f"oof_{cfg.lgb_model_name}": lgb_oof_predictions
})
lgb_oof_preds_df.to_csv(f"oof_preds_{cfg.lgb_model_name}.csv", index=False)
lgb_oof_preds_df.head()

Unnamed: 0,isic_id,patient_id,fold,final_target,oof_lgb_v1
0,ISIC_0015670,IP_1235828,4,0,4e-05
1,ISIC_0015845,IP_8170065,1,0,0.493837
2,ISIC_0015864,IP_6724798,5,0,0.000111
3,ISIC_0015902,IP_4111386,2,0,8.3e-05
4,ISIC_0024200,IP_8313778,1,0,0.000676


In [23]:
lgb_val_auc_scores

{'fold_1': 0.9524802412232688,
 'fold_2': 0.9471366230794189,
 'fold_3': 0.9424190544233817,
 'fold_4': 0.9623170183823988,
 'fold_5': 0.9500881067276934}

In [24]:
lgb_val_pauc_scores

{'fold_1': 0.16378409765484395,
 'fold_2': 0.16105856024492304,
 'fold_3': 0.15530701289139032,
 'fold_4': 0.16902790430669776,
 'fold_5': 0.1628304162278929}

In [25]:
lgb_cv_auc_oof = compute_auc(lgb_oof_preds_df[target_column], lgb_oof_preds_df[f"oof_{cfg.lgb_model_name}"])
lgb_cv_pauc_oof = compute_pauc(lgb_oof_preds_df[target_column], lgb_oof_preds_df[f"oof_{cfg.lgb_model_name}"], min_tpr=0.8)

lgb_cv_auc_avg = np.mean(list(lgb_val_auc_scores.values()))
lgb_cv_pauc_avg = np.mean(list(lgb_val_pauc_scores.values()))

In [26]:
print(f"CV AUC OOF: {lgb_cv_auc_oof}")
print(f"CV PAUC OOF: {lgb_cv_pauc_oof}")
print(f"CV AUC AVG: {lgb_cv_auc_avg}")
print(f"CV PAUC AVG: {lgb_cv_pauc_avg}")

CV AUC OOF: 0.9351820916647064
CV PAUC OOF: 0.14685486641840567
CV AUC AVG: 0.9508882087672322
CV PAUC AVG: 0.16240159826514958


In [27]:
cb_metadata = {
    "params": cb_params,
    "cb_best_num_rounds": cb_best_num_rounds,
    "cb_val_auc_scores": cb_val_auc_scores,
    "cb_val_pauc_scores": cb_val_pauc_scores,
    "cb_cv_auc_oof": cb_cv_auc_oof,
    "cb_cv_pauc_oof": cb_cv_pauc_oof,
    "cb_cv_auc_avg": cb_cv_auc_avg,
    "cb_cv_pauc_avg": cb_cv_pauc_avg
}

with open("cb_run_metadata.json", "w") as f:
    json.dump(cb_metadata, f)

In [28]:
lgb_metadata = {
    "params": lgb_params,
    "num_rounds": num_rounds,
    "es_rounds": es_rounds,
    "lgb_best_num_rounds": lgb_best_num_rounds,
    "lgb_val_auc_scores": lgb_val_auc_scores,
    "lgb_val_pauc_scores": lgb_val_pauc_scores,
    "lgb_cv_auc_oof": lgb_cv_auc_oof,
    "lgb_cv_pauc_oof": lgb_cv_pauc_oof,
    "lgb_cv_auc_avg": lgb_cv_auc_avg,
    "lgb_cv_pauc_avg": lgb_cv_pauc_avg
}

with open("lgb_run_metadata.json", "w") as f:
    json.dump(lgb_metadata, f)