In [1]:
import json
import joblib
from pathlib import Path

import pandas as pd
import numpy as np

from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OrdinalEncoder

import catboost as cb

from isic_helper import DotDict
from isic_helper import get_folds
from isic_helper import compute_auc, compute_pauc

pd.options.display.max_columns = 1000

In [2]:
cfg = DotDict()
cfg.seed = 2022

cfg.models_output_dir = "models"
cfg.model_name = "cb_v1"

In [3]:
INPUT_PATH = Path("../input/isic-2024-challenge/")
MODELS_OUTPUT_PATH = Path(f"{cfg.models_output_dir}")
MODELS_OUTPUT_PATH.mkdir(exist_ok=True)

train_metadata = pd.read_csv(INPUT_PATH / "train-metadata.csv", low_memory=False)
test_metadata = pd.read_csv(INPUT_PATH / "test-metadata.csv")

folds_df = get_folds()
train_metadata = train_metadata.merge(folds_df, on=["isic_id", "patient_id"], how="inner")
print(f"Train data size: {train_metadata.shape}")
print(f"Test data size: {train_metadata.shape}")

Train data size: (401059, 57)
Test data size: (401059, 57)


In [4]:
train_metadata["target"].value_counts(normalize=True)

target
0    0.99902
1    0.00098
Name: proportion, dtype: float64

In [5]:
def feature_engineering(df):
    df["lesion_size_ratio"]              = df["tbp_lv_minorAxisMM"] / df["clin_size_long_diam_mm"]
    df["hue_contrast"]                   = (df["tbp_lv_H"] - df["tbp_lv_Hext"]).abs()
    df["normalized_lesion_size"]         = df["clin_size_long_diam_mm"] / df["age_approx"]
    df["overall_color_difference"]       = (df["tbp_lv_deltaA"] + df["tbp_lv_deltaB"] + df["tbp_lv_deltaL"]) / 3
    
    patient_num_images = df.groupby("patient_id", as_index=False)["isic_id"].count().rename(columns={"isic_id": "num_images"})
    df = df.merge(patient_num_images, on="patient_id", how="left")

    new_num_cols = [
        "num_images",
        "lesion_size_ratio",
        "hue_contrast",
        "normalized_lesion_size", 
        "overall_color_difference"
    ]
    
    new_cat_cols = []
    
    return df, new_num_cols, new_cat_cols

train_metadata, new_num_cols, new_cat_cols = feature_engineering(train_metadata.copy())
test_metadata, new_num_cols, new_cat_cols = feature_engineering(test_metadata.copy())

In [6]:
id_column = "isic_id"
target_column = "final_target"
group_column = "patient_id"
drop_features = ["image_type"] + train_metadata.columns[~np.in1d(train_metadata.columns, test_metadata.columns)].tolist() # target column removed
categorical_features = ["sex", "anatom_site_general", 
                        "tbp_tile_type", "tbp_lv_location", "tbp_lv_location_simple",
                        "attribution", "copyright_license"
                       ] + new_cat_cols
numerical_features = train_metadata.columns[~np.in1d(train_metadata.columns, 
                                                     [id_column] + [group_column] + categorical_features + drop_features)]

In [7]:
mixed_encoded_preprocessor = ColumnTransformer(
    [
        ("numerical", "passthrough", numerical_features),
        (
            "categorical",
            OrdinalEncoder(handle_unknown="use_encoded_value", unknown_value=-2, encoded_missing_value=-1, dtype=int),
            categorical_features,
        ),
    ],
    verbose_feature_names_out=False,

)
mixed_encoded_preprocessor.set_output(transform="pandas")

In [8]:
with open(f"{cfg.model_name}_encoder.joblib", "wb") as f:
    joblib.dump(mixed_encoded_preprocessor, f)

In [9]:
train_ids = train_metadata[id_column]
groups = train_metadata[group_column]
folds = train_metadata["fold"]

enc = mixed_encoded_preprocessor.fit(train_metadata)
X_train = enc.transform(train_metadata)
y_train = train_metadata[target_column]

In [10]:
columns_for_model = len(X_train.columns)
print(f"Total number of columns: {columns_for_model}")

Total number of columns: 46


In [11]:
class PAUC:
    def get_final_error(self, error, weight):
        return error

    def is_max_optimal(self):
        return True

    def evaluate(self, approxes, target, weight):
        y_true = target.astype(int)
        y_pred = approxes[0].astype(float)
        
        score = compute_pauc(y_true, y_pred, min_tpr=0.8)
        
        return score, 1.0

In [12]:
params = {
    "objective": "Logloss",
#     "boosting_type": "Ordered",
    "iterations": 3000,
    "learning_rate": 0.05,
    "max_depth": 8,
    "l2_leaf_reg": 5,
    "verbose": 50,
    "custom_metric": "AUC",
    "random_state": cfg.seed,
}
es_rounds = 200

In [13]:
best_num_rounds = {}
val_auc_scores = {}
val_pauc_scores = {}
all_folds = np.sort(folds.unique())
oof_predictions = np.zeros(X_train.shape[0])
for fold in all_folds:
    print(f"Running fold: {fold}")
    dev_index = folds != fold
    val_index = folds == fold
    
    X_dev = X_train.loc[dev_index, :]
    y_dev = y_train[dev_index]
    
    X_val = X_train.loc[val_index, :]
    y_val = y_train[val_index]
    
    model = cb.CatBoostClassifier(**params, eval_metric=PAUC())
    
    cb_dataset_dev = cb.Pool(X_dev, y_dev, cat_features=categorical_features)
    cb_dataset_val = cb.Pool(X_val, y_val, cat_features=categorical_features)
    
    model.fit(cb_dataset_dev, eval_set=cb_dataset_val, 
              early_stopping_rounds=es_rounds, 
              metric_period=params["verbose"], use_best_model=True,
              init_model=None)
    model.save_model(MODELS_OUTPUT_PATH / f"{cfg.model_name}_fold_{fold}.txt")
    best_num_rounds[f"fold_{fold}"] = model.best_iteration_
    
    val_preds = model.predict_proba(X_val)[:, -1]
    val_pauc_scores[f"fold_{fold}"] = compute_pauc(y_val, val_preds)
    val_auc_scores[f"fold_{fold}"] = compute_auc(y_val, val_preds)
    oof_predictions[val_index] = val_preds
    print("\n")

Running fold: 1


Failed in nopython mode pipeline (step: nopython frontend)
[1mUntyped global name 'compute_pauc':[0m [1m[1mCannot determine Numba type of <class 'function'>[0m
[1m
File "../../tmp/ipykernel_18/1392371515.py", line 12:[0m
[1m<source missing, REPL/exec in use?>[0m
[0m
  self._object._train(train_pool, test_pool, params, allow_clear_pool, init_model._object if init_model else None)


0:	learn: 0.0773160	test: 0.0667397	best: 0.0667397 (0)	total: 1.83s	remaining: 1h 31m 26s
50:	learn: 0.1617511	test: 0.1427070	best: 0.1427920 (49)	total: 29.6s	remaining: 28m 33s
100:	learn: 0.1695377	test: 0.1510287	best: 0.1510287 (100)	total: 56.3s	remaining: 26m 56s
150:	learn: 0.1749511	test: 0.1523460	best: 0.1531773 (138)	total: 1m 23s	remaining: 26m 13s
200:	learn: 0.1792676	test: 0.1558899	best: 0.1562087 (199)	total: 1m 49s	remaining: 25m 31s
250:	learn: 0.1824340	test: 0.1556460	best: 0.1564580 (215)	total: 2m 15s	remaining: 24m 47s
300:	learn: 0.1845519	test: 0.1561612	best: 0.1564580 (215)	total: 2m 42s	remaining: 24m 13s
350:	learn: 0.1863370	test: 0.1563331	best: 0.1566920 (318)	total: 3m 8s	remaining: 23m 43s
400:	learn: 0.1881107	test: 0.1569603	best: 0.1575768 (377)	total: 3m 35s	remaining: 23m 13s
450:	learn: 0.1892066	test: 0.1562203	best: 0.1575768 (377)	total: 4m 1s	remaining: 22m 43s
500:	learn: 0.1901295	test: 0.1560792	best: 0.1575768 (377)	total: 4m 27s	rema

Failed in nopython mode pipeline (step: nopython frontend)
[1mUntyped global name 'compute_pauc':[0m [1m[1mCannot determine Numba type of <class 'function'>[0m
[1m
File "../../tmp/ipykernel_18/1392371515.py", line 12:[0m
[1m<source missing, REPL/exec in use?>[0m
[0m
  self._object._train(train_pool, test_pool, params, allow_clear_pool, init_model._object if init_model else None)


0:	learn: 0.0797063	test: 0.0693681	best: 0.0693681 (0)	total: 682ms	remaining: 34m 3s
50:	learn: 0.1645189	test: 0.1392392	best: 0.1392392 (50)	total: 28.8s	remaining: 27m 43s
100:	learn: 0.1704676	test: 0.1454901	best: 0.1454901 (100)	total: 54.4s	remaining: 26m
150:	learn: 0.1750580	test: 0.1473624	best: 0.1481590 (143)	total: 1m 17s	remaining: 24m 25s
200:	learn: 0.1797583	test: 0.1500709	best: 0.1501549 (194)	total: 1m 42s	remaining: 23m 49s
250:	learn: 0.1833866	test: 0.1539478	best: 0.1539478 (250)	total: 2m 7s	remaining: 23m 15s
300:	learn: 0.1848204	test: 0.1555990	best: 0.1555990 (300)	total: 2m 33s	remaining: 22m 53s
350:	learn: 0.1865480	test: 0.1566863	best: 0.1567282 (344)	total: 2m 59s	remaining: 22m 31s
400:	learn: 0.1873926	test: 0.1577563	best: 0.1579346 (387)	total: 3m 25s	remaining: 22m 10s
450:	learn: 0.1884949	test: 0.1591272	best: 0.1591272 (450)	total: 3m 52s	remaining: 21m 53s
500:	learn: 0.1896205	test: 0.1601734	best: 0.1603777 (489)	total: 4m 19s	remaining: 

Failed in nopython mode pipeline (step: nopython frontend)
[1mUntyped global name 'compute_pauc':[0m [1m[1mCannot determine Numba type of <class 'function'>[0m
[1m
File "../../tmp/ipykernel_18/1392371515.py", line 12:[0m
[1m<source missing, REPL/exec in use?>[0m
[0m
  self._object._train(train_pool, test_pool, params, allow_clear_pool, init_model._object if init_model else None)


0:	learn: 0.0837277	test: 0.0550993	best: 0.0550993 (0)	total: 737ms	remaining: 36m 49s
50:	learn: 0.1692352	test: 0.1542168	best: 0.1542168 (50)	total: 28.9s	remaining: 27m 49s
100:	learn: 0.1784490	test: 0.1569074	best: 0.1573890 (93)	total: 56.5s	remaining: 27m
150:	learn: 0.1817130	test: 0.1589482	best: 0.1591586 (141)	total: 1m 24s	remaining: 26m 25s
200:	learn: 0.1846809	test: 0.1603619	best: 0.1604668 (192)	total: 1m 49s	remaining: 25m 29s
250:	learn: 0.1868858	test: 0.1618618	best: 0.1621593 (249)	total: 2m 15s	remaining: 24m 42s
300:	learn: 0.1885287	test: 0.1621100	best: 0.1623176 (263)	total: 2m 40s	remaining: 24m 3s
350:	learn: 0.1901712	test: 0.1617996	best: 0.1623176 (263)	total: 3m 7s	remaining: 23m 31s
400:	learn: 0.1912355	test: 0.1613296	best: 0.1623176 (263)	total: 3m 33s	remaining: 23m 5s
450:	learn: 0.1922869	test: 0.1608894	best: 0.1623176 (263)	total: 4m	remaining: 22m 37s
Stopped by overfitting detector  (200 iterations wait)

bestTest = 0.1623176424
bestIterati

Failed in nopython mode pipeline (step: nopython frontend)
[1mUntyped global name 'compute_pauc':[0m [1m[1mCannot determine Numba type of <class 'function'>[0m
[1m
File "../../tmp/ipykernel_18/1392371515.py", line 12:[0m
[1m<source missing, REPL/exec in use?>[0m
[0m
  self._object._train(train_pool, test_pool, params, allow_clear_pool, init_model._object if init_model else None)


0:	learn: 0.0868856	test: 0.0594133	best: 0.0594133 (0)	total: 714ms	remaining: 35m 40s
50:	learn: 0.1629072	test: 0.1484561	best: 0.1484561 (50)	total: 28.2s	remaining: 27m 12s
100:	learn: 0.1739177	test: 0.1592293	best: 0.1593289 (99)	total: 55.1s	remaining: 26m 20s
150:	learn: 0.1805194	test: 0.1615364	best: 0.1619004 (148)	total: 1m 22s	remaining: 25m 51s
200:	learn: 0.1841056	test: 0.1628301	best: 0.1628301 (200)	total: 1m 49s	remaining: 25m 19s
250:	learn: 0.1863896	test: 0.1641922	best: 0.1645619 (243)	total: 2m 16s	remaining: 24m 55s
300:	learn: 0.1878141	test: 0.1646323	best: 0.1646340 (299)	total: 2m 41s	remaining: 24m 12s
350:	learn: 0.1890722	test: 0.1657706	best: 0.1658341 (345)	total: 3m 7s	remaining: 23m 35s
400:	learn: 0.1903008	test: 0.1659765	best: 0.1662885 (368)	total: 3m 34s	remaining: 23m 7s
450:	learn: 0.1911273	test: 0.1658987	best: 0.1662885 (368)	total: 4m	remaining: 22m 41s
500:	learn: 0.1921420	test: 0.1650067	best: 0.1662885 (368)	total: 4m 26s	remaining: 2

Failed in nopython mode pipeline (step: nopython frontend)
[1mUntyped global name 'compute_pauc':[0m [1m[1mCannot determine Numba type of <class 'function'>[0m
[1m
File "../../tmp/ipykernel_18/1392371515.py", line 12:[0m
[1m<source missing, REPL/exec in use?>[0m
[0m
  self._object._train(train_pool, test_pool, params, allow_clear_pool, init_model._object if init_model else None)


0:	learn: 0.0750779	test: 0.1049415	best: 0.1049415 (0)	total: 708ms	remaining: 35m 24s
50:	learn: 0.1660872	test: 0.1446424	best: 0.1446424 (50)	total: 28.3s	remaining: 27m 17s
100:	learn: 0.1751420	test: 0.1528695	best: 0.1528695 (100)	total: 55.8s	remaining: 26m 40s
150:	learn: 0.1798026	test: 0.1589252	best: 0.1589252 (150)	total: 1m 23s	remaining: 26m 7s
200:	learn: 0.1824342	test: 0.1584612	best: 0.1597190 (179)	total: 1m 49s	remaining: 25m 22s
250:	learn: 0.1846858	test: 0.1595538	best: 0.1597299 (249)	total: 2m 15s	remaining: 24m 38s
300:	learn: 0.1862431	test: 0.1617145	best: 0.1617948 (298)	total: 2m 39s	remaining: 23m 47s
350:	learn: 0.1871104	test: 0.1618818	best: 0.1622370 (329)	total: 3m 4s	remaining: 23m 12s
400:	learn: 0.1880379	test: 0.1611827	best: 0.1622370 (329)	total: 3m 31s	remaining: 22m 49s
450:	learn: 0.1891804	test: 0.1608976	best: 0.1622370 (329)	total: 3m 57s	remaining: 22m 24s
500:	learn: 0.1900282	test: 0.1595106	best: 0.1622370 (329)	total: 4m 23s	remaini

In [14]:
oof_preds_df = pd.DataFrame({
    id_column: train_ids,
    group_column: groups,
    "fold": folds,
    target_column: y_train,
    f"oof_{cfg.model_name}": oof_predictions
})
oof_preds_df.to_csv(f"oof_preds_{cfg.model_name}.csv", index=False)
oof_preds_df.head()

Unnamed: 0,isic_id,patient_id,fold,final_target,oof_cb_v1
0,ISIC_0015670,IP_1235828,4,0,0.000198
1,ISIC_0015845,IP_8170065,1,0,0.3925
2,ISIC_0015864,IP_6724798,5,0,2.3e-05
3,ISIC_0015902,IP_4111386,2,0,1.8e-05
4,ISIC_0024200,IP_8313778,1,0,0.000797


In [15]:
val_auc_scores

{'fold_1': 0.9442278150615123,
 'fold_2': 0.9548778232358236,
 'fold_3': 0.9505865395415009,
 'fold_4': 0.9604536633762214,
 'fold_5': 0.9512827552399264}

In [16]:
val_pauc_scores

{'fold_1': 0.1575768055646264,
 'fold_2': 0.16761408614928966,
 'fold_3': 0.16231764236063442,
 'fold_4': 0.16628848385768302,
 'fold_5': 0.16223697973913237}

In [17]:
cv_auc_oof = compute_auc(oof_preds_df[target_column], oof_preds_df[f"oof_{cfg.model_name}"])
cv_pauc_oof = compute_pauc(oof_preds_df[target_column], oof_preds_df[f"oof_{cfg.model_name}"], min_tpr=0.8)

cv_auc_avg = np.mean(list(val_auc_scores.values()))
cv_pauc_avg = np.mean(list(val_pauc_scores.values()))

print(f"CV AUC OOF: {cv_auc_oof}")
print(f"CV PAUC OOF: {cv_pauc_oof}")
print(f"CV AUC AVG: {cv_auc_avg}")
print(f"CV PAUC AVG: {cv_pauc_avg}")

CV AUC OOF: 0.9500921995411991
CV PAUC OOF: 0.16129688470731848
CV AUC AVG: 0.9522857192909969
CV PAUC AVG: 0.16320679953427314


In [18]:
params

{'objective': 'Logloss',
 'iterations': 3000,
 'learning_rate': 0.05,
 'max_depth': 8,
 'l2_leaf_reg': 5,
 'verbose': 50,
 'custom_metric': 'AUC',
 'random_state': 2022}

In [19]:
metadata = {
    "params": params,
    "best_num_rounds": best_num_rounds,
    "val_auc_scores": val_auc_scores,
    "val_pauc_scores": val_pauc_scores,
    "cv_auc_oof": cv_auc_oof,
    "cv_pauc_oof": cv_pauc_oof,
    "cv_auc_avg": cv_auc_avg,
    "cv_pauc_avg": cv_pauc_avg
}

with open("run_metadata.json", "w") as f:
    json.dump(metadata, f)