In [1]:
import json
import joblib
from pathlib import Path

import pandas as pd
import numpy as np

from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OrdinalEncoder

import catboost as cb

from isic_helper import DotDict
from isic_helper import get_folds
from isic_helper import compute_auc, compute_pauc

pd.options.display.max_columns = 1000

In [2]:
cfg = DotDict()
cfg.seed = 2022

cfg.models_output_dir = "models"
cfg.model_name = "cb_v1"

In [3]:
INPUT_PATH = Path("../input/isic-2024-challenge/")
MODELS_OUTPUT_PATH = Path(f"{cfg.models_output_dir}")
MODELS_OUTPUT_PATH.mkdir(exist_ok=True)

train_metadata = pd.read_csv(INPUT_PATH / "train-metadata.csv", low_memory=False)
test_metadata = pd.read_csv(INPUT_PATH / "test-metadata.csv")

folds_df = get_folds()
train_metadata = train_metadata.merge(folds_df, on=["isic_id", "patient_id"], how="inner")
print(f"Train data size: {train_metadata.shape}")
print(f"Test data size: {train_metadata.shape}")

Train data size: (401059, 57)
Test data size: (401059, 57)


In [4]:
train_metadata["target"].value_counts(normalize=True)

target
0    0.99902
1    0.00098
Name: proportion, dtype: float64

In [5]:
def feature_engineering(df):
    df["lesion_size_ratio"]              = df["tbp_lv_minorAxisMM"] / df["clin_size_long_diam_mm"]
    df["hue_contrast"]                   = (df["tbp_lv_H"] - df["tbp_lv_Hext"]).abs()
    df["normalized_lesion_size"]         = df["clin_size_long_diam_mm"] / df["age_approx"]
    df["overall_color_difference"]       = (df["tbp_lv_deltaA"] + df["tbp_lv_deltaB"] + df["tbp_lv_deltaL"]) / 3
    
    patient_num_images = df.groupby("patient_id", as_index=False)["isic_id"].count().rename(columns={"isic_id": "num_images"})
    df = df.merge(patient_num_images, on="patient_id", how="left")

    new_num_cols = [
        "num_images",
        "lesion_size_ratio",
        "hue_contrast",
        "normalized_lesion_size", 
        "overall_color_difference"
    ]
    
    new_cat_cols = []
    
    return df, new_num_cols, new_cat_cols

train_metadata, new_num_cols, new_cat_cols = feature_engineering(train_metadata.copy())
test_metadata, new_num_cols, new_cat_cols = feature_engineering(test_metadata.copy())

In [6]:
id_column = "isic_id"
target_column = "final_target"
group_column = "patient_id"
drop_features = ["image_type"] + train_metadata.columns[~np.in1d(train_metadata.columns, test_metadata.columns)].tolist() # target column removed
categorical_features = ["sex", "anatom_site_general", 
                        "tbp_tile_type", "tbp_lv_location", "tbp_lv_location_simple",
                        "attribution", "copyright_license"
                       ] + new_cat_cols
numerical_features = train_metadata.columns[~np.in1d(train_metadata.columns, 
                                                     [id_column] + [group_column] + categorical_features + drop_features)]

In [7]:
mixed_encoded_preprocessor = ColumnTransformer(
    [
        ("numerical", "passthrough", numerical_features),
        (
            "categorical",
            OrdinalEncoder(handle_unknown="use_encoded_value", unknown_value=-2, encoded_missing_value=-1, dtype=int),
            categorical_features,
        ),
    ],
    verbose_feature_names_out=False,

)
mixed_encoded_preprocessor.set_output(transform="pandas")

In [8]:
with open(f"{cfg.model_name}_encoder.joblib", "wb") as f:
    joblib.dump(mixed_encoded_preprocessor, f)

In [9]:
train_ids = train_metadata[id_column]
groups = train_metadata[group_column]
folds = train_metadata["fold"]

enc = mixed_encoded_preprocessor.fit(train_metadata)
X_train = enc.transform(train_metadata)
y_train = train_metadata[target_column]

In [10]:
columns_for_model = len(X_train.columns)
print(f"Total number of columns: {columns_for_model}")

Total number of columns: 46


In [11]:
class PAUC:
    def get_final_error(self, error, weight):
        return error

    def is_max_optimal(self):
        return True

    def evaluate(self, approxes, target, weight):
        y_true = target.astype(int)
        y_pred = approxes[0].astype(float)
        
        score = compute_pauc(y_true, y_pred, min_tpr=0.8)
        
        return score, 1.0

In [12]:
params = {
    "objective": "Logloss",
#     "boosting_type": "Ordered",
    "iterations": 3000,
    "learning_rate": 0.05,
    "max_depth": 8,
    "l2_leaf_reg": 5,
    "verbose": 50,
    "custom_metric": "AUC",
    "task_type": "GPU",
    "devices": "0",
    "random_state": cfg.seed,
}
es_rounds = 200

In [13]:
best_num_rounds = {}
val_auc_scores = {}
val_pauc_scores = {}
all_folds = np.sort(folds.unique())
oof_predictions = np.zeros(X_train.shape[0])
for fold in all_folds:
    print(f"Running fold: {fold}")
    dev_index = folds != fold
    val_index = folds == fold
    
    X_dev = X_train.loc[dev_index, :]
    y_dev = y_train[dev_index]
    
    X_val = X_train.loc[val_index, :]
    y_val = y_train[val_index]
    
    model = cb.CatBoostClassifier(**params, eval_metric=PAUC())
    
    cb_dataset_dev = cb.Pool(X_dev, y_dev, cat_features=categorical_features)
    cb_dataset_val = cb.Pool(X_val, y_val, cat_features=categorical_features)
    
    model.fit(cb_dataset_dev, eval_set=cb_dataset_val, 
              early_stopping_rounds=es_rounds, 
              metric_period=params["verbose"], use_best_model=True,
              init_model=None)
    model.save_model(MODELS_OUTPUT_PATH / f"{cfg.model_name}_fold_{fold}.txt")
    best_num_rounds[f"fold_{fold}"] = model.best_iteration_
    
    val_preds = model.predict_proba(X_val)[:, -1]
    val_pauc_scores[f"fold_{fold}"] = compute_pauc(y_val, val_preds)
    val_auc_scores[f"fold_{fold}"] = compute_auc(y_val, val_preds)
    oof_predictions[val_index] = val_preds
    print("\n")

Running fold: 1


Failed in nopython mode pipeline (step: nopython frontend)
[1mUntyped global name 'compute_pauc':[0m [1m[1mCannot determine Numba type of <class 'function'>[0m
[1m
File "../../tmp/ipykernel_25/1392371515.py", line 12:[0m
[1m<source missing, REPL/exec in use?>[0m
[0m
  self._object._train(train_pool, test_pool, params, allow_clear_pool, init_model._object if init_model else None)


0:	learn: 0.0734167	test: 0.1208176	best: 0.1208176 (0)	total: 7.89s	remaining: 6h 34m 15s
50:	learn: 0.1528272	test: 0.1395086	best: 0.1408190 (44)	total: 11.1s	remaining: 10m 44s
100:	learn: 0.1653599	test: 0.1494783	best: 0.1495240 (99)	total: 14.6s	remaining: 6m 59s
150:	learn: 0.1741007	test: 0.1536138	best: 0.1538425 (141)	total: 18s	remaining: 5m 40s
200:	learn: 0.1795383	test: 0.1569665	best: 0.1569665 (200)	total: 21.5s	remaining: 4m 59s
250:	learn: 0.1827266	test: 0.1577486	best: 0.1581364 (239)	total: 24.9s	remaining: 4m 32s
300:	learn: 0.1852169	test: 0.1580744	best: 0.1581725 (296)	total: 28.4s	remaining: 4m 14s
350:	learn: 0.1875383	test: 0.1598226	best: 0.1598226 (350)	total: 31.8s	remaining: 3m 59s
400:	learn: 0.1895519	test: 0.1604126	best: 0.1604522 (390)	total: 35.3s	remaining: 3m 48s
450:	learn: 0.1912313	test: 0.1599136	best: 0.1609848 (421)	total: 38.7s	remaining: 3m 38s
500:	learn: 0.1926902	test: 0.1596439	best: 0.1609848 (421)	total: 42.1s	remaining: 3m 30s
550

Failed in nopython mode pipeline (step: nopython frontend)
[1mUntyped global name 'compute_pauc':[0m [1m[1mCannot determine Numba type of <class 'function'>[0m
[1m
File "../../tmp/ipykernel_25/1392371515.py", line 12:[0m
[1m<source missing, REPL/exec in use?>[0m
[0m
  self._object._train(train_pool, test_pool, params, allow_clear_pool, init_model._object if init_model else None)


0:	learn: 0.0924686	test: 0.0662902	best: 0.0662902 (0)	total: 157ms	remaining: 7m 49s
50:	learn: 0.1600662	test: 0.1437933	best: 0.1437933 (50)	total: 3.42s	remaining: 3m 18s
100:	learn: 0.1706627	test: 0.1490593	best: 0.1490726 (99)	total: 6.75s	remaining: 3m 13s
150:	learn: 0.1782556	test: 0.1518876	best: 0.1519599 (149)	total: 10.1s	remaining: 3m 10s
200:	learn: 0.1836477	test: 0.1539270	best: 0.1539825 (199)	total: 13.4s	remaining: 3m 7s
250:	learn: 0.1863838	test: 0.1558662	best: 0.1559655 (244)	total: 16.9s	remaining: 3m 5s
300:	learn: 0.1887643	test: 0.1567439	best: 0.1568633 (275)	total: 20.3s	remaining: 3m 1s
350:	learn: 0.1904455	test: 0.1586254	best: 0.1587156 (345)	total: 23.7s	remaining: 2m 58s
400:	learn: 0.1919135	test: 0.1598671	best: 0.1602008 (385)	total: 27s	remaining: 2m 55s
450:	learn: 0.1929985	test: 0.1608648	best: 0.1608957 (448)	total: 30.4s	remaining: 2m 51s
500:	learn: 0.1940823	test: 0.1614353	best: 0.1618550 (477)	total: 33.8s	remaining: 2m 48s
550:	learn:

Failed in nopython mode pipeline (step: nopython frontend)
[1mUntyped global name 'compute_pauc':[0m [1m[1mCannot determine Numba type of <class 'function'>[0m
[1m
File "../../tmp/ipykernel_25/1392371515.py", line 12:[0m
[1m<source missing, REPL/exec in use?>[0m
[0m
  self._object._train(train_pool, test_pool, params, allow_clear_pool, init_model._object if init_model else None)


0:	learn: 0.0542149	test: 0.0347709	best: 0.0347709 (0)	total: 147ms	remaining: 7m 20s
50:	learn: 0.1543162	test: 0.1346183	best: 0.1346183 (50)	total: 3.33s	remaining: 3m 12s
100:	learn: 0.1688751	test: 0.1490477	best: 0.1490477 (100)	total: 6.65s	remaining: 3m 10s
150:	learn: 0.1762507	test: 0.1540508	best: 0.1540677 (149)	total: 9.98s	remaining: 3m 8s
200:	learn: 0.1808474	test: 0.1581822	best: 0.1582325 (198)	total: 13.4s	remaining: 3m 6s
250:	learn: 0.1841179	test: 0.1586365	best: 0.1588045 (234)	total: 16.8s	remaining: 3m 3s
300:	learn: 0.1866814	test: 0.1603273	best: 0.1603326 (297)	total: 20.1s	remaining: 3m
350:	learn: 0.1885581	test: 0.1607321	best: 0.1607607 (345)	total: 23.6s	remaining: 2m 58s
400:	learn: 0.1903155	test: 0.1615002	best: 0.1618696 (391)	total: 27s	remaining: 2m 55s
450:	learn: 0.1916799	test: 0.1628145	best: 0.1629679 (435)	total: 30.3s	remaining: 2m 51s
500:	learn: 0.1928008	test: 0.1633753	best: 0.1637781 (478)	total: 33.7s	remaining: 2m 48s
550:	learn: 0.

Failed in nopython mode pipeline (step: nopython frontend)
[1mUntyped global name 'compute_pauc':[0m [1m[1mCannot determine Numba type of <class 'function'>[0m
[1m
File "../../tmp/ipykernel_25/1392371515.py", line 12:[0m
[1m<source missing, REPL/exec in use?>[0m
[0m
  self._object._train(train_pool, test_pool, params, allow_clear_pool, init_model._object if init_model else None)


0:	learn: 0.0669240	test: 0.0702879	best: 0.0702879 (0)	total: 159ms	remaining: 7m 55s
50:	learn: 0.1516753	test: 0.1388150	best: 0.1390192 (49)	total: 3.4s	remaining: 3m 16s
100:	learn: 0.1642901	test: 0.1470771	best: 0.1470771 (100)	total: 6.85s	remaining: 3m 16s
150:	learn: 0.1723209	test: 0.1510523	best: 0.1510857 (148)	total: 10.2s	remaining: 3m 11s
200:	learn: 0.1782513	test: 0.1542276	best: 0.1542276 (200)	total: 13.5s	remaining: 3m 7s
250:	learn: 0.1828457	test: 0.1577298	best: 0.1577298 (250)	total: 16.9s	remaining: 3m 4s
300:	learn: 0.1857219	test: 0.1590968	best: 0.1592093 (291)	total: 20.2s	remaining: 3m 1s
350:	learn: 0.1881225	test: 0.1595678	best: 0.1599387 (340)	total: 23.5s	remaining: 2m 57s
400:	learn: 0.1903176	test: 0.1595802	best: 0.1599387 (340)	total: 26.9s	remaining: 2m 54s
450:	learn: 0.1918781	test: 0.1594982	best: 0.1599387 (340)	total: 30.2s	remaining: 2m 50s
500:	learn: 0.1932275	test: 0.1610968	best: 0.1610968 (500)	total: 33.5s	remaining: 2m 47s
550:	lear

Failed in nopython mode pipeline (step: nopython frontend)
[1mUntyped global name 'compute_pauc':[0m [1m[1mCannot determine Numba type of <class 'function'>[0m
[1m
File "../../tmp/ipykernel_25/1392371515.py", line 12:[0m
[1m<source missing, REPL/exec in use?>[0m
[0m
  self._object._train(train_pool, test_pool, params, allow_clear_pool, init_model._object if init_model else None)


0:	learn: 0.0809995	test: 0.0891250	best: 0.0891250 (0)	total: 157ms	remaining: 7m 50s
50:	learn: 0.1518143	test: 0.1268442	best: 0.1268442 (50)	total: 3.37s	remaining: 3m 14s
100:	learn: 0.1669723	test: 0.1458558	best: 0.1459123 (94)	total: 6.73s	remaining: 3m 13s
150:	learn: 0.1759233	test: 0.1532550	best: 0.1535574 (146)	total: 10.1s	remaining: 3m 10s
200:	learn: 0.1817249	test: 0.1552741	best: 0.1557196 (194)	total: 13.6s	remaining: 3m 9s
250:	learn: 0.1857053	test: 0.1557932	best: 0.1562424 (220)	total: 16.9s	remaining: 3m 5s
300:	learn: 0.1880176	test: 0.1584211	best: 0.1586245 (281)	total: 20.3s	remaining: 3m 1s
350:	learn: 0.1896343	test: 0.1592736	best: 0.1593599 (342)	total: 23.6s	remaining: 2m 57s
400:	learn: 0.1910052	test: 0.1595454	best: 0.1595454 (400)	total: 26.9s	remaining: 2m 54s
450:	learn: 0.1923063	test: 0.1593933	best: 0.1595454 (400)	total: 30.2s	remaining: 2m 50s
500:	learn: 0.1933018	test: 0.1571572	best: 0.1595454 (400)	total: 33.5s	remaining: 2m 47s
550:	lear

In [14]:
oof_preds_df = pd.DataFrame({
    id_column: train_ids,
    group_column: groups,
    "fold": folds,
    target_column: y_train,
    f"oof_{cfg.model_name}": oof_predictions
})
oof_preds_df.to_csv(f"oof_preds_{cfg.model_name}.csv", index=False)
oof_preds_df.head()

Unnamed: 0,isic_id,patient_id,fold,final_target,oof_cb_v1
0,ISIC_0015670,IP_1235828,4,0,3.1e-05
1,ISIC_0015845,IP_8170065,1,0,0.401362
2,ISIC_0015864,IP_6724798,5,0,5.1e-05
3,ISIC_0015902,IP_4111386,2,0,3.3e-05
4,ISIC_0024200,IP_8313778,1,0,0.000363


In [15]:
val_auc_scores

{'fold_1': 0.9509077717395972,
 'fold_2': 0.9523324073203026,
 'fold_3': 0.9505116621532241,
 'fold_4': 0.9524636220688623,
 'fold_5': 0.9490689677129721}

In [16]:
val_pauc_scores

{'fold_1': 0.1621604495807423,
 'fold_2': 0.16410595585727236,
 'fold_3': 0.16377806342114784,
 'fold_4': 0.16413716289575817,
 'fold_5': 0.1595454288015405}

In [17]:
cv_auc_oof = compute_auc(oof_preds_df[target_column], oof_preds_df[f"oof_{cfg.model_name}"])
cv_pauc_oof = compute_pauc(oof_preds_df[target_column], oof_preds_df[f"oof_{cfg.model_name}"], min_tpr=0.8)

cv_auc_avg = np.mean(list(val_auc_scores.values()))
cv_pauc_avg = np.mean(list(val_pauc_scores.values()))

print(f"CV AUC OOF: {cv_auc_oof}")
print(f"CV PAUC OOF: {cv_pauc_oof}")
print(f"CV AUC AVG: {cv_auc_avg}")
print(f"CV PAUC AVG: {cv_pauc_avg}")

CV AUC OOF: 0.9509141643032035
CV PAUC OOF: 0.16259879590558052
CV AUC AVG: 0.9510568861989916
CV PAUC AVG: 0.16274541211129226


In [18]:
params

{'objective': 'Logloss',
 'iterations': 3000,
 'learning_rate': 0.05,
 'max_depth': 8,
 'l2_leaf_reg': 5,
 'verbose': 50,
 'custom_metric': 'AUC',
 'task_type': 'GPU',
 'devices': '0',
 'random_state': 2022}

In [19]:
metadata = {
    "params": params,
    "best_num_rounds": best_num_rounds,
    "val_auc_scores": val_auc_scores,
    "val_pauc_scores": val_pauc_scores,
    "cv_auc_oof": cv_auc_oof,
    "cv_pauc_oof": cv_pauc_oof,
    "cv_auc_avg": cv_auc_avg,
    "cv_pauc_avg": cv_pauc_avg
}

with open("run_metadata.json", "w") as f:
    json.dump(metadata, f)