In [1]:
import json
import joblib

from pathlib import Path

import pandas as pd
import numpy as np

import catboost as cb

from isic_helper import DotDict, get_folds

In [2]:
cfg = DotDict()

cfg.models_output_dir = "models"
cfg.model_name = "cb_v1"

In [3]:
INPUT_PATH = Path("../input/isic-2024-challenge/")
ARTIFACTS_INPUT_PATH = Path(f"../input/isic-scd-cb-lgb-train/")
MODELS_INPUT_PATH = ARTIFACTS_INPUT_PATH / cfg.models_output_dir

train_metadata = pd.read_csv(INPUT_PATH / "train-metadata.csv", low_memory=False)
test_metadata = pd.read_csv(INPUT_PATH / "test-metadata.csv", low_memory=False)

folds_df = get_folds()
train_metadata = train_metadata.merge(folds_df, on=["isic_id", "patient_id"], how="inner")
print(f"Train data size: {train_metadata.shape}")
print(f"Test data size: {test_metadata.shape}")

Train data size: (401059, 57)
Test data size: (3, 44)


In [4]:
with open(ARTIFACTS_INPUT_PATH / "cb_run_metadata.json", "r") as f:
    run_metadata = json.load(f)

In [5]:
run_metadata

{'params': {'objective': 'Logloss',
  'random_state': 2022,
  'iterations': 2500,
  'learning_rate': 0.05,
  'cat_features': ['sex',
   'anatom_site_general',
   'tbp_tile_type',
   'tbp_lv_location',
   'tbp_lv_location_simple',
   'attribution',
   'copyright_license'],
  'max_depth': 8,
  'l2_leaf_reg': 5,
  'verbose': 50,
  'early_stopping_rounds': 100,
  'eval_metric': 'AUC',
  'task_type': 'GPU',
  'devices': '0'},
 'cb_best_num_rounds': {'fold_1': 414,
  'fold_2': 649,
  'fold_3': 408,
  'fold_4': 642,
  'fold_5': 231},
 'cb_val_auc_scores': {'fold_1': 0.9494343400001526,
  'fold_2': 0.9503380060195923,
  'fold_3': 0.955212414264679,
  'fold_4': 0.9614055156707764,
  'fold_5': 0.950451135635376},
 'cb_val_pauc_scores': {'fold_1': 0.16176580074155472,
  'fold_2': 0.16379234653749836,
  'fold_3': 0.16503569155507852,
  'fold_4': 0.16790786337372549,
  'fold_5': 0.16277633946384645},
 'cb_cv_auc_oof': 0.952448562456487,
 'cb_cv_pauc_oof': 0.1632807063262568,
 'cb_cv_auc_avg': 0.953

In [6]:
def feature_engineering(df):
    df["lesion_size_ratio"]              = df["tbp_lv_minorAxisMM"] / df["clin_size_long_diam_mm"]
    df["hue_contrast"]                   = (df["tbp_lv_H"] - df["tbp_lv_Hext"]).abs()
    df["normalized_lesion_size"]         = df["clin_size_long_diam_mm"] / df["age_approx"]
    df["overall_color_difference"]       = (df["tbp_lv_deltaA"] + df["tbp_lv_deltaB"] + df["tbp_lv_deltaL"]) / 3
    
    patient_num_images = df.groupby("patient_id", as_index=False)["isic_id"].count().rename(columns={"isic_id": "num_images"})
    df = df.merge(patient_num_images, on="patient_id", how="left")

    new_num_cols = [
        "num_images",
        "lesion_size_ratio",
        "normalized_lesion_size", 
        "overall_color_difference"
    ]
    
    new_cat_cols = []
    
    return df, new_num_cols, new_cat_cols

train_metadata, _, _ = feature_engineering(train_metadata.copy())
test_metadata, _, _ = feature_engineering(test_metadata.copy())

In [7]:
id_column = "isic_id"
target_column = "target"
folds = train_metadata["fold"]

In [8]:
with open(ARTIFACTS_INPUT_PATH / "encoder.joblib", "rb") as f:
    mixed_encoded_preprocessor = joblib.load(f)

In [9]:
enc = mixed_encoded_preprocessor.fit(train_metadata)
X_test = enc.transform(test_metadata)

In [10]:
columns_for_model = len(X_test.columns)
print(f"Total number of columns: {columns_for_model}")

Total number of columns: 41


In [11]:
all_folds = np.sort(folds.unique())
test_predictions_df = pd.DataFrame({id_column: test_metadata[id_column]})
for fold in all_folds:
    model = cb.CatBoostClassifier(**run_metadata["params"])
    model.load_model(MODELS_INPUT_PATH / f"{cfg.model_name}_fold_{fold}.txt")
    test_predictions_df[f"fold_{fold}"] = model.predict_proba(X_test, ntree_end=run_metadata["cb_best_num_rounds"][f"fold_{fold}"])[:, -1]

In [12]:
test_predictions_df.head()

Unnamed: 0,isic_id,fold_1,fold_2,fold_3,fold_4,fold_5
0,ISIC_0015657,4e-05,8.1e-05,0.000129,5.9e-05,0.000102
1,ISIC_0015729,2.2e-05,2.1e-05,8e-06,1e-05,4.2e-05
2,ISIC_0015740,0.000136,0.000251,0.000364,9.4e-05,0.000285


In [13]:
test_predictions_df[target_column] = test_predictions_df[[f"fold_{fold}" for fold in all_folds]].mean(axis=1)

In [14]:
test_predictions_df.head()

Unnamed: 0,isic_id,fold_1,fold_2,fold_3,fold_4,fold_5,target
0,ISIC_0015657,4e-05,8.1e-05,0.000129,5.9e-05,0.000102,8.2e-05
1,ISIC_0015729,2.2e-05,2.1e-05,8e-06,1e-05,4.2e-05,2.1e-05
2,ISIC_0015740,0.000136,0.000251,0.000364,9.4e-05,0.000285,0.000226


In [15]:
test_predictions_df[target_column].describe()

count    3.000000
mean     0.000110
std      0.000105
min      0.000021
25%      0.000051
50%      0.000082
75%      0.000154
max      0.000226
Name: target, dtype: float64

In [16]:
test_predictions_df[[id_column, target_column]].head(10)

Unnamed: 0,isic_id,target
0,ISIC_0015657,8.2e-05
1,ISIC_0015729,2.1e-05
2,ISIC_0015740,0.000226


In [17]:
test_predictions_df[[id_column, target_column]].to_csv("submission.csv", index=False)