In [1]:
import json
import joblib

from pathlib import Path

import pandas as pd
import numpy as np

import catboost as cb

from isic_helper import DotDict, get_folds

In [2]:
cfg = DotDict()

cfg.models_output_dir = "models"
cfg.model_name = "cb_v1"

In [3]:
INPUT_PATH = Path("../input/isic-2024-challenge/")
ARTIFACTS_INPUT_PATH = Path(f"../input/isic-scd-cb-train/")
MODELS_INPUT_PATH = ARTIFACTS_INPUT_PATH / cfg.models_output_dir

train_metadata = pd.read_csv(INPUT_PATH / "train-metadata.csv", low_memory=False)
test_metadata = pd.read_csv(INPUT_PATH / "test-metadata.csv", low_memory=False)

folds_df = get_folds()
train_metadata = train_metadata.merge(folds_df, on=["isic_id", "patient_id"], how="inner")
print(f"Train data size: {train_metadata.shape}")
print(f"Test data size: {test_metadata.shape}")

Train data size: (401059, 57)
Test data size: (3, 44)


In [4]:
with open(ARTIFACTS_INPUT_PATH / "run_metadata.json", "r") as f:
    run_metadata = json.load(f)

In [5]:
def feature_engineering(df):
    df["lesion_size_ratio"]              = df["tbp_lv_minorAxisMM"] / df["clin_size_long_diam_mm"]
    df["hue_contrast"]                   = (df["tbp_lv_H"] - df["tbp_lv_Hext"]).abs()
    df["normalized_lesion_size"]         = df["clin_size_long_diam_mm"] / df["age_approx"]
    df["overall_color_difference"]       = (df["tbp_lv_deltaA"] + df["tbp_lv_deltaB"] + df["tbp_lv_deltaL"]) / 3
    
    patient_num_images = df.groupby("patient_id", as_index=False)["isic_id"].count().rename(columns={"isic_id": "num_images"})
    df = df.merge(patient_num_images, on="patient_id", how="left")

    new_num_cols = [
        "num_images",
        "lesion_size_ratio",
        "hue_contrast",
        "normalized_lesion_size", 
        "overall_color_difference"
    ]
    
    new_cat_cols = []
    
    return df, new_num_cols, new_cat_cols

train_metadata, new_num_cols, new_cat_cols = feature_engineering(train_metadata.copy())
test_metadata, new_num_cols, new_cat_cols = feature_engineering(test_metadata.copy())

In [6]:
id_column = "isic_id"
target_column = "target"
folds = train_metadata["fold"]

In [7]:
with open(ARTIFACTS_INPUT_PATH / f"{cfg.model_name}_encoder.joblib", "rb") as f:
    mixed_encoded_preprocessor = joblib.load(f)

In [8]:
enc = mixed_encoded_preprocessor.fit(train_metadata)
X_test = enc.transform(test_metadata)

In [9]:
columns_for_model = len(X_test.columns)
print(f"Total number of columns: {columns_for_model}")

Total number of columns: 46


In [10]:
all_folds = np.sort(folds.unique())
test_predictions_df = pd.DataFrame({id_column: test_metadata[id_column]})
for fold in all_folds:
    model = cb.CatBoostClassifier(use_best_model=True)
    model.load_model(MODELS_INPUT_PATH / f"{cfg.model_name}_fold_{fold}.txt")
    test_predictions_df[f"fold_{fold}"] = model.predict_proba(X_test)[:, -1]

In [11]:
test_predictions_df.head()

Unnamed: 0,isic_id,fold_1,fold_2,fold_3,fold_4,fold_5
0,ISIC_0015657,0.000223,9.7e-05,0.000119,0.000137,0.000323
1,ISIC_0015729,4.3e-05,8e-06,4.2e-05,3.5e-05,3.7e-05
2,ISIC_0015740,0.000297,0.000688,0.00023,0.000972,0.000385


In [12]:
test_predictions_df[target_column] = test_predictions_df[[f"fold_{fold}" for fold in all_folds]].mean(axis=1)

In [13]:
test_predictions_df.head()

Unnamed: 0,isic_id,fold_1,fold_2,fold_3,fold_4,fold_5,target
0,ISIC_0015657,0.000223,9.7e-05,0.000119,0.000137,0.000323,0.00018
1,ISIC_0015729,4.3e-05,8e-06,4.2e-05,3.5e-05,3.7e-05,3.3e-05
2,ISIC_0015740,0.000297,0.000688,0.00023,0.000972,0.000385,0.000514


In [14]:
test_predictions_df[target_column].describe()

count    3.000000
mean     0.000242
std      0.000247
min      0.000033
25%      0.000106
50%      0.000180
75%      0.000347
max      0.000514
Name: target, dtype: float64

In [15]:
test_predictions_df[[id_column, target_column]].head(10)

Unnamed: 0,isic_id,target
0,ISIC_0015657,0.00018
1,ISIC_0015729,3.3e-05
2,ISIC_0015740,0.000514


In [16]:
test_predictions_df[[id_column, target_column]].to_csv("submission.csv", index=False)