In [1]:
import json
import joblib
from pathlib import Path

import pandas as pd
import numpy as np

from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OrdinalEncoder

import lightgbm as lgb
from lightgbm.callback import log_evaluation, early_stopping

from isic_helper import DotDict
from isic_helper import get_folds
from isic_helper import compute_auc, compute_pauc

pd.options.display.max_columns = 1000

In [2]:
cfg = DotDict()
cfg.seed = 2022

cfg.models_output_dir = "models"
cfg.model_name = "lgb_v1"

In [3]:
INPUT_PATH = Path("../input/isic-2024-challenge/")
MODELS_OUTPUT_PATH = Path(f"{cfg.models_output_dir}")
MODELS_OUTPUT_PATH.mkdir(exist_ok=True)

train_metadata = pd.read_csv(INPUT_PATH / "train-metadata.csv", low_memory=False)
test_metadata = pd.read_csv(INPUT_PATH / "test-metadata.csv")

folds_df = get_folds()
train_metadata = train_metadata.merge(folds_df, on=["isic_id", "patient_id"], how="inner")
print(f"Train data size: {train_metadata.shape}")
print(f"Test data size: {test_metadata.shape}")

Train data size: (401059, 57)
Test data size: (3, 44)


In [4]:
train_metadata["target"].value_counts(normalize=True)

target
0    0.99902
1    0.00098
Name: proportion, dtype: float64

In [5]:
def feature_engineering(df):
    df["lesion_size_ratio"]              = df["tbp_lv_minorAxisMM"] / df["clin_size_long_diam_mm"]
    df["hue_contrast"]                   = (df["tbp_lv_H"] - df["tbp_lv_Hext"]).abs()
    df["normalized_lesion_size"]         = df["clin_size_long_diam_mm"] / df["age_approx"]
    df["overall_color_difference"]       = (df["tbp_lv_deltaA"] + df["tbp_lv_deltaB"] + df["tbp_lv_deltaL"]) / 3
    
    patient_num_images = df.groupby("patient_id", as_index=False)["isic_id"].count().rename(columns={"isic_id": "num_images"})
    df = df.merge(patient_num_images, on="patient_id", how="left")

    new_num_cols = [
        "num_images",
        "lesion_size_ratio",
        "hue_contrast",
        "normalized_lesion_size", 
        "overall_color_difference"
    ]
    
    new_cat_cols = []
    
    return df, new_num_cols, new_cat_cols

train_metadata, new_num_cols, new_cat_cols = feature_engineering(train_metadata.copy())
test_metadata, new_num_cols, new_cat_cols = feature_engineering(test_metadata.copy())

In [6]:
id_column = "isic_id"
target_column = "final_target"
group_column = "patient_id"
drop_features = ["image_type"] + train_metadata.columns[~np.in1d(train_metadata.columns, test_metadata.columns)].tolist() # target column removed
categorical_features = ["sex", "anatom_site_general", 
                        "tbp_tile_type", "tbp_lv_location", "tbp_lv_location_simple",
                       "attribution", "copyright_license"
                       ] + new_cat_cols
numerical_features = train_metadata.columns[~np.in1d(train_metadata.columns, 
                                                     [id_column] + [group_column] + categorical_features + drop_features)]

In [7]:
mixed_encoded_preprocessor = ColumnTransformer(
    [
        ("numerical", "passthrough", numerical_features),
        (
            "categorical",
            OrdinalEncoder(handle_unknown="use_encoded_value", unknown_value=-2, encoded_missing_value=-1, dtype=int),
            categorical_features,
        ),
    ],
    verbose_feature_names_out=False,

)
mixed_encoded_preprocessor.set_output(transform="pandas")

In [8]:
with open(f"{cfg.model_name}_encoder.joblib", "wb") as f:
    joblib.dump(mixed_encoded_preprocessor, f)

In [9]:
train_ids = train_metadata[id_column]
groups = train_metadata[group_column]
folds = train_metadata["fold"]

enc = mixed_encoded_preprocessor.fit(train_metadata)
X_train = enc.transform(train_metadata)
y_train = train_metadata[target_column]

In [10]:
columns_for_model = len(X_train.columns)
print(f"Total number of columns: {columns_for_model}")

Total number of columns: 46


In [11]:
def pauc_80(preds, data):
    score_value = compute_pauc(data.get_label(), preds, min_tpr=0.8)   
    return 'pauc_80', score_value, True

In [12]:
params = {
    "objective": "binary",
    "metric": "auc",
    "num_leaves": 63,
    "learning_rate": 0.01,
    "bagging_freq": 5,
    "bagging_fraction": 0.6,
    "bagging_seed": cfg.seed,
    "feature_fraction": 0.3,
    "feature_fraction_seed": cfg.seed,
    "lambda_l1": 0.95,
    "lambda_l2": 0.95,
    "verbosity": -1
}
num_rounds = 2500
es_rounds = 200
log_rounds = 50

In [13]:
best_num_rounds = {}
val_auc_scores = {}
val_pauc_scores = {}
all_folds = np.sort(folds.unique())
oof_predictions = np.zeros(X_train.shape[0])
for fold in all_folds:
    print(f"Running fold: {fold}")
    dev_index = folds != fold
    val_index = folds == fold
    
    X_dev = X_train.loc[dev_index, :]
    y_dev = y_train[dev_index]
    
    X_val = X_train.loc[val_index, :]
    y_val = y_train[val_index]
    
    lgb_dataset_dev = lgb.Dataset(X_dev, label=y_dev, categorical_feature=categorical_features, free_raw_data=False)
    lgb_dataset_val = lgb.Dataset(X_val, label=y_val, categorical_feature=categorical_features, free_raw_data=False)
    
    model = lgb.train(
        params,
        lgb_dataset_dev,
        num_boost_round=num_rounds,
        valid_sets=[lgb_dataset_val],
        feval=pauc_80,
        callbacks=[early_stopping(stopping_rounds=es_rounds), log_evaluation(log_rounds)],
    )
    
    model.save_model(MODELS_OUTPUT_PATH / f"{cfg.model_name}_fold_{fold}.txt")
    best_num_rounds[f"fold_{fold}"] = model.best_iteration
    val_scores = dict(model.best_score["valid_0"])
    
    val_auc_scores[f"fold_{fold}"] = val_scores["auc"]
    val_pauc_scores[f"fold_{fold}"] = val_scores["pauc_80"]
    
    val_preds = model.predict(X_val, num_iteration=model.best_iteration)
    oof_predictions[val_index] = val_preds
    print("\n")

Running fold: 1
Training until validation scores don't improve for 200 rounds
[50]	valid_0's auc: 0.931613	valid_0's pauc_80: 0.147766
[100]	valid_0's auc: 0.937806	valid_0's pauc_80: 0.151397
[150]	valid_0's auc: 0.942852	valid_0's pauc_80: 0.155529
[200]	valid_0's auc: 0.943996	valid_0's pauc_80: 0.156272
[250]	valid_0's auc: 0.945987	valid_0's pauc_80: 0.157757
[300]	valid_0's auc: 0.94731	valid_0's pauc_80: 0.159106
[350]	valid_0's auc: 0.948495	valid_0's pauc_80: 0.160156
[400]	valid_0's auc: 0.948868	valid_0's pauc_80: 0.16053
[450]	valid_0's auc: 0.948224	valid_0's pauc_80: 0.159804
[500]	valid_0's auc: 0.949286	valid_0's pauc_80: 0.160935
[550]	valid_0's auc: 0.950253	valid_0's pauc_80: 0.161591
[600]	valid_0's auc: 0.950537	valid_0's pauc_80: 0.161587
[650]	valid_0's auc: 0.950076	valid_0's pauc_80: 0.16107
[700]	valid_0's auc: 0.950629	valid_0's pauc_80: 0.161641
[750]	valid_0's auc: 0.950383	valid_0's pauc_80: 0.16139
Early stopping, best iteration is:
[559]	valid_0's auc: 0

In [14]:
oof_preds_df = pd.DataFrame({
    id_column: train_ids,
    group_column: groups,
    "fold": folds,
    target_column: y_train,
    f"oof_{cfg.model_name}": oof_predictions
})
oof_preds_df.to_csv(f"oof_preds_{cfg.model_name}.csv", index=False)
oof_preds_df.head()

Unnamed: 0,isic_id,patient_id,fold,final_target,oof_lgb_v1
0,ISIC_0015670,IP_1235828,4,0,2.3e-05
1,ISIC_0015845,IP_8170065,1,0,0.354119
2,ISIC_0015864,IP_6724798,5,0,9.3e-05
3,ISIC_0015902,IP_4111386,2,0,6.2e-05
4,ISIC_0024200,IP_8313778,1,0,0.000751


In [15]:
val_auc_scores

{'fold_1': 0.9506421551793852,
 'fold_2': 0.9525560711449348,
 'fold_3': 0.9443546349103343,
 'fold_4': 0.9668538081391721,
 'fold_5': 0.9495350198244137}

In [16]:
val_pauc_scores

{'fold_1': 0.16189363377553598,
 'fold_2': 0.16460742359112585,
 'fold_3': 0.1566555858531654,
 'fold_4': 0.17275461431905254,
 'fold_5': 0.16268287305687626}

In [17]:
cv_auc_oof = compute_auc(oof_preds_df[target_column], oof_preds_df[f"oof_{cfg.model_name}"])
cv_pauc_oof = compute_pauc(oof_preds_df[target_column], oof_preds_df[f"oof_{cfg.model_name}"], min_tpr=0.8)

cv_auc_avg = np.mean(list(val_auc_scores.values()))
cv_pauc_avg = np.mean(list(val_pauc_scores.values()))

print(f"CV AUC OOF: {cv_auc_oof}")
print(f"CV PAUC OOF: {cv_pauc_oof}")
print(f"CV AUC AVG: {cv_auc_avg}")
print(f"CV PAUC AVG: {cv_pauc_avg}")

CV AUC OOF: 0.934603157371475
CV PAUC OOF: 0.14554897076012202
CV AUC AVG: 0.952788337839648
CV PAUC AVG: 0.16371882611915117


In [18]:
metadata = {
    "params": params,
    "num_rounds": num_rounds,
    "es_rounds": es_rounds,
    "best_num_rounds": best_num_rounds,
    "val_auc_scores": val_auc_scores,
    "val_pauc_scores": val_pauc_scores,
    "cv_auc_oof": cv_auc_oof,
    "cv_pauc_oof": cv_pauc_oof,
    "cv_auc_avg": cv_auc_avg,
    "cv_pauc_avg": cv_pauc_avg
}

with open("run_metadata.json", "w") as f:
    json.dump(metadata, f)