In [1]:
import json
import joblib

from pathlib import Path

import pandas as pd
import numpy as np

import lightgbm as lgb

from isic_helper import DotDict, get_folds

In [2]:
cfg = DotDict()

cfg.models_output_dir = "models"
cfg.model_name = "lgb_v3"

In [3]:
id_column = "isic_id"
target_column = "target"
group_column = "patient_id"

In [4]:
def count_features(df, col):
    tmp = df[[id_column, group_column, col]].pivot_table(
        values=id_column, 
        index=group_column, 
        columns=col, 
        aggfunc="count", 
        fill_value=0)
    feature_cols = tmp.columns.tolist()
    tmp.reset_index(inplace=True)
    tmp.index.name = None
    df = df.merge(tmp, on=group_column, how="left")
    return df, feature_cols

def mean_features(df, col, val):
    tmp = df[[id_column, group_column, col, val]].pivot_table(
        values=val, 
        index=group_column, 
        columns=col, 
        aggfunc="mean", 
        fill_value=0)
    tmp.columns = [f"{c}_{val}_mean" for c in tmp.columns.tolist()]
    feature_cols = tmp.columns.tolist()
    tmp.reset_index(inplace=True)
    tmp.index.name = None
    df = df.merge(tmp, on=group_column, how="left")
    return df, feature_cols


def stat_features(df, group_cols, value_col, stats):
    tmp = df.groupby(group_cols)[value_col].agg(stats)
    tmp.columns = [f"{value_col}_{stat}" for stat in stats]
    tmp.reset_index(inplace=True)
    df = df.merge(tmp, on=group_cols, how="left")
    df[f"{value_col}_mean_diff"] = df[value_col] - df[f"{value_col}_mean"]
    return df


def feature_engineering(df):
    new_num_cols = []
    
    df["lesion_size_ratio"] = df["tbp_lv_minorAxisMM"] / df["clin_size_long_diam_mm"]
    new_num_cols += ["lesion_size_ratio"]
    
    df["lesion_distance"] = np.sqrt(df["tbp_lv_x"]**2 + df["tbp_lv_y"]**2 + df["tbp_lv_z"]**2)
    new_num_cols += ["lesion_distance"]
    
    df["hue_contrast"] = df["tbp_lv_H"] - df["tbp_lv_Hext"]
    df, feature_cols = mean_features(df, "anatom_site_general", "hue_contrast")
    new_num_cols += feature_cols
    
    df, feature_cols = count_features(df, "anatom_site_general")
    new_num_cols += feature_cols
    
    df["tbp_lv_A_diff"] =  df["tbp_lv_Aext"] - df["tbp_lv_A"]
    df = stat_features(df, ["patient_id", "tbp_lv_location"], "tbp_lv_A_diff", ["mean"])
    new_num_cols += ["tbp_lv_A_diff_mean_diff"]
    
    df["tbp_lv_B_diff"] =  df["tbp_lv_Bext"] - df["tbp_lv_B"]
    df = stat_features(df, ["patient_id", "tbp_lv_location"], "tbp_lv_B_diff", ["mean"])
    new_num_cols += ["tbp_lv_B_diff_mean_diff"]
    
    df["tbp_lv_L_diff"] =  df["tbp_lv_Lext"] - df["tbp_lv_L"]
    df = stat_features(df, ["patient_id", "tbp_lv_location"], "tbp_lv_L_diff", ["mean"])
    new_num_cols += ["tbp_lv_L_diff_mean_diff"]
    
    df["tbp_lv_L_std_diff"] =  df["tbp_lv_stdLExt"] - df["tbp_lv_stdL"]
    df = stat_features(df, ["patient_id", "tbp_lv_location"], "tbp_lv_L_std_diff", ["mean"])
    new_num_cols += ["tbp_lv_L_std_diff_mean_diff"]
    
    df["color_uniformity"] = df["tbp_lv_color_std_mean"] / df["tbp_lv_radial_color_std_max"]
    df, feature_cols = mean_features(df, "anatom_site_general", "color_uniformity")
    new_num_cols += feature_cols
    
    df["radius"] = np.cos(df["tbp_lv_symm_2axis_angle"]) * np.sqrt(df["tbp_lv_x"]**2 + df["tbp_lv_y"]**2 + df["tbp_lv_z"]**2)
    new_num_cols += ["radius"]
    
    return df, new_num_cols

In [5]:
INPUT_PATH = Path("../input/isic-2024-challenge/")
ARTIFACTS_INPUT_PATH = Path(f"../input/isic-scd-lgb-v3-train/")
MODELS_INPUT_PATH = ARTIFACTS_INPUT_PATH / cfg.models_output_dir

train_metadata = pd.read_csv(INPUT_PATH / "train-metadata.csv", low_memory=False)
test_metadata = pd.read_csv(INPUT_PATH / "test-metadata.csv")

folds_df = get_folds()
train_metadata = train_metadata.merge(folds_df, on=["isic_id", "patient_id"], how="inner")
print(f"Train data size: {train_metadata.shape}")
print(f"Test data size: {test_metadata.shape}")

train_metadata, new_num_cols = feature_engineering(train_metadata.copy())
test_metadata, _ = feature_engineering(test_metadata.copy())

Train data size: (401059, 57)
Test data size: (3, 44)


In [6]:
with open(ARTIFACTS_INPUT_PATH / f"{cfg.model_name}_encoder.joblib", "rb") as f:
    mixed_encoded_preprocessor = joblib.load(f)
    
enc = mixed_encoded_preprocessor.fit(train_metadata)

for col in mixed_encoded_preprocessor.feature_names_in_:
    if col not in test_metadata.columns:
        test_metadata[col] = np.nan

X_test = enc.transform(test_metadata)

columns_for_model = len(X_test.columns)
print(f"Total number of columns: {columns_for_model}")

Total number of columns: 60


In [7]:
folds = train_metadata["fold"]

with open(ARTIFACTS_INPUT_PATH / f"{cfg.model_name}_run_metadata.json", "r") as f:
    run_metadata = json.load(f)

In [8]:
all_folds = np.sort(folds.unique())
test_predictions_df = pd.DataFrame({id_column: test_metadata[id_column]})
for fold in all_folds:
    model = lgb.Booster(model_file=MODELS_INPUT_PATH / f"{cfg.model_name}_fold_{fold}.txt")
    test_predictions_df[f"fold_{fold}"] = model.predict(X_test, num_iteration=run_metadata["best_num_rounds"][f"fold_{fold}"])

In [9]:
test_predictions_df.head()

Unnamed: 0,isic_id,fold_1,fold_2,fold_3,fold_4,fold_5
0,ISIC_0015657,5.6e-05,1.2e-05,4e-05,2.9e-05,4.1e-05
1,ISIC_0015729,3e-05,1.2e-05,4.4e-05,2e-05,3.2e-05
2,ISIC_0015740,6.5e-05,1.3e-05,6.2e-05,5.4e-05,4.9e-05


In [10]:
test_predictions_df[target_column] = test_predictions_df[[f"fold_{fold}" for fold in all_folds]].mean(axis=1)

In [11]:
test_predictions_df.head()

Unnamed: 0,isic_id,fold_1,fold_2,fold_3,fold_4,fold_5,target
0,ISIC_0015657,5.6e-05,1.2e-05,4e-05,2.9e-05,4.1e-05,3.6e-05
1,ISIC_0015729,3e-05,1.2e-05,4.4e-05,2e-05,3.2e-05,2.8e-05
2,ISIC_0015740,6.5e-05,1.3e-05,6.2e-05,5.4e-05,4.9e-05,4.9e-05


In [12]:
test_predictions_df[target_column].describe()

count    3.000000
mean     0.000037
std      0.000011
min      0.000028
25%      0.000032
50%      0.000036
75%      0.000042
max      0.000049
Name: target, dtype: float64

In [13]:
test_predictions_df[[id_column, target_column]].head(10)

Unnamed: 0,isic_id,target
0,ISIC_0015657,3.6e-05
1,ISIC_0015729,2.8e-05
2,ISIC_0015740,4.9e-05


In [14]:
test_predictions_df[[id_column, target_column]].to_csv("submission.csv", index=False)