In [1]:
import time
import json
from pathlib import Path
from pprint import pprint

import h5py
import numpy as np
import pandas as pd

from torch.utils.data import DataLoader
from accelerate import Accelerator

from isic_helper import get_folds

In [2]:
model_name = "efficientnet_b2"
version = "v2"
mode = "pretrain"
path = f"/kaggle/input/isic-scd-{model_name.replace('_', '-')}-{version}-{mode}"

SAMPLE_SIZE = 5000
EXPECTED_TEST_SIZE = 500000

In [3]:
import sys
sys.path.append(path)

In [4]:
from dataset import test_augment, ISICDataset
from models import ISICNet
from engine import predict

In [5]:
id_column = "isic_id"
target_column = "target"
group_column = "patient_id"

In [6]:
INPUT_PATH = Path("../input/isic-2024-challenge/")

train_metadata = pd.read_csv(INPUT_PATH / "train-metadata.csv", low_memory=False)
test_metadata = pd.read_csv(INPUT_PATH / "test-metadata.csv")

folds_df = get_folds()
train_metadata = train_metadata.merge(folds_df, on=["isic_id", "patient_id"], how="inner")
print(f"Train data size: {train_metadata.shape}")
print(f"Test data size: {test_metadata.shape}")

train_images = h5py.File(INPUT_PATH / "train-image.hdf5", mode="r")
test_images = h5py.File(INPUT_PATH / "test-image.hdf5", mode="r")

Train data size: (401059, 58)
Test data size: (3, 44)


In [7]:
def get_dnn_predictions(train, test, test_images, model_name, version, path):
    start_time = time.time()
    test_df = test[[id_column]].copy()
    with open(path / f"{model_name}_{version}_run_metadata.json", "r") as f:
        run_metadata = json.load(f)
    pprint(run_metadata["params"])
    
    image_size = run_metadata["params"]["image_size"]
    batch_size = run_metadata["params"]["val_batch_size"]
    fold_method = run_metadata["params"]["fold_method"]
    
    test_dataset = ISICDataset(
        test_df, test_images, augment=test_augment(image_size), infer=True
    )
    test_dataloader = DataLoader(
        test_dataset,
        batch_size=batch_size,
        shuffle=False,
        num_workers=4,
        drop_last=False,
        pin_memory=True,
    )
    
    if fold_method == "gkf":
        print("Using GroupKFold")
        fold_column = "gkf_fold"
    elif fold_method == "sgkf":
        print("Using StratifiedGroupKFold")
        fold_column = "sgkf_fold"
    else:
        raise ValueError(f"Fold method {fold_method} not supported")
    
#     all_folds = np.unique(train[fold_column])
    all_folds = [1]
    test_predictions_df = pd.DataFrame({id_column: test[id_column]})
    for fold in all_folds:
        if fold < 0:
            continue
        print(f"\nFold {fold}")
        accelerator = Accelerator(
            mixed_precision=run_metadata["params"]["mixed_precision"],
        )
        
        model = ISICNet(model_name=model_name, pretrained=False)
        model = model.to(accelerator.device)
        
        model, test_dataloader = accelerator.prepare(model, test_dataloader)
        model_filepath = path / f"models/fold_{fold}"
        accelerator.load_state(model_filepath)

        test_predictions_df[f"fold_{fold}"] = predict(model, test_dataloader, accelerator, n_tta=run_metadata["params"]["n_tta"])
    test_predictions_df[target_column] = test_predictions_df[[f"fold_{fold}" for fold in all_folds if fold > 0]].mean(axis=1)
    end_time = time.time()
    return test_predictions_df[[id_column, target_column]], (end_time - start_time)

In [8]:
if test_metadata.shape[0] == 3:
    test_preds_df, total_runtime = get_dnn_predictions(train_metadata, 
                                                 train_metadata.sample(n=SAMPLE_SIZE, random_state=42), 
                                                 train_images, 
                                                 model_name, 
                                                 version, 
                                                 Path(path))
else:
    test_preds_df, total_runtime = get_dnn_predictions(train_metadata, 
                                                 test_metadata, 
                                                 test_images, 
                                                 model_name, 
                                                 version, 
                                                 Path(path))

{'debug': False,
 'down_sampling': True,
 'fold_method': 'gkf',
 'image_size': 64,
 'init_lr': 3e-05,
 'mixed_precision': 'fp16',
 'mode': 'pretrain',
 'n_tta': 8,
 'num_epochs': 20,
 'num_workers': 8,
 'seed': 2022,
 'train_batch_size': 64,
 'val_batch_size': 512}
Using GroupKFold

Fold 1
Step: 1/10
Step: 10/10


In [9]:
factor = EXPECTED_TEST_SIZE / SAMPLE_SIZE
expected_total_runtime = total_runtime * factor
total_runtime_minutes = int(expected_total_runtime // 60)
total_runtime_seconds = expected_total_runtime % 60
print(f"Expected total runtime during submission: {total_runtime_minutes} mins and {total_runtime_seconds} secs")

Expected total runtime during submission: 19 mins and 24.78290557861328 secs


In [10]:
test_preds_df.head()

Unnamed: 0,isic_id,target
278442,ISIC_6973879,0.000107
215021,ISIC_5407194,4e-05
209685,ISIC_5273739,0.036108
29648,ISIC_0802250,0.000168
323386,ISIC_8084953,0.002426


In [11]:
test_preds_df[target_column].describe()

count    5000.000000
mean        0.007220
std         0.027316
min         0.000004
25%         0.000406
50%         0.001199
75%         0.003784
max         0.596426
Name: target, dtype: float64

In [12]:
test_preds_df[[id_column, target_column]].head()

Unnamed: 0,isic_id,target
278442,ISIC_6973879,0.000107
215021,ISIC_5407194,4e-05
209685,ISIC_5273739,0.036108
29648,ISIC_0802250,0.000168
323386,ISIC_8084953,0.002426


In [13]:
test_preds_df[[id_column, target_column]].to_csv("submission.csv", index=False)