In [1]:
import time
import json
from pathlib import Path
from pprint import pprint

import h5py
import numpy as np
import pandas as pd

from torch.utils.data import DataLoader
from accelerate import Accelerator

from isic_helper import get_folds

In [2]:
model_name = "efficientnet_b2"
version = "v1"
mode = "pretrain"
path = f"/kaggle/input/isic-scd-{model_name.replace('_', '-')}-{version}-{mode}"

SAMPLE_SIZE = 5000
EXPECTED_TEST_SIZE = 500000

In [3]:
import sys
sys.path.append(path)

In [4]:
from dataset import test_augment, ISICDataset, preprocess, feature_engineering, get_emb_szs
from models import ISICNet
from engine import predict

In [5]:
id_column = "isic_id"
target_column = "target"
group_column = "patient_id"

In [6]:
INPUT_PATH = Path("../input/isic-2024-challenge/")

train_metadata = pd.read_csv(INPUT_PATH / "train-metadata.csv", low_memory=False)
test_metadata = pd.read_csv(INPUT_PATH / "test-metadata.csv")

folds_df = get_folds()
train_metadata = train_metadata.merge(folds_df, on=["isic_id", "patient_id"], how="inner")
print(f"Train data size: {train_metadata.shape}")
print(f"Test data size: {test_metadata.shape}")

train_images = h5py.File(INPUT_PATH / "train-image.hdf5", mode="r")
test_images = h5py.File(INPUT_PATH / "test-image.hdf5", mode="r")

Train data size: (401059, 56)
Test data size: (3, 44)


In [7]:
def get_dnn_predictions(train, test, test_images, model_name, version, path):
    start_time = time.time()
    with open(path / f"{model_name}_{version}_run_metadata.json", "r") as f:
        run_metadata = json.load(f)
    pprint(run_metadata["params"])
    
    image_size = run_metadata["params"]["image_size"]
    batch_size = run_metadata["params"]["val_batch_size"]
    use_meta = run_metadata["params"]["use_meta"]
    
    test_dataset = ISICDataset(
        test_df, test_images, augment=test_augment(image_size), 
        use_meta=use_meta,
        cat_cols=cat_cols,
        cont_cols=cont_cols,
        infer=True
    )
    test_dataloader = DataLoader(
        test_dataset,
        batch_size=batch_size,
        shuffle=False,
        num_workers=4,
        drop_last=False,
        pin_memory=True,
    )
    
    fold_column = "fold"
    all_folds = np.unique(train[fold_column])
#     all_folds = [1]
    test_predictions_df = pd.DataFrame({id_column: test[id_column]})
    for fold in all_folds:
        print(f"\nFold {fold}")
        accelerator = Accelerator(
            mixed_precision=run_metadata["params"]["mixed_precision"],
        )
        
        model = ISICNet(model_name=model_name, pretrained=False,
                        use_meta=use_meta,
                        cat_cols=cat_cols,
                        cont_cols=cont_cols,
                        emb_szs=emb_szs,)
        model = model.to(accelerator.device)
        
        model, test_dataloader = accelerator.prepare(model, test_dataloader)
        model_filepath = path / f"models/fold_{fold}"
        accelerator.load_state(model_filepath)

        test_predictions_df[f"fold_{fold}"] = predict(model, test_dataloader, accelerator, n_tta=run_metadata["params"]["n_tta"], use_meta=use_meta)
    test_predictions_df[target_column] = test_predictions_df[[f"fold_{fold}" for fold in all_folds]].mean(axis=1)
    end_time = time.time()
    return test_predictions_df[[id_column, target_column]], (end_time - start_time)

In [8]:
print(f"Preprocessing metadata...")
test_metadata = preprocess(test_metadata)
print(f"Feature engineering...")
test_metadata, cat_cols, cont_cols = feature_engineering(test_metadata)
emb_szs = get_emb_szs(cat_cols)

Preprocessing metadata...
Feature engineering...


In [9]:
test_preds_df = pd.DataFrame({id_column: test_metadata[id_column], target_column: 0.01})

In [10]:
# test_preds_df, total_runtime = get_dnn_predictions(train_metadata, 
#                                                    test_metadata, 
#                                                    test_images, 
#                                                    model_name, 
#                                                    version, 
#                                                    Path(path))

In [11]:
# factor = EXPECTED_TEST_SIZE / SAMPLE_SIZE
# expected_total_runtime = total_runtime * factor
# total_runtime_minutes = int(expected_total_runtime // 60)
# total_runtime_seconds = expected_total_runtime % 60
# print(f"Expected total runtime during submission: {total_runtime_minutes} mins and {total_runtime_seconds} secs")

In [12]:
test_preds_df.head()

Unnamed: 0,isic_id,target
0,ISIC_0015657,0.01
1,ISIC_0015729,0.01
2,ISIC_0015740,0.01


In [13]:
test_preds_df[target_column].describe()

count    3.00
mean     0.01
std      0.00
min      0.01
25%      0.01
50%      0.01
75%      0.01
max      0.01
Name: target, dtype: float64

In [14]:
test_preds_df[[id_column, target_column]].head()

Unnamed: 0,isic_id,target
0,ISIC_0015657,0.01
1,ISIC_0015729,0.01
2,ISIC_0015740,0.01


In [15]:
test_preds_df[[id_column, target_column]].to_csv("submission.csv", index=False)