In [12]:
import pandas as pd
import subprocess
import cv2
import matplotlib.pyplot as plt
import h5py
from tqdm import tqdm
import numpy as np
import os
import random
from pathlib import Path
import shutil

In [13]:
archive_save_path = "/kaggle/input/isic_archive_malignant"
archive_meta_path = archive_save_path + "/metadata.csv"

improved_save_path = "/kaggle/input/improved_dataset/"
improved_meta_path = improved_save_path + "/metadata.csv"

save_dir = Path(f"{improved_save_path}/train_image")
save_dir.mkdir(exist_ok=True, parents=True)

orig_dataset_path = "/kaggle/input/isic-2024-challenge/train-image/image"
orig_meta_path = "/kaggle/input/isic-2024-challenge/train-metadata.csv"

In [11]:
def center_crop(img: np.ndarray):
    height = img.shape[0]
    width = img.shape[1]
    if width > height:
        diff = width - height
        img = img[:, diff // 2 : -diff // 2, :]
    elif height > width:
        diff = height - width
        img = img[diff // 2 : -diff // 2, :, :]
    else:
        pass
    return img


def save_resized(img_path_list, save_dir, size):
    for img_path in img_path_list:
        img = cv2.imread(str(img_path))
        img = center_crop(img)
        img = cv2.resize(img, (size, size), interpolation=cv2.INTER_LANCZOS4)
        save_path = f"{save_dir}/{img_path.name}"
        cv2.imwrite(save_path, img)


img_path_list = list(Path(archive_save_path).glob("*.jpg"))


save_resized(img_path_list, save_dir, 384)

for img_p in Path(orig_dataset_path).glob("*"):
    shutil.copy(img_p, save_dir)

'/kaggle/input/isic_archive_malignant_resized/metadata.csv'

In [22]:
def create_hdf5_dataset(img_paths_list, hdf5_path):
    f = h5py.File(hdf5_path, "w")

    for img_p in tqdm(img_paths_list):
        ext = os.path.splitext(img_p)[1][1:]
        if ext != "jpg":
            print("%s does not have a supported extension. Skipping!!" % (img_p))
            continue
        if ext == "JPG" or ext == "jpg":
            fin = open(img_p, "rb")
            binary_data = fin.read()
            binary_data_np = np.asarray(binary_data)
            fin.close()

        fname = os.path.splitext(os.path.basename(img_p))[0]
        f.create_dataset(fname, data=binary_data_np)
    f.close()

In [16]:
img_paths_list = list(Path(f"{improved_save_path}/train_image").glob("*.jpg"))
hdf_path = f"{improved_save_path}/train_image.hdf5"

In [23]:
create_hdf5_dataset(img_paths_list, hdf_path)

100%|██████████| 408259/408259 [00:27<00:00, 14993.97it/s]


In [32]:
archive_meta = pd.read_csv(archive_meta_path)
orig_meta = pd.read_csv(orig_meta_path)
archive_meta["patient_id"] = archive_meta["patient_id"].fillna(
    archive_meta.index.to_series()
)
archive_meta["source"] = "archive"
orig_meta["source"] = "orig"
pd.concat([orig_meta, archive_meta], axis=0).to_csv(improved_meta_path, index=False)

  orig_meta = pd.read_csv(orig_meta_path)
