In [6]:
!pip install -q -U isic-cli h5py tqdm pandas pillow

In [28]:
from pathlib import Path
import h5py
from tqdm import tqdm
import numpy as np
import pandas as pd
from PIL import Image
import io,os
from glob import glob
from io import BytesIO

pd.options.display.max_columns=1000

In [8]:
images_dir = Path("images")
volume_dir = Path("volume/isic-2024-external/")

In [9]:
volume_dir.mkdir(exist_ok=True)

In [11]:
!isic image download {images_dir}

If you have been granted special permissions, logging in with `isic user login` might return more data.



[2KDownloading images (and metadata) (81,722 total) [90m━━━━━━━━━━━━━━━━━━[0m [35m100%[0m [36m0:00:00[0m00:01[0m01:35[0m

[?25h

[32mSuccessfully downloaded 81,722 images to images/.[0m

[32mSuccessfully wrote 81,722 metadata records to images/metadata.csv.[0m

[32mSuccessfully wrote attributions to images/attribution.txt.[0m

[32mSuccessfully wrote 3 license(s) to images/licenses.[0m


In [13]:
# Supported Image Extension (One can extend the list in case needed)
extensions = ['JPG','jpg','.PNG','png','.BMP','bmp']

# Location of the target folder where the images reside
# This code assumes that the isic_id of the image is the basename of the image (without the extension)
direc = str(images_dir)

# Determine the name of the HDF5 file
HDF5_Dataset_name = volume_dir / 'external_images.hdf5'

In [14]:
# Read the files names from the target folder 
flist = glob(os.path.join(direc,'*'))

## Save the samples in an HDF5 File
# Open the HDF5 file for writing
f = h5py.File(HDF5_Dataset_name, 'w')
for file in tqdm(flist):
    ext = os.path.splitext(file)[1][1:]
    if ext not in extensions:
        print('%s does not have a supported extension. Skipping!!'%(file))
        continue
    if ext == 'JPG' or ext =='jpg':
        fin = open(file, 'rb')
        binary_data = fin.read()
        binary_data_np = np.asarray(binary_data)
        fin.close()
    else:
        #If the sample is not jpeg encoded, we need to encode first and then write into the HDF5 file
        print('JPEG Compression is applied to sample %s'%(file))
        tmp = Image.open(file)
        tmp.save('temp.jpg','jpeg',quality=100)
        fin = open('temp.jpg', 'rb')
        binary_data = fin.read()
        binary_data_np = np.asarray(binary_data)
        fin.close()

    fname = os.path.splitext(os.path.basename(file))[0]
    f.create_dataset(fname, data=binary_data_np)
f.close()

100%|██████████| 81725/81725 [02:52<00:00, 474.98it/s] 


images/metadata.csv does not have a supported extension. Skipping!!

images/attribution.txt does not have a supported extension. Skipping!!

images/licenses does not have a supported extension. Skipping!!


In [30]:
data = pd.read_csv(f"{direc}/metadata.csv", low_memory=False)

In [31]:
data.head()

Unnamed: 0,isic_id,attribution,copyright_license,acquisition_day,age_approx,anatom_site_general,benign_malignant,clin_size_long_diam_mm,concomitant_biopsy,dermoscopic_type,diagnosis,diagnosis_confirm_type,family_hx_mm,fitzpatrick_skin_type,image_type,lesion_id,mel_class,mel_mitotic_index,mel_thick_mm,mel_type,mel_ulcer,melanocytic,nevus_type,patient_id,personal_hx_mm,pixels_x,pixels_y,sex
0,ISIC_7559201,Memorial Sloan Kettering Cancer Center,CC-BY,2497.0,55.0,anterior torso,benign,6.6,,contact non-polarized,nevus,histopathology,True,IV,dermoscopic,IL_2668505,,,,,,True,,IP_1238256,True,3264,2448,female
1,ISIC_0485014,Memorial Sloan Kettering Cancer Center,CC-BY,1.0,45.0,lower extremity,benign,,,contact non-polarized,,,True,I,dermoscopic,IL_2775050,,,,,,True,,IP_3227071,,6000,4000,female
2,ISIC_5257439,Memorial Sloan Kettering Cancer Center,CC-BY,2360.0,40.0,lateral torso,benign,4.2,,,nevus,histopathology,True,II,clinical: close-up,IL_8547824,,,,,,True,,IP_7407753,True,3264,2448,female
3,ISIC_2989732,Memorial Sloan Kettering Cancer Center,CC-BY,78.0,80.0,anterior torso,benign,,,non-contact polarized,,,False,II,dermoscopic,IL_4657752,,,,,,True,,IP_2597637,,6000,4000,male
4,ISIC_5638210,Memorial Sloan Kettering Cancer Center,CC-BY,78.0,80.0,anterior torso,benign,,,contact non-polarized,,,False,II,dermoscopic,IL_4657752,,,,,,True,,IP_2597637,,6000,4000,male


In [32]:
data["benign_malignant"].value_counts()

benign_malignant
benign                     64047
malignant                   9239
indeterminate                150
indeterminate/malignant       85
indeterminate/benign          67
Name: count, dtype: int64

In [33]:
data["target"] = np.where(data["benign_malignant"] == "malignant", 1, 0)

In [34]:
data.to_csv(volume_dir / "external_metadata.csv", index=False)