In [1]:
!pip install -q -U isic-cli
!isic image download -l 10 images/

If you have been granted special permissions, logging in with `isic user login` might return more data.

[2KDownloading images (and metadata) (10 total) [90m━━━━━━━━━━━━━━━━━━━━━━[0m [35m100%[0m [36m0:00:00[0m
[?25h
[32mSuccessfully downloaded 10 images to images/.[0m
[32mSuccessfully wrote 10 metadata records to images/metadata.csv.[0m
[32mSuccessfully wrote attributions to images/attribution.txt.[0m
[32mSuccessfully wrote 1 license(s) to images/licenses.[0m


In [2]:
import h5py
from tqdm import tqdm
import numpy as np
import pandas as pd
from PIL import Image
import io,os
from glob import glob
from io import BytesIO

pd.options.display.max_columns=1000

In [3]:
# Supported Image Extension (One can extend the list in case needed)
extensions = ['JPG','jpg','.PNG','png','.BMP','bmp']

# Location of the target folder where the images reside
# This code assumes that the isic_id of the image is the basename of the image (without the extension)
direc = './images'

# Determine the name of the HDF5 file
HDF5_Dataset_name = 'external_images.hdf5'

In [4]:
# Read the files names from the target folder 
flist = glob(os.path.join(direc,'*'))

## Save the samples in an HDF5 File
# Open the HDF5 file for writing
f = h5py.File(HDF5_Dataset_name, 'w')
for file in tqdm(flist):
    ext = os.path.splitext(file)[1][1:]
    if ext not in extensions:
        print('%s does not have a supported extension. Skipping!!'%(file))
        continue
    if ext == 'JPG' or ext =='jpg':
        fin = open(file, 'rb')
        binary_data = fin.read()
        binary_data_np = np.asarray(binary_data)
        fin.close()
    else:
        #If the sample is not jpeg encoded, we need to encode first and then write into the HDF5 file
        print('JPEG Compression is applied to sample %s'%(file))
        tmp = Image.open(file)
        tmp.save('temp.jpg','jpeg',quality=100)
        fin = open('temp.jpg', 'rb')
        binary_data = fin.read()
        binary_data_np = np.asarray(binary_data)
        fin.close()

    fname = os.path.splitext(os.path.basename(file))[0]
    f.create_dataset(fname, data=binary_data_np)
f.close()

100%|██████████| 13/13 [00:00<00:00, 480.79it/s]

./images/metadata.csv does not have a supported extension. Skipping!!
./images/licenses does not have a supported extension. Skipping!!
./images/attribution.txt does not have a supported extension. Skipping!!





In [5]:
data = pd.read_csv(f"{direc}/metadata.csv")
data["target"] = np.where(data["benign_malignant"] == "malignant", 1, 0)

In [6]:
data["benign_malignant"].value_counts()

benign_malignant
benign    10
Name: count, dtype: int64

In [7]:
data["target"].value_counts()

target
0    10
Name: count, dtype: int64

In [8]:
data.to_csv("external_metadata.csv", index=False)

In [9]:
!rm -rf images/