# ISIC datasets in Kaggle

## Imports

In [1]:
from kaggle.api.kaggle_api_extended import KaggleApi
import pandas as pd
import mlcroissant as mlc
import plotly.express as px
import os
import re

# Custom util functions
import sys; sys.path.append("./libraries/")
from libraries.utils import *

## Kaggle API authentication

**IMPORTANT:** This code was initially run in October 2024 to gather metadata and save the results. Since then, the use of an API key is no longer necessary, as running the code now would yield new results due to the continual creation of duplicate datasets containing the word “ISIC”.

In [2]:
# api = KaggleApi()
# api.authenticate()

## Number of total datasets with word "ISIC" in the title in Kaggle

In [3]:
search_term = 'ISIC'
datasets_list = []
datasets_list_description = []
page = 1

while True and not os.path.exists("../data/01_isic_datasets_metadata.csv"):
    datasets = api.dataset_list(search=search_term, page=page)
    if not datasets:
        break
    for dataset in datasets:
        if search_term.lower() in dataset.title.lower():
            datasets_list.append(dataset)
        elif search_term.lower() in dataset.description.lower():
            datsets_lists_description.append(dataset)
    page += 1

if os.path.exists("../data/01_isic_datasets_metadata.csv"):
    df = pd.read_csv("../data/01_isic_datasets_metadata.csv")
    print(f"Total datasets found with 'ISIC' in the title: {len(df)}")
    print(f"Total datasets found with 'ISIC' in the description: {df.description.notnull().sum()}")
else:
    print(f"Total datasets found with 'ISIC' in the title: {len(datasets_list)}")
    print(f"Total datasets found with 'ISIC' in the description: {len(datasets_list_description)}")

Total datasets found with 'ISIC' in the title: 860
Total datasets found with 'ISIC' in the description: 0


In [4]:
croissant_metadata_list = []
for dataset in datasets_list:
    croissant_metadata = convert_dataset_to_croissant(dataset)
    croissant_metadata_list.append(croissant_metadata)

data_rows = []
for metadata in croissant_metadata_list:
    flattened_metadata = flatten_metadata(metadata)
    data_rows.append(flattened_metadata)

df = pd.DataFrame(data_rows)

# renaming some columns for clarity
df.rename(columns={
    'distribution_0.contentSize': 'contentSize',
    'distribution_0.contentUrl': 'contentUrl',
    'distribution_0.encodingFormat': 'encodingFormat'
}, inplace=True)

if not os.path.exists('../data/01_isic_datasets_metadata.csv'):
    df.to_csv('../data/01_isic_datasets_metadata.csv', index=False)
    print("Metadata saved to '01_isic_datasets_metadata.csv'")
else:
    print("File already exists")

File already exists


In [5]:
df = pd.read_csv('../data/01_isic_datasets_metadata.csv')
df

Unnamed: 0,@context.@language,@context.@vocab,@type,name,alternateName,description,url,identifier,creator.@type,creator.name,...,distribution_0.@type,contentUrl,contentSize,encodingFormat,isPrivate,downloadCount,viewCount,voteCount,usabilityRating,conformsTo
0,en,https://schema.org/,Dataset,Skin Cancer ISIC,The skin cancer data. Contains 9 classes of sk...,,https://www.kaggle.com/nodoubttome/skin-cancer...,319080,Person,Andrey Katanskiy,...,DataDownload,https://www.kaggle.com/datasets/nodoubttome/sk...,2048.0,application/zip,False,16375,132171,220,0.750000,http://mlcommons.org/croissant/1.0
1,en,https://schema.org/,Dataset,All ISIC Data 20240629,All images and metadata in ISIC archive.,,https://www.kaggle.com/tomooinubushi/all-isic-...,5302785,Person,tomoo inubushi,...,DataDownload,https://www.kaggle.com/datasets/tomooinubushi/...,75776.0,application/zip,False,376,3489,55,0.764706,http://mlcommons.org/croissant/1.0
2,en,https://schema.org/,Dataset,ISIC 2020 JPG 256x256 RESIZED,,,https://www.kaggle.com/nischaydnk/isic-2020-jp...,5295545,Person,Nischay Dhankhar,...,DataDownload,https://www.kaggle.com/datasets/nischaydnk/isi...,595.0,application/zip,False,709,2149,48,0.882353,http://mlcommons.org/croissant/1.0
3,en,https://schema.org/,Dataset,ISIC 2019 JPG 224x224 RESIZED,ISIC 2019 resized dataset,,https://www.kaggle.com/nischaydnk/isic-2019-jp...,5295517,Person,Nischay Dhankhar,...,DataDownload,https://www.kaggle.com/datasets/nischaydnk/isi...,355.0,application/zip,False,561,1930,40,0.941176,http://mlcommons.org/croissant/1.0
4,en,https://schema.org/,Dataset,JPEG ISIC 2019 512x512,,,https://www.kaggle.com/cdeotte/jpeg-isic2019-5...,762203,Person,Chris Deotte,...,DataDownload,https://www.kaggle.com/datasets/cdeotte/jpeg-i...,1024.0,application/zip,False,2445,7096,54,0.588235,http://mlcommons.org/croissant/1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
855,en,https://schema.org/,Dataset,melanoma_isic,,,https://www.kaggle.com/chitrapsg/melanoma-isic,3501446,Person,Chitra Govindasamy,...,DataDownload,https://www.kaggle.com/datasets/chitrapsg/mela...,786.0,application/zip,False,3,81,0,0.000000,http://mlcommons.org/croissant/1.0
856,en,https://schema.org/,Dataset,4000ISIC19Balanced,,,https://www.kaggle.com/manirujjamanmonir/4000i...,3374258,Person,Manirujjaman Monir,...,DataDownload,https://www.kaggle.com/datasets/manirujjamanmo...,6144.0,application/zip,False,2,73,0,0.000000,http://mlcommons.org/croissant/1.0
857,en,https://schema.org/,Dataset,siim_isic_2020_leukemia_dataset,,,https://www.kaggle.com/rajibbag1/siim-isic-202...,4911856,Person,RAJIB BAG_1,...,DataDownload,https://www.kaggle.com/datasets/rajibbag1/siim...,2048.0,application/zip,False,0,20,0,0.000000,http://mlcommons.org/croissant/1.0
858,en,https://schema.org/,Dataset,data_isic1718,,,https://www.kaggle.com/bugakakak/data-isic1718,4788617,Person,bugakakak,...,DataDownload,https://www.kaggle.com/datasets/bugakakak/data...,261.0,application/zip,False,0,17,0,0.000000,http://mlcommons.org/croissant/1.0


In [6]:
print(df.columns.tolist())

['@context.@language', '@context.@vocab', '@type', 'name', 'alternateName', 'description', 'url', 'identifier', 'creator.@type', 'creator.name', 'creator.url', 'license.@type', 'license.name', 'keywords', 'dateModified', 'isAccessibleForFree', 'distribution_0.@type', 'contentUrl', 'contentSize', 'encodingFormat', 'isPrivate', 'downloadCount', 'viewCount', 'voteCount', 'usabilityRating', 'conformsTo']


In [7]:
df.name.value_counts()

name
ISIC2018                                         6
ISIC2017                                         5
isic2016                                         4
isic2018                                         4
Skin Cancer ISIC                                 4
                                                ..
ISIC19AllAugmented                               1
ISIC_png_resized_456                             1
SIIM-ISIC Melanoma Classification NPY 224 CV2    1
ISIC_2019_Test_Input                             1
Linear_exact_ISIC_2017_test                      1
Name: count, Length: 821, dtype: int64

In [8]:
df.description.value_counts()

Series([], Name: count, dtype: int64)

In [9]:
df.isAccessibleForFree.value_counts()

isAccessibleForFree
True    860
Name: count, dtype: int64

In [10]:
df.usabilityRating.value_counts().sort_index()

usabilityRating
0.000000     49
0.062500     19
0.117647    110
0.125000    152
0.176471     36
0.187500     87
0.205882      1
0.235294     68
0.250000     71
0.294118     51
0.312500     51
0.352941     12
0.375000     39
0.411765     16
0.437500     19
0.470588      5
0.500000     11
0.529412      7
0.562500      3
0.588235     17
0.625000      1
0.647059      2
0.687500      4
0.705882      6
0.750000      2
0.764706      3
0.812500      2
0.823529      3
0.875000      4
0.882353      4
0.911765      1
0.941176      1
1.000000      3
Name: count, dtype: int64

In [11]:
df = df.drop(df[df.contentSize == "Unknown"].index)
df['contentSize'] = df['contentSize'].astype(float)
df = df.sort_values('contentSize', ascending=False)
df

Unnamed: 0,@context.@language,@context.@vocab,@type,name,alternateName,description,url,identifier,creator.@type,creator.name,...,distribution_0.@type,contentUrl,contentSize,encodingFormat,isPrivate,downloadCount,viewCount,voteCount,usabilityRating,conformsTo
1,en,https://schema.org/,Dataset,All ISIC Data 20240629,All images and metadata in ISIC archive.,,https://www.kaggle.com/tomooinubushi/all-isic-...,5302785,Person,tomoo inubushi,...,DataDownload,https://www.kaggle.com/datasets/tomooinubushi/...,75776.000000,application/zip,False,376,3489,55,0.764706,http://mlcommons.org/croissant/1.0
792,en,https://schema.org/,Dataset,Augmentation Task ISIC2019,,,https://www.kaggle.com/shaheedanwarfahad/augme...,3947239,Person,Shaheed Anwar Fahad,...,DataDownload,https://www.kaggle.com/datasets/shaheedanwarfa...,52224.000000,application/zip,False,4,297,0,0.125000,http://mlcommons.org/croissant/1.0
656,en,https://schema.org/,Dataset,ISIC2019 Augmented data,,,https://www.kaggle.com/kishorebabunampalle/isi...,3544602,Person,Kishore Babu Nampalle,...,DataDownload,https://www.kaggle.com/datasets/kishorebabunam...,34816.000000,application/zip,False,4,65,0,0.250000,http://mlcommons.org/croissant/1.0
502,en,https://schema.org/,Dataset,ISIC 2020,,,https://www.kaggle.com/shakiburrahmanasif/isic...,4096026,Person,Shakib Ur Rahman Asif,...,DataDownload,https://www.kaggle.com/datasets/shakiburrahman...,32768.000000,application/zip,False,5,116,0,0.000000,http://mlcommons.org/croissant/1.0
790,en,https://schema.org/,Dataset,Augmentation Task ISIC2017,,,https://www.kaggle.com/shaheedanwarfahad/augme...,3941112,Person,Shaheed Anwar Fahad,...,DataDownload,https://www.kaggle.com/datasets/shaheedanwarfa...,30720.000000,application/zip,False,12,282,0,0.125000,http://mlcommons.org/croissant/1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
653,en,https://schema.org/,Dataset,submission code ISIC2024,,,https://www.kaggle.com/sylvesteriro/submission...,5622780,Person,Sylvester iro,...,DataDownload,https://www.kaggle.com/datasets/sylvesteriro/s...,0.001953,application/zip,False,0,4,0,0.250000,http://mlcommons.org/croissant/1.0
852,en,https://schema.org/,Dataset,ISBI2016_ISIC,,,https://www.kaggle.com/jainpriyal1412/isbi2016...,3808409,Person,JainPriyal1412,...,DataDownload,https://www.kaggle.com/datasets/jainpriyal1412...,0.001953,application/zip,False,1,91,0,0.117647,http://mlcommons.org/croissant/1.0
317,en,https://schema.org/,Dataset,SIIM-ISIC Melanoma Classification,,,https://www.kaggle.com/itsanjalichaudhary/skin...,3697802,Person,Anjali Chaudhary,...,DataDownload,https://www.kaggle.com/datasets/itsanjalichaud...,0.001953,application/zip,False,7,63,1,0.125000,http://mlcommons.org/croissant/1.0
555,en,https://schema.org/,Dataset,OneHot Encoder ISIC 2024,,,https://www.kaggle.com/greysky/onehot-encoder-...,5654530,Person,Farukcan Saglam,...,DataDownload,https://www.kaggle.com/datasets/greysky/onehot...,0.000977,application/zip,False,3,15,0,0.062500,http://mlcommons.org/croissant/1.0


In [12]:
# top 5 datasets with the largest contentSize that is not null
df[df.contentSize.notnull()].sort_values('contentSize', ascending=False).head(5)

Unnamed: 0,@context.@language,@context.@vocab,@type,name,alternateName,description,url,identifier,creator.@type,creator.name,...,distribution_0.@type,contentUrl,contentSize,encodingFormat,isPrivate,downloadCount,viewCount,voteCount,usabilityRating,conformsTo
1,en,https://schema.org/,Dataset,All ISIC Data 20240629,All images and metadata in ISIC archive.,,https://www.kaggle.com/tomooinubushi/all-isic-...,5302785,Person,tomoo inubushi,...,DataDownload,https://www.kaggle.com/datasets/tomooinubushi/...,75776.0,application/zip,False,376,3489,55,0.764706,http://mlcommons.org/croissant/1.0
792,en,https://schema.org/,Dataset,Augmentation Task ISIC2019,,,https://www.kaggle.com/shaheedanwarfahad/augme...,3947239,Person,Shaheed Anwar Fahad,...,DataDownload,https://www.kaggle.com/datasets/shaheedanwarfa...,52224.0,application/zip,False,4,297,0,0.125,http://mlcommons.org/croissant/1.0
656,en,https://schema.org/,Dataset,ISIC2019 Augmented data,,,https://www.kaggle.com/kishorebabunampalle/isi...,3544602,Person,Kishore Babu Nampalle,...,DataDownload,https://www.kaggle.com/datasets/kishorebabunam...,34816.0,application/zip,False,4,65,0,0.25,http://mlcommons.org/croissant/1.0
502,en,https://schema.org/,Dataset,ISIC 2020,,,https://www.kaggle.com/shakiburrahmanasif/isic...,4096026,Person,Shakib Ur Rahman Asif,...,DataDownload,https://www.kaggle.com/datasets/shakiburrahman...,32768.0,application/zip,False,5,116,0,0.0,http://mlcommons.org/croissant/1.0
790,en,https://schema.org/,Dataset,Augmentation Task ISIC2017,,,https://www.kaggle.com/shaheedanwarfahad/augme...,3941112,Person,Shaheed Anwar Fahad,...,DataDownload,https://www.kaggle.com/datasets/shaheedanwarfa...,30720.0,application/zip,False,12,282,0,0.125,http://mlcommons.org/croissant/1.0


In [13]:
df.contentSize.isnull().sum()

np.int64(0)

In [14]:
df.usabilityRating.isnull().sum() 

np.int64(0)

In [15]:
df.downloadCount.isnull().sum()

np.int64(0)

In [16]:
df = df.dropna(subset=['contentSize'])
df = df.dropna(subset=['usabilityRating'])
df = df.dropna(subset=['downloadCount'])
df = df.dropna(subset=['dateModified'])

## Line plot of number of dataset modifications over years

In [17]:
df['dateModified'] = pd.to_datetime(df['dateModified'])

df['Period'] = df['dateModified'].dt.to_period("Y")

modification_counts = df.groupby('Period').size()

fig = px.line(df, x=modification_counts.index.to_timestamp(), y=modification_counts, markers=True,
              labels=dict(x="Year", y = "Number of Modifications"))
fig.update_traces(line_color='#69b3a2')

fig.show()

## Scatter plot for identifying any relationships between contentSize, downloadCount and usabilityRating

In [18]:
fig = px.scatter(df, x="downloadCount", y="contentSize", marginal_x="violin", marginal_y='violin', color = 'usabilityRating', 
                 color_continuous_scale=px.colors.diverging.RdYlGn, log_x=True, labels=dict(contentSize="contentSize (MB)"))

fig.update_xaxes(tickvals=[1, 10, 100, 1000, 10000])

fig.show()

## Total content size of the datasets in Kaggle with the word "ISIC" in the title

In [19]:
total_size_mb = df["contentSize"].sum()

total_size_gb = total_size_mb / 1024
total_size_tb = total_size_gb / 1024

print(f'Total size of datasets: {total_size_gb:.2f} GB')
print(f'Total size of datasets: {total_size_tb:.2f} TB')

Total size of datasets: 2642.14 GB
Total size of datasets: 2.58 TB


## Total content size of the datasets in the official webiste of ISIC Challenges

In [20]:
df_official_isic = pd.read_csv("../data/isic_challange_datasets.csv")

total_size_mb = df_official_isic["contentSize"].sum()

total_size_gb = total_size_mb / 1024
total_size_tb = total_size_gb / 1024

print(f'Total size of datasets: {total_size_gb:.2f} GB')
print(f'Total size of datasets: {total_size_tb:.2f} TB')

Total size of datasets: 75.15 GB
Total size of datasets: 0.07 TB
