# Identifying Potential Duplicates among ISIC Datasets on Kaggle and Original ISIC Challenge Datasets Based on Corresponding Year Tags in Titles

## Imports

In [1]:
import pandas as pd
import re
import plotly.express as px
import numpy as np
import os

# Custom util functions
import sys; sys.path.append("./libraries/")
from libraries.utils import *

In [2]:
df_kaggle_isic_metadata = pd.read_csv("../data/01_isic_datasets_metadata.csv")
df_original_isic_metadata = pd.read_csv("../data/isic_challange_datasets.csv")

In [3]:
df_kaggle_isic_metadata

Unnamed: 0,@context.@language,@context.@vocab,@type,name,alternateName,description,url,identifier,creator.@type,creator.name,...,distribution_0.@type,contentUrl,contentSize,encodingFormat,isPrivate,downloadCount,viewCount,voteCount,usabilityRating,conformsTo
0,en,https://schema.org/,Dataset,Skin Cancer ISIC,The skin cancer data. Contains 9 classes of sk...,,https://www.kaggle.com/nodoubttome/skin-cancer...,319080,Person,Andrey Katanskiy,...,DataDownload,https://www.kaggle.com/datasets/nodoubttome/sk...,2048.0,application/zip,False,16375,132171,220,0.750000,http://mlcommons.org/croissant/1.0
1,en,https://schema.org/,Dataset,All ISIC Data 20240629,All images and metadata in ISIC archive.,,https://www.kaggle.com/tomooinubushi/all-isic-...,5302785,Person,tomoo inubushi,...,DataDownload,https://www.kaggle.com/datasets/tomooinubushi/...,75776.0,application/zip,False,376,3489,55,0.764706,http://mlcommons.org/croissant/1.0
2,en,https://schema.org/,Dataset,ISIC 2020 JPG 256x256 RESIZED,,,https://www.kaggle.com/nischaydnk/isic-2020-jp...,5295545,Person,Nischay Dhankhar,...,DataDownload,https://www.kaggle.com/datasets/nischaydnk/isi...,595.0,application/zip,False,709,2149,48,0.882353,http://mlcommons.org/croissant/1.0
3,en,https://schema.org/,Dataset,ISIC 2019 JPG 224x224 RESIZED,ISIC 2019 resized dataset,,https://www.kaggle.com/nischaydnk/isic-2019-jp...,5295517,Person,Nischay Dhankhar,...,DataDownload,https://www.kaggle.com/datasets/nischaydnk/isi...,355.0,application/zip,False,561,1930,40,0.941176,http://mlcommons.org/croissant/1.0
4,en,https://schema.org/,Dataset,JPEG ISIC 2019 512x512,,,https://www.kaggle.com/cdeotte/jpeg-isic2019-5...,762203,Person,Chris Deotte,...,DataDownload,https://www.kaggle.com/datasets/cdeotte/jpeg-i...,1024.0,application/zip,False,2445,7096,54,0.588235,http://mlcommons.org/croissant/1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
855,en,https://schema.org/,Dataset,melanoma_isic,,,https://www.kaggle.com/chitrapsg/melanoma-isic,3501446,Person,Chitra Govindasamy,...,DataDownload,https://www.kaggle.com/datasets/chitrapsg/mela...,786.0,application/zip,False,3,81,0,0.000000,http://mlcommons.org/croissant/1.0
856,en,https://schema.org/,Dataset,4000ISIC19Balanced,,,https://www.kaggle.com/manirujjamanmonir/4000i...,3374258,Person,Manirujjaman Monir,...,DataDownload,https://www.kaggle.com/datasets/manirujjamanmo...,6144.0,application/zip,False,2,73,0,0.000000,http://mlcommons.org/croissant/1.0
857,en,https://schema.org/,Dataset,siim_isic_2020_leukemia_dataset,,,https://www.kaggle.com/rajibbag1/siim-isic-202...,4911856,Person,RAJIB BAG_1,...,DataDownload,https://www.kaggle.com/datasets/rajibbag1/siim...,2048.0,application/zip,False,0,20,0,0.000000,http://mlcommons.org/croissant/1.0
858,en,https://schema.org/,Dataset,data_isic1718,,,https://www.kaggle.com/bugakakak/data-isic1718,4788617,Person,bugakakak,...,DataDownload,https://www.kaggle.com/datasets/bugakakak/data...,261.0,application/zip,False,0,17,0,0.000000,http://mlcommons.org/croissant/1.0


In [4]:
df_original_isic_metadata

Unnamed: 0,name,url,trainSize,valSize,testSize,contentSize
0,ISIC 2016 Challenge,https://challenge.isic-archive.com/data/#2016,3563.0,Unknown,1199.0,4763.0
1,ISIC 2017 Challenge,https://challenge.isic-archive.com/data/#2017,5939.0,899.0,5520.0,12358.0
2,ISIC 2018 Challenge,https://challenge.isic-archive.com/data/#2018,13312.0,286.0,2663.0,16261.0
3,ISIC 2019 Challenge,https://challenge.isic-archive.com/data/#2019,9318.0,Unknown,3686.0,13154.0
4,ISIC 2020 Challenge,https://challenge.isic-archive.com/data/#2020,23552.0,Unknown,6861.0,30413.0


In [5]:
df_kaggle_isic_metadata = df_kaggle_isic_metadata.drop(df_kaggle_isic_metadata[df_kaggle_isic_metadata.contentSize == "Unknown"].index)
df_kaggle_isic_metadata['contentSize'] = df_kaggle_isic_metadata['contentSize'].astype(float)

In [6]:
df_original_isic_metadata['Year'] = df_original_isic_metadata['name'].apply(extract_year)
df_kaggle_isic_metadata['Year'] = df_kaggle_isic_metadata['name'].apply(extract_year)

In [7]:
potential_duplicates_df_based_on_name = find_potential_duplicates(df_original_isic_metadata, df_kaggle_isic_metadata)
if not os.path.exists('../data/02_potential_isic_duplicates_based_on_name.csv'):
    potential_duplicates_df_based_on_name.to_csv("../data/02_potential_isic_duplicates_based_on_name.csv", index=False)
else:
    print("File already exists")

File already exists


In [8]:
potential_duplicates_df_based_on_name['Size Difference Category (%)'] = potential_duplicates_df_based_on_name['Size Difference (%)'].apply(size_difference_category)
potential_duplicates_df_based_on_name.to_csv('../data/02_potential_isic_duplicates_based_on_name.csv', index=False)

In [9]:
potential_duplicates_df_based_on_name

Unnamed: 0,Kaggle Dataset Name,Kaggle Dataset URL,Kaggle Size (MB),Original Dataset Name,Orginal Dataset URL,Original Size (MB),Size Difference (%),Size Difference Category (%)
0,ISIC 2020 JPG 256x256 RESIZED,https://www.kaggle.com/nischaydnk/isic-2020-jp...,595.0,ISIC 2020 Challenge,https://challenge.isic-archive.com/data/#2020,30413.0,98.04,60-100%
1,ISIC 2019 JPG 224x224 RESIZED,https://www.kaggle.com/nischaydnk/isic-2019-jp...,355.0,ISIC 2019 Challenge,https://challenge.isic-archive.com/data/#2019,13154.0,97.30,60-100%
2,JPEG ISIC 2019 512x512,https://www.kaggle.com/cdeotte/jpeg-isic2019-5...,1024.0,ISIC 2019 Challenge,https://challenge.isic-archive.com/data/#2019,13154.0,92.22,60-100%
3,ISIC 2019 TFRecords 512x512,https://www.kaggle.com/cdeotte/isic2019-512x512,1024.0,ISIC 2019 Challenge,https://challenge.isic-archive.com/data/#2019,13154.0,92.22,60-100%
4,ISIC 2019 JPG 256x256 RESIZED,https://www.kaggle.com/nischaydnk/isic-2019-jp...,445.0,ISIC 2019 Challenge,https://challenge.isic-archive.com/data/#2019,13154.0,96.62,60-100%
...,...,...,...,...,...,...,...,...
133,ISIC2019-2018-ITA-Based-Grouped,https://www.kaggle.com/sanazmovahed/isic2019-2...,9216.0,ISIC 2018 Challenge,https://challenge.isic-archive.com/data/#2018,16261.0,43.32,30-60%
134,ham10000 skin cancer isic 2018 dataset,https://www.kaggle.com/dharjoy/ham10000-skin-c...,2048.0,ISIC 2018 Challenge,https://challenge.isic-archive.com/data/#2018,16261.0,87.41,60-100%
135,SIIM-ISIC-Melanoma-Classification-2020-Resized...,https://www.kaggle.com/chirag94/siimisicmelano...,3072.0,ISIC 2020 Challenge,https://challenge.isic-archive.com/data/#2020,30413.0,89.90,60-100%
136,ISIC 2016 ORIGINAL DATASET 5 FOLDED WITH LABEL,https://www.kaggle.com/mahmudulhasantasin/isic...,3072.0,ISIC 2016 Challenge,https://challenge.isic-archive.com/data/#2016,4763.0,35.50,30-60%


In [10]:
potential_duplicates_df_based_on_name.sort_values(by='Size Difference (%)', ascending=True)

Unnamed: 0,Kaggle Dataset Name,Kaggle Dataset URL,Kaggle Size (MB),Original Dataset Name,Orginal Dataset URL,Original Size (MB),Size Difference (%),Size Difference Category (%)
40,ISIC 2017 Original Dataset,https://www.kaggle.com/mahmudulhasantasin/isic...,12288.000000,ISIC 2017 Challenge,https://challenge.isic-archive.com/data/#2017,12358.0,0.57,1-30%
21,ISIC-2017,https://www.kaggle.com/johnchfr/isic-2017,12288.000000,ISIC 2017 Challenge,https://challenge.isic-archive.com/data/#2017,12358.0,0.57,1-30%
81,ISIC 2017 Task 3 Dataset,https://www.kaggle.com/srijankundu/isic-2017-t...,12288.000000,ISIC 2017 Challenge,https://challenge.isic-archive.com/data/#2017,12358.0,0.57,1-30%
56,ISIC Challenge Dataset-2020,https://www.kaggle.com/sumaiyabinteshahid/isic...,30720.000000,ISIC 2020 Challenge,https://challenge.isic-archive.com/data/#2020,30413.0,1.01,1-30%
104,ISIC 2020,https://www.kaggle.com/shakiburrahmanasif/isic...,32768.000000,ISIC 2020 Challenge,https://challenge.isic-archive.com/data/#2020,30413.0,7.74,1-30%
...,...,...,...,...,...,...,...,...
39,Skin Cancer Dataset ISIC 2016,https://www.kaggle.com/mdforiduzzamanzihad/ski...,7.000000,ISIC 2016 Challenge,https://challenge.isic-archive.com/data/#2016,4763.0,99.85,60-100%
78,SIIM-ISIC-2020,https://www.kaggle.com/abdulrahmanamukhlif/sii...,14.000000,ISIC 2020 Challenge,https://challenge.isic-archive.com/data/#2020,30413.0,99.95,60-100%
115,Deep Features Skin Cancer ISIC 2017,https://www.kaggle.com/usmansadiq/deep-feature...,1.000000,ISIC 2017 Challenge,https://challenge.isic-archive.com/data/#2017,12358.0,99.99,60-100%
77,ISIC 2019 Pseudo Patient Labels,https://www.kaggle.com/feidawei/isic-2019-pseu...,0.165039,ISIC 2019 Challenge,https://challenge.isic-archive.com/data/#2019,13154.0,100.00,60-100%


In [11]:
potential_duplicates_df_based_on_name['Size Difference (%)'].median()

np.float64(83.545)

## Matched datasets picked for further comparison

In [12]:
# potential_duplicates_df_based_on_name_sample_for_comparison = potential_duplicates_df_based_on_name.groupby('Size Difference Category (%)').sample(n=1)
# potential_duplicates_df_based_on_name_sample_for_comparison
# potential_duplicates_df_based_on_name_sample_for_comparison.to_csv('../data/potential_duplicates_df_based_on_name_sample_for_comparison.csv', index=False)

In [13]:
potential_duplicates_df_based_on_name_sample_for_comparison = pd.read_csv("../data/potential_duplicates_df_based_on_name_sample_for_comparison.csv")
potential_duplicates_df_based_on_name_sample_for_comparison

Unnamed: 0,Kaggle Dataset Name,Kaggle Dataset URL,Kaggle Size (MB),Original Dataset Name,Orginal Dataset URL,Original Size (MB),Size Difference (%),Size Difference Category (%)
0,ISIC-2018,https://www.kaggle.com/trantoanthang/isic-2018,13312.0,ISIC 2018 Challenge,https://challenge.isic-archive.com/data/#2018,16261.0,18.14,1-30%
1,ISIC 2017 1 FOLD,https://www.kaggle.com/mahmudulhasantasin/isic...,6144.0,ISIC 2017 Challenge,https://challenge.isic-archive.com/data/#2017,12358.0,50.28,30-60%
2,ISIC 2019 Preprocessed Dataset,https://www.kaggle.com/segijaganath/isic-2019-...,3072.0,ISIC 2019 Challenge,https://challenge.isic-archive.com/data/#2019,13154.0,76.65,60-100%


## Histogram of "Size Difference (%)"

In [14]:
bins = [0, 30, 60, 100]
labels = ['1-30%', '30-60%', '60-100%']
potential_duplicates_df_based_on_name['Binned Size Difference'] = pd.cut(potential_duplicates_df_based_on_name['Size Difference (%)'], bins=bins, labels=labels, include_lowest=True)

fig = px.histogram(potential_duplicates_df_based_on_name, x='Binned Size Difference',
                   text_auto=True,
                   category_orders={'Binned Size Difference': ['1-30%', '30-60%', '60-100%']},
                   color_discrete_sequence=['#2d3546'] 
                   )

fig.update_layout(
    xaxis_title="Size Difference (%)",
    yaxis_title="Count",
)

fig.show()

We can clearly see that most of the duplicate datasets on Kaggle matched on the year in the title with the original datasets have significant difference with the 60-100% "Size Difference (%)" dominating.