# Comparison of dataset sizes to identify potential duplicates based on size of the dataset (contentSize) with name being a condition as well

In [76]:
import pandas as pd
import re

In [77]:
df_kaggle_isic_metadata = pd.read_csv("../data/01_isic_datasets_metadata.csv")
df_original_isic_metadata = pd.read_csv("../data/isic_challange_datasets.csv")

In [78]:
df_kaggle_isic_metadata

Unnamed: 0,@context.@language,@context.@vocab,@type,name,alternateName,description,url,identifier,creator.@type,creator.name,...,distribution_0.@type,contentUrl,contentSize,encodingFormat,isPrivate,downloadCount,viewCount,voteCount,usabilityRating,conformsTo
0,en,https://schema.org/,Dataset,Skin Cancer ISIC,The skin cancer data. Contains 9 classes of sk...,,https://www.kaggle.com/nodoubttome/skin-cancer...,319080,Person,Andrey Katanskiy,...,DataDownload,https://www.kaggle.com/datasets/nodoubttome/sk...,2048.0,application/zip,False,16045,130612,217,0.750000,http://mlcommons.org/croissant/1.0
1,en,https://schema.org/,Dataset,All ISIC Data 20240629,All images and metadata in ISIC archive.,,https://www.kaggle.com/tomooinubushi/all-isic-...,5302785,Person,tomoo inubushi,...,DataDownload,https://www.kaggle.com/datasets/tomooinubushi/...,75776.0,application/zip,False,365,3403,55,0.764706,http://mlcommons.org/croissant/1.0
2,en,https://schema.org/,Dataset,ISIC 2020 JPG 256x256 RESIZED,,,https://www.kaggle.com/nischaydnk/isic-2020-jp...,5295545,Person,Nischay Dhankhar,...,DataDownload,https://www.kaggle.com/datasets/nischaydnk/isi...,595.0,application/zip,False,677,1997,48,0.882353,http://mlcommons.org/croissant/1.0
3,en,https://schema.org/,Dataset,ISIC 2019 JPG 224x224 RESIZED,ISIC 2019 resized dataset,,https://www.kaggle.com/nischaydnk/isic-2019-jp...,5295517,Person,Nischay Dhankhar,...,DataDownload,https://www.kaggle.com/datasets/nischaydnk/isi...,355.0,application/zip,False,552,1865,39,0.941176,http://mlcommons.org/croissant/1.0
4,en,https://schema.org/,Dataset,JPEG ISIC 2019 512x512,,,https://www.kaggle.com/cdeotte/jpeg-isic2019-5...,762203,Person,Chris Deotte,...,DataDownload,https://www.kaggle.com/datasets/cdeotte/jpeg-i...,1024.0,application/zip,False,2423,7074,54,0.588235,http://mlcommons.org/croissant/1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
851,en,https://schema.org/,Dataset,4000ISIC19Balanced,,,https://www.kaggle.com/manirujjamanmonir/4000i...,3374258,Person,Manirujjaman Monir,...,DataDownload,https://www.kaggle.com/datasets/manirujjamanmo...,6144.0,application/zip,False,2,73,0,0.000000,http://mlcommons.org/croissant/1.0
852,en,https://schema.org/,Dataset,melanoma_isic,,,https://www.kaggle.com/chitrapsg/melanoma-isic,3501446,Person,Chitra Govindasamy,...,DataDownload,https://www.kaggle.com/datasets/chitrapsg/mela...,786.0,application/zip,False,3,75,0,0.000000,http://mlcommons.org/croissant/1.0
853,en,https://schema.org/,Dataset,siim_isic_2020_leukemia_dataset,,,https://www.kaggle.com/rajibbag1/siim-isic-202...,4911856,Person,RAJIB BAG_1,...,DataDownload,https://www.kaggle.com/datasets/rajibbag1/siim...,2048.0,application/zip,False,0,18,0,0.000000,http://mlcommons.org/croissant/1.0
854,en,https://schema.org/,Dataset,data_isic1718,,,https://www.kaggle.com/bugakakak/data-isic1718,4788617,Person,bugakakak,...,DataDownload,https://www.kaggle.com/datasets/bugakakak/data...,261.0,application/zip,False,0,16,0,0.000000,http://mlcommons.org/croissant/1.0


In [79]:
df_original_isic_metadata

Unnamed: 0,name,url,contentSize
0,ISIC 2016 Challenge,https://challenge.isic-archive.com/data/#2016,4219.0
1,ISIC 2017 Challenge,https://challenge.isic-archive.com/data/#2017,12708.0
2,ISIC 2018 Challenge,https://challenge.isic-archive.com/data/#2018,15880.0
3,ISIC 2019 Challenge,https://challenge.isic-archive.com/data/#2019,12700.0
4,ISIC 2020 Challenge,https://challenge.isic-archive.com/data/#2020,29700.0


In [80]:
df_kaggle_isic_metadata = df_kaggle_isic_metadata.drop(df_kaggle_isic_metadata[df_kaggle_isic_metadata.contentSize == "Unknown"].index)
df_kaggle_isic_metadata['contentSize'] = df_kaggle_isic_metadata['contentSize'].astype(float)

In [81]:
# extrating the year from the name of the dataset so it can be used to compare the datasets

def extract_year(name):
    match = re.search(r'\b(20\d{2})\b', name)
    return int(match.group(0)) if match else None

df_original_isic_metadata['Year'] = df_original_isic_metadata['name'].apply(extract_year)
df_kaggle_isic_metadata['Year'] = df_kaggle_isic_metadata['name'].apply(extract_year)

In [82]:
def find_potential_duplicates(original_df, kaggle_df, threshold=30):
    
    duplicates = []

    for index, kaggle_row in kaggle_df.iterrows():
        kaggle_content_size = kaggle_row['contentSize']
        kaggle_name = kaggle_row['name']
        kaggle_url = kaggle_row['url']
        kaggle_year = kaggle_row['Year']

        for _, original_row in original_df.iterrows():
            original_size = original_row['contentSize']
            original_name = original_row['name']
            orginal_url = original_row['url']
            original_year = original_row['Year']

            # percentage difference in contentSize
            percentage_difference = round(abs(kaggle_content_size - original_size) / original_size * 100, 2)

            # checking if the difference is within the threshold and years match
            if percentage_difference <= threshold and kaggle_year == original_year:
                duplicates.append({
                    'Kaggle Dataset Name': kaggle_name,
                    "Kaggle Dataset URL": kaggle_url,
                    'Kaggle Size (MB)': kaggle_content_size,
                    'Original Dataset Name': original_name,
                    'Orginal Dataset URL': orginal_url,
                    'Original Size (MB)': original_size,
                    'Size Difference (%)': percentage_difference,
                })

    return pd.DataFrame(duplicates)

In [83]:
potential_duplicates_df = find_potential_duplicates(df_original_isic_metadata, df_kaggle_isic_metadata)
potential_duplicates_df.to_csv('../data/02_potential_isic_duplicates_based_on_contentSize_and_name.csv', index=False)

In [84]:
potential_duplicates_df

Unnamed: 0,Kaggle Dataset Name,Kaggle Dataset URL,Kaggle Size (MB),Original Dataset Name,Orginal Dataset URL,Original Size (MB),Size Difference (%)
0,ISIC 2019 Skin Lesion images for classification,https://www.kaggle.com/salviohexia/isic-2019-s...,9216.0,ISIC 2019 Challenge,https://challenge.isic-archive.com/data/#2019,12700.0,27.43
1,ISIC-2017,https://www.kaggle.com/johnchfr/isic-2017,12288.0,ISIC 2017 Challenge,https://challenge.isic-archive.com/data/#2017,12708.0,3.31
2,ISIC 2019,https://www.kaggle.com/graf10a/isic-2019,9216.0,ISIC 2019 Challenge,https://challenge.isic-archive.com/data/#2019,12700.0,27.43
3,ISIC 2019 7 classes,https://www.kaggle.com/gabrielmv/isic-2019-7-c...,9216.0,ISIC 2019 Challenge,https://challenge.isic-archive.com/data/#2019,12700.0,27.43
4,SIIM-ISIC-2020,https://www.kaggle.com/prashantjeswani/siimisi...,23552.0,ISIC 2020 Challenge,https://challenge.isic-archive.com/data/#2020,29700.0,20.7
5,ISIC 2017/2018,https://www.kaggle.com/rychardguedes/isic-2017...,14336.0,ISIC 2017 Challenge,https://challenge.isic-archive.com/data/#2017,12708.0,12.81
6,ISIC-2018,https://www.kaggle.com/trantoanthang/isic-2018,13312.0,ISIC 2018 Challenge,https://challenge.isic-archive.com/data/#2018,15880.0,16.17
7,SIIM-ISIC-2019,https://www.kaggle.com/prashantjeswani/siim-is...,9216.0,ISIC 2019 Challenge,https://challenge.isic-archive.com/data/#2019,12700.0,27.43
8,"ISIC-2019 skin disease dataset (test ,train, val)",https://www.kaggle.com/mdefajalam/isic-2019-sk...,9216.0,ISIC 2019 Challenge,https://challenge.isic-archive.com/data/#2019,12700.0,27.43
9,ISIC 2017 Original Dataset,https://www.kaggle.com/mahmudulhasantasin/isic...,12288.0,ISIC 2017 Challenge,https://challenge.isic-archive.com/data/#2017,12708.0,3.31


In [85]:
potential_duplicates_df_based_on_name = pd.read_csv("../data/02_potential_isic_duplicates_based_on_name.csv")
potential_duplicates_df_based_on_name

Unnamed: 0,Kaggle Dataset Name,Kaggle Dataset URL,Kaggle Size (MB),Original Dataset Name,Orginal Dataset URL,Original Size (MB),Size Difference (%)
0,ISIC 2020 JPG 256x256 RESIZED,https://www.kaggle.com/nischaydnk/isic-2020-jp...,595.0,ISIC 2020 Challenge,https://challenge.isic-archive.com/data/#2020,29700.0,98.00
1,ISIC 2019 JPG 224x224 RESIZED,https://www.kaggle.com/nischaydnk/isic-2019-jp...,355.0,ISIC 2019 Challenge,https://challenge.isic-archive.com/data/#2019,12700.0,97.20
2,JPEG ISIC 2019 512x512,https://www.kaggle.com/cdeotte/jpeg-isic2019-5...,1024.0,ISIC 2019 Challenge,https://challenge.isic-archive.com/data/#2019,12700.0,91.94
3,ISIC 2019 TFRecords 512x512,https://www.kaggle.com/cdeotte/isic2019-512x512,1024.0,ISIC 2019 Challenge,https://challenge.isic-archive.com/data/#2019,12700.0,91.94
4,ISIC 2019 JPG 256x256 RESIZED,https://www.kaggle.com/nischaydnk/isic-2019-jp...,445.0,ISIC 2019 Challenge,https://challenge.isic-archive.com/data/#2019,12700.0,96.50
...,...,...,...,...,...,...,...
133,ISIC2019-2018-ITA-Based-Grouped,https://www.kaggle.com/sanazmovahed/isic2019-2...,9216.0,ISIC 2018 Challenge,https://challenge.isic-archive.com/data/#2018,15880.0,41.96
134,ham10000 skin cancer isic 2018 dataset,https://www.kaggle.com/dharjoy/ham10000-skin-c...,2048.0,ISIC 2018 Challenge,https://challenge.isic-archive.com/data/#2018,15880.0,87.10
135,SIIM-ISIC-Melanoma-Classification-2020-Resized...,https://www.kaggle.com/chirag94/siimisicmelano...,3072.0,ISIC 2020 Challenge,https://challenge.isic-archive.com/data/#2020,29700.0,89.66
136,ISIC 2016 ORIGINAL DATASET 5 FOLDED WITH LABEL,https://www.kaggle.com/mahmudulhasantasin/isic...,3072.0,ISIC 2016 Challenge,https://challenge.isic-archive.com/data/#2016,4219.0,27.19


In [86]:
# TO-DO A distribution plot of size difference