# Comparison of dataset sizes to identify potential duplicates based on size of the dataset

In [27]:
import pandas as pd
import re

In [28]:
df_kaggle_isic_metadata = pd.read_csv("../data/01_isic_datasets_metadata.csv")
df_original_isic_metadata = pd.read_csv("../data/isic_challange_datasets.csv")

In [10]:
df_kaggle_isic_metadata

Unnamed: 0,@context.@language,@context.@vocab,@type,name,alternateName,description,url,identifier,creator.@type,creator.name,...,distribution_0.@type,contentUrl,contentSize,encodingFormat,isPrivate,downloadCount,viewCount,voteCount,usabilityRating,conformsTo
0,en,https://schema.org/,Dataset,Skin Cancer ISIC,The skin cancer data. Contains 9 classes of sk...,,https://www.kaggle.com/nodoubttome/skin-cancer...,319080,Person,Andrey Katanskiy,...,DataDownload,https://www.kaggle.com/datasets/nodoubttome/sk...,2048.0,application/zip,False,16007,130425,216,0.750000,http://mlcommons.org/croissant/1.0
1,en,https://schema.org/,Dataset,All ISIC Data 20240629,All images and metadata in ISIC archive.,,https://www.kaggle.com/tomooinubushi/all-isic-...,5302785,Person,tomoo inubushi,...,DataDownload,https://www.kaggle.com/datasets/tomooinubushi/...,75776.0,application/zip,False,365,3392,55,0.764706,http://mlcommons.org/croissant/1.0
2,en,https://schema.org/,Dataset,ISIC 2020 JPG 256x256 RESIZED,,,https://www.kaggle.com/nischaydnk/isic-2020-jp...,5295545,Person,Nischay Dhankhar,...,DataDownload,https://www.kaggle.com/datasets/nischaydnk/isi...,595.0,application/zip,False,676,1981,48,0.882353,http://mlcommons.org/croissant/1.0
3,en,https://schema.org/,Dataset,ISIC 2019 JPG 224x224 RESIZED,ISIC 2019 resized dataset,,https://www.kaggle.com/nischaydnk/isic-2019-jp...,5295517,Person,Nischay Dhankhar,...,DataDownload,https://www.kaggle.com/datasets/nischaydnk/isi...,355.0,application/zip,False,551,1855,39,0.941176,http://mlcommons.org/croissant/1.0
4,en,https://schema.org/,Dataset,JPEG ISIC 2019 512x512,,,https://www.kaggle.com/cdeotte/jpeg-isic2019-5...,762203,Person,Chris Deotte,...,DataDownload,https://www.kaggle.com/datasets/cdeotte/jpeg-i...,1024.0,application/zip,False,2423,7074,54,0.588235,http://mlcommons.org/croissant/1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
852,en,https://schema.org/,Dataset,4000ISIC19Balanced,,,https://www.kaggle.com/manirujjamanmonir/4000i...,3374258,Person,Manirujjaman Monir,...,DataDownload,https://www.kaggle.com/datasets/manirujjamanmo...,6144.0,application/zip,False,2,73,0,0.000000,http://mlcommons.org/croissant/1.0
853,en,https://schema.org/,Dataset,melanoma_isic,,,https://www.kaggle.com/chitrapsg/melanoma-isic,3501446,Person,Chitra Govindasamy,...,DataDownload,https://www.kaggle.com/datasets/chitrapsg/mela...,786.0,application/zip,False,3,75,0,0.000000,http://mlcommons.org/croissant/1.0
854,en,https://schema.org/,Dataset,siim_isic_2020_leukemia_dataset,,,https://www.kaggle.com/rajibbag1/siim-isic-202...,4911856,Person,RAJIB BAG_1,...,DataDownload,https://www.kaggle.com/datasets/rajibbag1/siim...,2048.0,application/zip,False,0,18,0,0.000000,http://mlcommons.org/croissant/1.0
855,en,https://schema.org/,Dataset,data_isic1718,,,https://www.kaggle.com/bugakakak/data-isic1718,4788617,Person,bugakakak,...,DataDownload,https://www.kaggle.com/datasets/bugakakak/data...,261.0,application/zip,False,0,16,0,0.000000,http://mlcommons.org/croissant/1.0


In [11]:
df_original_isic_metadata

Unnamed: 0,name,contentSize
0,ISIC 2016 Challenge,4219.0
1,ISIC 2017 Challenge,12708.0
2,ISIC 2018 Challenge,15880.0
3,ISIC 2019 Challenge,12700.0
4,ISIC 2020 Challenge,29700.0


In [12]:
df_kaggle_isic_metadata = df_kaggle_isic_metadata.drop(df_kaggle_isic_metadata[df_kaggle_isic_metadata.contentSize == "Unknown"].index)
df_kaggle_isic_metadata['contentSize'] = df_kaggle_isic_metadata['contentSize'].astype(float)

In [21]:
def extract_year(name):
    match = re.search(r'\b(20\d{2})\b', name)
    return int(match.group(0)) if match else None

df_original_isic_metadata['Year'] = df_original_isic_metadata['name'].apply(extract_year)
df_kaggle_isic_metadata['Year'] = df_kaggle_isic_metadata['name'].apply(extract_year)

In [22]:
def find_potential_duplicates(official_df, kaggle_df, threshold=5):
    duplicates = []

    for index, kaggle_row in kaggle_df.iterrows():
        kaggle_content_size = kaggle_row['contentSize']
        kaggle_name = kaggle_row['name']
        kaggle_year = kaggle_row['Year']

        for _, official_row in official_df.iterrows():
            original_size = official_row['contentSize']
            original_name = official_row['name']
            original_year = official_row['Year']

            # Calculate the percentage difference in size
            percentage_difference = abs(kaggle_content_size - official_size) / official_size * 100

            # Check if the difference is within the threshold and years match
            if percentage_difference <= threshold and kaggle_year == original_year:
                duplicates.append({
                    'Kaggle Dataset Name': kaggle_name,
                    'Kaggle Size MB': kaggle_content_size,
                    'Official Dataset Name': original_name,
                    'Official Size MB': original_size,
                    'Size Difference %': percentage_difference,
                    'Year': kaggle_year
                })

    return pd.DataFrame(duplicates)

In [25]:
# finding potential duplciates
potential_duplicates_df = find_potential_duplicates(df_original_isic_metadata, df_kaggle_isic_metadata)

potential_duplicates_df.to_csv('../data/02_potential_isic_duplicates_based_on_contentSize_and_name.csv', index=False)

In [26]:
potential_duplicates_df

Unnamed: 0,Kaggle Dataset Name,Kaggle Size MB,Official Dataset Name,Official Size MB,Size Difference %,Year
0,ISIC-2017,12288.0,ISIC 2017 Challenge,12708.0,3.305005,2017.0
1,ISIC 2017 Original Dataset,12288.0,ISIC 2017 Challenge,12708.0,3.305005,2017.0
2,ISIC Challenge Dataset-2020,30720.0,ISIC 2020 Challenge,29700.0,3.434343,2020.0
3,ISIC 2017 Task 3 Dataset,12288.0,ISIC 2017 Challenge,12708.0,3.305005,2017.0
