# MD5 Hash comparison on Images

## Imports

In [1]:
import os
import pandas as pd
import hashlib

In [2]:
def compute_file_hashes(image_folder, dataset_name):
    image_extensions = {".jpg", ".jpeg", ".png"}
    hashes = []
    for root, dirs, files in os.walk(image_folder):
        for file_name in files:
            if os.path.splitext(file_name)[1].lower() in image_extensions:
                file_path = os.path.join(root, file_name)
                try:
                    # reading the file in binary mode and computing MD5 hash
                    with open(file_path, "rb") as f:
                        file_data = f.read()
                        file_hash = hashlib.md5(file_data).hexdigest()
                    subfolder = os.path.relpath(root, image_folder)
                    hashes.append({
                        "Dataset": dataset_name,
                        "Subfolder": subfolder,
                        "FilePath": file_path,
                        "FileName": file_name,
                        "FileHash": file_hash
                    })
                except Exception as e:
                    print(f"Error processing {file_path}: {e}")
    return hashes

In [3]:
dataset_pairs = [
    {
        "Duplicate": {
            "Path": "../data/duplicate_data/ISIC-2017-1-FOLD",
            "Name": "ISIC-2017-1-FOLD"
        },
        "Original": {
            "Path": "../data/original_data/ISIC-2017-Challenge",
            "Name": "ISIC-2017-Challenge"
        }
    },
    {
        "Duplicate": {
            "Path": "../data/duplicate_data/ISIC-2018",
            "Name": "ISIC-2018"
        },
        "Original": {
            "Path": "../data/original_data/ISIC-2018-Challenge",
            "Name": "ISIC-2018-Challenge"
        }
    },
    {
        "Duplicate": {
            "Path": "../data/duplicate_data/ISIC-2019-Preprocessed-Dataset",
            "Name": "ISIC-2019-Preprocessed-Dataset"
        },
        "Original": {
            "Path": "../data/original_data/ISIC-2019-Challenge",
            "Name": "ISIC-2019-Challenge"
        }
    }
]

In [4]:
# computing hashes for all datasets
all_hashes = []
for pair in dataset_pairs:
    for dataset_type in ["Duplicate", "Original"]:
        dataset_info = pair[dataset_type]
        dataset_path = dataset_info["Path"]
        dataset_name = dataset_info["Name"]
        print(f"Computing hashes for {dataset_name} dataset ...")
        hashes = compute_file_hashes(dataset_path, dataset_name)
        all_hashes.extend(hashes)

Computing hashes for ISIC-2017-1-FOLD dataset ...
Computing hashes for ISIC-2017-Challenge dataset ...
Computing hashes for ISIC-2018 dataset ...
Computing hashes for ISIC-2018-Challenge dataset ...
Computing hashes for ISIC-2019-Preprocessed-Dataset dataset ...
Computing hashes for ISIC-2019-Challenge dataset ...


In [5]:
hashes_df = pd.DataFrame(all_hashes)
hashes_df.to_csv("../data/hashes/file_hashes_by_pair.csv", index=False)
print(f"Total images hashed: {len(hashes_df)}")

Total images hashed: 84882


In [6]:
def compare_hashes_for_pair(hashes_df, duplicate_dataset_name, original_dataset_name):
    duplicate_df = hashes_df[hashes_df["Dataset"] == duplicate_dataset_name].copy()
    original_df = hashes_df[hashes_df["Dataset"] == original_dataset_name].copy()
    
    duplicate_df.reset_index(drop=True, inplace=True)
    original_df.reset_index(drop=True, inplace=True)
    
    # merging on "FileHash" to find exact duplicates
    duplicate_pairs = pd.merge(
        duplicate_df,
        original_df,
        on="FileHash",
        suffixes=("_Duplicate", "_Original")
    )
    
    duplicate_pairs = duplicate_pairs.to_dict("records")
    
    return duplicate_pairs

In [7]:
# comparing hashes for each dataset pair
all_duplicate_pairs = []

for pair in dataset_pairs:
    duplicate_name = pair["Duplicate"]["Name"]
    original_name = pair["Original"]["Name"]
    
    duplicate_pairs = compare_hashes_for_pair(hashes_df, duplicate_name, original_name)
    
    all_duplicate_pairs.extend(duplicate_pairs)
    
    duplicates_df = pd.DataFrame(duplicate_pairs)
    csv_filename = f"../data/hashes/duplicate_images_{duplicate_name}_vs_{original_name}.csv"
    duplicates_df.to_csv(csv_filename, index=False)
    print(f"Total duplicate pairs found between {duplicate_name} and {original_name}: {len(duplicates_df)}\n")

Total duplicate pairs found between ISIC-2017-1-FOLD and ISIC-2017-Challenge: 1381

Total duplicate pairs found between ISIC-2018 and ISIC-2018-Challenge: 3694

Total duplicate pairs found between ISIC-2019-Preprocessed-Dataset and ISIC-2019-Challenge: 0



In [8]:
# all duplicate pairs across all dataset pairs
all_duplicates_df = pd.DataFrame(all_duplicate_pairs)
all_duplicates_df.to_csv("../data/hashes/all_duplicate_images_pairs.csv", index=False)
print(f"Total duplicate pairs found across all dataset pairs: {len(all_duplicates_df)}")

Total duplicate pairs found across all dataset pairs: 5075
