# Image Size Comparison

In [36]:
import os
import csv

In [37]:
def count_images_and_get_filenames(directory):
    
    image_extensions = {'.jpg', '.jpeg', '.png'}
    image_filenames = set()
    
    for file_name in os.listdir(directory):
        if os.path.splitext(file_name)[1].lower() in image_extensions:
            image_filenames.add(file_name)
    return len(image_filenames), image_filenames

In [38]:
def compare_image_counts_and_filenames(folder_duplicate, folder_original):
    subfolders = ['train', 'test', 'val']
    results = {}

    for subfolder in subfolders:
        dir_duplicate = os.path.join(folder_duplicate, subfolder)
        dir_original = os.path.join(folder_original, subfolder)

        if os.path.exists(dir_duplicate) and os.path.exists(dir_original):
            
            count_duplicate, filenames_duplicate = count_images_and_get_filenames(dir_duplicate)
            count_original, filenames_original = count_images_and_get_filenames(dir_original)
            difference = count_original - count_duplicate
            missing_in_duplicate = filenames_original - filenames_duplicate 
            missing_in_original = filenames_duplicate - filenames_original 
            common_files = filenames_duplicate & filenames_original

            results[subfolder] = {
                'Duplicate Folder': count_duplicate,
                'Original Folder': count_original,
                'Match': count_duplicate == count_original and len(missing_in_original) == 0 and len(missing_in_duplicate) == 0,
                'Common Files': list(common_files),
                'Missing images in Original Folder': list(missing_in_original),
                'Missing images in Duplicate Folder': list(missing_in_duplicate),
                'Difference in Count': difference
            }
        elif os.path.exists(dir_duplicate) and not os.path.exists(dir_original):
            count_duplicate, filenames_duplicate = count_images_and_get_filenames(dir_duplicate)
            missing_in_original = filenames_duplicate - filenames_original 
            results[subfolder] = {
                'Duplicate Folder': count_duplicate,
                'Original Folder': 'Directory not found',
                'Match': False,
                'Common Files': [],
                'Missing images in Original Folder': list(missing_in_original),
                'Missing images in Duplicate Folder': [],
                'Difference in Count': count_duplicate
            }
        elif not os.path.exists(dir_duplicate) and os.path.exists(dir_original):
            count_original, filenames_original = count_images_and_get_filenames(dir_original)
            missing_in_duplicate = filenames_original - filenames_duplicate 
            results[subfolder] = {
                'Duplicate Folder': 'Directory not found',
                'Original Folder': count_original,
                'Match': False,
                'Common Files': [],
                'Missing images in Original Folder': [],
                'Missing images in Duplicate Folder': list(missing_in_duplicate),
                'Difference in Count': count_original
            }
        else:
            results[subfolder] = {
                'Duplicate Folder': 'Directory not found',
                'Original Folder': 'Directory not found',
                'Match': 'N/A',
                'Common Files': [],
                'Missing images in Original Folder': [],
                'Missing images in Duplicate Folder': [],
                'Difference in Count': 'N/A'
            }

    return results

In [39]:
def compare_multiple_datasets(folder_pairs):
    all_results = {}
    for pair_index, (folder_duplicate, folder_original) in enumerate(folder_pairs, start=1):
        comparison_results = compare_image_counts_and_filenames(folder_duplicate, folder_original)
        all_results[f"Pair {pair_index} ({folder_duplicate} vs {folder_original})"] = comparison_results
    return all_results

In [40]:
folder_pairs = [
    ("../data/duplicate_data/ISIC-2017-1-FOLD", "../data/original_data/ISIC-2017-Challenge"), 
    ("../data/duplicate_data/ISIC-2018", "../data/original_data/ISIC-2018-Challenge"),
    ("../data/duplicate_data/ISIC-2019-Preprocessed-Dataset", "../data/original_data/ISIC-2019-Challenge")
]

results = compare_multiple_datasets(folder_pairs)

for result_description, comparison_result in results.items():
    print(f"\nResults for {result_description}:")
    for subfolder, details in comparison_result.items():
        print(f"  {subfolder.capitalize()} -")
        print(f"    Duplicate Folder Count: {details['Duplicate Folder']}")
        print(f"    Original Folder Count: {details['Original Folder']}")
        print(f"    Difference in Count: {details['Difference in Count']}")
        print(f"    Match: {details['Match']}")
        print(f"    Common Files ({len(details['Common Files'])}): {details['Common Files']}")
        print(f"    Missing in Duplicate Folder ({len(details['Missing images in Duplicate Folder'])}): {details['Missing images in Duplicate Folder']}")
        print(f"    Missing in Original Folder ({len(details['Missing images in Original Folder'])}): {details['Missing images in Original Folder']}")


Results for Pair 1 (../data/duplicate_data/ISIC-2017-1-FOLD vs ../data/original_data/ISIC-2017-Challenge):
  Train -
    Duplicate Folder Count: 821
    Original Folder Count: 4000
    Difference in Count: 3179
    Match: False
    Common Files (599): ['ISIC_0000063.jpg', 'ISIC_0013346.jpg', 'ISIC_0011384.jpg', 'ISIC_0012360.jpg', 'ISIC_0007475.jpg', 'ISIC_0013163.jpg', 'ISIC_0000140.jpg', 'ISIC_0012854.jpg', 'ISIC_0000253.jpg', 'ISIC_0012888.jpg', 'ISIC_0013213.jpg', 'ISIC_0000500.jpg', 'ISIC_0011169.jpg', 'ISIC_0009919.jpg', 'ISIC_0012320.jpg', 'ISIC_0010064.jpg', 'ISIC_0008626.jpg', 'ISIC_0000483.jpg', 'ISIC_0012374.jpg', 'ISIC_0012952.jpg', 'ISIC_0011317.jpg', 'ISIC_0010481.jpg', 'ISIC_0009998.jpg', 'ISIC_0015079.jpg', 'ISIC_0012406.jpg', 'ISIC_0000077.jpg', 'ISIC_0013155.jpg', 'ISIC_0010213.jpg', 'ISIC_0011363.jpg', 'ISIC_0012675.jpg', 'ISIC_0013579.jpg', 'ISIC_0000103.jpg', 'ISIC_0012214.jpg', 'ISIC_0012982.jpg', 'ISIC_0013114.jpg', 'ISIC_0012962.jpg', 'ISIC_0010467.jpg', 'ISIC_