In [25]:
import os

In [26]:
def count_images(directory):
    image_extensions = {'.jpg', '.jpeg', '.png'}

    image_count = 0
    
    for file_name in os.listdir(directory):
        if os.path.splitext(file_name)[1].lower() in image_extensions:
            image_count += 1
    return image_count

In [27]:
def compare_image_counts(folder_a, folder_b):
    subfolders = ['train', 'test', 'val']
    results = {}

    for subfolder in subfolders:
        dir_a = os.path.join(folder_a, subfolder)
        dir_b = os.path.join(folder_b, subfolder)

        if os.path.exists(dir_a) and os.path.exists(dir_b):
            count_a = count_images(dir_a)
            count_b = count_images(dir_b)
            results[subfolder] = {
                'Folder A': count_a, 
                'Folder B': count_b, 
                'Match': count_a == count_b
            }
        elif os.path.exists(dir_a) and not os.path.exists(dir_b):
            count_a = count_images(dir_a)
            results[subfolder] = {
                'Folder A': count_a, 
                'Folder B': 'Directory not found', 
                'Match': False
            }
        elif not os.path.exists(dir_a) and os.path.exists(dir_b):
            count_b = count_images(dir_b)
            results[subfolder] = {
                'Folder A': 'Directory not found', 
                'Folder B': count_b, 
                'Match': False
            }
        else:
            results[subfolder] = {
                'Folder A': 'Directory not found', 
                'Folder B': 'Directory not found', 
                'Match': 'N/A'
            }

    return results

In [28]:
def compare_multiple_datasets(folder_pairs):
    all_results = {}
    for pair_index, (folder_a, folder_b) in enumerate(folder_pairs, start=1):
        # print(f"comparing Folder Pair {pair_index}: {folder_a} vs {folder_b}")
        comparison_results = compare_image_counts(folder_a, folder_b)
        all_results[f"Pair {pair_index} ({folder_a} vs {folder_b})"] = comparison_results
    
    return all_results

In [29]:
folder_pairs = [
    ("../data/duplicate_data/ISIC-2017-1-FOLD", "../data/original_data/ISIC-2017-Challange"),
    ("../data/duplicate_data/ISIC-2018", "../data/original_data/ISIC-2018-Challange"),
    ("../data/duplicate_data/ISIC-2019-Preprocessed-Dataset", "../data/original_data/ISIC-2019-Challange")
]

results = compare_multiple_datasets(folder_pairs)
for result_description, comparison_result in results.items():
    print(f"\nResults for {result_description}:")
    for subfolder, details in comparison_result.items():
        print(f"  {subfolder.capitalize()} - Folder A: {details['Folder A']}, Folder B: {details['Folder B']}, Match: {details['Match']}")


Results for Pair 1 (../data/duplicate_data/ISIC-2017-1-FOLD vs ../data/original_data/ISIC-2017-Challange):
  Train - Folder A: 0, Folder B: Directory not found, Match: False
  Test - Folder A: 0, Folder B: Directory not found, Match: False
  Val - Folder A: 0, Folder B: Directory not found, Match: False

Results for Pair 2 (../data/duplicate_data/ISIC-2018 vs ../data/original_data/ISIC-2018-Challange):
  Train - Folder A: 2594, Folder B: 12609, Match: False
  Test - Folder A: 1000, Folder B: 2512, Match: False
  Val - Folder A: 100, Folder B: 293, Match: False

Results for Pair 3 (../data/duplicate_data/ISIC-2019-Preprocessed-Dataset vs ../data/original_data/ISIC-2019-Challange):
  Train - Folder A: 25331, Folder B: 25331, Match: True
  Test - Folder A: Directory not found, Folder B: 8238, Match: False
  Val - Folder A: Directory not found, Folder B: Directory not found, Match: N/A
