In [2]:
!pip install imagehash


Collecting imagehash
  Downloading ImageHash-4.3.1-py2.py3-none-any.whl.metadata (8.0 kB)
Collecting PyWavelets (from imagehash)
  Downloading PyWavelets-1.4.1-cp38-cp38-win_amd64.whl.metadata (1.9 kB)
Downloading ImageHash-4.3.1-py2.py3-none-any.whl (296 kB)
Downloading PyWavelets-1.4.1-cp38-cp38-win_amd64.whl (4.2 MB)
   ---------------------------------------- 0.0/4.2 MB ? eta -:--:--
   ----------------------------------- ---- 3.7/4.2 MB 18.2 MB/s eta 0:00:01
   ---------------------------------------- 4.2/4.2 MB 17.9 MB/s eta 0:00:00
Installing collected packages: PyWavelets, imagehash
Successfully installed PyWavelets-1.4.1 imagehash-4.3.1


In [3]:
import os
from PIL import Image
import imagehash

def calculate_phash(image_path):
    """Calculate pHash for an image."""
    with Image.open(image_path) as img:
        return str(imagehash.phash(img))

def find_duplicate_pairs(folder):
    """Find duplicate image pairs in the given folder and subfolders."""
    hash_to_paths = {}
    duplicate_pairs = []

    for root, dirs, files in os.walk(folder):
        for file in files:
            if file.lower().endswith(('.png', '.jpg', '.jpeg', '.tif', '.tiff')):
                file_path = os.path.join(root, file)
                img_hash = calculate_phash(file_path)

                if img_hash in hash_to_paths:
                    hash_to_paths[img_hash].append(file_path)
                else:
                    hash_to_paths[img_hash] = [file_path]

    # Collect duplicate pairs
    for img_hash, paths in hash_to_paths.items():
        if len(paths) > 1:
            for i in range(len(paths)):
                for j in range(i + 1, len(paths)):
                    duplicate_pairs.append((paths[i], paths[j]))

    return duplicate_pairs

def generate_ground_truth(folder):
    """Generate ground truth list from duplicate pairs found."""
    duplicate_pairs = find_duplicate_pairs(folder)
    ground_truth = [(os.path.basename(pair[0]), os.path.basename(pair[1])) for pair in duplicate_pairs]
    return ground_truth

if __name__ == "__main__":
    folder_path = r'C:\Users\uu1n23\Downloads\DeduplicationFinal'  # Replace with your folder path
    ground_truth = generate_ground_truth(folder_path)

    print("Ground Truth (Duplicate Image Pairs):")
    for pair in ground_truth:
        print(pair)




Ground Truth (Duplicate Image Pairs):
('Agincourt HMS 3_1956 W&L 1036P.jpg', 'RNM 2001_2_1036_P_o2.jpg')
('Agincourt HMS 3_1956 W&L 1036P.jpg', 'Agincourt HMS 3_1956 W&L 1036P.jpg')
('RNM 2001_2_1036_P_o2.jpg', 'Agincourt HMS 3_1956 W&L 1036P.jpg')
('hmp_rmmus_006_74.tif', 'RMM 1974_6.tif')
('hmp_rmmus_006_74.tif', 'hmp_rmmus_006_74.tif')
('RMM 1974_6.tif', 'hmp_rmmus_006_74.tif')
('hmp_rmmus_008_71.tif', 'hmp_rmmus_008_71 - Copy.tif')
('hmp_rmmus_008_71.tif', 'hmp_rmmus_008_71.tif')
('hmp_rmmus_008_71 - Copy.tif', 'hmp_rmmus_008_71.tif')
('hmp_rmmus_008_74.tif', 'RMM 1974_8.tif')
('hmp_rmmus_008_74.tif', 'hmp_rmmus_008_74.tif')
('RMM 1974_8.tif', 'hmp_rmmus_008_74.tif')
('hmp_rmmus_009_74.tif', 'hmp_rmmus_009_74.tif')
('hmp_rmmus_010_74.tif', 'RMM 1974_10_o4.jpg')
('hmp_rmmus_010_74.tif', 'hmp_rmmus_010_74.tif')
('RMM 1974_10_o4.jpg', 'hmp_rmmus_010_74.tif')
('hmp_rmmus_011_74.tif', 'RMM 1974_11.tif')
('hmp_rmmus_011_74.tif', 'hmp_rmmus_011_74.tif')
('RMM 1974_11.tif', 'hmp_rmmus_011_

In [4]:
print(len(ground_truth))

32


use this

In [12]:
import imagehash
from PIL import Image
import os

def calculate_phash(image_path):
    img = Image.open(image_path)
    return imagehash.phash(img)

def find_duplicates(folder):
    hashes = {}
    duplicates = []

    for root, dirs, files in os.walk(folder):
        for file in files:
            if file.lower().endswith(('.png', '.jpg', '.jpeg', '.tif', '.tiff')):  # Include .tif and .tiff if needed
                image_path = os.path.join(root, file)
                hash_value = calculate_phash(image_path)
                
                if hash_value in hashes:
                    # Append only the file names, not the entire paths
                    duplicates.append((os.path.basename(hashes[hash_value]), file))
                else:
                    hashes[hash_value] = image_path

    return duplicates

# Folder path containing your images
folder_path = r'C:\Users\uu1n23\Downloads\DeduplicationFinal'

# Find and print duplicates
duplicate_pairs = find_duplicates(folder_path)
print("Found duplicates:")
for pair in duplicate_pairs:
    print(pair)


Found duplicates:
('hmp_rmmus_010_74.tif', 'RMM 1974_10_o4.jpg')
('hmp_rmmus_011_74.tif', 'RMM 1974_11.tif')
('RMM 1974_13.tif', 'RMM 1974_13_o2.jpg')
('hmp_rmmus_006_74.tif', 'RMM 1974_6.tif')
('RMM 1974_7.tif', 'RMM 1974_7_o4.jpg')
('hmp_rmmus_008_74.tif', 'RMM 1974_8.tif')
('hmp_rmmus_006_74.tif', 'hmp_rmmus_006_74.tif')
('RMM 1974_7.tif', 'hmp_rmmus_007_74.tif')
('hmp_rmmus_008_71.tif', 'hmp_rmmus_008_71 - Copy.tif')
('hmp_rmmus_008_71.tif', 'hmp_rmmus_008_71.tif')
('hmp_rmmus_008_74.tif', 'hmp_rmmus_008_74.tif')
('hmp_rmmus_009_74.tif', 'hmp_rmmus_009_74.tif')
('hmp_rmmus_010_74.tif', 'hmp_rmmus_010_74.tif')
('hmp_rmmus_011_74.tif', 'hmp_rmmus_011_74.tif')
('RMM 1974_13.tif', 'hmp_rmmus_13_74.tif')
('RMM 1973_170.tif', 'hmp_rmmus_170_73.tif')
('RMM 1974_19.tif', 'hmp_rmmus_19_74.tif')
('RMM 1974_20.tif', 'hmp_rmmus_20_74.tif')
('RMM 1971_278.tif', 'hmp_rmmus_278_71.tif')
('RMM 1967_304.tif', 'hmp_rmmus_304_67.tif')
('hmp_rmmus_88_02o - Copy.tif', 'hmp_rmmus_88_02o.tif')
('hmp_rmmu

In [11]:
import os
from PIL import Image
import imagehash

def calculate_phash(image_path):
    """Calculate pHash for an image."""
    with Image.open(image_path) as img:
        return imagehash.phash(img)

def find_duplicate_pairs(folder, similarity_threshold=0.1):
    """Find duplicate image pairs in the given folder and subfolders based on pHash similarity."""
    hash_to_paths = {}
    duplicate_pairs = []

    for root, dirs, files in os.walk(folder):
        for file in files:
            if file.lower().endswith(('.png', '.jpg', '.jpeg', '.tif', '.tiff')):
                file_path = os.path.join(root, file)
                img_hash = calculate_phash(file_path)

                # Store the image hash with its file path
                hash_to_paths[file_path] = img_hash

    # Compare each image with every other image to find duplicates
    paths = list(hash_to_paths.keys())
    for i in range(len(paths)):
        for j in range(i + 1, len(paths)):
            hash1 = hash_to_paths[paths[i]]
            hash2 = hash_to_paths[paths[j]]
            hash_difference = hash1 - hash2  # This gives the Hamming distance

            # Calculate similarity (1 - normalized Hamming distance)
            similarity = 1 - (hash_difference / len(hash1.hash)**2)
            if similarity >= (1 - similarity_threshold):  # 90% or more similar
                duplicate_pairs.append((paths[i], paths[j]))

    return duplicate_pairs

def generate_ground_truth(folder):
    """Generate ground truth list from duplicate pairs found."""
    duplicate_pairs = find_duplicate_pairs(folder)
    ground_truth = [(os.path.basename(pair[0]), os.path.basename(pair[1])) for pair in duplicate_pairs]
    return ground_truth

if __name__ == "__main__":
    folder_path = r'C:\Users\uu1n23\Downloads\DeduplicationFinal'  # Replace with your folder path
    ground_truth = generate_ground_truth(folder_path)

    print("Ground Truth (Duplicate Image Pairs):")
    for pair in ground_truth:
        print(pair)


Ground Truth (Duplicate Image Pairs):
('hmp_rmmus_006_74.tif', 'RMM 1974_6.tif')
('hmp_rmmus_006_74.tif', 'hmp_rmmus_006_74.tif')
('hmp_rmmus_008_71.tif', 'hmp_rmmus_008_71 - Copy.tif')
('hmp_rmmus_008_71.tif', 'hmp_rmmus_008_71.tif')
('hmp_rmmus_008_74.tif', 'RMM 1974_8.tif')
('hmp_rmmus_008_74.tif', 'hmp_rmmus_008_74.tif')
('hmp_rmmus_009_74.tif', 'hmp_rmmus_009_74.tif')
('hmp_rmmus_010_74.tif', 'RMM 1974_10_o4.jpg')
('hmp_rmmus_010_74.tif', 'hmp_rmmus_010_74.tif')
('hmp_rmmus_011_74.tif', 'RMM 1974_11.tif')
('hmp_rmmus_011_74.tif', 'hmp_rmmus_011_74.tif')
('RMM 1967_304.tif', 'hmp_rmmus_304_67.tif')
('RMM 1971_278.tif', 'hmp_rmmus_278_71.tif')
('RMM 1973_170.tif', 'hmp_rmmus_170_73.tif')
('RMM 1974_10_o4.jpg', 'hmp_rmmus_010_74.tif')
('RMM 1974_11.tif', 'hmp_rmmus_011_74.tif')
('RMM 1974_13.tif', 'RMM 1974_13_o2.jpg')
('RMM 1974_13.tif', 'hmp_rmmus_13_74.tif')
('RMM 1974_13_o2.jpg', 'hmp_rmmus_13_74.tif')
('RMM 1974_19.tif', 'hmp_rmmus_19_74.tif')
('RMM 1974_20.tif', 'hmp_rmmus_20_7