In [2]:
import os
from PIL import Image
import imagehash
from itertools import combinations

def find_similar_images(folder_path, hash_size=8, threshold=5):
    """
    Finds similar images in a folder using perceptual hashing.

    Args:
        folder_path (str): The path to the folder containing images.
        hash_size (int): The size of the hash. A larger size can be more
                         precise but slower.
        threshold (int): The maximum allowed difference between two hashes
                         to be considered similar. A lower value is stricter.

    Returns:
        A list of tuples, where each tuple contains the paths of two similar images.
    """
    image_files = [os.path.join(folder_path, f) for f in os.listdir(folder_path)
                   if f.lower().endswith(('.png', '.jpg', '.jpeg', '.gif', '.bmp'))]

    # Dictionary to store hashes: {hash_value: [file_path1, file_path2, ...]}
    hashes = {}

    for image_path in image_files:
        try:
            with Image.open(image_path) as img:
                # Use a perceptual hash algorithm (e.g., dhash, phash)
                # dhash is often a good balance of speed and accuracy
                image_hash = imagehash.dhash(img, hash_size=hash_size)
                
                # Check for exact duplicates first
                if image_hash in hashes:
                    print(f"Exact duplicate found: '{image_path}' and '{hashes[image_hash][0]}'")
                    hashes[image_hash].append(image_path)
                else:
                    hashes[image_hash] = [image_path]

        except Exception as e:
            print(f"Could not process image '{image_path}': {e}")
    
    similar_pairs = []
    # Compare hashes of all unique pairs of images
    hash_list = list(hashes.keys())
    for hash1, hash2 in combinations(hash_list, 2):
        # Calculate the Hamming distance (difference) between the hashes
        hash_difference = hash1 - hash2
        print(hash_difference)
        if hash_difference <= threshold:
            for file1 in hashes[hash1]:
                for file2 in hashes[hash2]:
                    # Make sure not to compare the same file
                    if file1 != file2:
                        similar_pairs.append((file1, file2, hash_difference))

    return similar_pairs


No similar images found.


In [4]:
if __name__ == "__main__":
    # Replace with the path to your image folder
    my_image_folder = "/Users/tawate/Documents/Architecture_Image_Modeling/architectural_style_images"

    if not os.path.isdir(my_image_folder):
        print(f"Error: The folder '{my_image_folder}' does not exist.")
        exit()

    similar_images = find_similar_images(my_image_folder)

    if similar_images:
        print("\n--- Found Similar Image Pairs ---")
        for file1, file2, diff in similar_images:
            print(f"'{os.path.basename(file1)}' and '{os.path.basename(file2)}' are similar (Difference: {diff})")
    else:
        print("\nNo similar images found.")


No similar images found.
