In [17]:
!pip install imagehash


Defaulting to user installation because normal site-packages is not writeable


In [3]:
import os
import hashlib
from PIL import Image

def remove_duplicate_images(directory):
    """
    Remove duplicate images from a directory.
    
    Parameters:
    -----------
    directory : str
        Path to the directory containing images
    
    Returns:
    --------
    dict
        Information about duplicates removed
    """
    # Dictionary to store image hashes
    seen_hashes = {}
    duplicates_removed = []
    
    # Supported image extensions
    image_extensions = ['.png', '.jpg', '.jpeg', '.gif', '.bmp', '.tiff']
    
    # Iterate through all files in the directory
    for filename in os.listdir(directory):
        # Full file path
        filepath = os.path.join(directory, filename)
        
        # Check if it's an image file
        if os.path.isfile(filepath) and any(filename.lower().endswith(ext) for ext in image_extensions):
            # Compute file hash
            with open(filepath, 'rb') as f:
                file_hash = hashlib.md5(f.read()).hexdigest()
            
            # Check if this hash has been seen before
            if file_hash in seen_hashes:
                # Remove the duplicate file
                os.remove(filepath)
                duplicates_removed.append(filename)
            else:
                # Store the hash of the first occurrence
                seen_hashes[file_hash] = filename
    
    # Prepare and return results
    return {
        'total_files_scanned': len(os.listdir(directory)),
        'duplicates_removed': len(duplicates_removed),
        'removed_files': duplicates_removed
    }

# Specify the directory path
dataset_path = r'E:\Research\DataSet\copra'

# Run the duplicate removal
results = remove_duplicate_images(dataset_path)

# Print the results
print("Duplicate Removal Results:")
print(f"Total files scanned: {results['total_files_scanned']}")
print(f"Duplicates removed: {results['duplicates_removed']}")
print("\nRemoved files:")
for file in results['removed_files']:
    print(file)

Duplicate Removal Results:
Total files scanned: 6164
Duplicates removed: 0

Removed files:
