# Image Deduplication

In [2]:
import cv2
import numpy as np
import matplotlib.pyplot as plt
from PIL import Image, ImageFilter
import os

In [17]:
def compute_rmse(image1, image2):
    # Convert the images to grayscale and then to numpy arrays
    np_image1 = np.array(image1.convert('L'))
    np_image2 = np.array(image2.convert('L'))
    
    # Compute the Root Mean Squared Error (RMSE) between the two grayscale images
    mse = np.mean((np_image1 - np_image2) ** 2)
    rmse = np.sqrt(mse)
    
    return rmse

def resize_image(image, target_size):
    # Resize the image to the target size
    resized_image = image.resize(target_size, Image.ANTIALIAS)
    return resized_image

def deduplicate_images(folder_path, rmse_threshold=7.0, target_size=(256, 256)):
    image_files = [f for f in os.listdir(folder_path) if f.endswith(('.jpg', '.jpeg', '.png', '.gif'))]
    deduplicated_images = []

    for i, image_file in enumerate(image_files):
        image1_path = os.path.join(folder_path, image_file)
        image1 = Image.open(image1_path)

        # Convert the image to grayscale
        image1 = image1.convert('L')

        # Resize the image to the target size
        image1 = resize_image(image1, target_size)

        is_duplicate = False
        for dedup_image in deduplicated_images:
            image2 = dedup_image[0]
            rmse = compute_rmse(image1, image2)
            if rmse < rmse_threshold:
                is_duplicate = True
                break

        if not is_duplicate:
            deduplicated_images.append((image1, image_file))
    
    deduplicated_images.sort(key=lambda x: x[1])
    
    return [image_file for _, image_file in deduplicated_images]

# Example usage:
# folder_path = "/path/to/your/folder"
folder_path = 'Data/train/test_dups'
deduplicated_images = deduplicate_images(folder_path)
print(deduplicated_images)


['11 - Copy.png', '11.png']


  resized_image = image.resize(target_size, Image.ANTIALIAS)
