In [1]:
import cv2
import numpy as np
import matplotlib.pyplot as plt
import os

In [2]:
def remove_duplicate(dir_path):
    """
    This function is to delete the images in the dir_path
    input: 
        -- dir_path images folder path 
    """
    image_filenames = []
    for path in os.listdir(dir_path):
        if is_image(path):
            image_filenames += [os.path.join(dir_path, path)]
#     print(image_filenames)

    # detect duplicte iamges
    images = {}
    for img in sorted(image_filenames):
        try:
            img_data = cv2.imread(img)
            img_data = cv2.cvtColor(img_data, cv2.COLOR_BGR2RGB)
            _, hash_val= pHash(img_data)
        except Exception as e:
            print('Problem:', e, 'with', img)
            continue
        if hash_val in images:
            print(img, '  already exists as', ' '.join(images[hash_val]))
            if 'dupPictures' in img:
                print('rm -v', img)
        images[hash_val] = images.get(hash_val, []) + [img]
        
    # remove duplicate images  
    for (h, hashed_path) in images.items():
        if len(hashed_path) > 1:
            for duplicated_file in hashed_path[1:]:
                os.remove(duplicated_file)
    print("Remove_duplicate is done")
    

def pHash(img):
    """
    This function for calculate the perceptual hash value of the image
    input: 
        gray image
    output:
        hash_value_bits (this is the input for function Hamming_distance)
        hash_value (this value is for creat hash table)
    """
    img_resize = cv2.resize(img, (32, 32))
    gray_img = cv2.cvtColor(img_resize, cv2.COLOR_BGR2GRAY)
    dct_gray = cv2.dct(np.float32(gray_img))
    dct_top_left = dct_gray[0:8, 0:8]
    average_value = np.mean(dct_top_left)
    phash_binary = (dct_top_left > average_value)+0
    
    phash_list = phash_binary.reshape(1,-1)[0].tolist()
    hash_value_bits = ''.join([str(x) for x in phash_list])
    
    hash_value = ''.join('%0.2x' % x for x in np.packbits(phash_binary))
    return hash_value_bits, hash_value


def Hamming_distance(hash1,hash2):
    """
    This function for calculate similiarity of two images use hamming distance
    input: 
        -- hash1 image1 hash_value_bits
        -- hash2 image2 hash_value_bits
    output:
        -- num The different of two image (0-64) 0 is same; the bigger the more different 
    """
    num = 0
    for index in range(len(hash1)):
        if hash1[index] != hash2[index]:
            num += 1
    return num 

def is_image(filename):
    """
    This function is to check whether the file is an image
    input: 
        -- filename 
    output:
        -- Bool
    """
    f = filename.lower()
    return f.endswith(".png") or f.endswith(".jpg") or \
        f.endswith(".jpeg") or f.endswith(".bmp") or \
        f.endswith(".gif") or '.jpg' in f or  f.endswith(".svg")




In [3]:
dir_path = "duplicatedimages_test"
remove_duplicate(dir_path)

Problem: not enough values to unpack (expected 3, got 2) with duplicatedimages_test/0.jpg
Problem: not enough values to unpack (expected 3, got 2) with duplicatedimages_test/0的副本.jpg
Problem: not enough values to unpack (expected 3, got 2) with duplicatedimages_test/1.jpg
Problem: not enough values to unpack (expected 3, got 2) with duplicatedimages_test/1的副本.jpg
Problem: not enough values to unpack (expected 3, got 2) with duplicatedimages_test/2.jpg
Problem: not enough values to unpack (expected 3, got 2) with duplicatedimages_test/2的副本.jpg
Problem: not enough values to unpack (expected 3, got 2) with duplicatedimages_test/3.jpg
Problem: not enough values to unpack (expected 3, got 2) with duplicatedimages_test/3的副本.jpg
Problem: not enough values to unpack (expected 3, got 2) with duplicatedimages_test/4.jpg
Problem: not enough values to unpack (expected 3, got 2) with duplicatedimages_test/4的副本.jpg
Problem: not enough values to unpack (expected 3, got 2) with duplicatedimages_test/5