In [None]:
import glob
import hashlib
import time
import tqdm
import os
import sys

import cv2
from matplotlib import pyplot as plt
import numpy as np

In [None]:
def print_status(text):
    """Function to print and update text on jupyter notebook
        Args: 
            text: str
        Returns: 
    """
    sys.stdout.write("{}\r".format(text))
    sys.stdout.flush()

In [None]:
def calculate_md5(file_name):
    return hashlib.md5(open(file_name,'rb').read()).hexdigest()

In [None]:
def show_duplicates(duplicates):
    """Function to show duplicates
            
        Args: 
            duplicates: dict. {'ref_image_path': ['similar_image_path']}

        Returns: 

    """
    cntr = 0

    for key, vals in duplicates.items():
        for val in vals:
            cntr+=1
            img1 = cv2.imread(key) #read reference image
            img2 = cv2.imread(val[0]) #read similar image

            if img2 is None: #if file could be not read!
                print("file not found!")
                plt.imshow(img1[:,:,::-1])
                plt.show()
                continue
                
            print("**\nref: {} \nsimilars: {}\**".format(key, val))

            img2 = cv2.resize(img2, (img1.shape[1], img1.shape[0])) #resize images for hconcat
            
            combined = cv2.hconcat([img1, img2]) #combine images
            
            #show images
            plt.imshow(combined[:,:,::-1])
            plt.show()

            print("*"*33)
    print("{} files detected!".format(cntr))

In [None]:
def remove_duplicates(duplicates, simulate=False):
    """Function to remove duplicates
            
        Args: 
            duplicates: dict. {'ref_image_path': ['similar_image_path']}

        Returns: 

    """
    cntr = 0

    for key, vals in duplicates.items():
        for val in vals:
            cntr+=1
            if not simulate:
                os.remove(val[0])
    if simulate:
        print("{} files WILL BE removed".format(cntr))
    else:
        print("{} files removed".format(cntr))

## !set directory!

In [None]:
####
file_list = glob.glob('/home/xyz/*')[:100]
####
print("{} files will be analyzed".format(len(file_list)))

## calculate md5 for each file

In [None]:
start_time = time.time()
md5_list = []
#calculate md5 for each file 
for ind, image_path in tqdm.tqdm_notebook(enumerate(file_list)):
    tmp_md5 = calculate_md5(image_path)
    
    md5_list.append(tmp_md5) #add features to list

print("it took {:.2f} seconds to build index for {} images".format(time.time()-start_time, len(file_list)))

## analyze and find duplicates

In [None]:
duplicates = {}
processed_items = []

occurrences = lambda s, lst: (i for i,e in enumerate(lst) if e == s)

for ind, (image_path, tmp_md5) in tqdm.tqdm_notebook(enumerate(zip(file_list,md5_list))):

    if image_path in processed_items:
        continue
    processed_items.append(image_path)

    #print("querying similar images for {}".format(image_path))       
    
    similar_image_inds = list(occurrences(tmp_md5, md5_list))
   
    for sim_image_ind  in similar_image_inds:
        similar_image_path = file_list[sim_image_ind]
        
        if similar_image_path == image_path or similar_image_path in processed_items:
            continue

        if image_path in duplicates:
            duplicates[image_path].append([similar_image_path])
        else:
            duplicates[image_path] = []
            duplicates[image_path].append([similar_image_path])
        processed_items.append(similar_image_path)
        
            

## show samples

In [None]:
show_duplicates(duplicates)

## delete samples
#### !set simulate=False to remove!

In [None]:
remove_duplicates(duplicates, simulate=True)