In [None]:
import glob
import time
import tqdm
import os

import cv2
from matplotlib import pyplot as plt
import numpy as np

from modules.AnnoyUtils import AnnoyUtils
from modules.FeatureExtractor import FeatureExtractor
from modules.utilities import print_status

In [None]:
def show_similar_items(similar_items):
    """Function to show similar_items
            
        Args: 
            similar_items: dict. {'ref_image_path': ['similar_image_path', confidence]}

        Returns: 

    """
    
    cntr = 0

    for key, vals in similar_items.items():
        for val in vals:
            cntr+=1
            img1 = cv2.imread(key) #read reference image
            img2 = cv2.imread(val[0]) #read similar image

            if img2 is None: #if file could be not read!
                print("file not found!")
                plt.imshow(img1[:,:,::-1])
                plt.show()
                continue
                
            #score = val[1]
            print("**\nref: {} \nsimilars: {}\**".format(key, val))

            img2 = cv2.resize(img2, (img1.shape[1], img1.shape[0])) #resize images for hconcat
            
            combined = cv2.hconcat([img1, img2]) #combine images
            
            #show images
            plt.imshow(combined[:,:,::-1])
            plt.show()

            print("*"*33)
    print("{} files detected!".format(cntr))

In [None]:
def remove_similar_items(similar_items, simulate=False):
    """Function to remove similar_items
            
        Args: 
            similar_items: dict. {'ref_image_path': ['similar_image_path', confidence]}

        Returns: 

    """
    cntr = 0

    for key, vals in similar_items.items():
        for val in vals:
            cntr+=1
            if not simulate:
                os.remove(val[0])
    if simulate:
        print("{} files WILL BE removed".format(cntr))
    else:
        print("{} files removed".format(cntr))

### set parameters

In [None]:
####
image_list = glob.glob('/home/xyz/*')

features_list = []

annoy_metric = "angular" # "euclidean", "manhattan", "hamming", "dot"
annoy_vector_length = 4096 #feature length (vgg16)
annoy_n_trees = 50 #More trees gives higher precision when querying -> https://github.com/spotify/annoy

similarity_threshold = 0.1 #0.0 exact match, <0.5 similar images

####

In [None]:
print("preparing featureExtractor")
feature_extractor = FeatureExtractor()

In [None]:
print("preparing annoy index for {} images".format(len(image_list)))

#init annoy
annoy_utils = AnnoyUtils()
annoy_utils.prepare_annoy(annoy_vector_length, annoy_metric)

#set image_list

annoy_utils.set_image_list(image_list)

###  extract features and update annoy index

In [None]:
#batch processing
start_time = time.time()

total_size = len(image_list)
batch_size = 64 #mind the memory usage!

ind = 0
for start_index in range(0, total_size, batch_size):
    end_index = total_size if start_index + batch_size > total_size else start_index + batch_size
    batch = image_list[start_index:end_index] 
    
    print_status("processing... {}/{}".format(start_index, total_size))
    
    batch_features = feature_extractor.extract_feature_for_batch_images(batch)
    
    for tmp_features in batch_features:
        features_list.append(tmp_features) #add features to list
        annoy_utils.ann_index.add_item(ind, tmp_features)
        ind+=1

#print("building index..")
annoy_utils.build_index(annoy_n_trees)

#print("saving index..")
annoy_utils.save_index('annoy.index')

print("it took {:.2f} seconds to build index for {} images".format(time.time()-start_time, total_size))

## analyze and find similar images

In [None]:
similar_items = {}

processed_items = []
retrieve_n_most_similar=9999

for ind, (image_path, tmp_features) in tqdm.tqdm_notebook(enumerate(zip(image_list,features_list))):

    if image_path in processed_items:
        continue
    processed_items.append(image_path)

    most_similars = annoy_utils.query_similar_images_by_features(tmp_features, retrieve_n_most_similar)
    
    for sim_image_ind, confidence  in most_similars:
        similar_image_path = image_list[sim_image_ind]
        
        if similar_image_path == image_path or similar_image_path in processed_items:
            continue

        if confidence < similarity_threshold: #0.0 exact match, <0.5 similar images
            if image_path in similar_items:
                similar_items[image_path].append([similar_image_path, confidence])
            else:
                similar_items[image_path] = []
                similar_items[image_path].append([similar_image_path, confidence])
            processed_items.append(similar_image_path)
        
            

## show samples

In [None]:
show_similar_items(similar_items)

## delete samples
#### !set simulate=False to remove!

In [None]:
remove_similar_items(similar_items, simulate=True)