## Import statements

In [1]:
import os
import re 
import os
import sys 
import cv2
import json 
import time
import glob
import shutil
import pickle
import IPython
import hashlib
import fnmatch
import copyreg
import requests
import imagehash
import flickrapi
from math import exp
from PIL import Image
import multiprocessing
from pprint import pprint
import concurrent.futures
from tqdm.notebook import tqdm

## Helper functions

In [2]:
!nvidia-smi

Wed Sep 30 02:59:20 2020       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 455.23.05    Driver Version: 455.23.05    CUDA Version: 11.1     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  GeForce GTX 1660    On   | 00000000:01:00.0  On |                  N/A |
|  0%   36C    P8    11W / 130W |    899MiB /  5941MiB |      7%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+---------------------------------------------------------------------------

In [3]:
def chunks(lst, n):
    """Yield successive n-sized chunks from lst."""
    for i in range(0, len(lst), n):
        yield lst[i:i + n]

def thread_it(thread_function, my_list, tq=True, WORKERS=None):
    # Set worker number to CPU count
    if not WORKERS:
        WORKERS = multiprocessing.cpu_count()
    
    if tq:
        tq = tqdm(total=len(my_list))
    
    # Separate into chunks and execute threaded
    thread_list = chunks(my_list, WORKERS)
    for chunk in thread_list:
        with concurrent.futures.ThreadPoolExecutor(max_workers=WORKERS) as executor:
            for item in chunk:
                executor.submit(thread_function, item)
                if tq:
                    tq.update(1)
                
def thread_it_return(thread_function, my_list, tq=True, WORKERS=None):
    # Set worker number to CPU count
    if not WORKERS:
        WORKERS = multiprocessing.cpu_count()
    
    if tq:
        tq = tqdm(total=len(my_list))
        
    results = []
    # Separate into chunks and execute threaded
    thread_list = chunks(my_list, WORKERS)
    for chunk in thread_list:
        with concurrent.futures.ThreadPoolExecutor(max_workers=WORKERS) as executor:
            for item in chunk:
                future = executor.submit(thread_function, item)
                
                return_value = future.result()
                if return_value != None:
                    results.append(return_value)
                    
                if tq:
                    tq.update(1)
    return results
    

def show_img(a):
    IPython.display.display(Image.fromarray(a))
    
def show_img_by_path(a):
    # I could and probably need to implement image scaling beforehand, for network access
    IPython.display.display(Image.open(a))

def create_folder(path):
    if not os.path.isdir(path):
        os.makedirs(path)

## Link scraping (DuckDuckGo)

In [4]:
def extract_json(objs, exts):
    links = []
    for obj in objs:
        
        """
        print("Width {0}, Height {1}".format(obj["width"], obj["height"]))
        print("Thumbnail {0}".format(obj["thumbnail"]))
        print("Url {0}".format(obj["url"]))
        print("Title {0}".format(obj["title"].encode('utf-8')))
        print("Image {0}".format(obj["image"]))
        
        -- EXAMPLE OUTPUT --
        Width 3840, Height 2560
        Thumbnail https://tse1.mm.bing.net/th?id=OIF.BrhofaJg5Fx2yl9jrBBQLQ&pid=Api
        Url https://www.airantares.ro/cazare/in-Paris/Franta/beaugrenelle-eiffel-tour-3-stars-paris-franta/
        Title b'Beaugrenelle Tour Eiffel, Paris, Franta'
        Image https://i.travelapi.com/hotels/2000000/1070000/1063000/1062936/c5a49732.jpg
        """

        if (obj["width"] * obj["height"]) > 307200 and obj["image"].split(".")[-1].lower() in exts:
            links.append(obj["image"])

    return links

def links_from_ddg(topic, max_images=None, exts=["jpg", "png", "bmp", "jpeg"]):
    link_list = []

    url = 'https://duckduckgo.com/' 
    params = {'q': topic} 

    #   First make a request to above URL, and parse out the 'vqd'
    #   This is a special token, which should be used in the subsequent request
    res = requests.post(url, data=params)
    searchObj = re.search(r'vqd=([\d-]+)\&', res.text, re.M|re.I) 

    if not searchObj:
        # Token parsing failed
        return -1 

    headers = {
        'authority': 'duckduckgo.com',
        'accept': 'application/json, text/javascript, */*; q=0.01',
        'sec-fetch-dest': 'empty',
        'x-requested-with': 'XMLHttpRequest',
        'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.163 Safari/537.36',
        'sec-fetch-site': 'same-origin',
        'sec-fetch-mode': 'cors',
        'referer': 'https://duckduckgo.com/',
        'accept-language': 'en-US,en;q=0.9',
    }

    params = (
        ('l', 'us-en'),
        ('o', 'json'),
        ('q', topic),
        ('vqd', searchObj.group(1)),
        ('f', ',,,'),
        ('p', '1'),
        ('v7exp', 'a'),
    )

    requestUrl = url + "i.js"

    print("Scraping links from DuckDuckGo")
    tq = tqdm(total=max_images)
    link_count = 0
    while True:
        while True:
            try:
                res = requests.get(requestUrl, headers=headers, params=params)
                data = json.loads(res.text)
                break
            except ValueError:
                # Hitting Url Failure - Sleep and Retry
                time.sleep(5)
                continue

        links = extract_json(data["results"], exts)
        for link in links:
            if max_images and link_count != max_images:
                link_list += [link]
                link_count += 1
                tq.update(1)
            else:
                return link_list
            

        if "next" not in data:
            # No next page
            return link_list

        requestUrl = url + data["next"]

## Link scraping (Flickr)

In [5]:
def links_from_flickr(topic, max_images):
    KEY = '88a8660edd2e770b1b00e878af174879'
    SECRET = 'f3063c276e3ad859'

    SIZES = ["url_o", "url_k", "url_h", "url_l", "url_c"]  # in order of preference

    """
    - url_o: Original (4520 × 3229)
    - url_k: Large 2048 (2048 × 1463)
    - url_h: Large 1600 (1600 × 1143)
    - url_l=: Large 1024 (1024 × 732)
    - url_c: Medium 800 (800 × 572)
    - url_z: Medium 640 (640 × 457)
    - url_m: Medium 500 (500 × 357)
    - url_n: Small 320 (320 × 229)
    - url_s: Small 240 (240 × 171)
    - url_t: Thumbnail (100 × 71)
    - url_q: Square 150 (150 × 150)
    - url_sq: Square 75 (75 × 75)
    """
    
    extras = ','.join(SIZES)
    flickr = flickrapi.FlickrAPI(KEY, SECRET)
    photos = flickr.walk(text=topic,  # it will search by image title and image tags
                            extras=extras,  # get the urls for each size we want
                            privacy_filter=1,  # search only for public photos
                            per_page=50,
                            sort='relevance')  # we want what we are looking for to appear first
    counter, urls = 0, []

    print("Scraping links from Flickr")
    tq = tqdm(total = max_images)
    for photo in photos:
        if counter < max_images:
            for i in range(len(SIZES)):  # makes sure the loop is done in the order we want
                url = photo.get(SIZES[i])
                if url:  # if url is None try with the next size
                    urls.append(url)
                    counter += 1
                    tq.update(1)
                    break
        else:
            break

    return urls


## Download function

In [6]:
def thread_download(item):
    link = item["link"]
    folder = item["folder"]
    service = item["service"]
    link_hash = str(hashlib.md5(link.encode("utf-8")).hexdigest())
    ext = link.split(".")[-1].lower()
    fname = "image_{}.{}".format(link_hash, ext)
    
    path = os.path.join(folder, fname)

    if not os.path.isfile(path):
        myfile = None
        if service == "ddg":
            myfile = requests.get(link, allow_redirects=True)
        elif service == "flickr":
            myfile = requests.get(link, stream=True)
        
        open(path, 'wb').write(myfile.content)        

def download(links, folder, service="flickr"):
    items = []
    for link in links:
        items.append({"link": link, "folder": folder, "service": service})
    thread_it(thread_download, items, WORKERS=None)

## Define various variables
#### This includes all paths for image folders.

In [9]:
# Define task
compare_dir = "compare_set/"
data_dir = "images/"
consider_dir = "consider/"

dirs = [compare_dir, data_dir, consider_dir]
for path in dirs:
    create_folder(path)

CPUs = multiprocessing.cpu_count()
data_num = 3000
compare_num = 100
topic = "eiffel tower"

## Download the sets
#### Download the comparison set and count number of downloaded files 

In [10]:
# Download comparison set
exts = ["jpg", "jpeg"]
links = links_from_ddg(topic, max_images = compare_num, exts = exts)
download(links, compare_dir, service="ddg")
file_num = len(glob.glob(compare_dir+"*", recursive=True))
print("Downloaded {} images for the comparison set".format(file_num))

Scraping links from DuckDuckGo


HBox(children=(FloatProgress(value=0.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0), HTML(value='')))

Downloaded 172 images for the comparison set


#### Download the data set and count number of downloaded files 

In [11]:
# Download data set
links = links_from_flickr(topic, max_images=data_num)
download(links, data_dir, service="flickr")
file_num = len(glob.glob(data_dir+"*", recursive=True))
print("Downloaded {} images for the data set".format(file_num))

Scraping links from Flickr


HBox(children=(FloatProgress(value=0.0, max=3000.0), HTML(value='')))






HBox(children=(FloatProgress(value=0.0, max=3000.0), HTML(value='')))

Downloaded 2979 images for the data set


## Clean dataset formats
#### Rename all files to ".jpg" file format

In [None]:
def img_rename(image):
    fname = image.split(".")[-2]
    ext = image.split(".")[-1]
    if ext == "jpeg":
        shutil.move(image, fname+".jpg")
        
def images_rename(path):
    files = glob.glob(path+"*.jpeg", recursive=True)
    thread_it(img_rename, files)

images_rename(compare_dir)
images_rename(data_dir)



#### Detect corruption

In [None]:
from skimage import io

def verify_thread(image):
    try:
        img = io.imread(image)
    except Exception as e:
        return (image, e)

def verify_images(path):
    files = glob.glob(path+"*.jpg", recursive=True)
    corrupt = thread_it_return(verify_thread, files)
    return corrupt

corrupt_compare = verify_images(compare_dir)
print("Compare directory:")
pprint(corrupt_compare)

corrupt_data = verify_images(data_dir)
print("Data directory:")
pprint(corrupt_data)

In [None]:
def img_remove_thread(item):
    os.remove(item[0])

def remove_corrupt(items):
    thread_it(img_remove_thread, items)
    
remove_corrupt(corrupt_compare)
remove_corrupt(corrupt_data)

## Preprocessing

#### Get blur variance average for the comparison set

In [None]:
def blur_avg_thread(image):
    img = cv2.imread(image, cv2.IMREAD_GRAYSCALE)
    val = cv2.Laplacian(img, cv2.CV_64F).var()
    return val 

def get_blur_average(path, multiplier=1):
    files = glob.glob(path+"*.jpg", recursive=True)
    thr = thread_it_return(blur_avg_thread, files)
    avg = sum(thr) / len(files)
    return avg * multiplier

blur_avg = get_blur_average(compare_dir, multiplier=0.75)
print("The blur average is: ", blur_avg)


#### Get the images in the data set that are more blurry than a given threshold

In [None]:
def too_blurry_thread(item):
    image = item["image"]
    threshold = item["threshold"]
    img = cv2.imread(image, cv2.IMREAD_GRAYSCALE)
    val = cv2.Laplacian(img, cv2.CV_64F).var()
    if val < threshold:
        return image
                
def get_too_blurry(path, threshold):
    files = glob.glob(path+"*.jpg")
    items = []
    for image in files:
        items.append({"image": image, "threshold": threshold})
    too_blurry = thread_it_return(too_blurry_thread, items)
    
    print("{} out of {} images are blurry".format(len(too_blurry), len(files)))
    return too_blurry

too_blurry = get_too_blurry(data_dir, blur_avg)

#### Remove the files referred to by the paths in the list

In [None]:
def remove_from_list(file_list):
    tq = tqdm(total=len(file_list))
    for item in file_list:
        os.remove(item)
        tq.update(1)

In [None]:
remove_from_list(too_blurry)

#### Get the maximum number of pixels in any image within the comparison and data sets

In [None]:
def max_resolution_thread(image):
    img = cv2.imread(image, cv2.IMREAD_GRAYSCALE)

    # Get image height and width
    #height, width, channels = img.shape
    height, width = img.shape

    # Count maximum resolution
    val = width * height
    
    # Show warning if problem
    if val < 307200:
        print("{} is too small \({}\)".format(image, val))
    
    return val

def get_max_resolution(path):
    files = glob.glob(path+"*.jpg")
    max_resolution = thread_it_return(max_resolution_thread, files)
    
    return max(max_resolution)


max_resolution_compare = get_max_resolution(compare_dir)
print("Max resolution for compare: ", max_resolution_compare)
max_resolution_data = get_max_resolution(data_dir)
print("Max resolution for data: ", max_resolution_data)

## Data processing
#### Hashing images

In [None]:
def hash_thread(image):
    img_hash = imagehash.dhash(Image.open(image))
    fname = image.split(".")[-2]+".hash"
    if not os.path.isfile(fname):
        pickle.dump(img_hash, open(fname, "wb"))    

def compute_img_hashes(path):
    files = glob.glob(path+"*.jpg")
    thread_it(hash_thread, files)

compute_img_hashes(compare_dir)
compute_img_hashes(data_dir)


#### Compute hash distances for each image to each image

In [None]:
def hash_distance_thread(item):
    hash1 = pickle.load(open(item["hashname"], "rb"))
    dis = item["hashname"].split(".")[-2]+".dis"
    check = item["hashname"].split(".")[-2]+".jpg"
    
    if os.path.isfile(dis):
        compute = pickle.load(open(dis, "rb"))
    else:
        compute = {}
    
    for hashpath in item["files"]:
        try:
            image = hashpath.split(".")[-2]+".jpg"
            if image not in compute.keys() and image != check:
                hash2 = pickle.load(open(hashpath, "rb"))
                compute[image] = hash1 - hash2
        except Exception as e:
            print(e)
            
    pickle.dump(compute, open(dis, "wb"))

def compute_hash_distance(path):
    files = glob.glob(path+"*.hash")
    
    items = []
    for image in files:
        items.append({"hashname": image, "files": files})

    thread_it(hash_distance_thread, items)

compute_hash_distance(compare_dir)    
compute_hash_distance(data_dir)

#### Check for duplicates

In [None]:
def get_duplicate_images(path, threshold=10):
    files = glob.glob(path+"*.dis")
    dup, close = [], []
    
    for path in files:
        distances = pickle.load(open(path, "rb"))
        img = path.split(".")[-2]+".jpg"
        
        for key, val in distances.items():
            if val == 0:
                if key not in dup and img not in dup:
                    dup.append(key)
            elif val < threshold:
                if key not in close and img not in close:
                    close.append(key)
                
    return dup, close

compare_dup, compare_close = get_duplicate_images(compare_dir)
data_dup, data_close = get_duplicate_images(data_dir)

print("Compare duplicates:")
pprint(compare_dup)
print("Compare close:")
pprint(compare_close)

print("Data duplicates:")
pprint(data_dup)
print("Data close:")
pprint(data_close)


"""
    for item in duplicates:
        show_img_by_path(item)
"""

#### Remove duplicates

In [None]:
remove_from_list(compare_dup)
remove_from_list(data_dup)

## Feature Extraction

In [None]:
# Register Pickle behaviour for feature points.
def _pickle_keypoints(point):
    return cv2.KeyPoint, (*point.pt, point.size, point.angle,
                          point.response, point.octave, point.class_id)

# Register pickle handler for KeyPoints
copyreg.pickle(cv2.KeyPoint().__class__, _pickle_keypoints)

In [None]:
def extract_thread(item):
    image = item["image"]
    points = image.split(".")[-2]+".pts"
    detector = item["detector"]

    if not os.path.isfile(points):
        try:
            img = cv2.imread(image, cv2.IMREAD_GRAYSCALE)
            kp, des = detector.detectAndCompute(img, None)
            data = {"kp": kp, "des": des}
            pickle.dump(data, open(points, "wb"))
        except Exception as e:
            print("{} failed because {}".format(image, e))

def feature_extraction(path, feature_matcher="ORB", points_num=8192):
    # Get all images list.
    files = glob.glob(path+"*.jpg")

    detector = None
    if feature_matcher == "SIFT":
        detector = cv2.SIFT_create(nfeatures=points_num)
    elif feature_matcher == "SURF":
        detector = cv2.SURF_create(nfeatures=points_num)
    elif feature_matcher == "ORB":
        detector = cv2.ORB_create(nfeatures=points_num)

    print(detector)
        
    items = []
    for image in files:
        items.append({"image": image, "detector": detector})
    
    thread_it(extract_thread, items)

In [None]:
points_num = 2048
features = "SIFT"
feature_extraction(compare_dir, feature_matcher=features, points_num=points_num)
feature_extraction(data_dir, feature_matcher=features, points_num=points_num)

## Feature Matching

In [None]:
def get_matcher(feature_matcher="ORB", bf_or_flann="BF"):
    #https://www.programcreek.com/python/?code=NetEase%2Fairtest%2Fairtest-master%2Fairtest%2Ftrash%2Ffind_obj.py
    matcher, norm = None, None
    if feature_matcher == "ORB":
        norm = cv2.NORM_HAMMING
    elif feature_matcher in ["SIFT", "SURF"]:
        norm = cv2.NORM_L2
    if bf_or_flann == "FLANN":
        if norm == cv2.NORM_L2:
            FLANN_INDEX_KDTREE = 1
            flann_params = dict(algorithm = FLANN_INDEX_KDTREE, trees = 5)
        else:
            FLANN_INDEX_LSH = 6
            flann_params= dict(algorithm = FLANN_INDEX_LSH,
                                table_number = 6, # 12
                                key_size = 12,     # 20
                                multi_probe_level = 1) #2
        matcher = cv2.FlannBasedMatcher(flann_params, {})
        # bug : need to pass empty dict (#1329)
    elif bf_or_flann == "BF":
        matcher = cv2.BFMatcher(norm)
    else:
        matcher = cv2.BFMatcher(norm)

    return matcher


def match_within_path(path, matcher, ratio_test=False):
    files = glob.glob(path+"*.jpg")
    
    tq = tqdm(total=len(files))
    for img1 in files:
        base1 = img1.split(".")[-2]
        jpg1 = base1+".jpg"
        features1 = pickle.load(open(base1+".pts", "rb"))
        
        if os.path.isfile(base1+".mch"):
            matches1 = pickle.load(open(base1+".mch", "rb"))
        else:
            matches1 = {}
            
        for img2 in files:
            # Skip if same image
            if img1 is img2:
                continue
            
            base2 = img2.split(".")[-2]
            jpg2 = base2+".jpg"
            features2 = pickle.load(open(base2+".pts", "rb"))
            
            # If either were matched against the other, fill out and skip
            if os.path.isfile(base2+".mch"):
                matches2 = pickle.load(open(base2+".mch", "rb"))
                if jpg1 in matches2.keys():
                    matches1[jpg2] = matches2[jpg1]
                    continue
                elif jpg2 in matches1.keys():
                    matches2[jpg1] = matches1[jpg2]
                    continue
            else:
                matches2 = {}

            # Read computed data.
            des1 = features1["des"]  # Actual set image
            des2 = features2["des"]  # Compare set image
            try:
                matches_data = matcher.knnMatch(des1, des2, k=2)
            except Exception as e:
                print(e)
                matches_data = []

            if ratio_test:
                # Apply ratio test
                good = []
                for m, n in matches_data:
                    if m.distance < 0.75*n.distance:
                        good.append(True)

                matches1[jpg2] = len(good)
                matches2[jpg1] = len(good)
            else:
                # Those that don't exist in here probably don't have matches can be removed
                matches1[jpg2] = len(matches_data)
                matches2[jpg1] = len(matches_data)

            pickle.dump(matches2, open(base2+".mch", "wb"))
        pickle.dump(matches1, open(base1+".mch", "wb"))
        tq.update(1)


def match_features_other_path(path, other_path, matcher, ratio_test=False):
    # Get pre-computed images list.
    files = glob.glob(path+"*.pts")
    other = glob.glob(other_path+"*.pts")

    tq = tqdm(total=len(files))
    for img1 in files:
        base1 = img1.split(".")[-2]
        jpg1 = base1+".jpg"
        
        features1 = pickle.load(open(base1+".pts", "rb"))
        if os.path.isfile(base1+".mch"):
            matches = pickle.load(open(base1+".mch", "rb"))
        else:
            matches = {}
            
        for img2 in other:
            # Skip if same image
            if img1 is img2:
                continue
            
            base2 = img2.split(".")[-2]
            jpg2 = base2+".jpg"
            
            if jpg2 in matches.keys():
                continue
            
            features2 = pickle.load(open(base2+".pts", "rb"))

            # Read computed data.
            des1 = features1["des"]  # Actual set image
            des2 = features2["des"]  # Compare set image
            try:
                matches_data = matcher.knnMatch(des1, des2, k=2)
            except Exception:
                matches_data = []

            if ratio_test:
                # Apply ratio test
                good = []
                for m, n in matches_data:
                    if m.distance < 0.75*n.distance:
                        good.append(True)
                        
                matches[jpg2] = len(good)
            else:
                # Those that don't exist in here probably don't have matches can be removed
                matches[jpg2] = len(matches_data)
        tq.update(1)
                

        pickle.dump(matches, open(base1+".mch", "wb"))

In [None]:
matcher = get_matcher(feature_matcher=features, bf_or_flann="BF")
print(matcher)

match_within_path(compare_dir, matcher, ratio_test=True)
match_features_other_path(data_dir, compare_dir, matcher, ratio_test=True)

In [None]:
def load_and_total_matches(path):
    files = glob.glob(path+"*.jpg")
    matches = {}
    for path in files:
        match_path = path.split(".")[-2]+".mch"
        matches[path] = pickle.load(open(match_path, "rb"))
    return matches

compare_matches = load_and_total_matches(compare_dir)
data_matches = load_and_total_matches(data_dir)

pprint(data_matches)

In [None]:
def total_matches(matches, do_print=False):
    totals = {}
    for key in matches.keys():
        totals[key] = []
    for key, value in matches.items():
        length = len(value)
        for _, num_matches in value.items():
            totals[key].append(num_matches)
    for key in totals.keys():
        totals[key] = sum(totals[key])

    if do_print:
        pprint(totals)
    return totals

compare_total_matches = total_matches(compare_matches)
data_total_matches = total_matches(data_matches)

pprint(compare_total_matches)
pprint(data_total_matches)

## Calculate threshold from comparison set 

In [None]:
def get_thr_from_compare(path, totals, multipler=1):
    thr_per_image = {}
    
    tq = tqdm(total=len(totals))
    for img1, item in totals.items():
        for img2, val in item.items():
            # Actual resolutions differences won't matter when using Scale-Invariant feature descriptions
            # Only do multipler because references are the best case, real data won't be.
            thr = val * multipler
            
            # Add thr to dict
            if img2 not in thr_per_image.keys():
                thr_per_image[img2] = [thr]
            else:
                thr_per_image[img2].append(thr)
        tq.update(1)

    # Get the average feature match for a valid image for each image in the reference set to every other image
    for key in thr_per_image.keys():
        val = thr_per_image[key]
        thr_per_image[key] = sum(val)/len(val)
        
    return thr_per_image



In [None]:
# Does Lowe ratio apply here?
#thr = thr*0.75

thr_per_image = get_thr_from_compare(compare_dir, compare_matches, multipler=0.85)
print(thr_per_image)


## Apply threshold to data directory set

In [None]:
def sigmoid(x):
  return 1 / (1 + exp(-x))


def get_threshold_items(totals, thr_per_image, show=False):
    values = {}
    #x, y = [], []
    tq = tqdm(total=len(totals))
    for img1, item in totals.items():
        # Get resolution
        img = cv2.imread(img1, cv2.IMREAD_GRAYSCALE)
        height, width = img.shape
        res = width * height

        rating = 0
        
        for img2, val in item.items():
            # If the "Data" image is under the thr for the comparison image
            if val > thr_per_image[img2]:
                # Show which images from the comparison set, the data image is under thr for, and how much
                rating += 1
                
        rating = rating / len(item) 
        #rating = sigmoid(rating)

        values[img1] = rating
        
        tq.update(1)
    """    
    if show:
        %matplotlib notebook
        import matplotlib.pyplot as plt
        plt.figure(figsize=(200, 200))
        plt.plot(y, x, "o", color="black")
        plt.plot([x for x in range(len(x))], [confidence for x in range(len(x))], '-ok', color="red")
        plt.xlabel("Number of features")
        plt.ylabel("Confidence")
        plt.show()
        print("Average is ", confidence)
    """
        
    return values

ratings = get_threshold_items(data_matches, thr_per_image, show=False)
#pprint(ratings)

under_confidence = []
confidence = sum(ratings.values()) / len(ratings)
print("CONFIDENCE: {}".format(confidence))
for key, val in ratings.items():
    if val < confidence:
        print(key, "@", val, ":")
        under_confidence.append(key)
        #show_img_by_path(key)

In [None]:
def move_threshold_items(under, consider_folder, do_print=False):
    for val in under:
        if os.path.isfile(val):
            filename = val.split("/")[-1]
            path = os.path.join(consider_folder, filename)
            try:
                shutil.move(val, consider_folder)
            except Exception as e:
                print(e)
        else:
            print(val, "doesn't exist")

move_threshold_items(under_confidence, consider_dir)
