## Import statements

In [1]:
import os
import sys 
import cv2
import time
import glob
import shutil
import pickle
import sqlite3
import IPython
import fnmatch
import copyreg
import imagehash
import subprocess
from math import exp
from PIL import Image
import multiprocessing
from pprint import pprint
import concurrent.futures
from tqdm.notebook import tqdm

## Helper functions

In [2]:
def chunks(lst, n):
    """Yield successive n-sized chunks from lst."""
    for i in range(0, len(lst), n):
        yield lst[i:i + n]


def thread_it(thread_function, my_list, tq=True, WORKERS=None):
    # Set worker number to CPU count
    if not WORKERS:
        WORKERS = multiprocessing.cpu_count()
    
    if tq:
        tq = tqdm(total=len(my_list))
    
    # Separate into chunks and execute threaded
    thread_list = chunks(my_list, WORKERS)
    for chunk in thread_list:
        with concurrent.futures.ThreadPoolExecutor(max_workers=WORKERS) as executor:
            for item in chunk:
                executor.submit(thread_function, item)
                if tq:
                    tq.update(1)
    tq.close()



def thread_it_return(thread_function, my_list, tq=True, WORKERS=None):
    # Set worker number to CPU count
    if not WORKERS:
        WORKERS = multiprocessing.cpu_count()
    
    if tq:
        tq = tqdm(total=len(my_list))
        
    results = []
    # Separate into chunks and execute threaded
    thread_list = chunks(my_list, WORKERS)
    for chunk in thread_list:
        with concurrent.futures.ThreadPoolExecutor(max_workers=WORKERS) as executor:
            for item in chunk:
                future = executor.submit(thread_function, item)
                
                return_value = future.result()
                if return_value != None:
                    results.append(return_value)
                    
                if tq:
                    tq.update(1)
    
    tq.close()
    
    return results
    

def show_img_by_path(a, resize=True, size=(320, 240)):
    # I could and probably need to implement image scaling beforehand, for network access
    img = Image.open(a)
    if resize:
        img = img.resize(size=size)
    IPython.display.display(img)


def create_folder(path):
    if not os.path.isdir(path):
        os.makedirs(path)

## Define various variables
#### This includes all paths for image folders.

In [3]:
# These should already exist from the "Download"
compare_dir = "compare_set/"
data_dir = "images/"


consider_dir = "consider/"
problems = "problems/"
blurry = "blurry/"

dirs = [consider_dir, blurry, problems]
for path in dirs:
    create_folder(path)

## Preprocessing

#### Get blur variance average for the comparison set

In [None]:
def blur_avg_thread(image):
    img = cv2.imread(image, cv2.IMREAD_GRAYSCALE)
    val = cv2.Laplacian(img, cv2.CV_64F).var()
    return val 

def get_blur_average(path, multiplier=1):
    files = glob.glob(path+"*.jpg", recursive=True)
    thr = thread_it_return(blur_avg_thread, files)
    avg = sum(thr) / len(files)
    return avg * multiplier

blur_avg = get_blur_average(compare_dir, multiplier=0.75)
print("The blur average is: ", blur_avg)


#### Get the images in the data set that are more blurry than a given threshold

In [None]:
def too_blurry_thread(item):
    image = item["image"]
    threshold = item["threshold"]
    img = cv2.imread(image, cv2.IMREAD_GRAYSCALE)
    val = cv2.Laplacian(img, cv2.CV_64F).var()
    if val < threshold:
        return image
                
def get_too_blurry(path, threshold):
    files = glob.glob(path+"*.jpg")
    items = []
    for image in files:
        items.append({"image": image, "threshold": threshold})
    too_blurry = thread_it_return(too_blurry_thread, items)
    
    print("{} out of {} images are blurry".format(len(too_blurry), len(files)))
    return too_blurry

#too_blurry = get_too_blurry(data_dir, blur_avg)
#https://www.pyimagesearch.com/2015/09/07/blur-detection-with-opencv/
# Temporary override to check how a constant of 200 does.
too_blurry = get_too_blurry(data_dir, 200)
# Well, it does pretty damn good!

#### Remove the files referred to by the paths in the list

In [None]:
def move_to(file_list, dest):
    tq = tqdm(total=len(file_list))
    exception_flag = False
    for item in file_list:
        try:
            shutil.move(item, dest)
        except Exception as e:
            print(e)
            exception_flag = True
        tq.update(1)
    tq.close()
    return exception_flag

In [None]:
if move_to(too_blurry, blurry):
    del(too_blurry)

## Data processing
#### Hashing images

In [None]:
def hash_thread(image):
    fname = image.split(".")[-2]+".hsh"
    if not os.path.isfile(fname):
        img_hash = imagehash.dhash(Image.open(image))
        pickle.dump(img_hash, open(fname, "wb"))    

def compute_img_hashes(path):
    files = glob.glob(path+"*.jpg")
    thread_it(hash_thread, files)

compute_img_hashes(compare_dir)
compute_img_hashes(data_dir)


#### Compute hash distances for each image to each image

In [None]:
def hash_distance_thread(item):
    hash1 = pickle.load(open(item["hashname"], "rb"))
    dis = item["hashname"].split(".")[-2]+".dis"
    check = item["hashname"].split(".")[-2]+".jpg"
    
    if os.path.isfile(dis):
        compute = pickle.load(open(dis, "rb"))
    else:
        compute = {}
    
    for hashpath in item["files"]:
        try:
            image = hashpath.split(".")[-2]+".jpg"
            if image not in compute.keys() and image != check:
                hash2 = pickle.load(open(hashpath, "rb"))
                compute[image] = hash1 - hash2
        except Exception as e:
            print(e)
            
    pickle.dump(compute, open(dis, "wb"))

def compute_hash_distance(path):
    files = glob.glob(path+"*.hsh")
    
    items = []
    for image in files:
        items.append({"hashname": image, "files": files})

    thread_it(hash_distance_thread, items)

compute_hash_distance(compare_dir)    
compute_hash_distance(data_dir)

#### Check for duplicates

In [None]:
def get_duplicate_images(path, threshold=10):
    files = glob.glob(path+"*.dis")
    dup, close = [], []
    
    for path in files:
        distances = pickle.load(open(path, "rb"))
        img = path.split(".")[-2]+".jpg"
        
        for key, val in distances.items():
            if val == 0:
                if key not in dup and img not in dup:
                    dup.append(key)
            elif val < threshold:
                if key not in close and img not in close:
                    close.append(key)
                
    return dup, close

compare_dup, compare_close = get_duplicate_images(compare_dir, threshold=5)
data_dup, data_close = get_duplicate_images(data_dir, threshold=5)

print("Compare duplicates:")
pprint(compare_dup)
print("Compare close:")
pprint(compare_close)

print("Data duplicates:")
pprint(data_dup)
print("Data close:")
pprint(data_close)

show = False
if show:
    print("DUPLICATES IN COMPARE:")
    for item in compare_dup:
        show_img_by_path(item)
    print("DUPLICATES IN DATA:")
    for item in data_dup:
        show_img_by_path(item)
    print("CLOSE IN COMPARE:")
    for item in compare_close:
        show_img_by_path(item)
    print("CLOSE IN DATA:")
    for item in data_close:
        show_img_by_path(item)

#### Remove duplicates

In [None]:
if move_to(compare_dup, problems):
    del(compare_dup)
if move_to(data_dup, problems):
    del(data_dup)

## Feature Extraction (Comparison set)

In [None]:
cmd = r"colmap feature_extractor --database_path ./colmap_folder/colmap.sqlite3 --image_path compare_set/ --SiftExtraction.max_num_features 2048"
process = subprocess.Popen(cmd.split(), stdout=subprocess.PIPE)
output, error = process.communicate()
print(output.splitlines()[-1])
print("Finished extracting features from Comparison set")

## Feature Matching (Comparison set)

In [None]:
cmd = r"colmap exhaustive_matcher --database_path ./colmap_folder/colmap.sqlite3"
process = subprocess.Popen(cmd.split(), stdout=subprocess.PIPE)
output, error = process.communicate()
print(output.splitlines()[-1])
print("Finished matching features from Comparison set")

## Database Functions

In [4]:
def create_connection(db_file):
    conn = None
    try:
        conn = sqlite3.connect(db_file, timeout=30000)
        dest = sqlite3.connect(':memory:')
        conn.backup(dest)
    except Error as e:
        print(e)

    return dest

def delete_from(conn, table, column_name, where_value):
    sql = r"DELETE FROM {} WHERE {}='{}'".format(table, column_name, where_value)
    cur = conn.cursor()
    cur.execute(sql)
    conn.commit()

def select_what_from_where(conn, what, table, where_name, where_value):
    cur = conn.cursor()
    row = cur.execute(r"SELECT {} FROM {} WHERE {}='{}'".format(what, table, column_name, where_value)).fetchone()
    conn.commit()
    return row

def select_what_from(conn, what, table):
    cur = conn.cursor()
    cur.execute(r"SELECT {} FROM {}".format(what, table))
    conn.commit()
    rows = cur.fetchall()
    conn.commit()
    return rows

def decrement_cameras(conn):
    sql = r"UPDATE sqlite_sequence SET seq = seq - 1 WHERE name='cameras'"
    cur = conn.cursor()
    cur.execute(sql)
    conn.commit()

def decrement_images(conn):
    sql = r"UPDATE sqlite_sequence SET seq = seq - 1 WHERE name='images'"
    cur = conn.cursor()
    cur.execute(sql)
    conn.commit()


In [5]:
#https://colmap.github.io/database.html
def pair_id_to_image_ids(pair_id):
    image_id2 = pair_id % 2147483647
    image_id1 = (pair_id - image_id2) / 2147483647
    return image_id1, image_id2

def load_matches(conn):
    matches = {}
    id_to_img = {}
    for row in select_what_from(conn, "image_id, name", "images"):
        img_id = row[0]
        name = row[1]
        id_to_img[img_id] = name

    for img in id_to_img.values():
        matches[img] = {}

    for row in select_what_from(conn, "pair_id, rows", "matches"):
        pair_id = row[0]
        img1id, img2id = pair_id_to_image_ids(pair_id)
        num_matches = row[1]
        img1name, img2name = id_to_img[img1id], id_to_img[img2id]
        matches[img1name][img2name] = num_matches
        matches[img2name][img1name] = num_matches
    
    conn.close()

    return matches

conn = create_connection("./colmap_folder/colmap.sqlite3")
compare_matches = load_matches(conn)
conn.close()

In [6]:
def total_matches(matches):
    totals = {}
    for key in matches.keys():
        totals[key] = []
    for key, value in matches.items():
        length = len(value)
        for _, num_matches in value.items():
            totals[key].append(num_matches)
    for key in totals.keys():
        totals[key] = sum(totals[key])

    return totals

compare_total_matches = total_matches(compare_matches)
# If one image doesn't match at all, either eliminate it (and rerun) or find more images for the set.
pprint(compare_total_matches)

{&#39;image_010d209e7868c373442bd62dc19551a8.jpg&#39;: 4018,
 &#39;image_02edbd010eac8a8076cb4e9fbfe40b0d.jpg&#39;: 4737,
 &#39;image_03f2ce4031203591db026ea0b7f7a164.jpg&#39;: 229,
 &#39;image_048203bccf979cc3bbc3a9cb525486d2.jpg&#39;: 151,
 &#39;image_0647bf1c96468264221555ad19df45af.jpg&#39;: 4183,
 &#39;image_075a0568bb3549140aeb743239f9fc50.jpg&#39;: 4114,
 &#39;image_07ef018ed0ecd5d5b8970c5db570d2a8.jpg&#39;: 427,
 &#39;image_0a267444ea542710f6d1261aec4cdede.jpg&#39;: 44,
 &#39;image_0b74808c6f93d3eaa36d973a8aadb336.jpg&#39;: 6269,
 &#39;image_0b89967e0767c7fa910b9f570612206d.jpg&#39;: 3802,
 &#39;image_0b91b2443d738c7c17d5c0ef0214440d.jpg&#39;: 3438,
 &#39;image_0b9815e25c221742e410ac401231c1f7.jpg&#39;: 2355,
 &#39;image_0de2fbea71c4ef65b5b2b3209fa287e3.jpg&#39;: 435,
 &#39;image_0ef489424c1de58a34d126b3780deb5a.jpg&#39;: 4986,
 &#39;image_1041982b1a050e369bdcdef421ec7294.jpg&#39;: 298,
 &#39;image_1706e115f358603dc116d5e4137e6092.jpg&#39;: 1507,
 &#39;image_17128ba882609a83e64

## Calculate threshold from comparison set 

In [7]:
def get_thr_from_compare(totals, multipler=1):
    thr_per_image = {}
    
    tq = tqdm(total=len(totals))
    for img1, item in totals.items():
        for img2, val in item.items():
            thr = val * multipler
            
            # Add thr to dict
            if img2 not in thr_per_image.keys():
                thr_per_image[img2] = [thr]
            else:
                thr_per_image[img2].append(thr)
        tq.update(1)

    # Get the average feature match for a valid image for each image in the reference set to every other image
    for key in thr_per_image.keys():
        val = thr_per_image[key]
        thr_per_image[key] = sum(val)/len(val)
        
    return thr_per_image

In [8]:
thr_per_image = get_thr_from_compare(compare_matches, multipler=0.85)
pprint(thr_per_image)

HBox(children=(HTML(value=&#39;&#39;), FloatProgress(value=0.0, max=143.0), HTML(value=&#39;&#39;)))

{&#39;image_010d209e7868c373442bd62dc19551a8.jpg&#39;: 24.051408450704216,
 &#39;image_02edbd010eac8a8076cb4e9fbfe40b0d.jpg&#39;: 28.355281690140846,
 &#39;image_03f2ce4031203591db026ea0b7f7a164.jpg&#39;: 1.370774647887324,
 &#39;image_048203bccf979cc3bbc3a9cb525486d2.jpg&#39;: 0.9038732394366197,
 &#39;image_0647bf1c96468264221555ad19df45af.jpg&#39;: 25.039084507042254,
 &#39;image_075a0568bb3549140aeb743239f9fc50.jpg&#39;: 24.626056338028167,
 &#39;image_07ef018ed0ecd5d5b8970c5db570d2a8.jpg&#39;: 2.555985915492957,
 &#39;image_0a267444ea542710f6d1261aec4cdede.jpg&#39;: 0.2633802816901408,
 &#39;image_0b74808c6f93d3eaa36d973a8aadb336.jpg&#39;: 37.52570422535211,
 &#39;image_0b89967e0767c7fa910b9f570612206d.jpg&#39;: 22.758450704225353,
 &#39;image_0b91b2443d738c7c17d5c0ef0214440d.jpg&#39;: 20.579577464788734,
 &#39;image_0b9815e25c221742e410ac401231c1f7.jpg&#39;: 14.096830985915494,
 &#39;image_0de2fbea71c4ef65b5b2b3209fa287e3.jpg&#39;: 2.6038732394366195,
 &#39;image_0ef489424c1de58a

## Calculating number of matches for each image (Data set) to entire comparison set

In [53]:
#https://colmap.github.io/database.html
def image_ids_to_pair_id(image_id1, image_id2):
    if image_id1 > image_id2:
        return 2147483647 * image_id2 + image_id1
    else:
        return 2147483647 * image_id1 + image_id2

def pair_id_to_image_ids(pair_id):
    image_id2 = pair_id % 2147483647
    image_id1 = (pair_id - image_id2) / 2147483647
    return image_id1, image_id2

def remove_img_from_db(conn, filename, compare_images, delete=False):
    # Retrieve img_id and cam_id for image to delete
    images_row = select_from_where(conn, "images", "name", filename)
    img_id = images_row[0]
    cam_id = images_row[2]

    # Check if that's the only image referencing that camera.
    images_rows = select_all(conn, "images")
    only_cam_ref = True
    for row in images_rows:
        if row[2] == cam_id and row[0] != img_id:
            # Cannot delete that camera
            only_cam_ref = False

    if only_cam_ref:
        delete_from(conn, "cameras", "camera_id", cam_id)
        decrement_cameras(conn)

    # Delete any images, descriptors and keypoints for one data image record.
    delete_from(conn, "images", "name", filename)
    delete_from(conn, "descriptors", "image_id", img_id)
    delete_from(conn, "keypoints", "image_id", img_id)


    # Delete all matches and two_view_geometries for one data image to all comparison images
    for compare_img in compare_images:
        compare_filename = compare_img.split("/")[-1]
        try:
            return_val = select_from_where(conn, "images", "name", compare_filename)
            compare_img_id = return_val[0]
        except Exception:
            # If the return values is none, assume no matches for this image.
            #print(compare_filename, return_val)
            pass

        pair_id = image_ids_to_pair_id(img_id, compare_img_id)

        delete_from(conn, "matches", "pair_id", pair_id)
        delete_from(conn, "two_view_geometries", "pair_id", pair_id)

    decrement_images(conn)

data_matches = {}
conn = create_connection("./colmap_folder/colmap.sqlite3")
data_images = glob.glob(data_dir+"*.jpg")
data_pair_matching = "./colmap_folder/pairs_to_match.txt"

# Retrieve all img_ids and filenames for comparison set

if os.path.isfile("./compare_ids_to_img.p"):
    comparison_only_pair_ids = pickle.load(open("./compare_ids_to_img.p", "rb"))
    print("LOADED compare_ids_to_img FROM PICKLE")
else:
    compare_ids_to_img = {}
    return_val = select_what_from(conn, "image_id, name", "images")
    for row in return_val:
        compare_img_id = row[0]
        compare_img_filename = row[1]
        compare_ids_to_img[compare_img_id] = compare_img_filename
    pickle.dump(compare_ids_to_img, open("./compare_ids_to_img.p", "wb"))
print("Length of \"compare_ids_to_img\":", len(compare_ids_to_img.keys()))

# Get all comparison only pair_ids
if os.path.isfile("./comparison_pair_ids.p"):
    comparison_only_pair_ids = pickle.load(open("./comparison_pair_ids.p", "rb"))
    print("LOADED comparison_only_pair_ids FROM PICKLE")
else:    
    comparison_only_pair_ids = []
    return_val = select_what_from(conn, "pair_id", "matches")
    for row in return_val:
        comparison_only_pair_ids.append(str(row[0]))
    pickle.dump(comparison_only_pair_ids, open("./comparison_pair_ids.p", "wb"))
print("Length of \"comparison_only_pair_ids\":",len(comparison_only_pair_ids))

# Write all image pairings for each data image to each comparison image but not to other data images.
for data_img in data_images:
    data_filename = data_img.split("/")[-1]

    for compare_img in compare_ids_to_img.values():
        #to_write = data_filename + " " + compare_img + "\n"
        to_write = data_filename + " " + compare_img + "\n"
        f = open(data_pair_matching, "a").write(to_write)

cmd = r"colmap feature_extractor --database_path ./colmap_folder/colmap.sqlite3 --image_path ./images --SiftExtraction.max_num_features 2048"
process = subprocess.Popen(cmd.split(), stdout=subprocess.PIPE)
output, error = process.communicate()
p_status = process.wait()

# Retrieve all img_ids and filenames for data set
data_ids_to_img = {}
return_val = select_what_from(conn, "image_id, name", "images")
for row in return_val:
    data_img_id = row[0]
    data_img_filename = row[1]
    # If not a comparison image
    if data_img_id not in compare_ids_to_img.keys():
        data_ids_to_img[data_img_id] = data_img_filename

# Match all image pairs for data set.
cmd = r"colmap matches_importer --database_path ./colmap_folder/colmap.sqlite3 --match_list_path ./colmap_folder/pairs_to_match.txt"
process = subprocess.Popen(cmd.split(), stdout=subprocess.PIPE)
output, error = process.communicate()
p_status = process.wait()
#print(output)

# WILL NEED FOR REPEAT
os.remove(data_pair_matching)

# Get all match numbers
matches = {}
return_val = select_what_from(conn, "pair_id, rows", "matches")
#print(comparison_only_pair_ids)

for row in return_val:
    pair_id = str(row[0])
    num_matches = row[1]
    
    if pair_id not in comparison_only_pair_ids:
        img1, img2 = pair_id_to_image_ids(int(pair_id))
        img1, img2 = int(img1), int(img2)
        if img1 in compare_ids_to_img.keys():
            img1 = compare_ids_to_img[img1]
            img2 = data_ids_to_img[img2]
            if img2 not in matches.keys():
                matches[img2] = {}
            matches[img2][img1] = num_matches
        elif img2 in compare_ids_to_img.keys():
            img2 = compare_ids_to_img[img2]
            img1 = data_ids_to_img[img1]
            if img1 not in matches.keys():
                matches[img1] = {}
            matches[img1][img2] = num_matches


#pprint(matches)

LOADED compare_ids_to_img FROM PICKLE
Length of &quot;compare_ids_to_img&quot;: 143
LOADED comparison_only_pair_ids FROM PICKLE
Length of &quot;comparison_only_pair_ids&quot;: 10153


KeyError: &#39;image_00a6536bd2115e634b95c14f5aefbc66.jpg&#39;

## Apply threshold to data directory set

In [None]:
def get_threshold_items(totals, thr_per_image, show=False):
    values = {}
    #x, y = [], []
    tq = tqdm(total=len(totals))
    for img1, item in totals.items():
        rating = 0
        
        for img2, val in item.items():
            # If the "Data" image is under the thr for the comparison image
            if val > thr_per_image[img2]:
                # Show which images from the comparison set, the data image is under thr for, and how much
                rating += 1
        
        if rating != 0:
            rating = rating / len(item) 
        #rating = sigmoid(rating)

        values[img1] = rating
        
        tq.update(1)
    """    
    if show:
        %matplotlib notebook
        import matplotlib.pyplot as plt
        plt.figure(figsize=(200, 200))
        plt.plot(y, x, "o", color="black")
        plt.plot([x for x in range(len(x))], [confidence for x in range(len(x))], '-ok', color="red")
        plt.xlabel("Number of features")
        plt.ylabel("Confidence")
        plt.show()
        print("Average is ", confidence)
    """
        
    return values

In [None]:
#print(matches)
ratings = get_threshold_items(matches, thr_per_image, show=False)
#pprint(ratings)
under_confidence = []
confidence = sum(ratings.values()) / len(ratings)
print("CONFIDENCE: {}".format(confidence))
for key, val in ratings.items():
    if val < confidence:
        print(key, "@", val)
        under_confidence.append(key)
        #show_img_by_path(key, size=(75,75))

print("{} out of {} images are under confident".format(len(under_confidence), len(ratings)))

In [None]:
def move_threshold_items(under, consider_folder, do_print=False):
    for val in under:
        val = data_dir+val
        if os.path.isfile(val):
            filename = val.split("/")[-1]
            path = os.path.join(consider_folder, filename)
            try:
                shutil.move(val, consider_folder)
            except Exception as e:
                print(e)
        else:
            print(val, "doesn't exist")

move_threshold_items(under_confidence, consider_dir)

# An idea; I could possibly smush all the feature data for the comparison images into one object... That would give me a nice percentage as an overall match.