In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import pickle
from tqdm import tqdm
import cv2

In [2]:
# CNN feature matches
dist = pd.read_csv('./resnet101_matches_distance.csv').values
fname = pd.read_csv('./resnet101_matches_filename.csv').values

In [3]:
train_xy = pd.read_csv('validate_train.csv', index_col=0)
validate_xy =  pd.read_csv('validate.csv')
validate_path = validate_xy['id'].values
# Train dataset
# train_xy = pd.read_csv('train.csv', index_col=0)
# test_path = pd.read_csv('imagenames.csv')['id'].values

In [4]:
# Limit the candidate pictures to a limited decrease only, comparing to the top one
extract_match = lambda i, thresh: fname[i,np.argwhere(dist[i] < dist[i,0] + thresh)]

In [5]:
# Clustered Online Cumulative K-Means (CLOCK) 
def onl_kmeans(data, fnames, max_clusters, max_range, min_size = 2):
    '''Cluster the given data and pick the biggest cluster

    Params
    ---
    - data: A list of images' coordinates
    - fnames: Corresponding list of the image names
    - max_clusters: Max number of cluster to export. Putting 0 or -1 will get all available clusters
    - max_range: Max distance from centroid to be considered part of a cluster
    - min_size: The clustering will run until at least one cluster reach the specified minimum size

    Returns
    ---
    - cluster_elements: Point coordinates of the chosen cluster
    - cluster_filenames: Image filename of the chosen cluster
    - cluster_central: The centroid of the chosen cluster
    '''
    cluster_centrals = None
    cluster_elems = []
    cluster_filename = []
    cluster_count = []
    for i,coord in enumerate(data):
        # Adding the first point as the first cluster central
        if cluster_centrals is None:
            cluster_centrals = np.array([coord])
            cluster_elems.append([coord])
            cluster_filename.append([fnames[i]])
            cluster_count.append(1)
            continue
        # Get distance from point to each cluster
        distances = np.sum((cluster_centrals - coord)**2, axis=1)**0.5
        nearest = np.argmin(distances)
        # If point is far away from clusters, it's on its own cluster
        if distances[nearest] > max_range:
            # Stop when max number of clusters reached and have a big enough cluster
            if cluster_centrals.shape[0] >= max_clusters \
                and np.max(cluster_count) >= min_size \
                and max_clusters > 0 : break
            cluster_centrals = np.append(cluster_centrals,[coord], axis=0)
            cluster_elems.append([coord])
            cluster_filename.append([fnames[i]])
            cluster_count.append(1)
        # If not, it belongs to cluster with nearest centeal. Update that one
        else:
            cluster_centrals[nearest] = (cluster_centrals[nearest] 
                                       * cluster_count[nearest] 
                                       + coord) / (cluster_count[nearest]+1)
            cluster_elems[nearest].append(coord)
            cluster_filename[nearest].append(fnames[i])
            cluster_count[nearest] += 1
    # Return the coordinates, filenames, and center of the largest cluster
    biggest_cluster = np.argmax(cluster_count)
    return cluster_elems[biggest_cluster], \
           cluster_filename[biggest_cluster], \
           cluster_centrals[biggest_cluster]

In [6]:
# Processing parameters
threshold = 5
max_clusters = 5
max_radius = 8
min_size = 2


def CLOCK(threshold,max_clusters,max_radius,min_size):
    centroids = []
    for i,_ in enumerate(tqdm(validate_path)):
        img_idx = extract_match(i,threshold).flatten()
        coords = train_xy.loc[img_idx].values
        loc, f, centroid = onl_kmeans(coords, img_idx, max_clusters, max_radius, min_size)
        centroids.append(centroid)
    return centroids

In [7]:
# If export cluster image specs for SIFT, run this
centroids = CLOCK(5,5,8,2)

# If taking centroid as output, run this
out = pd.DataFrame(centroids,index=validate_path)
out.to_csv('CLOCK_5_5_8_2.csv',index_label='id',header=['x','y'])

100%|██████████████████████████████████████████████████████████████████████████████| 600/600 [00:00<00:00, 1274.35it/s]


In [8]:
CLOCK_predict = pd.read_csv('CLOCK_5_5_8_2.csv')
# calculate MAE
MAE = np.abs(CLOCK_predict['x']-validate_xy['x']) +  np.abs(CLOCK_predict['y']-validate_xy['y'])
MAE = np.sum(MAE)/1300
MAE

9.429872038096999

In [10]:
def multiple_grid_search(func, params: dict, verbose: bool = True):
    '''Conduct grid search for a function given a list of parameters to try
    
    Parameters
    ----------
    func : callable
        Function to run grid search from.
    
    params : dictionary of list
        Non-empty dictionary of list, containing the parameter lists to be tried.

    verbose : bool
        True if best result is to be printed.
    
    Returns
    -------
    param_sets : ndarray
        The array of possible combinations that is run on.

    result_list: list
        List of MAE received through each round for each run.
    '''
    # Input error handling
    if not callable(func): raise ValueError("`func` is not a callable function")
    if bool(params) == False: raise ValueError("`params` must be non-empty")
    if any([bool(params.get(p)) == False for p in params]):raise ValueError("All elements of `params` must be non-empty")

    result_list= []
    # Getting a ndarray of all possible combinations
    param_sets = np.array(np.meshgrid(*(p for _, p in sorted(params.items())))).T.reshape(-1,len(params.keys()))
    # Invalid combinations that returned None
    invalid_param_index = []
    # Iterating through each combination
    for i,ps in enumerate(param_sets):
        threshold,max_clusters, max_radius,min_size = ps
        centroids = func(threshold,max_clusters, max_radius,min_size)
        out = pd.DataFrame(centroids,index=validate_path)
        out.columns = ['x','y']
        MAE = np.abs(out['x'].values-validate_xy['x']) + np.abs(out['y'].values-validate_xy['y'])
        MAE = np.sum(MAE)/1300
        result = MAE
        if result is None:
            invalid_param_index.append(i)
        else:
            result_list.append(result)
    # Filter out invalid combinations
    param_sets = param_sets[[i for i in range(param_sets.shape[0]) 
                             if i not in invalid_param_index]]
    # Get the best result
    index_max = np.argmin(result_list)
    if verbose:
        print(f'\rFinished. Best result: {result_list[index_max]} '
              f'at params = {param_sets[index_max]}')
    return param_sets, result_list

In [113]:
params = {'threshold': [1,3,5,7,9],'max_clusters' : [3,5,7], 'max_radius': [2,4,6,8],'min_size':[1,2,3]}
p,r = multiple_grid_search(CLOCK, params)

100%|██████████████████████████████████████████████████████████████████████████████| 600/600 [00:00<00:00, 1856.71it/s]
100%|██████████████████████████████████████████████████████████████████████████████| 600/600 [00:00<00:00, 1667.68it/s]
100%|██████████████████████████████████████████████████████████████████████████████| 600/600 [00:00<00:00, 1575.73it/s]
100%|██████████████████████████████████████████████████████████████████████████████| 600/600 [00:00<00:00, 1155.64it/s]
100%|███████████████████████████████████████████████████████████████████████████████| 600/600 [00:00<00:00, 899.65it/s]
100%|██████████████████████████████████████████████████████████████████████████████| 600/600 [00:00<00:00, 1335.02it/s]
100%|██████████████████████████████████████████████████████████████████████████████| 600/600 [00:00<00:00, 1392.91it/s]
100%|██████████████████████████████████████████████████████████████████████████████| 600/600 [00:00<00:00, 1340.53it/s]
100%|███████████████████████████████████

100%|███████████████████████████████████████████████████████████████████████████████| 600/600 [00:00<00:00, 714.17it/s]
100%|███████████████████████████████████████████████████████████████████████████████| 600/600 [00:00<00:00, 717.29it/s]
100%|███████████████████████████████████████████████████████████████████████████████| 600/600 [00:00<00:00, 725.96it/s]
100%|███████████████████████████████████████████████████████████████████████████████| 600/600 [00:00<00:00, 706.32it/s]
100%|███████████████████████████████████████████████████████████████████████████████| 600/600 [00:01<00:00, 443.53it/s]
100%|███████████████████████████████████████████████████████████████████████████████| 600/600 [00:01<00:00, 450.74it/s]
100%|███████████████████████████████████████████████████████████████████████████████| 600/600 [00:01<00:00, 448.96it/s]
100%|███████████████████████████████████████████████████████████████████████████████| 600/600 [00:01<00:00, 453.58it/s]
100%|███████████████████████████████████

Finished. Best result: 8.385771795089475 at params = [3 2 3 1]





In [12]:
params = {'threshold': [2,3,4],'max_clusters' : [2,3,4], 'max_radius': [3,5,7],'min_size':[1]}
p,r = multiple_grid_search(CLOCK, params)

100%|██████████████████████████████████████████████████████████████████████████████| 600/600 [00:00<00:00, 1493.55it/s]
100%|██████████████████████████████████████████████████████████████████████████████| 600/600 [00:00<00:00, 1674.16it/s]
100%|██████████████████████████████████████████████████████████████████████████████| 600/600 [00:00<00:00, 1633.90it/s]
100%|██████████████████████████████████████████████████████████████████████████████| 600/600 [00:00<00:00, 1506.02it/s]
100%|██████████████████████████████████████████████████████████████████████████████| 600/600 [00:00<00:00, 1501.58it/s]
100%|██████████████████████████████████████████████████████████████████████████████| 600/600 [00:00<00:00, 1240.08it/s]
100%|██████████████████████████████████████████████████████████████████████████████| 600/600 [00:00<00:00, 1440.85it/s]
100%|██████████████████████████████████████████████████████████████████████████████| 600/600 [00:00<00:00, 1354.24it/s]
100%|███████████████████████████████████

Finished. Best result: 8.497397436218206 at params = [2 3 1 3]



