# Basic implmentation of Clustered Online Cumulative K-Means (CLOCK)

In [16]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import pickle
from tqdm import tqdm

## Importing data

In [17]:
dist = pd.read_csv('./resnet101_matches_distance.csv').values
fname = pd.read_csv('./resnet101_matches_filename.csv').values

In [38]:
train_xy = pd.read_csv('train.csv', index_col=0)
test_path = pd.read_csv('imagenames.csv')['id'].values

## Main functions

In [19]:
# Limit the candidate pictures to a limited decrease only, comparing to the top one
extract_match = lambda i, thresh: fname[i,np.argwhere(dist[i] < dist[i,0] + thresh)]

In [31]:
# Clustered Online Cumulative K-Means (CLOCK) 
def onl_kmeans(data, fnames, max_clusters, max_range, min_size = 2):
    '''Cluster the given data and pick the biggest cluster

    Params
    ---
    - data: A list of images' coordinates
    - fnames: Corresponding list of the image names
    - max_clusters: Max number of cluster to export. Putting 0 or -1 will get all available clusters
    - max_range: Max distance from centroid to be considered part of a cluster
    - min_size: The clustering will run until at least one cluster reach the specified minimum size

    Returns
    ---
    - cluster_elements: Point coordinates of the chosen cluster
    - cluster_filenames: Image filename of the chosen cluster
    - cluster_central: The centroid of the chosen cluster
    '''
    cluster_centrals = None
    cluster_elems = []
    cluster_filename = []
    cluster_count = []
    for i,coord in enumerate(data):
        # Adding the first point as the first cluster central
        if cluster_centrals is None:
            cluster_centrals = np.array([coord])
            cluster_elems.append([coord])
            cluster_filename.append([fnames[i]])
            cluster_count.append(1)
            continue
        # Get distance from point to each cluster
        distances = np.sum((cluster_centrals - coord)**2, axis=1)**0.5
        nearest = np.argmin(distances)
        # If point is far away from clusters, it's on its own cluster
        if distances[nearest] > max_range:
            # Stop when max number of clusters reached and have a big enough cluster
            if cluster_centrals.shape[0] >= max_clusters \
                and np.max(cluster_count) >= min_size \
                and max_clusters > 0 : break
            cluster_centrals = np.append(cluster_centrals,[coord], axis=0)
            cluster_elems.append([coord])
            cluster_filename.append([fnames[i]])
            cluster_count.append(1)
        # If not, it belongs to cluster with nearest centeal. Update that one
        else:
            cluster_centrals[nearest] = (cluster_centrals[nearest] 
                                       * cluster_count[nearest] 
                                       + coord) / (cluster_count[nearest]+1)
            cluster_elems[nearest].append(coord)
            cluster_filename[nearest].append(fnames[i])
            cluster_count[nearest] += 1
    # Return the coordinates, filenames, and center of the largest cluster
    biggest_cluster = np.argmax(cluster_count)
    return cluster_elems[biggest_cluster], \
           cluster_filename[biggest_cluster], \
           cluster_centrals[biggest_cluster]

## Running the algorithm, and export the results

In [42]:
# Processing parameters based on the tuning method 
threshold = 2
max_clusters = 3
max_radius = 1
min_size = 3

locs = []
fnames = []
centroids = []
for i,_ in enumerate(tqdm(test_path)):
    img_idx = extract_match(i,threshold).flatten()
    coords = train_xy.loc[img_idx].values
    loc, f, centroid = onl_kmeans(coords, img_idx, max_clusters, max_radius, min_size)
    locs.append(loc)
    fnames.append(f)
    centroids.append(centroid)

100%|██████████| 1200/1200 [00:01<00:00, 1042.02it/s]


In [15]:
# If export cluster image specs for SIFT, run this
f = open(f'./kmeans_coords.pckl','wb')
pickle.dump(locs,f)
f.close()

f = open(f'./fnames_coords.pckl','wb')
pickle.dump(fnames,f)
f.close()

In [43]:
# If taking centroid as output, run this
out = pd.DataFrame(centroids,index=test_path)
out.to_csv('CLOCK_2_3_1_3.csv',index_label='id',header=['x','y'])