In [121]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from tqdm import tqdm

In [7]:
dist = pd.read_csv('./resnet101_matches_distance.csv').values
fname = pd.read_csv('./resnet101_matches_filename.csv').values

In [18]:
train_xy = pd.read_csv('train.csv', index_col=0)
test_path = pd.read_csv('imagenames.csv')['id'].values

In [132]:
# Limit the candidate pictures to a limited decrease only, comparing to the top one
extract_match = lambda i, thresh: fname[i,np.argwhere(dist[i] < dist[i,0] + thresh)]

In [123]:
# Clustered Online Cumulative K-Means (COCK) 
def onl_kmeans(data, max_clusters, max_range):
    cluster_centrals = None
    cluster_count = []
    for coord in data:
        # Adding the first point as the first cluster central
        if cluster_centrals is None:
            cluster_centrals = np.array([coord])
            cluster_count.append(1)
            continue
        # Get distance from point to each cluster
        distances = np.sum((cluster_centrals - coord)**2, axis=1)**0.5
        nearest = np.argmin(distances)
        # If point is far away from clusters, it's on its own cluster
        if distances[nearest] > max_range:
            # Stop when max number of clusters reached
            if cluster_centrals.shape[0] == max_clusters: break
            cluster_centrals = np.append(cluster_centrals,[coord], axis=0)
            cluster_count.append(1)
        # If not, it belongs to cluster with nearest centeal. Update that one
        else:
            cluster_centrals[nearest] = (cluster_centrals[nearest] 
                                       * cluster_count[nearest] 
                                       + coord) / (cluster_count[nearest]+1)
            cluster_count[nearest] += 1
    # Return the center of the largest cluster
    biggest_cluster = np.argmax(cluster_count)
    return cluster_centrals[biggest_cluster]

In [135]:
threshold = 5
max_clusters = 5
max_radius = 7

locs = []
for i,_ in enumerate(tqdm(test_path)):
    coords = train_xy.loc[extract_match(i,threshold).flatten()].values
    loc = onl_kmeans(coords,max_clusters,max_radius)
    locs.append(loc)

100%|██████████| 1200/1200 [00:01<00:00, 1146.78it/s]


In [136]:
out = pd.DataFrame(locs,index=test_path)
out.to_csv('COCK_5_5_7.csv',index_label='id',header=['x','y'])