In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import pickle
import cv2
from tqdm import tqdm

In [2]:
dist = pd.read_csv('./resnet101_matches_distance.csv').values
fname = pd.read_csv('./resnet101_matches_filename.csv').values

In [20]:
good = pd.read_csv('./sift_matches_distance.csv').values
fgood = pd.read_csv('./sift_matches_filename.csv').values

In [3]:
train_xy = pd.read_csv('train.csv', index_col=0)
test_path = pd.read_csv('imagenames.csv')['id'].values

In [4]:
# Limit the candidate pictures to a limited decrease only, comparing to the top one
extract_match = lambda i, thresh: fname[i,np.argwhere(dist[i] < dist[i,0] + thresh)]

In [5]:
# Clustered Online Cumulative K-Means (COCK) 
def onl_kmeans(data, fnames, max_clusters, max_range, min_size = 2):
    cluster_centrals = None
    cluster_elems = []
    cluster_filename = []
    cluster_count = []
    for i,coord in enumerate(data):
        # Adding the first point as the first cluster central
        if cluster_centrals is None:
            cluster_centrals = np.array([coord])
            cluster_elems.append([coord])
            cluster_filename.append([fnames[i]])
            cluster_count.append(1)
            continue
        # Get distance from point to each cluster
        distances = np.sum((cluster_centrals - coord)**2, axis=1)**0.5
        nearest = np.argmin(distances)
        # If point is far away from clusters, it's on its own cluster
        if distances[nearest] > max_range:
            # Stop when max number of clusters reached and have a big enough cluster
            if cluster_centrals.shape[0] >= max_clusters:
                if np.max(cluster_count) >= min_size: break
                # Not big enough clusters means that the CNN is messed up
                return None,None,None
            cluster_centrals = np.append(cluster_centrals,[coord], axis=0)
            cluster_elems.append([coord])
            cluster_filename.append([fnames[i]])
            cluster_count.append(1)
        # If not, it belongs to cluster with nearest centeal. Update that one
        else:
            cluster_centrals[nearest] = (cluster_centrals[nearest] 
                                       * cluster_count[nearest] 
                                       + coord) / (cluster_count[nearest]+1)
            cluster_elems[nearest].append(coord)
            cluster_filename[nearest].append(fnames[i])
            cluster_count[nearest] += 1
    # Return the coordinates, filenames, and center of the largest cluster
    biggest_cluster = np.argmax(cluster_count)
    return cluster_elems[biggest_cluster], \
           cluster_filename[biggest_cluster], \
           cluster_centrals[biggest_cluster]

## Partial SIFT Implementation: Only Match on sparse clusters

In [19]:
# Processing
threshold = 5
max_clusters = 5
max_radius = 7
min_size = 3

# FLANN specs
FLANN_INDEX_KDTREE = 1
index_params = dict(algorithm = FLANN_INDEX_KDTREE, trees = 5)
search_params = dict(checks=50)
flann = cv2.FlannBasedMatcher(index_params,search_params)

#locs = []
#fnames = []
centroids = []
for i,test in enumerate(tqdm(test_path)):
    img_idx = extract_match(i,threshold).flatten()
    coords = train_xy.loc[img_idx].values
    _, _, centroid = onl_kmeans(coords, img_idx, max_clusters, max_radius, min_size)
    if centroid is None:
        with open(f'./test_kp/test_kp{test}.pckl', 'rb') as test_sift_file:
            des_test = pickle.load(test_sift_file)
        goods = []

        for train in img_idx:
            with open(f'./train_kp/train_kp{train}.pckl', 'rb') as train_sift_file:
                des_train = pickle.load(train_sift_file)

            # Matching descriptor using KNN algorithm
            if des_train is None or len(des_train) < 2:
                goods.append(-1)
                continue
            matches = flann.knnMatch(des_test,des_train,k=2)

            # Store all good matches as per Lowe's Ratio test.
            good = len([m for m,n in matches if m.distance < 0.7*n.distance])
            goods.append(good)
        
        max_idx = np.argmax(goods)
        centroids.append(train_xy.loc[img_idx[max_idx]].values)
        #sorted_idx = np.argsort(goods)
        #sorted_path = [img_idx[idx] for idx in sorted_idx]
        #sorted_goods = [goods[idx] for idx in sorted_idx]
    else:
        #locs.append(loc)
        #fnames.append(f)
        centroids.append(centroid)

100%|██████████| 1200/1200 [07:46<00:00,  2.57it/s]


## Exhaustive SIFT implementation: Do feature matching on all candidated pools

In [22]:
# COCK params for images with few features
MIN_MATCHES = 5
threshold = 5
max_clusters = 5
max_radius = 7
min_size = 1

# FLANN specs
FLANN_INDEX_KDTREE = 1
index_params = dict(algorithm = FLANN_INDEX_KDTREE, trees = 5)
search_params = dict(checks=50)
flann = cv2.FlannBasedMatcher(index_params,search_params)

centroids = []
for i,test in enumerate(tqdm(test_path)):
    img_idx = extract_match(i,threshold).flatten()
    
    with open(f'./test_kp/test_kp{test}.pckl', 'rb') as test_sift_file:
        des_test = pickle.load(test_sift_file)
    goods = []

    # Weak finding: Do COCK instead
    if des_test is None or len(des_test) < MIN_MATCHES:
        coords = train_xy.loc[img_idx].values
        _, _, centroid = onl_kmeans(coords, img_idx, max_clusters, max_radius, min_size)
        centroids.append(centroid)
        continue

    for train in img_idx:
        with open(f'./train_kp/train_kp{train}.pckl', 'rb') as train_sift_file:
            des_train = pickle.load(train_sift_file)

        # Matching descriptor using KNN algorithm
        if des_train is None or len(des_train) < 2:
            goods.append(-1)
            continue
        matches = flann.knnMatch(des_test,des_train,k=2)

        # Store all good matches as per Lowe's Ratio test.
        good = len([m for m,n in matches if m.distance < 0.7*n.distance])
        goods.append(good)
    
    max_idx = np.argmax(goods)
    centroids.append(train_xy.loc[img_idx[max_idx]].values)
    #sorted_idx = np.argsort(goods)
    #sorted_path = [img_idx[idx] for idx in sorted_idx]
    #sorted_goods = [goods[idx] for idx in sorted_idx]

100%|██████████| 1200/1200 [20:45<00:00,  1.04s/it]


## Exhaustive SIFT implmentation: Do feature matching on CNN candidate pool, then do clustering to odd out outliers

In [41]:
# COCK params for images with few features
MIN_MATCHES = 5
threshold = 5
max_clusters = 3
max_radius = 7
min_size = 1
max_match_keep = 0.3

# FLANN specs
FLANN_INDEX_KDTREE = 1
index_params = dict(algorithm = FLANN_INDEX_KDTREE, trees = 5)
search_params = dict(checks=50)
flann = cv2.FlannBasedMatcher(index_params,search_params)

centroids = []
for i,test in enumerate(tqdm(test_path)):
    img_idx = extract_match(i,threshold).flatten()
    
    sift_fname = fgood[i,:]
    sift_match = good[i,:]

    # Weak finding: Do COCK instead
    if sift_fname[0] is np.nan:
        coords = train_xy.loc[img_idx].values
        _, _, centroid = onl_kmeans(coords, img_idx, 5, max_radius, min_size)
        centroids.append(centroid)
        continue

    # Basically get all cnn indices in order of best SIFT matches
    matchings = []
    good_m = []
    for match_idx,m in enumerate(sift_fname):
        if m in img_idx:
            matchings.append(m)
            good_m.append(sift_match[match_idx])
    # Once again do thresholding
    good_match = [m for idx,m in enumerate(matchings) 
                  if good_m[idx] > good_m[0]*max_match_keep]

    # Weak finding: Do COCK instead
    if good_m[0] < MIN_MATCHES:
        coords = train_xy.loc[img_idx].values
        _, _, centroid = onl_kmeans(coords, img_idx, 5, max_radius, min_size)
        centroids.append(centroid)
        continue

    coords = train_xy.loc[good_match].values
    _, _, centroid = onl_kmeans(coords, img_idx, max_clusters, max_radius, min_size)
    
    centroids.append(centroid)

100%|██████████| 1200/1200 [01:03<00:00, 18.95it/s]


## File export

In [30]:
# If export cluster image specs for SIFT, run this
# f = open(f'./kmeans_coords.pckl','wb')
# pickle.dump(locs,f)
# f.close()

# f = open(f'./fnames_coords.pckl','wb')
# pickle.dump(fnames,f)
# f.close()

In [34]:
# If taking centroid as output, run this
out = pd.DataFrame(centroids,index=test_path)
out.to_csv('Results/COCK_SIFT_exhaustive_5_3_7_1.csv',index_label='id',header=['x','y'])