# SIFT Matches Extraction

This script exports for each test image its number of SIFT matches with other training images, with the corresponding image filename

In [1]:
import cv2
import tqdm
import glob
import pickle
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt

## Read in data

In [6]:
train_i = []
test_i = []

train_xy = pd.read_csv('validate_train.csv')
train_path = train_xy['id'].values
test_xy =  pd.read_csv('validate.csv')
test_path = test_xy['id'].values

for f in train_path:
    i = cv2.imread('./train/' + f + '.jpg')
    train_i.append(i)
for f in test_path:
    i = cv2.imread('./train/' + f + '.jpg')
    test_i.append(i)

## Converting to SIFT coordinates
Since the dataset can be heavy (up to 5GB), the dataset is exported to pickle file instead, so that:
* The data can be stored in internal disk rather than RAM
* The process can start from anywhere you want. No need to worry if there is anything broken halfway 

In [8]:
sift = cv2.SIFT_create()

# Compute SIFT keypoints and descriptors
for i,img in tqdm.tqdm(enumerate(train_i)):
    _, des = sift.detectAndCompute(img,None)
    f = open(f'./train_kp/train_kp{train_path[i]}.pckl','wb')
    pickle.dump(des,f)
    f.close()

for i,img in tqdm.tqdm(enumerate(test_i)):
    _, des = sift.detectAndCompute(img,None)
    f = open(f'./test_kp/test_kp{test_path[i]}.pckl','wb')
    pickle.dump(des,f)
    f.close()


2035it [03:32,  9.56it/s]


KeyboardInterrupt: 

## Do the extraction

* Read in the previously exported data
* Just letting it run. Each batch of 100 images takes around 8 hours

In [9]:
# FLANN matcher
FLANN_INDEX_KDTREE = 1
index_params = dict(algorithm = FLANN_INDEX_KDTREE, trees = 5)
search_params = dict(checks=50)
flann = cv2.FlannBasedMatcher(index_params,search_params)

for test in tqdm.tqdm(test_path):
    # Read in file
    f = open(f'./train_kp/train_kp{test}.pckl', 'rb')
    des_test = pickle.load(f)
    f.close()

    goods = []
    current_good = 0
    # Not matchable: Export None to a dummy pickle file
    if des_test is None or len(des_test) < 2:
        with open(f'./not_match/sift_not_match_{test}.pckl','rb') as f:
            pickle.dump(None,f)
    else:
        for train in train_path:
            f = open(f'./train_kp/train_kp{train}.pckl', 'rb')
            des_train = pickle.load(f)
            f.close()

            # If matching is weak, it must be irrelevant. Assign -1 to no. matches
            if des_train is None or len(des_train) < 2:
                goods.append(-1)
                continue
            matches = flann.knnMatch(des_test,des_train,k=2)
            # Store all good matches based on Lowe's Ratio test.
            good = len([m for m,n in matches if m.distance < 0.7*n.distance])
            goods.append(good)
        
        # Sorting. The order starts from the least matches
        sorted_idx = np.argsort(goods)
        sorted_path = [train_path[idx] for idx in sorted_idx]
        sorted_goods = [goods[idx] for idx in sorted_idx]
        
    # Export to pickle again
    f = open(f'./filename/sift_name_{test}.pckl','wb')
    pickle.dump(sorted_path,f)
    f.close()
    f = open(f'./goodmatch/sift_good_n_{test}.pckl','wb')
    pickle.dump(sorted_goods,f)
    f.close()

  0%|                                                                                          | 0/600 [00:17<?, ?it/s]


KeyboardInterrupt: 

## Convert the matched files to completed `.csv` files
Also flip back the order to be more consistent with CNN feature extraction

In [None]:
n_trains = len(train_xy)
# Get list of weak test instances
weak_test = [path.replace('not_match\\sift_not_match_','').replace('.pckl','') 
             for path in glob.glob('not_match/*')]

fname = np.empty((0,n_trains))
match = np.empty((0,n_trains))
for test in tqdm(test_path):
    if test in weak_test:
        fname = np.append(fname, np.empty((1,n_trains)) * (np.nan), axis=0)
        match = np.append(match, np.empty((1,n_trains)) * (np.nan), axis=0)
        continue
    with open(f'./filename/sift_name_{test}.pckl','rb') as f:
        fn = np.flip(pickle.load(f))
    with open(f'./goodmatch/sift_good_n_{test}.pckl','rb') as f:
        goods = np.flip(pickle.load(f))
    # Assign nan row for instances with weak matches
    if fn is None:
        fname = np.append(fname, np.empty((1,n_trains)) * (np.nan), axis=0)
        match = np.append(match, np.empty((1,n_trains)) * (np.nan), axis=0)
        continue
    fname = np.append(fname, [fn], axis=0)
    match = np.append(match, [goods], axis=0)

100%|██████████| 1200/1200 [04:37<00:00,  4.32it/s]


In [None]:
pd.DataFrame(fname).to_csv('sift_matches_filename.csv',index=False)
pd.DataFrame(match).to_csv('sift_matches_distance.csv',index=False)