<a href="https://colab.research.google.com/github/urjits25/visual-search/blob/master/CSE534-VisualSearch.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
# Importing required libraries
import numpy as np
import cv2
from google.colab import drive
from matplotlib import pyplot as plt
from sklearn.cluster import MiniBatchKMeans
import time
import os
from google.colab import drive
import pickle
from sklearn.neighbors import NearestNeighbors
from scipy import stats



Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [0]:
# SET THE INPUT ARGUMENTS HERE

# Dataset stored in Google Drive
drive.mount('/content/gdrive')
path = "./gdrive/My Drive/paris/"
img_list = os.listdir(path)

query_img = cv2.imread(path+img_list[100],0)

##Pre-processing the dataset

### Feature Detection

In [0]:
# For each image in the dataset
# Detect Features and Descriptors using ORB
# Cluster the descriptors for all the images
# Create a visual vocabulary for each image (vector for each cluster associated with the frequency of each cluster)
# Compute frequency for each cluster, and compute Histogram
start = time.clock()

# For every image in the list
for image in img_list: 
    img = cv2.imread(path+image, 0)
    
    # Initiate ORB detector
    orb = cv2.ORB_create()
    
    # Detect and describe keypoints with ORB
    kp = orb.detect(img, None)
    kp, des = orb.compute(img, kp)
    
    # Write all the descriptors to a file (pre-process and save) 
    # 500 features per image, 32-dimensional vector for each descriptor
    if des is not None: 
        with open('final_des.txt', 'a+') as f1:
            for list in des:
                for descr in list:
                    f1.write(str(descr) + " ")
                f1.write("\n")
        f1.close()
timer = time.clock()-start

print("--- Creating Descriptors File: %.5f s ---" % timer)

### Feature Clustering - MiniBatchKMeans

In [0]:
# Creating Descriptors' List for Clustering
final_des = []

with open('final_des.txt', 'r') as f2:
    for line in f2: 
        final_des.append( [int(ele) for ele in line.split() if ele is not " "] )
f2.close()

In [0]:
final_des = np.asarray(final_des)

In [0]:
stats.describe(final_des)

In [6]:
start=time.clock()

# Change number of clusters to sqrt(len(descriptors))
# iterations to 10-30

clusters = MiniBatchKMeans(n_clusters= len(img_list)//5, \
                           random_state=0, max_iter=5, \
                           batch_size=len(final_des)//10).fit(final_des)
timer = time.clock()-start
print("--- Clustering descriptors: %.5f s ---" % timer)

--- Clustering descriptors: 2192.48754 s ---


In [0]:
# Saving created KMeans model for future use
with open('MBKmeans_v2.sav', 'wb') as f3:
    pickle.dump(clusters, f3)
f3.close()

In [0]:
stats.describe(clusters.cluster_centers_)

### Visual Vocabulary

In [0]:
with open('MBKmeans.sav', 'rb') as f3:
    clusters = pickle.load(f3)
f3.close()

In [0]:
def build_histogram(descriptor_list, cluster_alg):
    histogram = np.zeros(len(cluster_alg.cluster_centers_))
    cluster_result=cluster_alg.predict(np.asarray([descriptor_list]))
    for i in cluster_result:
        histogram[i] += 1.0
    return histogram

In [0]:
# For every image in the dataset, do a frequency count of the descriptors
start = time.clock()
vocab = []

with open('vocab.txt', 'a+') as f1:
    for des in final_des:
        hist = build_histogram(des, clusters)
        for count in hist:
            f1.write(str(count) + " ")
    f1.write("\n")
f1.close()

In [0]:
with open('vocab.txt', 'r') as f2:
    for line in f2: 
        vocab.append( [int(ele) for ele in line.split() if ele is not " "] )
f2.close()

timer = time.clock() - start
print("--- Computing Visual Vocabulary: %.5f s ---" % timer)

In [0]:
vocab = np.asarray(vocab)

## Querying the Model

In [0]:
plt.imshow(query_img)
plt.show()

In [0]:
# Initiate ORB detector
orb1 = cv2.ORB_create()

# Detect and describe keypoints with ORB
kp = orb1.detect(query_img, None)
kp, des = orb1.compute(query_img, kp)

# Map all descriptors to their centroids
if des is not None: 
    hist1 = build_histogram(des, clusters)
    neighbor = NearestNeighbors(n_neighbors=10).fit(vocab)
    dist, result = neighbor.kneighbors([hist1])
else:
    print("Corrupted image")

In [0]:
for index in result.flatten():
    plt.imshow(cv2.imread(path+img_list[index], 0))
    plt.show()

## References: 

* Python3 Documentation: https://docs.python.org/3/
* Bag of Words Example: https://medium.freecodecamp.org/an-introduction-to-bag-of-words-and-how-to-code-it-in-python-for-nlp-282e87a9da04<br>
* Bag of Visual Words In a Nutshell: https://towardsdatascience.com/bag-of-visual-words-in-a-nutshell-9ceea97ce0fb<br>
* Importing Data from Google Drive into Colab: https://towardsdatascience.com/3-ways-to-load-csv-files-into-colab-7c14fcbdcb92<br>
* Reading all files from a directory: https://stackoverflow.com/questions/3207219/how-do-i-list-all-files-of-a-directory<br>
* Comparison of Feature Detectors: https://computer-vision-talks.com/2011-07-13-comparison-of-the-opencv-feature-detection-algorithms/ <br>
* ORB Feature Detector: https://opencv-python-tutroals.readthedocs.io/en/latest/py_tutorials/py_feature2d/py_orb/py_orb.html#orb<br>
* Read line-by-line into a list: https://stackoverflow.com/questions/3277503/how-to-read-a-file-line-by-line-into-a-list<br>
* MiniBatchKMeans Clustering: https://scikit-learn.org/stable/modules/generated/sklearn.cluster.MiniBatchKMeans.html<br>
* Saving trained model: https://machinelearningmastery.com/save-load-machine-learning-models-python-scikit-learn/ <br>
* KNN for Ranking Image results: https://scikit-learn.org/stable/modules/neighbors.html
