In [1]:
import numpy 
import sys 
import nmslib 
import time 
import math 
from sklearn.neighbors import NearestNeighbors
from sklearn.model_selection import train_test_split
print(sys.version)
print("NMSLIB version:", nmslib.__version__)


Your CPU supports instructions that this binary was not compiled to use: SSE3 SSE4.1 SSE4.2 AVX AVX2
For maximum performance, you can install NMSLIB from sources 
pip install --no-binary :all: nmslib


3.9.13 (main, May 23 2022, 21:57:12) 
[GCC 11.2.0]
NMSLIB version: 2.1.1


In [4]:
# Just read the data
all_data_matrix = numpy.loadtxt('../../sample_data/final8_10K.txt')

print(all_data_matrix.shape)

(10000, 8)


In [5]:
# Create a held-out query data set
(data_matrix, query_matrix) = train_test_split(all_data_matrix, test_size = 0.1)

In [6]:
print("# of queries %d, # of data points %d"  % (query_matrix.shape[0], data_matrix.shape[0]) )

# of queries 1000, # of data points 9000


In [7]:
# Set index parameters
# These are the most important onese
NN = 15
efC = 100

num_threads = 4
index_time_params = {'NN': NN, 'indexThreadQty': num_threads, 'efConstruction': efC}

In [8]:
# Number of neighbors 
K=100

In [9]:
# Space name should correspond to the space name 
# used for brute-force search
space_name='kldivgenfast'

In [10]:
# Intitialize the library, specify the space, the type of the vector and add data points 
index = nmslib.init(method='sw-graph', space=space_name, data_type=nmslib.DataType.DENSE_VECTOR) 
index.addDataPointBatch(data_matrix) 
# https://nmslib.github.io/nmslib/index.html


9000

In [11]:
# Create an index
start = time.time()
index.createIndex(index_time_params) 
end = time.time() 
print('Index-time parameters', index_time_params)
print('Indexing time = %f' % (end-start))

Index-time parameters {'NN': 15, 'indexThreadQty': 4, 'efConstruction': 100}
Indexing time = 0.244212


In [12]:
# Setting query-time parameters
efS = 100
query_time_params = {'efSearch': efS}
print('Setting query-time parameters', query_time_params)
index.setQueryTimeParams(query_time_params)

Setting query-time parameters {'efSearch': 100}


In [23]:
# Querying
query_qty = query_matrix.shape[0]
start = time.time() 
nbrs = index.knnQueryBatch(query_matrix, k = K, num_threads = num_threads)
end = time.time() 
print('kNN time total=%f (sec), per query=%f (sec), per query adjusted for thread number=%f (sec)' % 
      (end-start, float(end-start)/query_qty, num_threads*float(end-start)/query_qty)) 

kNN time total=0.056333 (sec), per query=0.000056 (sec), per query adjusted for thread number=0.000225 (sec)


In [21]:
# query single
q = query_matrix[0]
ret = index.knnQuery(q,k=K)
print(ret)

(array([8482, 3344, 7421, 1293, 5378, 1536, 7607, 6453, 1411, 1981, 3393,
       7694, 3608, 1158, 5541, 6963, 3026, 5127,  109, 7625, 6731, 7887,
       2546, 7647, 8900, 8434, 7524,  736, 2510, 4368, 8130, 2383, 5925,
       7689, 4366, 1162, 3681, 5414, 5037, 7821, 4362, 7470, 7416,  393,
       1646, 8196, 3231, 8682, 5960, 6015, 2175, 1660, 5758, 4455, 5383,
       4834, 7737, 6678, 7660, 7307, 3867, 2256, 3911, 4258, 8848, 7254,
       5606, 7664, 5987, 8768, 1761, 8411, 1587, 8473,  714, 8329, 7740,
       8290, 2267, 2719, 5755, 8525, 1777, 6685, 2836, 8052, 8373,  362,
       2566, 2228, 7650, 2914, 6614, 4632, 4071, 4221, 6039, 8464, 7979,
        577], dtype=int32), array([0.01575311, 0.02636963, 0.03595767, 0.06196811, 0.0679979 ,
       0.09825444, 0.10086484, 0.10179234, 0.11319474, 0.11628333,
       0.1191566 , 0.12235996, 0.12307923, 0.12399974, 0.13194562,
       0.13399309, 0.13961558, 0.14494188, 0.14676481, 0.14676481,
       0.14786555, 0.14796309, 0.15174966, 0.1

In [14]:
# Computing gold-standard data 
print('Computing gold-standard data')

start = time.time()

gs = []


query_qty = query_matrix.shape[0]
data_qty = data_matrix.shape[0]

for i in range(query_qty):
    q = query_matrix[i,:]
    d = numpy.log(data_matrix * (1.0/q) )
    dist_vals = numpy.sum(data_matrix * d, axis=1)
    tmp = [ (dist_vals[i], i) for i in range(data_qty)]
    tmp.sort()
    gs.append( [tmp[i][1] for i in range(K)] )
    
end = time.time()

print('brute-force kNN time total=%f (sec), per query=%f (sec)' % 
      (end-start, float(end-start)/query_qty) )

Computing gold-standard data
brute-force kNN time total=9.965226 (sec), per query=0.009965 (sec)


In [13]:
# Finally computing recall
recall=0.0
for i in range(0, query_qty):
  correct_set = set(gs[i])
  ret_set = set(nbrs[i][0])
  recall = recall + float(len(correct_set.intersection(ret_set))) / len(correct_set)
recall = recall / query_qty
print('kNN recall %f' % recall)

kNN recall 0.999300


In [15]:
# Save a meta index and the data
index.saveIndex('dense_index_kldiv.txt', save_data=True)

In [16]:
# Re-intitialize the library, specify the space, the type of the vector.
newIndex = nmslib.init(method='sw-graph', space=space_name, data_type=nmslib.DataType.DENSE_VECTOR) 

In [17]:
# Re-load the index and the data
newIndex.loadIndex('dense_index_kldiv.txt', load_data=True)

In [18]:
# Setting query-time parameters and querying
print('Setting query-time parameters', query_time_params)
newIndex.setQueryTimeParams(query_time_params)

query_qty = query_matrix.shape[0]
start = time.time() 
new_nbrs = newIndex.knnQueryBatch(query_matrix, k = K, num_threads = num_threads)
end = time.time() 
print('kNN time total=%f (sec), per query=%f (sec), per query adjusted for thread number=%f (sec)' % 
      (end-start, float(end-start)/query_qty, num_threads*float(end-start)/query_qty)) 

Setting query-time parameters {'efSearch': 100}
kNN time total=0.051589 (sec), per query=0.000052 (sec), per query adjusted for thread number=0.000206 (sec)


In [19]:
# Finally computing recall
recall=0.0
for i in range(0, query_qty):
  correct_set = set(gs[i])
  ret_set = set(nbrs[i][0])
  recall = recall + float(len(correct_set.intersection(ret_set))) / len(correct_set)
recall = recall / query_qty
print('kNN recall %f' % recall)

kNN recall 0.999100
