In [2]:
import numpy 
import sys 
import nmslib 
import time 
import math 
from sklearn.neighbors import NearestNeighbors
from sklearn.model_selection import train_test_split
print(sys.version)
print("NMSLIB version:", nmslib.__version__)


3.9.13 (main, May 23 2022, 21:57:12) 
[GCC 11.2.0]
NMSLIB version: 2.1.1


In [3]:
# Just read the data
all_data_matrix = numpy.loadtxt('../../sample_data/final128_10K.txt')

In [4]:
# Create a held-out query data set
(data_matrix, query_matrix) = train_test_split(all_data_matrix, test_size = 0.1)

In [5]:
print("# of queries %d, # of data points %d"  % (query_matrix.shape[0], data_matrix.shape[0]) )

# of queries 1000, # of data points 9000


In [6]:
# Set index parameters
# These are the most important onese
M = 15
efC = 100

num_threads = 4
index_time_params = {'M': M, 'indexThreadQty': num_threads, 'efConstruction': efC, 'post' : 0}
print('Index-time parameters', index_time_params)

Index-time parameters {'M': 15, 'indexThreadQty': 4, 'efConstruction': 100, 'post': 0}


In [7]:
# Number of neighbors 
K=100

In [8]:
# Space name should correspond to the space name 
# used for brute-force search
space_name='l2'

In [9]:
# Intitialize the library, specify the space, the type of the vector and add data points 
index = nmslib.init(method='hnsw', space=space_name, data_type=nmslib.DataType.DENSE_VECTOR) 
index.addDataPointBatch(data_matrix) 

9000

In [10]:
# Create an index
start = time.time()
index_time_params = {'M': M, 'indexThreadQty': num_threads, 'efConstruction': efC}
index.createIndex(index_time_params) 
end = time.time() 
print('Index-time parameters', index_time_params)
print('Indexing time = %f' % (end-start))

Index-time parameters {'M': 15, 'indexThreadQty': 4, 'efConstruction': 100}
Indexing time = 0.328718


In [11]:
# Setting query-time parameters
efS = 100
query_time_params = {'efSearch': efS}
print('Setting query-time parameters', query_time_params)
index.setQueryTimeParams(query_time_params)

Setting query-time parameters {'efSearch': 100}


In [12]:
# Querying
query_qty = query_matrix.shape[0]
start = time.time() 
nbrs = index.knnQueryBatch(query_matrix, k = K, num_threads = num_threads)
end = time.time() 
print('kNN time total=%f (sec), per query=%f (sec), per query adjusted for thread number=%f (sec)' % 
      (end-start, float(end-start)/query_qty, num_threads*float(end-start)/query_qty)) 

kNN time total=0.037016 (sec), per query=0.000037 (sec), per query adjusted for thread number=0.000148 (sec)


In [13]:
# Computing gold-standard data 
print('Computing gold-standard data')

start = time.time()
sindx = NearestNeighbors(n_neighbors=K, metric='l2', algorithm='brute').fit(data_matrix)
end = time.time()

print('Brute-force preparation time %f' % (end - start))

start = time.time() 
gs = sindx.kneighbors(query_matrix)
end = time.time()

print('brute-force kNN time total=%f (sec), per query=%f (sec)' % 
      (end-start, float(end-start)/query_qty) )

Computing gold-standard data
Brute-force preparation time 0.001652
brute-force kNN time total=0.695983 (sec), per query=0.000696 (sec)


In [14]:
# Finally computing recall
recall=0.0
for i in range(0, query_qty):
  correct_set = set(gs[1][i])
  ret_set = set(nbrs[i][0])
  recall = recall + float(len(correct_set.intersection(ret_set))) / len(correct_set)
recall = recall / query_qty
print('kNN recall %f' % recall)

kNN recall 0.992520


In [15]:
# Save a meta index, but no data!
index.saveIndex('dense_index_optim.bin', save_data=False)

In [16]:
# Re-intitialize the library, specify the space, the type of the vector.
newIndex = nmslib.init(method='hnsw', space=space_name, data_type=nmslib.DataType.DENSE_VECTOR) 
# For an optimized L2 index, there's no need to re-load data points, but this would be required for
# non-optimized index or any other methods different from HNSW (other methods can save only meta indices)
#newIndex.addDataPointBatch(data_matrix) 

In [17]:
# Re-load the index and re-run queries
newIndex.loadIndex('dense_index_optim.bin')

In [23]:
# Setting query-time parameters and querying
print('Setting query-time parameters', query_time_params)
newIndex.setQueryTimeParams(query_time_params)

query_qty = query_matrix.shape[0]
start = time.time() 
new_nbrs = newIndex.knnQueryBatch(query_matrix, k = K, num_threads = num_threads)
end = time.time() 
print('kNN time total=%f (sec), per query=%f (sec), per query adjusted for thread number=%f (sec)' % 
      (end-start, float(end-start)/query_qty, num_threads*float(end-start)/query_qty)) 

print(new_nbrs[0][1])      

Setting query-time parameters {'efSearch': 100}
kNN time total=0.039916 (sec), per query=0.000040 (sec), per query adjusted for thread number=0.000160 (sec)
[0.00270098 0.01801973 0.02218262 0.06074424 0.06086846 0.06361715
 0.06782023 0.06879169 0.0713936  0.07616136 0.08065243 0.08119275
 0.08251369 0.08296454 0.08621208 0.08669227 0.08720806 0.08797651
 0.08798739 0.08999468 0.0906682  0.09100393 0.0930972  0.09348261
 0.09350233 0.09391692 0.09601999 0.09789868 0.09929319 0.10020437
 0.10059278 0.10071577 0.10189154 0.10201125 0.1021492  0.10290202
 0.10317906 0.10332359 0.10399057 0.10442822 0.10629443 0.10637057
 0.10797795 0.11080971 0.11121096 0.11195159 0.11204608 0.11248356
 0.11297312 0.11298401 0.11405149 0.11416586 0.11469842 0.115043
 0.11571665 0.1158088  0.11657865 0.11695337 0.11932322 0.1211203
 0.12164541 0.12225268 0.12242275 0.12351011 0.12460133 0.12594494
 0.12604609 0.12639461 0.12661618 0.12696955 0.12888691 0.13031843
 0.13039014 0.13042888 0.13189721 0.132283

In [19]:
# Finally computing recall for the new result set
recall=0.0
for i in range(0, query_qty):
  correct_set = set(gs[1][i])
  ret_set = set(new_nbrs[i][0])
  recall = recall + float(len(correct_set.intersection(ret_set))) / len(correct_set)
recall = recall / query_qty
print('kNN recall %f' % recall)

kNN recall 0.992520
