In [1]:
import hnswlib
import numpy as np

In [2]:
def sample_vector(distribution, shape, **kwargs):
    # shape = (num_element, dim)
    # Example usage:
    # vec1 = sample_vector('normal', (5,), mean=0, std=1)
    # vec2 = sample_vector('uniform', (5,), low=0, high=1)
    if distribution == 'normal':
        return np.random.normal(kwargs.get('mean', 0), kwargs.get('std', 1), size=shape)
    elif distribution == 'uniform':
        return np.random.uniform(kwargs.get('low', 0), kwargs.get('high', 1), size=shape)
    else:
        return "Unsupported distribution"

def find_KNN(dataset, query_vector, k):
    # find kNN of the given query in the dataset. query_vector should be one vector whith shape (1, ndim) or a ndim array
    
    # Calculate distances from query_vector to all vectors in dataset
    distances = np.linalg.norm(np.array(dataset) - np.array(query_vector).flatten(), axis=1)

    # Get the indices of the k smallest distances
    k_indices = np.argsort(distances)[:k]
    
    # Get k nearest vectors and their distances
    k_nearest_vectors = [dataset[i] for i in k_indices]
    k_nearest_distances = [distances[i] for i in k_indices]
    
    return k_indices, k_nearest_vectors, k_nearest_distances

def topk_vector_accuracy(topk_vectors, retrieved_vectors):
    # return accuracy score numbers of match vector / k
    # Count matching vectors
    count_matches = 0
    for gt, est in zip(topk_vectors, retrieved_vectors):
        match = gt==est
        if match.all(): count_matches+=1 
    
    # Calculate accuracy
    k = len(topk_vectors)
    accuracy = count_matches / k
    return accuracy

def topk_idx_accuracy(topk_idx, retrieved_idx):
    # return accuracy score numbers of match vector / k
    pro_topk_idx = np.int64(topk_idx.flatten())
    pro_retrieved_idx = np.int64(retrieved_idx.flatten())
    
    # Count matching vectors
    matches = pro_topk_idx==pro_retrieved_idx
    num_matches = sum(matches.flatten())
    # print(num_matches)
    
    # Calculate accuracy
    k = len(pro_topk_idx)
    accuracy = num_matches / k
    return accuracy

## Create index for a vector dataset

In [50]:
dim = 32
num_elements = 10000
k = 10

# Generating sample data
dataset = np.float32(sample_vector('normal', (num_elements, dim), mean=0, std=1))
ids = np.arange(num_elements)

# Declaring index
p = hnswlib.Index(space = 'l2', dim = dim) # possible options are l2, cosine or ip

# Initializing index - the maximum number of elements should be known beforehand
p.init_index(max_elements = num_elements*10, ef_construction = 200, M = 16)

# Element insertion (can be called several times):
p.add_items(dataset, ids)

# Controlling the recall by setting ef:
p.set_ef(100) # ef should always be > k

print("current DB size:", p.get_current_count())

current DB size: 10000


In [51]:
vec1 = sample_vector('normal', (1, dim), mean=0, std=1)
est_idx, distances = p.knn_query(vec1, k=k)
gt_idx, gt_vecs, gt_distances = find_KNN(dataset, vec1, k=k)
acc = topk_idx_accuracy(est_idx, gt_idx)
print("Top_{} Acc:".format(k), acc)

Top_10 Acc: 1.0


In [52]:
num_runs = 1000
accs = []
for i in range(num_runs):
    vec1 = sample_vector('normal', (1, dim), mean=0, std=1)
    est_idx, distances = p.knn_query(vec1, k=k)
    gt_idx, gt_vecs, gt_distances = find_KNN(dataset, vec1, k=k)
    acc = topk_idx_accuracy(est_idx, gt_idx)
    accs.append(acc)

print("{} runs; top_{} Acc:".format(num_runs, k), np.average(accs))

1000 runs; top_10 Acc: 0.9448000000000001


## Add out-of-distribution data

In [53]:
new_data_size = num_elements
new_mean = 0.1
new_std = 1.1

newDataset = sample_vector('normal', (new_data_size, dim), mean=new_mean, std=new_std)
dataset = np.concatenate([dataset, newDataset])

p.add_items(newDataset)

print("current DB size:", p.get_current_count())

current DB size: 20000


In [54]:
vec1 = sample_vector('normal', (1, dim), mean=new_mean, std=new_std)
est_idx, distances = p.knn_query(vec1, k=k)
gt_idx, gt_vecs, gt_distances = find_KNN(dataset, vec1, k=k)
acc = topk_idx_accuracy(est_idx, gt_idx)
print("Top_{} Acc:".format(k), acc)

Top_10 Acc: 1.0


In [58]:
num_runs = 1000
accs = []
for i in range(num_runs):
    vec1 = sample_vector('normal', (1, dim), mean=new_mean, std=new_std)
    est_idx, distances = p.knn_query(vec1, k=k)
    gt_idx, gt_vecs, gt_distances = find_KNN(dataset, vec1, k=k)
    acc = topk_idx_accuracy(est_idx, gt_idx)
    accs.append(acc)

print("{} runs; top_{} Acc:".format(num_runs, k), np.average(accs))

1000 runs; top_10 Acc: 0.8842


## Re-index (NOT SURE)

In [59]:
# Generating sample data
ids = np.arange(p.get_current_count())

# Declaring index
p = hnswlib.Index(space = 'l2', dim = dim) # possible options are l2, cosine or ip

# Initializing index - the maximum number of elements should be known beforehand
p.init_index(max_elements = num_elements*10, ef_construction = 200, M = 16)

# Element insertion (can be called several times):
p.add_items(dataset, ids)

# Controlling the recall by setting ef:
p.set_ef(100) # ef should always be > k

print("current DB size:", p.get_current_count())

current DB size: 20000


In [60]:
num_runs = 1000
accs = []
for i in range(num_runs):
    vec1 = sample_vector('normal', (1, dim), mean=new_mean, std=new_std)
    est_idx, distances = p.knn_query(vec1, k=k)
    gt_idx, gt_vecs, gt_distances = find_KNN(dataset, vec1, k=k)
    acc = topk_idx_accuracy(est_idx, gt_idx)
    accs.append(acc)

print("{} runs; top_{} Acc:".format(num_runs, k), np.average(accs))

1000 runs; top_10 Acc: 0.8819000000000001
