In [1]:
import heapq
import matplotlib.pyplot as plt
import numpy as np
from sklearn.neighbors import NearestNeighbors

# Introduction
This notebook implements test some neearest neighbor algorithms.

# Brutal Force

In [16]:
class brutal_force_nearest_neighbor:
    def __init__(self, data_lst):
        self.data_lst = data_lst
        
    def reset_datalst(self, data_lst):
        self.data_lst = data_lst
        
    def find_top_k(self, x, k = 5):
        dist_lst = [[np.linalg.norm(x - ele), idx] \
                for idx, ele in enumerate(self.data_lst)]
        dist_lst.sort()
        return [self.data_lst[i] for i in \
                [idx for dist, idx in dist_lst[:k]]]    

In [22]:
raw_data = np.random.normal(size = (10000, 3))
bf_nn = brutal_force_nearest_neighbor(raw_data)
bf_nn.find_top_k(np.zeros(3))

[array([ 0.00545283, -0.04924882, -0.01622512]),
 array([-0.07629304, -0.0001987 ,  0.00593705]),
 array([-0.01549295, -0.00033424, -0.08512716]),
 array([0.07244523, 0.03610154, 0.07886056]),
 array([ 0.047736  , -0.10353047,  0.01099387])]

# Smarter Brutal Force with Heap

In [29]:
class heap_nearest_neighbor:
    def __init__(self, data_lst):
        self.data_lst = data_lst
        
    def reset_datalst(self, data_lst):
        self.data_lst = data_lst
        
    def find_top_k(self, x, k = 5):
        
        dist_lst = [[np.linalg.norm(x - ele), idx] \
                for idx, ele in enumerate(self.data_lst)]
        
        return [self.data_lst[i] for i in \
                [idx for dist, idx in heapq.nsmallest(k, dist_lst)]]    

In [30]:
h_nn = heap_nearest_neighbor(raw_data)
h_nn.find_top_k(np.zeros(3))

[array([ 0.00545283, -0.04924882, -0.01622512]),
 array([-0.07629304, -0.0001987 ,  0.00593705]),
 array([-0.01549295, -0.00033424, -0.08512716]),
 array([0.07244523, 0.03610154, 0.07886056]),
 array([ 0.047736  , -0.10353047,  0.01099387])]

# K-d Tree

In [31]:
from scipy.spatial import KDTree

In [36]:
class kdtree_nearest_neighbor:
    def __init__(self, data_lst):
        self.kdtree = KDTree(data_lst)
        
    def reset_datalst(self, data_lst):
        self.kdtree = KDTree(data_lst)
        
    def find_top_k(self, x, k = 5):
        dist, indices = self.kdtree.query(x, k) 
        return [self.kdtree.data[i] for i in indices]

In [37]:
kdt_nn = kdtree_nearest_neighbor(raw_data)
kdt_nn.find_top_k(np.zeros(3))

[array([ 0.00545283, -0.04924882, -0.01622512]),
 array([-0.07629304, -0.0001987 ,  0.00593705]),
 array([-0.01549295, -0.00033424, -0.08512716]),
 array([0.07244523, 0.03610154, 0.07886056]),
 array([ 0.047736  , -0.10353047,  0.01099387])]

# Locality sensitive hashing