# Nearest neighbors    
We will demonstrate here how to solve nearest neighbours with a few technologies, in both the Vaex and the SKlearn way.

## Vaex hnswlib

In [2]:
import vaex

df = vaex.example().head(10000)
print(df.head(2))

  #    id          x          y          z        vx       vy        vz        E        L       Lz       FeH
  0     0   1.23187   -0.396929  -0.598058   301.155  174.059   27.4275  -149431  407.389  333.956  -1.00539
  1    23  -0.163701   3.65422   -0.254906  -195      170.472  142.53    -124248  890.241  684.668  -1.70867


In [3]:
from hnswlib import Index
import numpy as np
from goldilox import Pipeline

# Build index
index = Index(
    space="l2", dim=df.shape[1] - 1
)  # possible options are l2, cosine or ip
index.init_index(max_elements=len(df), ef_construction=200, M=16)
features = df.get_column_names(regex="[^id]")  # not the ida

for i1, i2, chunk in df.to_pandas_df(chunk_size=1000):
    X = chunk[features]
    y = chunk["id"]
    index.add_items(X, y)

index.set_ef(50)  # ef should always be > k (Controlling the recall by setting ef)

# Add to Dataframe
@vaex.register_function(on_expression=False)
def topk(*columns, k=3):
    labels, _ = index.knn_query(np.array(columns).T, k=k)
    return np.array(labels)

df["knn"] = df.func.topk(*tuple([df[col] for col in features]), k=3)
df.add_function("topk", topk)

# build pipeline for production
pipeline = Pipeline.from_vaex(df)
assert pipeline.validate
pipeline.inference(pipeline.raw)

#,id,x,y,z,vx,vy,vz,E,L,Lz,FeH,knn
0,0,1.23187,-0.396929,-0.598058,301.155,174.059,27.4275,-149431,407.389,333.956,-1.00539,"array([24, 31, 13], dtype=uint64)"


## [KDTree]((https://scikit-learn.org/stable/auto_examples/neighbors/approximate_nearest_neighbors.html))


### Vaex

In [4]:
import vaex
from sklearn.neighbors import KDTree

df = vaex.example().head(10000)

model = KDTree(df[features], leaf_size=2)

@vaex.register_function(on_expression=False)
def query(*columns):
    data = np.array(columns).T
    dist, ind = model.query(data, k=3)
    return ind

df.add_function("query", query)
df["predictions"] = df.func.query(*tuple([df[col] for col in features]))

pipeline = Pipeline.from_vaex(df)

assert pipeline.validate()
pipeline.inference(pipeline.raw)

#,id,x,y,z,vx,vy,vz,E,L,Lz,FeH,predictions
0,0,1.23187,-0.396929,-0.598058,301.155,174.059,27.4275,-149431,407.389,333.956,-1.00539,"array([ 0, 7713, 1744])"


### Sklearn

In [43]:
import vaex
from sklearn.neighbors import KDTree
import sklearn.pipeline
from sklearn.base import TransformerMixin, BaseEstimator
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer

df = vaex.example().head(10000)
features = df.get_column_names(regex="[^id]")  # not the ida

X = df[features].to_pandas_df()

class KDTreePredictor(TransformerMixin, BaseEstimator):
    def __init__(self, features=None, leaf_size=2, k=3, output_column="results"):
        self.index = None
        self.ids = None
        self.features = features
        self.k = k
        self.leaf_size = leaf_size
        self.output_column = output_column
        self.means = {}

    def fit(self, X, y=None):        
        if y is not None:
            assert len(X) == len(y)
            self.ids = {i: j for i, j in enumerate(y)}
        if self.features and isinstance(self.features, list):
            self.means = {feature:X[feature].mean() for feature in self.features}
            X = X[self.features]        
            
        self.index = KDTree(X, leaf_size=self.leaf_size)
        return self

    def transform(self, X):
        copy = X.copy()               
        if self.index is None:
            raise RuntimeError("model was not trained")
        if self.features and isinstance(self.features, list):            
            copy = X[self.features]        
            for feature in self.features:
                copy[feature] = copy[feature].fillna(self.means.get(feature))
        _, ind = self.index.query(copy, k=self.k)                
        copy[self.output_column] = list(ind)
        return copy
    
model = KDTreePredictor(features=features)

pipeline = Pipeline.from_sklearn(model).fit(X)

assert pipeline.validate()
pipeline.inference(pipeline.raw)

Unnamed: 0,x,y,z,vx,vy,vz,E,L,Lz,FeH,results
0,1.231868,-0.396929,-0.598058,301.155273,174.059479,27.427546,-149431.40625,407.388977,333.955536,-1.005385,"[0, 7713, 1744]"
