# Nearest neighbors    
We will demonstrate here how to solve nearest neighbours with a few technologies, in both the Vaex and the SKlearn way.

## [KDTree (SKlearn)](https://scikit-learn.org/stable/auto_examples/neighbors/approximate_nearest_neighbors.html)


### Vaex

In [10]:
import vaex
from sklearn.neighbors import KDTree
from goldilox import Pipeline
import numpy as np

df = vaex.example().head(10000)
features = df.get_column_names(regex="[^id]")  # not the ida
model = KDTree(df[features], leaf_size=2)

@vaex.register_function(on_expression=False)
def query(*columns):
    data = np.array(columns).T
    _, ind = model.query(data, k=3)
    return ind

df.add_function("query", query)
df["knn"] = df.func.query(*features)

pipeline = Pipeline.from_vaex(df)

assert pipeline.validate()
pipeline.inference(pipeline.raw)

#,id,x,y,z,vx,vy,vz,E,L,Lz,FeH,knn
0,0,1.23187,-0.396929,-0.598058,301.155,174.059,27.4275,-149431,407.389,333.956,-1.00539,"array([ 0, 7713, 1744])"


### Sklearn

In [47]:
import vaex
from sklearn.neighbors import KDTree
import sklearn.pipeline
from sklearn.base import TransformerMixin, BaseEstimator
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer

df = vaex.example().head(10000)
features = df.get_column_names(regex="[^id]")  # not the ida

X = df[features].to_pandas_df()

class KDTreeTransformer(TransformerMixin, BaseEstimator):
    def __init__(self, features=None, leaf_size=2, k=3, output_column="knn"):
        self.index = None
        self.ids = None
        self.features = features
        self.k = k
        self.leaf_size = leaf_size
        self.output_column = output_column
        self.means = {}

    def fit(self, X, y=None):        
        if y is not None:
            assert len(X) == len(y)
            self.ids = {i: j for i, j in enumerate(y)}
        if self.features and isinstance(self.features, list):
            self.means = {feature:X[feature].mean() for feature in self.features}
            X = X[self.features]        
            
        self.index = KDTree(X, leaf_size=self.leaf_size)
        return self

    def transform(self, X):
        copy = X.copy()               
        if self.index is None:
            raise RuntimeError("model was not trained")
        if self.features and isinstance(self.features, list):            
            copy = X[self.features]        
            for feature in self.features:
                copy[feature] = copy[feature].fillna(self.means.get(feature))
        _, ind = self.index.query(copy, k=self.k)                
        copy[self.output_column] = list(ind)
        return copy
    
model = KDTreePredictor(features=features)

pipeline = Pipeline.from_sklearn(model).fit(X)

assert pipeline.validate()
pipeline.inference(pipeline.raw)

Unnamed: 0,x,y,z,vx,vy,vz,E,L,Lz,FeH,knn
0,1.231868,-0.396929,-0.598058,301.155273,174.059479,27.427546,-149431.40625,407.388977,333.955536,-1.005385,"[0, 7713, 1744]"


# [hnswlib](https://github.com/nmslib/hnswlib) (Recommended)
## Vaex

In [11]:
import vaex

df = vaex.example().head(10000)
features = df.get_column_names(regex="[^id]")  # not the ida
print(df.head(2))

  #    id          x          y          z        vx       vy        vz        E        L       Lz       FeH
  0     0   1.23187   -0.396929  -0.598058   301.155  174.059   27.4275  -149431  407.389  333.956  -1.00539
  1    23  -0.163701   3.65422   -0.254906  -195      170.472  142.53    -124248  890.241  684.668  -1.70867


In [12]:
from hnswlib import Index
import numpy as np
from goldilox import Pipeline


# Build index
index = Index(
    space="l2", dim=df.shape[1] - 1
)  # possible options are l2, cosine or ip
index.init_index(max_elements=len(df), ef_construction=200, M=16)

for i1, i2, chunk in df.to_pandas_df(chunk_size=1000):
    X = chunk[features]
    y = chunk["id"]
    index.add_items(X, y)

index.set_ef(50)  # ef should always be > k (Controlling the recall by setting ef)

# Add to Dataframe
@vaex.register_function(on_expression=False)
def topk(*columns, k=3):
    labels, _ = index.knn_query(np.array(columns).T, k=k)
    return np.array(labels)

df.add_function("topk", topk)
df["knn"] = df.func.topk(*features)

# build pipeline for production
pipeline = Pipeline.from_vaex(df)
assert pipeline.validate
pipeline.inference(pipeline.raw)

#,id,x,y,z,vx,vy,vz,E,L,Lz,FeH,knn
0,0,1.23187,-0.396929,-0.598058,301.155,174.059,27.4275,-149431,407.389,333.956,-1.00539,"array([24, 31, 13], dtype=uint64)"


# [nmslib](https://github.com/nmslib/nmslib/blob/master/python_bindings/README.md)
Unfortunately nmslib is not pickable, but we can get over it by creating a class and implement *\_\_reduce\_\_()* for it and implementing the serialization.

## Vaex

In [4]:
import vaex
import nmslib 

df = vaex.example().head(1000)

ids = {index: _id for index, _id in enumerate(df["id"].tolist())}
df.variables["id_map"] = ids  # good practice

features = df.get_column_names(regex="[^id]")  # not the ida

method = "hnsw"
space = "l2"
index = nmslib.init(method=method, space=space)
index.addDataPointBatch(df[features])
index.createIndex()

Your CPU supports instructions that this binary was not compiled to use: AVX2
For maximum performance, you can install NMSLIB from sources 
pip install --no-binary :all: nmslib


In [5]:
import traitlets
from tempfile import NamedTemporaryFile

class NMSLibModel(traitlets.HasTraits):

        # This should work with the reduce's arguments
        def __init__(self, index=None, method="hnsw", metric="cosinesimil"):

            self.method = method
            self.metric = metric
            self.index = self.decode(index)

        # This is how you make a class pickalbe
        def __reduce__(self):
            return (self.__class__, (self.encode(), self.method, self.metric))

        # how nmslib implemented serialization
        def decode(self, encoding):
            import nmslib

            if isinstance(encoding, bytes):
                index = nmslib.init(method=self.method, space=self.metric)
                path = NamedTemporaryFile().name
                with open(path, "wb") as outfile:
                    outfile.write(encoding)
                index.loadIndex(path)
                return index
            else:
                return encoding

        # how nmslib implemented serialization
        def encode(self):
            if isinstance(self.index, bytes):
                return self.index
            path = NamedTemporaryFile().name
            self.index.saveIndex(path, save_data=True)
            with open(path, "rb") as outfile:
                encoding = outfile.read()
            return encoding

        def predict(self, data, k=3):
            neighbours = self.index.knnQueryBatch(data, k=k)
            return np.array(neighbours)[:, 0]

model = NMSLibModel(index, method, space)

In [6]:
# Add a nearest neighbours index column
@vaex.register_function(on_expression=False)
def topk(*columns, k=3):
    data = np.array(columns).T
    return model.predict(data, k)
df.add_function("topk", topk)
df["knn"] = df.func.topk(*features)

# Add a nearest neighbours actual ids
@vaex.register_function(on_expression=True)
def results(ar):
    return np.vectorize(ids.get)(ar)

df.add_function("results", results)
df["neighbours"] = df["knn"].results()

pipeline = Pipeline.from_vaex(df)
assert pipeline.validate()
pipeline.inference(pipeline.raw)

#,id,x,y,z,vx,vy,vz,E,L,Lz,FeH,knn,neighbours
0,0,1.23187,-0.396929,-0.598058,301.155,174.059,27.4275,-149431,407.389,333.956,-1.00539,"array([ 0., 704., 561.])","array([ 0, 21, 22])"


## Sklearn version
We need to implement *fit* and  *transform* with a class that inherent TransformerMixin, BaseEstimator for sklearn.base.    
Not forget the *\_\_reduce\_\_()* with the serialization.

In [130]:
import vaex 
import nmslib
from sklearn.base import TransformerMixin, BaseEstimator

class NMSlibTransformer(TransformerMixin, BaseEstimator):
        """Wrapper for using nmslib as sklearn's KNeighborsTransformer"""

        def __init__(
            self,
            n_neighbors=5,
            output_column="knn",
            method="hnsw",
            metric="cosinesimil",
            n_jobs=1,
            index=None,
            ids = {}
        ):

            self.n_neighbors = n_neighbors
            self.method = method
            self.metric = metric
            self.n_jobs = n_jobs
            self.output_column = output_column
            self.n_samples_fit_ = None
            self.index = self._create_index(index)
            self.ids = ids

        def __reduce__(self):
            return (
                self.__class__,
                (
                    self.n_neighbors,
                    self.output_column,
                    self.method,
                    self.metric,
                    self.n_jobs,
                    self._encode(),
                    self.ids
                ),
            )

        def _create_index(self, encoding):
            import nmslib

            if encoding is None:
                return nmslib.init(method=self.method, space=self.metric)
            if isinstance(encoding, bytes):
                index = nmslib.init(method=self.method, space=self.metric)
                path = NamedTemporaryFile().name
                with open(path, "wb") as outfile:
                    outfile.write(encoding)
                index.loadIndex(path)
                return index
            else:
                return encoding

        def _encode(self):
            if self.index is None:
                return None
            if isinstance(self.index, bytes):
                return self.index
            path = NamedTemporaryFile().name
            self.index.saveIndex(path, save_data=True)
            with open(path, "rb") as outfile:
                encoding = outfile.read()
            return encoding

        def __sklearn_is_fitted__(self):
            return self.n_samples_fit_ is not None

        def fit(self, X, y=None):
            self.ids = {index: _id for index, _id in enumerate(y)}            
            self.n_samples_fit_ = X.shape[0]
            self.index.addDataPointBatch(X)
            self.index.createIndex()
            return self

        def transform(self, X):
            results = self.index.knnQueryBatch(
                X, k=self.n_neighbors, num_threads=self.n_jobs
            )
            indices, distances = zip(*results)
            indices = np.vstack(indices)
            X[self.output_column] = tuple(indices)                        
            X[self.output_column] = X[self.output_column].apply(lambda x: [self.ids.get(i) for i in x])
            return X

df = vaex.example().head(100)
features = df.get_column_names(regex="[^id]")  # not the ida
X = df[features].to_pandas_df() 
y = df['id'].values

pipeline = Pipeline.from_sklearn(NMSlibTransformer()).fit(X, y)

assert pipeline.validate()
pipeline.inference(pipeline.raw)

Unnamed: 0,x,y,z,vx,vy,vz,E,L,Lz,FeH,knn
0,1.231868,-0.396929,-0.598058,301.155273,174.059479,27.427546,-149431.40625,407.388977,333.955536,-1.005385,"[0, 21, 13, 13, 0]"


# [Faiss](https://github.com/facebookresearch/faiss)

* Faiss is not pickable - so we implement a class with *\_\_reduce\_\_()* for it.

## Vaex

In [65]:
import vaex
from faiss import IndexFlatL2
from goldilox import Pipeline
import numpy as np
import traitlets
from tempfile import NamedTemporaryFile
from faiss import write_index, read_index

df = vaex.example().head(1000)
features = df.get_column_names(regex="[^id]")  # not the ida
d = len(features)
X = np.float32(np.ascontiguousarray(df[features]))
index = IndexFlatL2(d)
index.add(X)

class FiassModel(traitlets.HasTraits):

        # This should work with the reduce's arguments
        def __init__(self, index=None):
            self.index = self._decode(index)

        # This is how you make a class pickalbe
        def __reduce__(self):
            return (self.__class__, (self._encode(),))

        # how nmslib implemented serialization
        def _decode(self, encoding):            
            if isinstance(encoding, bytes):                
                path = NamedTemporaryFile().name
                with open(path, "wb") as outfile:
                    outfile.write(encoding)
                return read_index(path)
            else:
                return encoding

        # how nmslib implemented serialization
        def _encode(self):
            if isinstance(self.index, bytes):
                return self.index
            path = NamedTemporaryFile().name
            write_index(self.index, path)            
            with open(path, "rb") as outfile:
                encoding = outfile.read()
            return encoding

        def predict(self, data, k=3):
            data = np.float32(np.ascontiguousarray(data))
            _ , ind = model.index.search(data, k)
            return ind

model = FiassModel(index)

@vaex.register_function(on_expression=False)
def search(*columns):
    k=3
    data = np.float32(np.ascontiguousarray(np.array(columns).T))
    _ , ind = model.index.search(data, k)
    return ind

df.add_function("search", search)
df['neighbors'] = df.func.search(*features)
pipeline = Pipeline.from_vaex(df)
pipeline.validate()
pipeline.inference(pipeline.raw)


#,id,x,y,z,vx,vy,vz,E,L,Lz,FeH,neighbors
0,0,1.23187,-0.396929,-0.598058,301.155,174.059,27.4275,-149431,407.389,333.956,-1.00539,"array([ 0, 704, 561])"


## Sklearn

In [9]:
from faiss import IndexFlatL2
import vaex
from sklearn.base import TransformerMixin, BaseEstimator
from goldilox import Pipeline
import numpy as np
from tempfile import NamedTemporaryFile
from faiss import write_index, read_index

class FaissTransformer(TransformerMixin, BaseEstimator):
        """Wrapper for using nmslib as sklearn's KNeighborsTransformer"""

        def __init__(
            self,
            features=None,
            n_neighbors=5,
            output_column="prediction",
            index=None,
        ):
            self.features = features
            self.n_neighbors = n_neighbors
            self.output_column = output_column
            self.index = self._decode(index)

        def __reduce__(self):
            return (
                self.__class__,
                (
                    self.features,
                    self.n_neighbors,
                    self.output_column,                    
                    self._encode(),
                ),
            )

        def _decode(self, encoding):              
            if isinstance(encoding, bytes):                
                path = NamedTemporaryFile().name
                with open(path, "wb") as outfile:
                    outfile.write(encoding)
                return read_index(path)
            else:
                return encoding

        # how nmslib implemented serialization
        def _encode(self):
            if isinstance(self.index, bytes):
                return self.index
            path = NamedTemporaryFile().name
            write_index(self.index, path)            
            with open(path, "rb") as outfile:
                encoding = outfile.read()
            return encoding

        def __sklearn_is_fitted__(self):
            return self.index is not None

        
        @property
        def dim(self):
            return len(self.features)

        def fit(self, X, y=None):
            if self.features is None:
                self.features = list(X.columns)             
            if self.index is None:                
                self.index = IndexFlatL2(len(self.features))                     
            X = np.float32(np.ascontiguousarray(X)) 
            self.index.add(X)
            return self

        def transform(self, X, y=None):            
            data = np.float32(np.ascontiguousarray(X[self.features]))
            _ , indices = self.index.search(data, k=self.n_neighbors)            
            indices = np.vstack(indices)
            X[self.output_column] = tuple(indices)                        
            return X

df = vaex.example().head(1000)
features = df.get_column_names(regex="[^id]")  # not the ida
X = df[features].to_pandas_df() 

pipeline = Pipeline.from_sklearn(FaissTransformer()).fit(X)

assert pipeline.validate()
pipeline.inference(pipeline.raw)

Unnamed: 0,x,y,z,vx,vy,vz,E,L,Lz,FeH,prediction
0,1.231868,-0.396929,-0.598058,301.155273,174.059479,27.427546,-149431.40625,407.388977,333.955536,-1.005385,"[0, 704, 561, 126, 14]"


# Deploy (any pipeline)

In [None]:
print(f"Saved to: {pipeline.save('pipeline.pkl')}")
print(f"Check out the docs: http://127.0.0.1:5000/docs\n")

!gl serve pipeline.pkl