# [NMSLIB](https://github.com/nmslib/nmslib/blob/master/python_bindings/README.md)
Non-Metric Space Library (NMSLIB) is an efficient cross-platform similarity search library and a toolkit for evaluation of similarity search methods.     
The core-library does not have any third-party dependencies.      
It has been gaining popularity recently.      
In particular, it has become a part of Amazon Elasticsearch Service.


Notes:   
* Unfortunately nmslib is not pickable, but we can get over it by creating a class and implement *\_\_reduce\_\_()* for it and implementing the serialization.

## Vaex

In [1]:
import vaex
import nmslib 

df = vaex.example().head(1000)

ids = {index: _id for index, _id in enumerate(df["id"].tolist())}
df.variables["id_map"] = ids  # good practice when the ids differ than the indices

features = df.get_column_names(regex="[^id]")  # not the ida

method = "hnsw"
space = "l2"
index = nmslib.init(method=method, space=space)
index.addDataPointBatch(df[features])
index.createIndex()

Your CPU supports instructions that this binary was not compiled to use: AVX2
For maximum performance, you can install NMSLIB from sources 
pip install --no-binary :all: nmslib


In [5]:
import traitlets
from tempfile import NamedTemporaryFile
from goldilox import Pipeline
import numpy as np

class NMSLibModel(traitlets.HasTraits):

        # This should work with the reduce's arguments
        def __init__(self, index=None, method="hnsw", metric="cosinesimil"):
            self.method = method
            self.metric = metric
            self.index = self._decode(index)

        # This is how you make a class pickalbe
        def __reduce__(self):
            return (self.__class__, (self._encode(), self.method, self.metric))

        # how nmslib implemented serialization
        def _decode(self, encoding):
            import nmslib

            if isinstance(encoding, bytes):
                index = nmslib.init(method=self.method, space=self.metric)
                path = NamedTemporaryFile().name
                with open(path, "wb") as outfile:
                    outfile.write(encoding)
                index.loadIndex(path)
                return index
            else:
                return encoding

        # how nmslib implemented serialization
        def _encode(self):
            if isinstance(self.index, bytes):
                return self.index
            path = NamedTemporaryFile().name
            self.index.saveIndex(path, save_data=True)
            with open(path, "rb") as outfile:
                encoding = outfile.read()
            return encoding

        def predict(self, data, k=3):
            neighbours = self.index.knnQueryBatch(data, k=k)
            return np.array(neighbours)[:, 0]

model = NMSLibModel(index, method, space)

# Add a nearest neighbours index column
@vaex.register_function(on_expression=False)
def topk(*columns, k=3):
    data = np.array(columns).T
    return model.predict(data, k)
df.add_function("topk", topk)
df["knn"] = df.func.topk(*features)

# Add a nearest neighbours actual ids
@vaex.register_function(on_expression=True)
def results(ar):
    return np.vectorize(ids.get)(ar)

df.add_function("results", results)
df["neighbours"] = df["knn"].results()

pipeline = Pipeline.from_vaex(df)
assert pipeline.validate()
pipeline.inference(pipeline.raw)

#,id,x,y,z,vx,vy,vz,E,L,Lz,FeH,knn,neighbours
0,0,1.23187,-0.396929,-0.598058,301.155,174.059,27.4275,-149431,407.389,333.956,-1.00539,"array([ 0., 704., 561.])","array([ 0, 21, 22])"


## Sklearn version
We need to implement *fit* and  *transform* with a class that inherent TransformerMixin, BaseEstimator for sklearn.base.    
Not forget the *\_\_reduce\_\_()* with the serialization.

In [6]:
import vaex 
import nmslib
from sklearn.base import TransformerMixin, BaseEstimator

class NMSlibTransformer(TransformerMixin, BaseEstimator):
        """Wrapper for using nmslib as sklearn's KNeighborsTransformer"""

        def __init__(
            self,
            n_neighbors=5,
            output_column="knn",
            method="hnsw",
            metric="cosinesimil",
            n_jobs=1,
            index=None,
            ids = {}
        ):

            self.n_neighbors = n_neighbors
            self.method = method
            self.metric = metric
            self.n_jobs = n_jobs
            self.output_column = output_column
            self.n_samples_fit_ = None
            self.index = self._create_index(index)
            self.ids = ids

        def __reduce__(self):
            return (
                self.__class__,
                (
                    self.n_neighbors,
                    self.output_column,
                    self.method,
                    self.metric,
                    self.n_jobs,
                    self._encode(),
                    self.ids
                ),
            )

        def _create_index(self, encoding):
            import nmslib

            if encoding is None:
                return nmslib.init(method=self.method, space=self.metric)
            if isinstance(encoding, bytes):
                index = nmslib.init(method=self.method, space=self.metric)
                path = NamedTemporaryFile().name
                with open(path, "wb") as outfile:
                    outfile.write(encoding)
                index.loadIndex(path)
                return index
            else:
                return encoding

        def _encode(self):
            if self.index is None:
                return None
            if isinstance(self.index, bytes):
                return self.index
            path = NamedTemporaryFile().name
            self.index.saveIndex(path, save_data=True)
            with open(path, "rb") as outfile:
                encoding = outfile.read()
            return encoding

        def __sklearn_is_fitted__(self):
            return self.n_samples_fit_ is not None

        def fit(self, X, y=None):
            self.ids = {index: _id for index, _id in enumerate(y)}            
            self.n_samples_fit_ = X.shape[0]
            self.index.addDataPointBatch(X)
            self.index.createIndex()
            return self

        def transform(self, X):
            results = self.index.knnQueryBatch(
                X, k=self.n_neighbors, num_threads=self.n_jobs
            )
            indices, distances = zip(*results)
            indices = np.vstack(indices)
            X[self.output_column] = tuple(indices)                        
            X[self.output_column] = X[self.output_column].apply(lambda x: [self.ids.get(i) for i in x])
            return X

df = vaex.example().head(100)
features = df.get_column_names(regex="[^id]")  # not the ida
X = df[features].to_pandas_df() 
y = df['id'].values

pipeline = Pipeline.from_sklearn(NMSlibTransformer()).fit(X, y)

assert pipeline.validate()
pipeline.inference(pipeline.raw)

Unnamed: 0,x,y,z,vx,vy,vz,E,L,Lz,FeH,knn
0,1.231868,-0.396929,-0.598058,301.155273,174.059479,27.427546,-149431.40625,407.388977,333.955536,-1.005385,"[0, 21, 13, 13, 0]"


## Serve (any pipeline)

In [None]:
print(f"Saved to: {pipeline.save('pipeline.pkl')}")
print(f"Check out the docs: http://127.0.0.1:5000/docs\n")

!gl serve pipeline.pkl