# [Faiss](https://github.com/facebookresearch/faiss)
Faiss is a library for efficient similarity search and clustering of dense vectors.     
It contains algorithms that search in sets of vectors of any size, up to ones that possibly do not fit in RAM. 
It also contains supporting code for evaluation and parameter tuning.      
Faiss is written in C++ with complete wrappers for Python/numpy. Some of the most useful algorithms are implemented on the GPU.     
It is developed by [Facebook AI Research](https://ai.facebook.com/).

Notes:
* Faiss is not pickable - so we implement a class with *\_\_reduce\_\_()* for it.

## Vaex

In [1]:
import vaex
from faiss import IndexFlatL2
from goldilox import Pipeline
import numpy as np
import traitlets
from tempfile import NamedTemporaryFile
from faiss import write_index, read_index

df = vaex.example().head(1000)
features = df.get_column_names(regex="[^id]")  # not the ida
d = len(features)
X = np.float32(np.ascontiguousarray(df[features]))
index = IndexFlatL2(d)
index.add(X)

class FiassModel(traitlets.HasTraits):

        # This should work with the reduce's arguments
        def __init__(self, index=None):
            self.index = self._decode(index)

        # This is how you make a class pickalbe
        def __reduce__(self):
            return (self.__class__, (self._encode(),))

        # how faiss implemented serialization
        def _decode(self, encoding):            
            if isinstance(encoding, bytes):                
                path = NamedTemporaryFile().name
                with open(path, "wb") as outfile:
                    outfile.write(encoding)
                return read_index(path)
            else:
                return encoding

        # how faiss implemented serialization
        def _encode(self):
            if isinstance(self.index, bytes):
                return self.index
            path = NamedTemporaryFile().name
            write_index(self.index, path)            
            with open(path, "rb") as outfile:
                encoding = outfile.read()
            return encoding

model = FiassModel(index)

@vaex.register_function(on_expression=False)
def search(*columns):
    k=3
    data = np.float32(np.ascontiguousarray(np.array(columns).T))
    _ , ind = model.index.search(data, k)
    return ind

df.add_function("search", search)
df['neighbors'] = df.func.search(*features)
pipeline = Pipeline.from_vaex(df)
pipeline.validate()
pipeline.inference(pipeline.raw)


#,id,x,y,z,vx,vy,vz,E,L,Lz,FeH,neighbors
0,0,1.23187,-0.396929,-0.598058,301.155,174.059,27.4275,-149431,407.389,333.956,-1.00539,"array([ 0, 704, 561])"


## Sklearn
For sklean, we need to implement the Transformer methods: *fit()* and *transform()*.    
Becuase Faiss is not picaklbe, we will also implement *\_\_reduce\_\_()*.

In [2]:
from faiss import IndexFlatL2
import vaex
from sklearn.base import TransformerMixin, BaseEstimator
from goldilox import Pipeline
import numpy as np
from tempfile import NamedTemporaryFile
from faiss import write_index, read_index

class FaissTransformer(TransformerMixin, BaseEstimator):
        """Wrapper for using nmslib as sklearn's KNeighborsTransformer"""

        def __init__(
            self,
            features=None,
            n_neighbors=5,
            output_column="prediction",
            index=None,
        ):
            self.features = features
            self.n_neighbors = n_neighbors
            self.output_column = output_column
            self.index = self._decode(index)

        def __reduce__(self):
            return (
                self.__class__,
                (
                    self.features,
                    self.n_neighbors,
                    self.output_column,                    
                    self._encode(),
                ),
            )

        def _decode(self, encoding):              
            if isinstance(encoding, bytes):                
                path = NamedTemporaryFile().name
                with open(path, "wb") as outfile:
                    outfile.write(encoding)
                return read_index(path)
            else:
                return encoding

        def _encode(self):
            if isinstance(self.index, bytes):
                return self.index
            path = NamedTemporaryFile().name
            write_index(self.index, path)            
            with open(path, "rb") as outfile:
                encoding = outfile.read()
            return encoding

        def __sklearn_is_fitted__(self):
            return self.index is not None

        
        @property
        def dim(self):
            return len(self.features)

        def fit(self, X, **kwargs):
            if self.features is None:
                self.features = list(X.columns)             
            if self.index is None:                
                self.index = IndexFlatL2(len(self.features))                     
            X = np.float32(np.ascontiguousarray(X)) 
            self.index.add(X)
            return self

        def transform(self, X, **kwargs):            
            data = np.float32(np.ascontiguousarray(X[self.features]))
            _ , indices = self.index.search(data, k=self.n_neighbors)            
            indices = np.vstack(indices)
            X[self.output_column] = tuple(indices)                        
            return X

df = vaex.example().head(1000)
features = df.get_column_names(regex="[^id]")  # not the ida
X = df[features].to_pandas_df() 

pipeline = Pipeline.from_sklearn(FaissTransformer()).fit(X)

assert pipeline.validate()
pipeline.inference(pipeline.raw)

Unnamed: 0,x,y,z,vx,vy,vz,E,L,Lz,FeH,prediction
0,1.231868,-0.396929,-0.598058,301.155273,174.059479,27.427546,-149431.40625,407.388977,333.955536,-1.005385,"[0, 704, 561, 126, 14]"


# Deploy (any pipeline)

In [None]:
print(f"Saved to: {pipeline.save('pipeline.pkl')}")
print(f"Check out the docs: http://127.0.0.1:5000/docs\n")

!gl serve pipeline.pkl