# [KDTree (SKlearn)](https://scikit-learn.org/stable/auto_examples/neighbors/approximate_nearest_neighbors.html)


### Vaex

In [10]:
import vaex
from sklearn.neighbors import KDTree
from goldilox import Pipeline
import numpy as np

df = vaex.example().head(10000)
features = df.get_column_names(regex="[^id]")  # not the ida
model = KDTree(df[features], leaf_size=2)

@vaex.register_function(on_expression=False)
def query(*columns):
    data = np.array(columns).T
    _, ind = model.query(data, k=3)
    return ind

df.add_function("query", query)
df["knn"] = df.func.query(*features)

pipeline = Pipeline.from_vaex(df)

assert pipeline.validate()
pipeline.inference(pipeline.raw)

#,id,x,y,z,vx,vy,vz,E,L,Lz,FeH,knn
0,0,1.23187,-0.396929,-0.598058,301.155,174.059,27.4275,-149431,407.389,333.956,-1.00539,"array([ 0, 7713, 1744])"


## Sklearn
For sklearn we must implement a Transformer with TransformerMixin, BaseEstimator, and the methods: *fit()* and *transform()*.    
We will deal with the missing values ourselvs by applying the means. 

In [7]:
import vaex
from sklearn.neighbors import KDTree
import sklearn.pipeline
from sklearn.base import TransformerMixin, BaseEstimator
from goldilox import Pipeline

df = vaex.example().head(10000)
features = df.get_column_names(regex="[^id]")  # not the ida

X = df[features].to_pandas_df()

class KDTreeTransformer(TransformerMixin, BaseEstimator):
    def __init__(self, features=None, leaf_size=2, k=3, output_column="knn"):
        self.index = None
        self.ids = None
        self.features = features
        self.k = k
        self.leaf_size = leaf_size
        self.output_column = output_column
        self.means = {}

    def fit(self, X, y=None):        
        if y is not None:
            assert len(X) == len(y)
            self.ids = {i: j for i, j in enumerate(y)}
        if self.features and isinstance(self.features, list):
            self.means = {feature:X[feature].mean() for feature in self.features}
            X = X[self.features]        
            
        self.index = KDTree(X, leaf_size=self.leaf_size)
        return self
    
    def _fillna(self, X):
        for feature in self.features:
            X[feature] = X[feature].fillna(self.means.get(feature))
        return X
        

    def transform(self, X):
        copy = X.copy()               
        if self.index is None:
            raise RuntimeError("model was not trained")
        if self.features and isinstance(self.features, list):            
            copy = self._fillna(X[self.features])
        _, ind = self.index.query(copy, k=self.k)                
        copy[self.output_column] = list(ind)
        return copy
    
model = KDTreeTransformer(features=features)

pipeline = Pipeline.from_sklearn(model).fit(X)

assert pipeline.validate()
pipeline.inference(pipeline.raw)

Unnamed: 0,x,y,z,vx,vy,vz,E,L,Lz,FeH,knn
0,1.231868,-0.396929,-0.598058,301.155273,174.059479,27.427546,-149431.40625,407.388977,333.955536,-1.005385,"[0, 7713, 1744]"


# Deploy (any pipeline)

In [None]:
print(f"Saved to: {pipeline.save('pipeline.pkl')}")
print(f"Check out the docs: http://127.0.0.1:5000/docs\n")

!gl serve pipeline.pkl