# [Hnswlib](https://github.com/nmslib/hnswlib)
Fast approximate nearest neighbor search


## Vaex

In [1]:
import vaex

df = vaex.example().head(10000)
features = df.get_column_names(regex="[^id]")  # not the ida
print(df.head(2))

  #    id          x          y          z        vx       vy        vz        E        L       Lz       FeH
  0     0   1.23187   -0.396929  -0.598058   301.155  174.059   27.4275  -149431  407.389  333.956  -1.00539
  1    23  -0.163701   3.65422   -0.254906  -195      170.472  142.53    -124248  890.241  684.668  -1.70867


In [2]:
from hnswlib import Index
import numpy as np
from goldilox import Pipeline


# Build index
index = Index(
    space="l2", dim=df.shape[1] - 1
)  # possible options are l2, cosine or ip
index.init_index(max_elements=len(df), ef_construction=200, M=16)

for i1, i2, chunk in df.to_pandas_df(chunk_size=1000):
    X = chunk[features]
    y = chunk["id"]
    index.add_items(X, y)

index.set_ef(50)  # ef should always be > k (Controlling the recall by setting ef)

# Add to Dataframe
@vaex.register_function(on_expression=False)
def topk(*columns, k=3):
    labels, _ = index.knn_query(np.array(columns).T, k=k)
    return np.array(labels)

df.add_function("topk", topk)
df["knn"] = df.func.topk(*features)

# build pipeline for production
pipeline = Pipeline.from_vaex(df)
assert pipeline.validate
pipeline.inference(pipeline.raw)

#,id,x,y,z,vx,vy,vz,E,L,Lz,FeH,knn
0,0,1.23187,-0.396929,-0.598058,301.155,174.059,27.4275,-149431,407.389,333.956,-1.00539,"array([24, 31, 13], dtype=uint64)"


## Serve 

In [None]:
print(f"Saved to: {pipeline.save('pipeline.pkl')}")
print(f"Check out the docs: http://127.0.0.1:5000/docs\n")

!gl serve pipeline.pkl