# Vaex

In [56]:
import vaex
from sklearn.cluster import KMeans
from vaex.ml.sklearn import Predictor

df = vaex.example()
features = df.get_column_names(regex='[^id]')
# handle missing values
for feature in features:
    df[feature] = df[feature].fillna(df[feature].mean())
kmeans = Predictor(model=KMeans(), features=features, prediction_name='cluster')
kmeans.fit(df)
df = kmeans.transform(df)
df.head(2)

#,id,x,y,z,vx,vy,vz,E,L,Lz,FeH,cluster
0,0,1.23187,-0.396929,-0.598058,301.155,174.059,27.4275,-149431,407.389,333.956,-1.00539,5
1,23,-0.163701,3.65422,-0.254906,-195.0,170.472,142.53,-124248,890.241,684.668,-1.70867,1


Let's build a production pipeline

In [57]:
from goldilox import Pipeline
import json

pipeline = Pipeline.from_vaex(df)

# I/O Example
print(f"predict for {json.dumps(pipeline.raw, indent=4)}")
pipeline.inference(pipeline.raw)

predict for {
    "id": 0,
    "x": 1.2318683862686157,
    "y": -0.39692866802215576,
    "z": -0.598057746887207,
    "vx": 301.1552734375,
    "vy": 174.05947875976562,
    "vz": 27.42754554748535,
    "E": -149431.40625,
    "L": 407.38897705078125,
    "Lz": 333.9555358886719,
    "FeH": -1.0053852796554565
}


#,id,x,y,z,vx,vy,vz,E,L,Lz,FeH,cluster
0,0,1.23187,-0.396929,-0.598058,301.155,174.059,27.4275,-149431,407.389,333.956,-1.00539,5


## Sklearn

In [58]:
from goldilox.datasets import load_iris

# Get teh data
df, features, target = load_iris()
df.head(2)

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,target
0,5.1,3.5,1.4,0.2,0
1,4.9,3.0,1.4,0.2,0


In [59]:
from sklearn.cluster import KMeans
from goldilox import Pipeline

pipeline = Pipeline.from_sklearn(KMeans()).fit(df)



The pipeline is ready to go, but it does not handle missing values which we might get in production.   
Let's fix that!

In [60]:
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
import sklearn.pipeline

imputer = ColumnTransformer([('features_mean', SimpleImputer(strategy='mean'), features)], remainder='passthrough')
sklearn_pipeline = sklearn.pipeline.Pipeline([('imputer', imputer), ('kmean', KMeans())])
pipeline = Pipeline.from_sklearn(sklearn_pipeline, features=features,
                                 output_columns=['cluster']).fit(df)

# I/O Example
print(f"predict for {json.dumps(pipeline.raw, indent=4)}")
pipeline.inference(pipeline.raw)

predict for {
    "sepal_length": 5.1,
    "sepal_width": 3.5,
    "petal_length": 1.4,
    "petal_width": 0.2
}


Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,cluster
0,5.1,3.5,1.4,0.2,1


In [7]:
pipeline.save('pipeline.pkl')
print('Go to the fastapi docs here: http://127.0.0.1:5000/docs')
!gl serve 'pipeline.pkl'

Go to the fasgtapi docs here: http://127.0.0.1:5000/docs
[2021-11-26 12:10:04 +0100] [39494] [INFO] Starting gunicorn 20.1.0
[2021-11-26 12:10:04 +0100] [39494] [INFO] Listening at: http://127.0.0.1:5000 (39494)
[2021-11-26 12:10:04 +0100] [39494] [INFO] Using worker: uvicorn.workers.UvicornH11Worker
[2021-11-26 12:10:04 +0100] [39527] [INFO] Booting worker with pid: 39527
[2021-11-26 12:10:04 +0100] [39527] [INFO] Started server process [39527]
[2021-11-26 12:10:04 +0100] [39527] [INFO] Waiting for application startup.
[2021-11-26 12:10:04 +0100] [39527] [INFO] Application startup complete.
^C
[2021-11-26 12:10:21 +0100] [39494] [INFO] Handling signal: int
