# Get data

## Vaex

In [27]:
import vaex
from vaex.ml.sklearn import Predictor
from sklearn.cluster import KMeans

df = vaex.example()
features = df.get_column_names(regex='[^id]')
kmeans = Predictor(model=KMeans(), features=features, prediction_name='cluster')
kmeans.fit(df)
df = kmeans.transform(df)
df.head(2)

#,id,x,y,z,vx,vy,vz,E,L,Lz,FeH,cluster
0,0,1.23187,-0.396929,-0.598058,301.155,174.059,27.4275,-149431,407.389,333.956,-1.00539,6
1,23,-0.163701,3.65422,-0.254906,-195.0,170.472,142.53,-124248,890.241,684.668,-1.70867,1


Let's build a production pipeline

In [28]:
from goldilox import Pipeline
import json

pipeline = Pipeline.from_vaex(df)
assert pipeline.validate()

# I/O Example
raw = pipeline.raw
print(f"predict for {json.dumps(raw, indent=4)}")
pipeline.inference(raw)

predict for {
    "id": 0,
    "x": 1.2318683862686157,
    "y": -0.39692866802215576,
    "z": -0.598057746887207,
    "vx": 301.1552734375,
    "vy": 174.05947875976562,
    "vz": 27.42754554748535,
    "E": -149431.40625,
    "L": 407.38897705078125,
    "Lz": 333.9555358886719,
    "FeH": -1.0053852796554565
}


#,id,x,y,z,vx,vy,vz,E,L,Lz,FeH,cluster
0,0,1.23187,-0.396929,-0.598058,301.155,174.059,27.4275,-149431,407.389,333.956,-1.00539,6


## Sklearn

In [29]:
import pandas as pd
from sklearn.datasets import load_iris

# Get teh data
iris  = load_iris()
df = pd.DataFrame(iris.data, columns=iris.feature_names)

df.head()

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm)
0,5.1,3.5,1.4,0.2
1,4.9,3.0,1.4,0.2
2,4.7,3.2,1.3,0.2
3,4.6,3.1,1.5,0.2
4,5.0,3.6,1.4,0.2


In [20]:
from sklearn.cluster import KMeans
from goldilox import Pipeline

pipeline = Pipeline.from_sklearn(KMeans()).fit(df)
assert pipeline.validate()




The pipeline is ready to go, but it does not handle missing values which we might get in production time.   
Let's fix that!

In [30]:
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
import sklearn.pipeline 
imputer = ColumnTransformer([('features_mean', SimpleImputer(strategy='mean'), df.columns)], remainder='passthrough')
sklearn_pipeline = sklearn.pipeline.Pipeline([('imputer',imputer), ('clustering', KMeans())])
pipeline = Pipeline.from_sklearn(sklearn_pipeline).fit(df)
assert pipeline.validate()

# I/O Example
raw = pipeline.raw
print(f"predict for {json.dumps(raw, indent=4)}")
pipeline.inference(raw)

predict for {
    "sepal length (cm)": 5.1,
    "sepal width (cm)": 3.5,
    "petal length (cm)": 1.4,
    "petal width (cm)": 0.2
}


Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),prediction
0,5.1,3.5,1.4,0.2,4


In [7]:
pipeline.save('pipeline.pkl')
print('Go to the fastapi docs here: http://127.0.0.1:5000/docs')
!gl serve 'pipeline.pkl'

Go to the fasgtapi docs here: http://127.0.0.1:5000/docs
[2021-11-26 12:10:04 +0100] [39494] [INFO] Starting gunicorn 20.1.0
[2021-11-26 12:10:04 +0100] [39494] [INFO] Listening at: http://127.0.0.1:5000 (39494)
[2021-11-26 12:10:04 +0100] [39494] [INFO] Using worker: uvicorn.workers.UvicornH11Worker
[2021-11-26 12:10:04 +0100] [39527] [INFO] Booting worker with pid: 39527
[2021-11-26 12:10:04 +0100] [39527] [INFO] Started server process [39527]
[2021-11-26 12:10:04 +0100] [39527] [INFO] Waiting for application startup.
[2021-11-26 12:10:04 +0100] [39527] [INFO] Application startup complete.
^C
[2021-11-26 12:10:21 +0100] [39494] [INFO] Handling signal: int
