In [48]:
import vaex
import warnings
from vaex.ml.datasets import load_iris_1e5
from lightgbm.sklearn import LGBMClassifier
from catboost import CatBoostClassifier
from xgboost.sklearn import XGBClassifier
from goldilox import Pipeline
import numpy as np
import json
from vaex.ml.sklearn import Predictor
warnings.filterwarnings('ignore')

df = load_iris_1e5()
target = 'class_'

# feature engineering example
df['petal_ratio'] = df['petal_length'] / df['petal_width'] 
features = ['petal_length', 'petal_width', 'sepal_length', 'sepal_width', 'petal_ratio']


## Modelling

In [49]:
lgm = Predictor(model=LGBMClassifier(**{'verbosity': -1,
                                                     'objective': 'multiclass',
                                                     'num_class': 3}),
                features=features, 
                target=target,
                prediction_name='lgm')

xgb = Predictor(model=XGBClassifier(**{'verbosity': 0,
                                                   'objective': 'multi:softmax',
                                                   'num_class': 3}),
                features=features, 
                target=target,
                prediction_name='xgb')

cb = Predictor(model=CatBoostClassifier(**{'verbose': 0, 'iterations': 10,
                                                         'objective': 'MultiClass'}),
                features=features, 
                target=target,
                prediction_name='cb')

for model in [lgm, xgb, cb]:
    model.fit(df)
    df = model.transform(df)
df['cb'] = df['cb'].apply(lambda x: x[0]) # catboost beeing annoying

### Crazy ensmble logic example
This is not efficent for big data, but it works for most use cases.

In [50]:
from itertools import combinations

def ensmble(lgm_result, xgb_result, cb_results):
    # here we can do whatever
    for i,j in combinations([lgm_result, xgb_result, cb_results],2):
        if i == j:
            return i
    return lgm_result

df['prediction'] = df.apply(ensmble, [df.lgm,df.xgb, df.cb])

In [51]:
# Add a nice label for the fronend
df['label'] = df['prediction'].map({0: 'setosa', 1: 'versicolor', 2: 'virginica'})
df

#,sepal_length,sepal_width,petal_length,petal_width,class_,petal_ratio,lgm,xgb,cb,prediction,label
0,5.9,3.0,4.2,1.5,1,2.8000000000000003,1,1,1,1,versicolor
1,6.1,3.0,4.6,1.4,1,3.2857142857142856,1,1,1,1,versicolor
2,6.6,2.9,4.6,1.3,1,3.538461538461538,1,1,1,1,versicolor
3,6.7,3.3,5.7,2.1,2,2.7142857142857144,2,2,2,2,virginica
4,5.5,4.2,1.4,0.2,0,6.999999999999999,0,0,0,0,setosa
...,...,...,...,...,...,...,...,...,...,...,...
100495,5.2,3.4,1.4,0.2,0,6.999999999999999,0,0,0,0,setosa
100496,5.1,3.8,1.6,0.2,0,8.0,0,0,0,0,setosa
100497,5.8,2.6,4.0,1.2,1,3.3333333333333335,1,1,1,1,versicolor
100498,5.7,3.8,1.7,0.3,0,5.666666666666667,0,0,0,0,setosa


## Deployment

In [53]:
from goldilox import Pipeline
pipeline = Pipeline.from_vaex(df)

assert pipeline.validate() # Good practice
print(f"Saved to: {pipeline.save('../tests/models/server.pkl')}")

Saved to: ../tests/models/server.pkl


### Serve

In [54]:
print(f"Check out the docs: http://127.0.0.1:5000\n")
!gl serve ../tests/models/server.pkl

Check out the docs: http://127.0.0.1:5000

[2021-11-16 18:47:57 +0100] [74627] [INFO] Starting gunicorn 20.1.0
[2021-11-16 18:47:57 +0100] [74627] [INFO] Listening at: http://127.0.0.1:5000 (74627)
[2021-11-16 18:47:57 +0100] [74627] [INFO] Using worker: uvicorn.workers.UvicornH11Worker
[2021-11-16 18:47:57 +0100] [74635] [INFO] Booting worker with pid: 74635
[2021-11-16 18:47:57 +0100] [74635] [INFO] Started server process [74635]
[2021-11-16 18:47:57 +0100] [74635] [INFO] Waiting for application startup.
[2021-11-16 18:47:57 +0100] [74635] [INFO] Application startup complete.
^C
[2021-11-16 18:48:40 +0100] [74627] [INFO] Handling signal: int
[2021-11-16 18:48:40 +0100] [74627] [INFO] Shutting down: Master
