In [2]:
import vaex

from goldilox.datasets import load_iris

df, features, target = load_iris()
df = vaex.from_pandas(df)

df.head(2)

#,sepal_length,sepal_width,petal_length,petal_width,target
0,5.1,3.5,1.4,0.2,0
1,4.9,3.0,1.4,0.2,0


## Feature engineering
Here we can do any feature engineering we wamt, knowing it will work in the pipeline automatically.

In [3]:
from vaex.ml.transformations import StandardScaler

df['petal_ratio'] = df['petal_length'] / df['petal_width']

features = ['petal_length', 'petal_width', 'sepal_length', 'sepal_width', 'petal_ratio']
for feature in features:
    df[feature] = df[feature].fillna(df[feature].mean())
df = StandardScaler(features=features, prefix='').fit_transform(df)
df.head(2)

#,sepal_length,sepal_width,petal_length,petal_width,target,petal_ratio
0,-0.900681,1.019,-1.34023,-1.31544,0,1.08389
1,-1.14302,-0.131979,-1.34023,-1.31544,0,1.08389


## Modeling 
Let's build two models. In production we can query any of them we want.   
* This is great if you have an experimental model you want to evaluate.

### LighGBM

In [4]:
from vaex.ml.lightgbm import LightGBMModel

booster = LightGBMModel(features=features,
                        target=target,
                        prediction_name='lgbm',
                        num_boost_round=500,
                        params={'verbosity': -1,
                                'objective': 'multiclass',
                                'num_class': 3})
booster.fit(df)
df = booster.transform(df)
df.head(2)



#,sepal_length,sepal_width,petal_length,petal_width,target,petal_ratio,lgbm
0,-0.900681,1.019,-1.34023,-1.31544,0,1.08389,"'array([9.99999924e-01, 7.50109907e-08, 5.347495..."
1,-1.14302,-0.131979,-1.34023,-1.31544,0,1.08389,"'array([9.99998853e-01, 1.05726411e-06, 8.970361..."


### XGBoost

In [5]:
from vaex.ml.xgboost import XGBoostModel

booster = XGBoostModel(
    params={"eval_metric": "mlogloss",
            "objective": "multi:softmax",
            "num_class": 3},
    features=features,
    target=target,
    prediction_name="xgb",
    num_boost_round=500,
)
booster.fit(df)
df = booster.transform(df)
df.head(2)

#,sepal_length,sepal_width,petal_length,petal_width,target,petal_ratio,lgbm,xgb
0,-0.900681,1.019,-1.34023,-1.31544,0,1.08389,"'array([9.99999924e-01, 7.50109907e-08, 5.347495...",0
1,-1.14302,-0.131979,-1.34023,-1.31544,0,1.08389,"'array([9.99998853e-01, 1.05726411e-06, 8.970361...",0


## Post model processing
We can do more procssing, making sure the Fronend/Backend is happy with the response.
* This option can save much friction in a teams.

In [6]:
import numpy as np
import vaex


@vaex.register_function()
def argmax(ar, axis=1):
    return np.argmax(ar, axis=axis)


df.add_function('argmax', argmax)
df['prediction'] = df['lgbm'].argmax()

df['label'] = df['prediction'].map({0: 'setosa', 1: 'versicolor', 2: 'virginica'})
df.head(2)

#,sepal_length,sepal_width,petal_length,petal_width,target,petal_ratio,lgbm,xgb,prediction,label
0,-0.900681,1.019,-1.34023,-1.31544,0,1.08389,"'array([9.99999924e-01, 7.50109907e-08, 5.347495...",0,0,setosa
1,-1.14302,-0.131979,-1.34023,-1.31544,0,1.08389,"'array([9.99998853e-01, 1.05726411e-06, 8.970361...",0,0,setosa


# Explainability
Let's say we want to know, as part of our product, why we made one decition or another.   
Check out [SHAP](https://github.com/slundberg/shap) for that.
* Great for communication - add it to your models as "metadata" on the prediction.

In [7]:
import shap
import pyarrow as pa
import pandas as pd

explainer = shap.TreeExplainer(booster.booster)
targets = df[target].unique()
feature_count = len(features)


@vaex.register_function(on_expression=False)
def explain(*columns):
    data = np.array(columns).T
    X, y = pd.DataFrame(data[:, :feature_count], columns=features), data[:, -1]
    shap_values = explainer.shap_values(X)
    explanation = []

    for i, c in enumerate(y):
        c = int(c)
        e = shap.force_plot(explainer.expected_value[c], shap_values[c][i, :], X.iloc[i]).data
        explanation.append(
            {feature: effects['effect'] for feature, effects in zip(e['featureNames'], e['features'].values())})
    return pa.array(explanation)


df.add_function('explain', explain)
explain_columns = features + ['prediction']
df['explanation'] = df.func.explain(*explain_columns)
df.head(2)

#,sepal_length,sepal_width,petal_length,petal_width,target,petal_ratio,lgbm,xgb,prediction,label,explanation
0,-0.900681,1.019,-1.34023,-1.31544,0,1.08389,"'array([9.99999924e-01, 7.50109907e-08, 5.347495...",0,0,setosa,{'petal_length': 3.226044178009033}
1,-1.14302,-0.131979,-1.34023,-1.31544,0,1.08389,"'array([9.99998853e-01, 1.05726411e-06, 8.970361...",0,0,setosa,{'petal_length': 3.226044178009033}


# Confidence
No matter how good your model is, it won't do as well if it meets data that it doesn't know.  
Having information about how similar the data in prodction to what you trained on, can let you build safeguards in your apps. 

* Here we will use a simple nearest neighbours model, and normelize the results.
* If you get very "weird" data, it will be far from normal.

In [8]:
from sklearn.neighbors import KDTree
from vaex.ml.transformations import MinMaxScaler

model = KDTree(df[features], leaf_size=2)


@vaex.register_function(on_expression=False)
def dist(*columns):
    data = np.array(columns).T
    distance, ind = model.query(data, k=5)
    return 1 - distance.mean(axis=1)


df.add_function("dist", dist)
df['distance'] = df.func.dist(*features)
df = MinMaxScaler(features=['distance'], prefix='').fit_transform(df)
df['outlier'] = df.func.where((0.01 < df.distance) & (df.distance < 0.99), "ok", "outlier")
df.head(2)

#,sepal_length,sepal_width,petal_length,petal_width,target,petal_ratio,lgbm,xgb,prediction,label,explanation,distance,outlier
0,-0.900681,1.019,-1.34023,-1.31544,0,1.08389,"'array([9.99999924e-01, 7.50109907e-08, 5.347495...",0,0,setosa,{'petal_length': 3.226044178009033},0.98279,ok
1,-1.14302,-0.131979,-1.34023,-1.31544,0,1.08389,"'array([9.99998853e-01, 1.05726411e-06, 8.970361...",0,0,setosa,{'petal_length': 3.226044178009033},0.902893,ok


In [9]:
from goldilox import Pipeline

pipeline = Pipeline.from_vaex(df)
pipeline.inference(pipeline.raw)

#,sepal_length,sepal_width,petal_length,petal_width,target,petal_ratio,lgbm,xgb,prediction,label,explanation,distance,outlier
0,-0.900681,1.019,-1.34023,-1.31544,0,1.08389,"'array([9.99999924e-01, 7.50109907e-08, 5.347495...",0,0,setosa,{'petal_length': 3.226044178009033},0.98279,ok


In [10]:
pipeline.inference({"sepal_length": 100,
                    "sepal_width": None,
                    "petal_length": -5,
                    "petal_width": 5})

#,sepal_length,sepal_width,petal_length,petal_width,target,petal_ratio,lgbm,xgb,prediction,label,explanation,distance,outlier
0,114.088,0,-4.97782,5.0029,--,-2.14018,"array([0.36681711, 0.00107121, 0.63211168])",0,2,virginica,"""{'petal_length': -2.0075998306274414, 'petal_ra...",-68.2714,outlier


### Great features
* simple and fast
* validate missing values
* validate serialization
* variables 
* description
* example

In [11]:
from sklearn.metrics import accuracy_score

# Let's say test is new data
test, features, target = load_iris()
test = vaex.from_pandas(test)

predictions = pipeline.inference(test)
predictions.head(2)

#,sepal_length,sepal_width,petal_length,petal_width,target,petal_ratio,lgbm,xgb,prediction,label,explanation,distance,outlier
0,-0.900681,1.019,-1.34023,-1.31544,0,1.08389,"'array([9.99999924e-01, 7.50109907e-08, 5.347495...",0,0,setosa,{'petal_length': 3.226044178009033},0.98279,ok
1,-1.14302,-0.131979,-1.34023,-1.31544,0,1.08389,"'array([9.99998853e-01, 1.05726411e-06, 8.970361...",0,0,setosa,{'petal_length': 3.226044178009033},0.902893,ok


In [28]:
pipeline.set_variable('branch', "demo")
pipeline.set_variable('version', "v1")
pipeline.set_variable('xgboost_params', booster.params)
pipeline.set_variable('training_accuracy',
                      float(accuracy_score(predictions['prediction'].values, predictions['target'].values)))
pipeline.set_variable('confidance', "when donfidance close to 1 is good, 0 or less is new data")
pipeline.set_variable('description', "I did demo and I liked it")
pipeline.save('pipeline.pkl')

'pipeline.pkl'

In [None]:
!glx serve pipeline.pkl

[2022-01-11 20:29:50 +0200] [3971] [INFO] Starting gunicorn 20.1.0
[2022-01-11 20:29:50 +0200] [3971] [INFO] Listening at: http://127.0.0.1:8000 (3971)
[2022-01-11 20:29:50 +0200] [3971] [INFO] Using worker: uvicorn.workers.UvicornH11Worker
[2022-01-11 20:29:50 +0200] [3976] [INFO] Booting worker with pid: 3976
[2022-01-11 20:29:50 +0200] [3976] [INFO] Started server process [3976]
[2022-01-11 20:29:50 +0200] [3976] [INFO] Waiting for application startup.
[2022-01-11 20:29:50 +0200] [3976] [INFO] Application startup complete.
[2022-01-11 20:44:18 +0200] [3971] [CRITICAL] WORKER TIMEOUT (pid:3976)
[2022-01-11 20:44:18 +0200] [4192] [INFO] Booting worker with pid: 4192
[2022-01-11 20:44:18 +0200] [4192] [INFO] Started server process [4192]
[2022-01-11 20:44:18 +0200] [4192] [INFO] Waiting for application startup.
[2022-01-11 20:44:18 +0200] [4192] [INFO] Application startup complete.


# Cloud?

In [23]:
from IPython.display import HTML

HTML('<img src="../assets/goldilox_platform.mp4">')

In [19]:
%%bash
curl -X POST 'https://75bd7e73-f285-4d50-b0f9-a6b915cb76cd.ai.goldilox.cloud/inference' \
	-H 'x-api-key: zGvPcGTLUr3YigMGbqjck5kwwlHtJIjk4dMg2vmb' \
	-d '[{"sepal_length":6.7,"sepal_width":3.1,"petal_length":5.6,"petal_width":2.4,"target":2}]'

[{"sepal_length":1.0380047568006336,"sepal_width":0.09821728693702417,"petal_length":1.0469454037485713,"petal_width":1.5804637593788793,"target":2,"petal_ratio":-0.7968154346110106,"lgbm":[5.144535866516098e-09,9.126372971557861e-09,0.9999999857290911],"xgb":2.0,"prediction":2,"label":"virginica","explanation":{"petal_length":2.3492918014526367,"petal_ratio":0.7434390783309937,"petal_width":1.508973240852356,"sepal_length":0.591556966304779,"sepal_width":-0.5662230253219604},"distance":0.9339118676939868,"outlier":"ok"}]

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
  0     0    0     0    0     0      0      0 --:--:-- --:--:-- --:--:--     0  0     0    0     0    0     0      0      0 --:--:-- --:--:-- --:--:--     0100   615  100   527  100    88    408     68  0:00:01  0:00:01 --:--:--   480


In [23]:
%%bash
curl -X GET 'https://75bd7e73-f285-4d50-b0f9-a6b915cb76cd.ai.goldilox.cloud/variables' -H 'x-api-key: zGvPcGTLUr3YigMGbqjck5kwwlHtJIjk4dMg2vmb'

{"map_choices":[null,"setosa","versicolor","virginica"],"branch":"demo","version":"v1","training_accuracy":1.0,"confidance":"when donfidance close to 1 is good, 0 or less is new data"}

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
  0     0    0     0    0     0      0      0 --:--:-- --:--:-- --:--:--     0  0     0    0     0    0     0      0      0 --:--:-- --:--:-- --:--:--     0100   184  100   184    0     0    475      0 --:--:-- --:--:-- --:--:--   486


In [24]:
%%bash
curl -X GET 'https://75bd7e73-f285-4d50-b0f9-a6b915cb76cd.ai.goldilox.cloud/description' -H 'x-api-key: zGvPcGTLUr3YigMGbqjck5kwwlHtJIjk4dMg2vmb'

"I did demo and I liked it"

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
  0     0    0     0    0     0      0      0 --:--:-- --:--:-- --:--:--     0100    27  100    27    0     0     62      0 --:--:-- --:--:-- --:--:--    63


In [25]:
%%bash
curl -X GET 'https://75bd7e73-f285-4d50-b0f9-a6b915cb76cd.ai.goldilox.cloud/example' -H 'x-api-key: zGvPcGTLUr3YigMGbqjck5kwwlHtJIjk4dMg2vmb'

[{"sepal_length":1.0380047568006336,"sepal_width":0.09821728693702417,"petal_length":1.0469454037485713,"petal_width":1.5804637593788793,"target":2,"petal_ratio":-0.7968154346110106,"lgbm":[5.144535866516098e-09,9.126372971557861e-09,0.9999999857290911],"xgb":2.0,"prediction":2,"label":"virginica","explanation":{"petal_length":2.3492918014526367,"petal_ratio":0.7434390783309937,"petal_width":1.508973240852356,"sepal_length":0.591556966304779,"sepal_width":-0.5662230253219604},"distance":0.9339118676939868,"outlier":"ok"}]

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
  0     0    0     0    0     0      0      0 --:--:-- --:--:-- --:--:--     0100   527  100   527    0     0    438      0  0:00:01  0:00:01 --:--:--   443100   527  100   527    0     0    438      0  0:00:01  0:00:01 --:--:--   443


# Raw -> Production

* [x] Classification
* [x] Regression
* [x] Clustering
* [x] Nearest neighbors
* [x] Recommendation systems
* [x] Explainability
* [x] Ensembles
* [x] Domain logic

In [27]:
from IPython.display import HTML

HTML('<img src="../assets/mind_blown.gif">')