In [1]:
import sys
sys.path.append("..")

### Dev

In [3]:
import vaex
import warnings
from vaex.ml.datasets import load_iris
from vaex.ml.lightgbm import LightGBMModel
from goldilox import Pipeline
import numpy as np
import json

warnings.filterwarnings('ignore')


df = load_iris()
target = 'class_'

# feature engineering example
df['petal_ratio'] = df['petal_length'] / df['petal_width'] 

booster = LightGBMModel(features=['petal_length', 'petal_width', 'sepal_length', 'sepal_width', 'petal_ratio'],
                        target=target,
                        prediction_name='predictions',
                        num_boost_round=500, params={'verbosity': -1,
                                                     'objective': 'multiclass',
                                                     'num_class': 3})
booster.fit(df)
df = booster.transform(df)

# post model processing example
@vaex.register_function()
def argmax(ar, axis=1):
    return np.argmax(ar, axis=axis)

df.add_function('argmax', argmax)
df['prediction'] = df['predictions'].argmax()
df['label'] = df['prediction'].map({0: 'setosa', 1: 'versicolor', 2: 'virginica'})
    
pipeline = Pipeline.from_vaex(df)
pipeline.raw.pop(target) # (optional) we don't expect to get the class_ in queries
assert pipeline.validate()


# Vaex solution

## Simple

In [11]:
import vaex
import warnings
from vaex.ml.datasets import load_iris_1e5
from vaex.ml.lightgbm import LightGBMModel
from goldilox import Pipeline
import numpy as np
import json

warnings.filterwarnings('ignore')


df = load_iris_1e5()
target = 'class_'

# feature engineering example
df['petal_ratio'] = df['petal_length'] / df['petal_width'] 

booster = LightGBMModel(features=['petal_length', 'petal_width', 'sepal_length', 'sepal_width', 'petal_ratio'],
                        target=target,
                        prediction_name='predictions',
                        num_boost_round=500, params={'verbosity': -1,
                                                     'objective': 'multiclass',
                                                     'num_class': 3})
booster.fit(df)
df = booster.transform(df)

# post model processing example
@vaex.register_function()
def argmax(ar, axis=1):
    return np.argmax(ar, axis=axis)

df.add_function('argmax', argmax)
df['prediction'] = df['predictions'].argmax()
df['label'] = df['prediction'].map({0: 'setosa', 1: 'versicolor', 2: 'virginica'})

pipeline = Pipeline.from_vaex(df, description='simple lightGBM')
pipeline.variables['var']='1'
pipeline.raw.pop(target) # (optional) we don't expect to get the class_ in queries
assert pipeline.validate()
df.head(2)

#,sepal_length,sepal_width,petal_length,petal_width,class_,petal_ratio,predictions,prediction,label
0,5.9,3,4.2,1.5,1,2.8,"'array([9.24152357e-10, 9.99999998e-01, 9.388205...",1,versicolor
1,6.1,3,4.6,1.4,1,3.28571,"'array([3.48585748e-09, 9.99999995e-01, 1.845006...",1,versicolor


### Serve

In [16]:
print("Pipeline raw data example:")
print(json.dumps(pipeline.raw, indent=4))
print('')
print("Pipeline output example:")
pipeline.inference(pipeline.raw).to_records()

Pipeline raw data example:
{
    "sepal_length": 5.9,
    "sepal_width": 3.0,
    "petal_length": 4.2,
    "petal_width": 1.5
}

Pipeline output example:


[{'sepal_length': 5.9,
  'sepal_width': 3.0,
  'petal_length': 4.2,
  'petal_width': 1.5,
  'class_': None,
  'petal_ratio': 2.8000000000000003,
  'predictions': [9.241523574551371e-10,
   0.9999999981370271,
   9.388205713472021e-10],
  'prediction': 1,
  'label': 'versicolor'}]

In [24]:
print(f"Saved to: {pipeline.save('../tests/models/server.pkl')}")
"""
# From python directly

from goldilox.app import Server
Server('../tests/models/server.pkl', options={'port':5000}).serve()
"""
!gl serve ../tests/models/server.pkl

Saved to: ../tests/models/server.pkl
[2021-11-16 16:44:55 +0100] [69656] [INFO] Starting gunicorn 20.1.0
[2021-11-16 16:44:55 +0100] [69656] [INFO] Listening at: http://127.0.0.1:5000 (69656)
[2021-11-16 16:44:55 +0100] [69656] [INFO] Using worker: uvicorn.workers.UvicornH11Worker
[2021-11-16 16:44:55 +0100] [69662] [INFO] Booting worker with pid: 69662
[2021-11-16 16:44:55 +0100] [69662] [INFO] Started server process [69662]
[2021-11-16 16:44:55 +0100] [69662] [INFO] Waiting for application startup.
[2021-11-16 16:44:55 +0100] [69662] [INFO] Application startup complete.
^C
[2021-11-16 16:45:00 +0100] [69656] [INFO] Handling signal: int
[2021-11-16 16:45:01 +0100] [69656] [INFO] Shutting down: Master




ExceptionPexpect: isalive() encountered condition where "terminated" is 0, but there was no child process. Did someone else call waitpid() on our process?

## Advance   

Here first we run a random_split experiment and save the results.    
Next, we train the data on the entire dataset. adding the evalution as a varaible so we can recall how good the model was.

We set everything within a function which recive a dataframe and return a Vaex DataFrame so we can redo it when we get more data.   

* This way we can change the pipeline training and outputs without changing our infrastructure at all.

In [29]:
from vaex.ml.datasets import load_iris

def fit(df):
    import vaex
    import numpy as np
    from vaex.ml.lightgbm import LightGBMModel
    from sklearn.metrics import accuracy_score
    from goldilox import Pipeline

    train, test = df.ml.train_test_split(test_size=0.2, verbose=False)

    features = ['petal_length', 'petal_width', 'sepal_length', 'sepal_width']
    target = 'class_'

    booster = LightGBMModel(features=features,
                            target=target,
                            prediction_name='predictions',
                            num_boost_round=500, params={'verbose': -1,
                                                         'objective': 'multiclass',
                                                         'num_class': 3})
    booster.fit(df)

    @vaex.register_function()
    def argmax(ar, axis=1):
        return np.argmax(ar, axis=axis)

    train = booster.transform(df)
    train.add_function('argmax', argmax)
    train['prediction'] = train['predictions'].argmax()
    
    """
    Using the  way to get predictions on a new dataset.
    This is very helpful if we did many feature engineering transformations. 
    """
    pipeline = Pipeline.from_vaex(train) 
    accuracy = accuracy_score(pipeline.inference(test[features])['prediction'].values,
                              test[target].values)
    
    # Re-train on the entire dataset
    booster = LightGBMModel(features=features,
                            target=target,
                            prediction_name='predictions',
                            num_boost_round=500, params={'verbose': -1,
                                                         'objective': 'multiclass',
                                                         'num_class': 3})
    booster.fit(df)
    df = booster.transform(df)
    df.add_function('argmax', argmax)
    df['prediction'] = df['predictions'].argmax()
    # The 'label' is to help the Frontend app to understand what actually was the result
    df['label'] = df['prediction'].map({0: 'setosa', 1: 'versicolor', 2: 'virginica'})
    df.variables['accuracy'] = accuracy
    return df

df = load_iris()
pipeline = Pipeline.from_vaex(df, fit=fit).fit(df)
pipeline.validate()

True

### Re-train

In [22]:
from vaex.ml.datasets import load_iris_1e5
df = load_iris_1e5()
pipeline.fit(df)
pipeline.validate()

True

### Persistance

In [23]:
from tempfile import TemporaryDirectory

path = str(TemporaryDirectory().name) + '/model.pkl'

print("Pipeline raw data example:")
print(json.dumps(pipeline.raw, indent=4))
pipeline = Pipeline.from_file(pipeline.save(path))
pipeline.inference(pipeline.raw)

Pipeline raw data example:
{
    "sepal_length": 5.9,
    "sepal_width": 3.0,
    "petal_length": 4.2,
    "petal_width": 1.5,
    "class_": 1
}


#,sepal_length,sepal_width,petal_length,petal_width,class_,predictions,prediction,label
0,5.9,3,4.2,1.5,1,"'array([5.20356629e-09, 9.99999994e-01, 1.033663...",1,versicolor


# Serve