In [1]:
import sys
sys.path.append("..")

### Dev

In [3]:
import vaex
import warnings
from vaex.ml.datasets import load_iris
from vaex.ml.lightgbm import LightGBMModel
from goldilox import Pipeline
import numpy as np
import json

warnings.filterwarnings('ignore')


df = load_iris()
target = 'class_'

# feature engineering example
df['petal_ratio'] = df['petal_length'] / df['petal_width'] 

booster = LightGBMModel(features=['petal_length', 'petal_width', 'sepal_length', 'sepal_width', 'petal_ratio'],
                        target=target,
                        prediction_name='predictions',
                        num_boost_round=500, params={'verbosity': -1,
                                                     'objective': 'multiclass',
                                                     'num_class': 3})
booster.fit(df)
df = booster.transform(df)

# post model processing example
@vaex.register_function()
def argmax(ar, axis=1):
    return np.argmax(ar, axis=axis)

df.add_function('argmax', argmax)
df['prediction'] = df['predictions'].argmax()
df['label'] = df['prediction'].map({0: 'setosa', 1: 'versicolor', 2: 'virginica'})
    
pipeline = Pipeline.from_vaex(df)
pipeline.raw.pop(target) # (optional) we don't expect to get the class_ in queries
assert pipeline.validate()


# Vaex solution

## Simple

In [4]:
import vaex
import warnings
from vaex.ml.datasets import load_iris_1e5
from vaex.ml.lightgbm import LightGBMModel
from goldilox import Pipeline
import numpy as np
import json

warnings.filterwarnings('ignore')


df = load_iris_1e5()
target = 'class_'

# feature engineering example
df['petal_ratio'] = df['petal_length'] / df['petal_width'] 

booster = LightGBMModel(features=['petal_length', 'petal_width', 'sepal_length', 'sepal_width', 'petal_ratio'],
                        target=target,
                        prediction_name='predictions',
                        num_boost_round=500, params={'verbosity': -1,
                                                     'objective': 'multiclass',
                                                     'num_class': 3})
booster.fit(df)
df = booster.transform(df)

# post model processing example
@vaex.register_function()
def argmax(ar, axis=1):
    return np.argmax(ar, axis=axis)

df.add_function('argmax', argmax)
df['prediction'] = df['predictions'].argmax()
df['label'] = df['prediction'].map({0: 'setosa', 1: 'versicolor', 2: 'virginica'})
    
pipeline = Pipeline.from_vaex(df)
pipeline.raw.pop(target) # (optional) we don't expect to get the class_ in queries
assert pipeline.validate()
df.head(2)

#,sepal_length,sepal_width,petal_length,petal_width,class_,petal_ratio,predictions,prediction,label
0,5.9,3,4.2,1.5,1,2.8,"'array([9.24152357e-10, 9.99999998e-01, 9.388205...",1,versicolor
1,6.1,3,4.6,1.4,1,3.28571,"'array([3.48585748e-09, 9.99999995e-01, 1.845006...",1,versicolor


### Serve

In [5]:
path = '../tests/models/server_pipeline.pkl'
pipeline.save(path)
print("Pipeline raw data example:")
print(json.dumps(pipeline.raw, indent=4))
pipeline.inference(pipeline.raw).to_records()

Pipeline raw data example:
{
    "sepal_length": 5.9,
    "sepal_width": 3.0,
    "petal_length": 4.2,
    "petal_width": 1.5
}


In [2]:
import sys
sys.path.append("..")
from goldilox.server import Server

path = '../tests/models/server_pipeline.pkl'
print("go to http://127.0.0.1:8000/docs")
Server(path).serve()


go to http://127.0.0.1:8000/docs


[2021-11-16 15:04:17 +0100] [62396] [INFO] Starting gunicorn 20.1.0
[2021-11-16 15:04:17 +0100] [62396] [INFO] Listening at: http://127.0.0.1:8000 (62396)
[2021-11-16 15:04:17 +0100] [62396] [INFO] Using worker: uvicorn.workers.UvicornH11Worker
[2021-11-16 15:04:17 +0100] [62415] [INFO] Booting worker with pid: 62415
[2021-11-16 15:04:17 +0100] [62415] [INFO] Started server process [62415]
[2021-11-16 15:04:17 +0100] [62415] [INFO] Waiting for application startup.
[2021-11-16 15:04:17 +0100] [62415] [INFO] Application startup complete.
[2021-11-16 15:25:57 +0100] [62396] [INFO] Handling signal: int
[2021-11-16 15:25:57 +0100] [62396] [INFO] Shutting down: Master
ERROR:root:Internal Python error in the inspect module.
Below is the traceback from this internal error.



Traceback (most recent call last):
  File "/Users/yonatanalexander/Dropbox/Development_box/xdss-projects/goldilox/.venv/lib/python3.7/site-packages/gunicorn/arbiter.py", line 224, in run
    handler()
  File "/Users/yonatanalexander/Dropbox/Development_box/xdss-projects/goldilox/.venv/lib/python3.7/site-packages/gunicorn/arbiter.py", line 262, in handle_int
    raise StopIteration
StopIteration

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/Users/yonatanalexander/Dropbox/Development_box/xdss-projects/goldilox/.venv/lib/python3.7/site-packages/IPython/core/interactiveshell.py", line 3444, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "/var/folders/gl/cklpy5415rzd6vb8y29rccpr0000gn/T/ipykernel_62396/473552721.py", line 7, in <module>
    Server(path).serve()
  File "../goldilox/server/__init__.py", line 143, in serve
    WSGIApplication(self.app, options).run()
  File "/Users/yonatanalexander/Dr

TypeError: object of type 'NoneType' has no len()

In [None]:
pip

## Advance   

Here first we run a random_split experiment and save the results.    
Next, we train the data on the entire dataset. adding the evalution as a varaible so we can recall how good the model was.

We set everything within a function which recive a dataframe and return a Vaex DataFrame so we can redo it when we get more data.   

* This way we can change the pipeline training and outputs without changing our infrastructure at all.

In [29]:
from vaex.ml.datasets import load_iris

def fit(df):
    import vaex
    import numpy as np
    from vaex.ml.lightgbm import LightGBMModel
    from sklearn.metrics import accuracy_score
    from goldilox import Pipeline

    train, test = df.ml.train_test_split(test_size=0.2, verbose=False)

    features = ['petal_length', 'petal_width', 'sepal_length', 'sepal_width']
    target = 'class_'

    booster = LightGBMModel(features=features,
                            target=target,
                            prediction_name='predictions',
                            num_boost_round=500, params={'verbose': -1,
                                                         'objective': 'multiclass',
                                                         'num_class': 3})
    booster.fit(df)

    @vaex.register_function()
    def argmax(ar, axis=1):
        return np.argmax(ar, axis=axis)

    train = booster.transform(df)
    train.add_function('argmax', argmax)
    train['prediction'] = train['predictions'].argmax()
    
    """
    Using the  way to get predictions on a new dataset.
    This is very helpful if we did many feature engineering transformations. 
    """
    pipeline = Pipeline.from_vaex(train) 
    accuracy = accuracy_score(pipeline.inference(test[features])['prediction'].values,
                              test[target].values)
    
    # Re-train on the entire dataset
    booster = LightGBMModel(features=features,
                            target=target,
                            prediction_name='predictions',
                            num_boost_round=500, params={'verbose': -1,
                                                         'objective': 'multiclass',
                                                         'num_class': 3})
    booster.fit(df)
    df = booster.transform(df)
    df.add_function('argmax', argmax)
    df['prediction'] = df['predictions'].argmax()
    # The 'label' is to help the Frontend app to understand what actually was the result
    df['label'] = df['prediction'].map({0: 'setosa', 1: 'versicolor', 2: 'virginica'})
    df.variables['accuracy'] = accuracy
    return df

df = load_iris()
pipeline = Pipeline.from_vaex(df, fit=fit).fit(df)
pipeline.validate()

True

### Re-train

In [22]:
from vaex.ml.datasets import load_iris_1e5
df = load_iris_1e5()
pipeline.fit(df)
pipeline.validate()

True

### Persistance

In [23]:
from tempfile import TemporaryDirectory

path = str(TemporaryDirectory().name) + '/model.pkl'

print("Pipeline raw data example:")
print(json.dumps(pipeline.raw, indent=4))
pipeline = Pipeline.from_file(pipeline.save(path))
pipeline.inference(pipeline.raw)

Pipeline raw data example:
{
    "sepal_length": 5.9,
    "sepal_width": 3.0,
    "petal_length": 4.2,
    "petal_width": 1.5,
    "class_": 1
}


#,sepal_length,sepal_width,petal_length,petal_width,class_,predictions,prediction,label
0,5.9,3,4.2,1.5,1,"'array([5.20356629e-09, 9.99999994e-01, 1.033663...",1,versicolor


# Serve