# [LightGBM](https://lightgbm.readthedocs.io/en/latest/)

In [1]:
from lightgbm.sklearn import LGBMClassifier

from goldilox.datasets import load_iris

# Get teh data
df, features, target = load_iris()
df.head(2)

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,target
0,5.1,3.5,1.4,0.2,0
1,4.9,3.0,1.4,0.2,0


# Sklearn

In [2]:
from goldilox import Pipeline

X, y = df[features], df[target]
model = LGBMClassifier()

pipeline = Pipeline.from_sklearn(model).fit(X, y)

# I/O Example
pipeline.inference(pipeline.raw)



Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,prediction
0,5.1,3.5,1.4,0.2,0


Missing values are not handled, Let's fix this!

In [3]:
from goldilox.sklearn.tranformers import Imputer
import sklearn.pipeline

skleran_pipeline = sklearn.pipeline.Pipeline([('imputer', Imputer()),
                                              ('classifier', LGBMClassifier())])
pipeline = Pipeline.from_sklearn(skleran_pipeline).fit(X, y)

# I/O Example
pipeline.inference({'sepal_length': None,
                    'sepal_width': 3.5,
                    'petal_length': 1.4,
                    'petal_width': 0.2})

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,prediction
0,,3.5,1.4,0.2,0


### Variables and description
We can add variables which we want to associate with the pipeline, and a description.
* A great place to put the training params, evaluation results, version, branch, etc,.

In [4]:
from sklearn.metrics import accuracy_score

pipeline.set_variable('training_accuracy', accuracy_score(y, pipeline.inference(X)['prediction']))
pipeline.description = "LightGBM on the iris dataset with sklearn"

* In this example I run the evaluation on the same data as training, ofc you would want to split the data to train/test, or kfold etc,.

# Vaex solution

Vaex solutions are much more **powerful** and allow for easier feature engineering and scale.    
In this example we do a simple feature engineering, and process the results to labels, so it would be easier to consume on the frontend side.

* We do not need to implement transformers for each feature engineering step or estimators. Instead we create simple functions which does what we want.
* It's good to remember that whenever we do train a model which loads **all data to memory**, Vaex obviously doesn't prevant that, and it should be taken into account - Maybe [online learning](https://docs.goldilox.io/reference/data-science-examples/online-learning)?

In [30]:
import vaex
import warnings
from vaex.ml.lightgbm import LightGBMModel
import numpy as np

warnings.filterwarnings('ignore')

df = vaex.from_pandas(df)

# feature engineering example
df['petal_ratio'] = df['petal_length'] / df['petal_width']

features = features + ['petal_ratio']
booster = LightGBMModel(features=features,
                        target=target,
                        prediction_name='predictions',
                        num_boost_round=500, params={'verbosity': -1,
                                                     'objective': 'multiclass',
                                                     'num_class': 3})
booster.fit(df)
df = booster.transform(df)


# post model processing example
@vaex.register_function()
def argmax(ar, axis=1):
    return np.argmax(ar, axis=axis)


df.add_function('argmax', argmax)
df['prediction'] = df['predictions'].argmax()

classes = {0: 'setosa', 1: 'versicolor', 2: 'virginica'}
df['label'] = df['prediction'].map(classes)

# Vaex remember all the transformations, this is a skleran.pipeline alternative
pipeline = Pipeline.from_vaex(df, description='simple lightGBM')

pipeline.inference(pipeline.raw)

#,sepal_length,sepal_width,petal_length,petal_width,target,petal_ratio,predictions,prediction,label
0,5.1,3.5,1.4,0.2,0,7,"'array([9.99999943e-01, 5.70021072e-08, 4.053823...",0,setosa


# Vaex + sklearn  -> vaex.ml.sklearn.Predictor
Another way to wrap any sklearn model into Vaex with a *Predictor* wrapper class.

In [31]:
from vaex.ml.sklearn import Predictor
from lightgbm.sklearn import LGBMClassifier

model = Predictor(model=LGBMClassifier(), features=features, target=target, prediction_name='lgbm2')
model.fit(df)
df = model.transform(df)
pipeline = Pipeline.from_vaex(df, description='simple lightGBM')
pipeline.raw.pop(target)

pipeline.inference(pipeline.raw)

#,sepal_length,sepal_width,petal_length,petal_width,target,petal_ratio,predictions,prediction,label,lgbm2
0,5.1,3.5,1.4,0.2,--,7,"'array([9.99999943e-01, 5.70021072e-08, 4.053823...",0,setosa,0


# Need even more flexibility? -> @vaex.register_function()
This is the Vaex swisse-knife 
* works with any pickable model.

In [34]:
import pyarrow as pa

model = LGBMClassifier().fit(df[features].values, df[target].values)


@vaex.register_function()
def predict_proba(*columns):
    X = np.array(columns).T  # Vaex retrive the batches efficiently, but transposed
    probabilities = model.predict_proba(X)
    return pa.array([{classes.get(i): probability for i, probability in enumerate(row)} for row in probabilities])


df.add_function('predict_proba', predict_proba)
df['probabilities'] = df.func.predict_proba(*features)

pipeline = Pipeline.from_vaex(df)
pipeline.inference(pipeline.raw)

#,sepal_length,sepal_width,petal_length,petal_width,target,petal_ratio,predictions,prediction,label,lgbm2,probabilities
0,5.1,3.5,1.4,0.2,0,7,"'array([9.99999943e-01, 5.70021072e-08, 4.053823...",0,setosa,0,"""{'setosa': 0.9999970258541964, 'versicolor': 2...."


# [Serve](https://docs.goldilox.io/reference/api-reference/cli/serve)
All pipeline get a predictions server in the same way

In [11]:
print(f"Saved to: {pipeline.save('pipeline.pkl')}")
print(f"Check out the docs: http://127.0.0.1:8000/docs\n")
!glx serve pipeline.pkl

Saved to: pipeline.pkl
Check out the docs: http://127.0.0.1:5000/docs

[2022-01-13 13:21:43 +0200] [20124] [INFO] Starting gunicorn 20.1.0
[2022-01-13 13:21:43 +0200] [20124] [INFO] Listening at: http://127.0.0.1:8000 (20124)
[2022-01-13 13:21:43 +0200] [20124] [INFO] Using worker: uvicorn.workers.UvicornH11Worker
[2022-01-13 13:21:43 +0200] [20137] [INFO] Booting worker with pid: 20137
[2022-01-13 13:21:44 +0200] [20137] [INFO] Started server process [20137]
[2022-01-13 13:21:44 +0200] [20137] [INFO] Waiting for application startup.
[2022-01-13 13:21:44 +0200] [20137] [INFO] Application startup complete.
^C
[2022-01-13 13:22:23 +0200] [20124] [INFO] Handling signal: int
[2022-01-13 13:22:23 +0200] [20137] [INFO] Shutting down
[2022-01-13 13:22:23 +0200] [20137] [INFO] Waiting for application shutdown.
[2022-01-13 13:22:23 +0200] [20137] [INFO] Application shutdown complete.
[2022-01-13 13:22:23 +0200] [20137] [INFO] Finished server process [20137]
[2022-01-13 13:22:23 +0200] [20137] [