# Online learning benchmarks

* [LightGBM](https://lightgbm.readthedocs.io/en/latest/)
* [XGBoost](https://xgboost.readthedocs.io/en/stable/)
* [Catboost](https://catboost.ai)
* [River](https://riverml.xyz/latest/)
* [Sklearn](https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.SGDClassifier.html#sklearn.linear_model.SGDClassifier.partial_fit)
* [Vopal Wabbit](https://vowpalwabbit.org)

In [1]:
from sklearn.datasets import make_classification
X, y = make_classification(n_samples=1000_000, n_informative=10, n_classes=3)

In [9]:
import vaex
from sklearn.metrics import accuracy_score


target = 'target'
features = [f"feature_{i}" for i in range(X.shape[1])]

df = vaex.from_dict({feature: X[:,i] for i,feature in enumerate(features)})
df['target'] = y+1

train, test = df.ml.train_test_split()

validation = train.head(10)

df.head(2)



#,feature_0,feature_1,feature_2,feature_3,feature_4,feature_5,feature_6,feature_7,feature_8,feature_9,feature_10,feature_11,feature_12,feature_13,feature_14,feature_15,feature_16,feature_17,feature_18,feature_19,target
0,-0.134619,-0.210785,-1.27125,0.319068,-0.506557,-0.903006,1.43086,0.354004,1.44165,2.37896,0.621935,-0.235586,0.971325,-3.17172,-1.69216,1.64932,-1.33266,0.131078,-1.84336,0.775879,3
1,-1.09678,1.11236,-0.124019,1.20753,0.754969,-1.32046,0.075942,0.0368741,-0.923697,-0.138596,2.05469,-2.11743,1.14415,2.20503,1.74582,-3.21001,0.381363,-0.108705,0.32164,1.94967,3


## River

In [41]:
from vaex.ml.incubator.river import RiverModel
import vaex.ml.metrics
from river.linear_model import LogisticRegression
from river import optim
from river.multiclass import OneVsRestClassifier

import warnings
warnings.filterwarnings("ignore")

# Set up the model
model = RiverModel(model = OneVsRestClassifier(LogisticRegression()),
                   batch_size=11_000_000,
                   features=features, 
                   target=target, 
                   prediction_name='river')

# Fit the model
model.fit(train, progress='widget')
train = model.transform(train)

HBox(children=(FloatProgress(value=0.0, max=1.0), Label(value='In progress...')))

Accuracy River: 0.60005625


# VW
* https://mlcourse.ai/articles/topic8-sgd-vw/

In [21]:
import traitlets
import tempfile
import base64
import pandas as pd
import numpy as np
from vowpalwabbit.DFtoVW import DFtoVW
from vowpalwabbit.pyvw import vw
from tqdm import tqdm

params = {"enable_logging": True, 'oaa':3}
model = vw(**params)

for _,_,d in tqdm(train.head(1000).to_pandas_df(chunk_size=10)):
    for ex in DFtoVW.from_colnames(df=d, y='target', x=features).convert_df():
        model.learn(ex)

model.finish()    

class VWModell(traitlets.HasTraits):

        # This should work with the reduce's arguments
        def __init__(self, model=None, features=None, target=None, params=None):
            self.params = params or {}
            self.features = features
            self.target = target            
            self.model = self._decode_model(model)

        # This is how you make a class pickalbe
        def __reduce__(self):
            return (self.__class__, (self._encode(), self.features, self.target, self.params))

        # How vw implemented serialization
        def _decode_model(self, encoding):       
            if encoding is None:
                return vw(**self.params)                
            if isinstance(encoding, str):                
                model_data = base64.decodebytes(encoding.encode('ascii'))
                openfilename = tempfile.mktemp()
                with open(openfilename, 'wb') as f:
                    f.write(model_data)
                params = self.params.copy()
                params['i']= openfilename
                return vw(**params)
            else:
                return encoding

        # How vw implemented serialization
        def _encode(self):
            if isinstance(self.model, bytes):
                return self.model
            filename = tempfile.mktemp()
            self.model.save(filename)
            with open(filename, 'rb') as f:
                model_data = f.read()
            encoding =  base64.encodebytes(model_data).decode('ascii')
            return encoding   
        
        def predict(self, data):   
            if isinstance(data, vaex.dataframe.DataFrame):
                data = data.to_pandas_df()
            elif isinstance(data, np.ndarray):
                data = pd.DataFrame(data, columns=features)  
            if self.target not in data:                
                data[self.target] = 1
            examples = DFtoVW.from_colnames(df=data, y=target, x=features).convert_df()            
            return np.array([self.model.predict(ex) for ex in examples])

vw_model = VWModell(model=model, features=features, target=target, params=params)

@vaex.register_function(on_expression=False)
def predict_vw(*columns):
    data = np.array(columns).T                
    return np.array(vw_model.predict(data))

train.add_function('predict_vw',predict_vw)
train['vw'] = train.func.predict_vw(*features)
print(f"Accuracy VW: {accuracy_score(vw_model.predict(validation), validation[target].values)}")
train.head(2)

#,feature_0,feature_1,feature_2,feature_3,feature_4,feature_5,feature_6,feature_7,feature_8,feature_9,feature_10,feature_11,feature_12,feature_13,feature_14,feature_15,feature_16,feature_17,feature_18,feature_19,target,vw
0,1.09626,-1.67528,-0.916187,-0.671703,-4.26103,0.730177,3.24956,-0.0408016,-1.74192,1.78474,-4.34423,-1.54608,0.524216,5.84992,2.12973,-1.25836,3.78808,1.56126,-1.92343,2.44382,2,2
1,-1.62342,-1.51648,0.199182,0.922495,-0.707596,0.757213,3.11861,1.07286,-1.1791,1.18565,1.9924,0.525271,1.13078,-0.99008,-1.85412,1.94633,-0.109579,1.44296,-1.049,-0.0227364,1,2


In [112]:
train = train.head(5000)

# XGBoost

In [114]:
from xgboost.sklearn import XGBClassifier

n_samples = len(df)
progressbar = vaex.utils.progressbars(True, title="fit(xgboost)")
num_epochs = 2
batch_size = 1000

model = None
for epoch in range(num_epochs):
    for i1, i2, chunks in train.evaluate_iterator(features+[target], chunk_size=batch_size, array_type='numpy'):
        progressbar((n_samples * epoch + i1) / (num_epochs * n_samples))
        X = np.array(chunks[:-1]).T  # the most efficient way depends on the algorithm (row of column based access)
        y = np.array(chunks[-1], copy=False)
        xgb_params = {
            'update':'refresh',
            'refresh_leaf': True,
            'xgb_model':model,
            'objective':'multi:softmax',
            'num_class': 3,
            'verbosity': 0}
        model = XGBClassifier(**xgb_params)
        model.fit(X, y)


progressbar(1.0)

xgb = Predictor(model=model,
                features=features, 
                target=target,
                prediction_name='xgb')
train = xgb.transform(train)        

fit(xgboost) [########################################] 100.00% elapsed time  :    13.70s =  0.2m =  0.0h 
 

True

# LightGBM

In [129]:
from lightgbm.sklearn import LGBMClassifier

n_samples = len(df)
progressbar = vaex.utils.progressbars(True, title="fit(lgm)")
num_epochs = 2
batch_size = 1000

model = None

params = {'verbosity': -1,
        'objective': 'multiclass',
        'num_class': 3,
         'num_iterations':1}

for epoch in range(num_epochs):
    for i1, i2, chunks in train.evaluate_iterator(features+[target], chunk_size=batch_size, array_type='numpy'):
        progressbar((n_samples * epoch + i1) / (num_epochs * n_samples))
        X = np.array(chunks[:-1]).T  # the most efficient way depends on the algorithm (row of column based access)
        y = np.array(chunks[-1], copy=False)

        if model is None:
            model = LGBMClassifier(**params)
            model.fit(X, y)
        else:            
            model.fit(X, y, init_model=model) # TODO test


progressbar(1.0)

lgb = Predictor(model=model,
                features=features, 
                target=target,
                prediction_name='lgm')
train = lgb.transform(train)        

fit(lgm) [########################################] 100.00% elapsed time  :     0.51s =  0.0m =  0.0h
 

# Catboost 

In [130]:
from catboost import CatBoostClassifier

n_samples = len(df)
progressbar = vaex.utils.progressbars(True, title="fit(catboost)")
num_epochs = 2
batch_size = 1000

model = None


for epoch in range(num_epochs):
    for i1, i2, chunks in train.evaluate_iterator(features+[target], chunk_size=batch_size, array_type='numpy'):
        progressbar((n_samples * epoch + i1) / (num_epochs * n_samples))
        X = np.array(chunks[:-1]).T  # the most efficient way depends on the algorithm (row of column based access)
        y = np.array(chunks[-1], copy=False)
        params = {'verbose': 0, 'iterations': 1,'objective': 'MultiClass'}
        if model is None:
            model = CatBoostClassifier(**params)
            model.fit(X, y)
        else:            
            model.fit(X, y, init_model=model) # TODO test


progressbar(1.0)

cb = Predictor(model=model,
                features=features, 
                target=target,
                prediction_name='cb')
train = cb.transform(train)   
train['cb'] = train['cb'].apply(lambda x: x[0]) # catboost beeing annoying

fit(catboost) [########################################] 100.00% elapsed time  :     0.28s =  0.0m =  0.0h
 

# SGD classifier

In [144]:
from sklearn.linear_model import SGDClassifier

n_samples = len(df)
progressbar = vaex.utils.progressbars(True, title="fit(SGD)")
num_epochs = 2
batch_size = 1000

model = None

model = SGDClassifier()
for epoch in range(num_epochs):
    for i1, i2, chunks in train.evaluate_iterator(features+[target], chunk_size=batch_size, array_type='numpy'):
        progressbar((n_samples * epoch + i1) / (num_epochs * n_samples))
        X = np.array(chunks[:-1]).T  # the most efficient way depends on the algorithm (row of column based access)
        y = np.array(chunks[-1], copy=False)
        if epoch==0 and i1==0:
            model.fit(X, y)
        else:
            model.partial_fit(X, y) 


progressbar(1.0)

sgd = Predictor(model=model,
                features=features, 
                target=target,
                prediction_name='sgd')
train = sgd.transform(train)   

fit(SGD) [########################################] 100.00% elapsed time  :     0.06s =  0.0m =  0.0h
 

In [145]:
y_true = train[target].values
print(f"Accuracy River: {accuracy_score(train['river'].values, y_true)}")
print(f"Accuracy XGBoost: {accuracy_score(train['xgb'].values, y_true)}")
print(f"Accuracy LightGBM: {accuracy_score(train['lgm'].values, y_true)}")
print(f"Accuracy Catboost: {accuracy_score(train['cb'].values, y_true)}")
print(f"Accuracy Vopal Wabbit: {accuracy_score(train['vw'].values, y_true)}")
print(f"Accuracy SGD: {accuracy_score(train['sgd'].values, y_true)}")

Accuracy River: 0.5942
Accuracy XGBoost: 0.8628
Accuracy LightGBM: 0.856
Accuracy Catboost: 0.7844
Accuracy Vopal Wabbit: 0.6676
Accuracy SGD: 0.5676
