# LightGBM - batch learning

## [Add a vaex accesor](https://vaex.io/docs/tutorial.html#Adding-DataFrame-accessors) (optional)
* The numpy version is about 2X fast

In [2]:
import vaex
import numpy as np


@vaex.register_dataframe_accessor('online', override=True)
class Online(object):
    def __init__(self, df):
        self.df = df

    def to_pandas_x_y(self, features=None, target=None, num_epochs=1, batch_size=1000, shuffle=False):
        """Return a tuple X, y which fits the sklearn pattern"""
        num_features = len(features)        
        column_names = features.copy()
        n_samples = len(self.df)
        if target is not None:
            column_names.append(target)
        progressbar = vaex.utils.progressbars(True, title="training")
        progressbar(0)
        def iterator():
            for epoch in range(num_epochs):
                for i1, i2, chunks in self.df.to_pandas_df(column_names=column_names, 
                                                           chunk_size=batch_size):
                    if shuffle and epoch > 0:
                        chunks = chunks.sample(frac=1)
                    X = chunks[column_names]
                    y = chunks[target]
                    yield X, y
                    progressbar((n_samples * epoch + i1) / (num_epochs * n_samples))
            progressbar(1.0)
        return iterator()

    def to_x_y(self, features=None, target=None, num_epochs=1, batch_size=1000, shuffle=False):
        """Return a tuple X, y which fits the sklearn pattern"""
        num_features = len(features)        
        column_names = features.copy()
        n_samples = len(self.df)
        if target is not None:
            column_names.append(target)
        progressbar = vaex.utils.progressbars(True, title="training")
        progressbar(0)
        y = None
        def iterator():
            for epoch in range(num_epochs):
                for i1, i2, chunks in self.df.evaluate_iterator(column_names, 
                                                                chunk_size=batch_size, 
                                                                progress=False):
                    chunks = np.array(chunks).T
                    if shuffle and epoch > 0:
                        np.random.shuffle(chunks)
                    X = chunks[:, 0 : num_features]
                    if target is not None:
                        y = chunks[:, -1]
                    yield X, y
                    progressbar((n_samples * epoch + i1) / (num_epochs * n_samples))
            progressbar(1.0)
        return iterator()

# Get data

In [3]:
from sklearn.datasets import make_classification
import vaex

X, y = make_classification(n_samples=10000, n_informative=10, n_classes=3)
df = vaex.from_arrays(**{f"feature{i}": X[:,i] for i in range(20)})
features, target = df.get_column_names(), 'target'
df[target] = y
df.head(2)

#,feature0,feature1,feature2,feature3,feature4,feature5,feature6,feature7,feature8,feature9,feature10,feature11,feature12,feature13,feature14,feature15,feature16,feature17,feature18,feature19,target
0,0.235455,5.26659,3.29424,-0.85068,0.711974,0.0685351,-0.348895,0.277649,-0.309844,-2.37508,2.06344,-0.205725,1.42592,3.51194,0.325553,0.8727,0.611526,-0.517743,1.39328,-1.66611,1
1,0.241575,-4.0031,0.927966,2.82486,0.8928,-0.701972,0.282573,0.0780901,-1.063,1.52407,0.184087,-2.05832,1.37858,-2.21525,1.16626,-1.89677,0.349468,-0.19203,-1.81256,-2.64037,2


In [4]:
train, test = df.ml.train_test_split()



## Training 

In [6]:
from lightgbm.sklearn import LGBMClassifier
from vaex.ml.sklearn import Predictor
import warnings

warnings.filterwarnings('ignore')

num_epochs = 2
batch_size = 5000
model = None

params = {'verbosity': -1,
        'objective': 'multiclass',
        'num_class': 3,
         'num_iterations':1}

for X,y in train.online.to_x_y(features=features, 
                               target=target,
                               batch_size=batch_size, 
                               num_epochs=num_epochs):

        model = LGBMClassifier(**params).fit(X, y) if model is None else model.fit(X, y, init_model=model) 
            


model = Predictor(model=model,
                features=features, 
                target=target,
                prediction_name='prediction')
train = model.transform(train)        

training [########################################] 100.00% elapsed time  :     0.27s =  0.0m =  0.0h
 

In [7]:
from goldilox import Pipeline
from sklearn.metrics import accuracy_score

pipeline = Pipeline.from_vaex(train)
print(f"Accuracy: {accuracy_score(pipeline.inference(test)['prediction'].values, test[target].values)}")

Accuracy: 0.8315
