# XGBoost - batch learning

## [Add a vaex accesor](https://vaex.io/docs/tutorial.html#Adding-DataFrame-accessors) (optional)
* The numpy version is about 2X fast

In [1]:
import vaex
import numpy as np


@vaex.register_dataframe_accessor('online', override=True)
class Online(object):
    def __init__(self, df):
        self.df = df

    def to_pandas_x_y(self, features=None, target=None, num_epochs=1, batch_size=1000, shuffle=False):
        """Return a tuple X, y which fits the sklearn pattern"""
        num_features = len(features)        
        column_names = features.copy()
        n_samples = len(self.df)
        if target is not None:
            column_names.append(target)
        progressbar = vaex.utils.progressbars(True, title="training")
        progressbar(0)
        def iterator():
            for epoch in range(num_epochs):
                for i1, i2, chunks in self.df.to_pandas_df(column_names=column_names, 
                                                           chunk_size=batch_size):
                    if shuffle and epoch > 0:
                        chunks = chunks.sample(frac=1)
                    X = chunks[column_names]
                    y = chunks[target]
                    yield X, y
                    progressbar((n_samples * epoch + i1) / (num_epochs * n_samples))
            progressbar(1.0)
        return iterator()

    def to_x_y(self, features=None, target=None, num_epochs=1, batch_size=1000, shuffle=False):
        """Return a tuple X, y which fits the sklearn pattern"""
        num_features = len(features)        
        column_names = features.copy()
        n_samples = len(self.df)
        if target is not None:
            column_names.append(target)
        progressbar = vaex.utils.progressbars(True, title="training")
        progressbar(0)
        y = None
        def iterator():
            for epoch in range(num_epochs):
                for i1, i2, chunks in self.df.evaluate_iterator(column_names, 
                                                                chunk_size=batch_size, 
                                                                progress=False):
                    chunks = np.array(chunks).T
                    if shuffle and epoch > 0:
                        np.random.shuffle(chunks)
                    X = chunks[:, 0 : num_features]
                    if target is not None:
                        y = chunks[:, -1]
                    yield X, y
                    progressbar((n_samples * epoch + i1) / (num_epochs * n_samples))
            progressbar(1.0)
        return iterator()

# Get data

In [2]:
from sklearn.datasets import make_classification
import vaex

X, y = make_classification(n_samples=10000, n_informative=10, n_classes=3)
df = vaex.from_arrays(**{f"feature{i}": X[:,i] for i in range(20)})
features, target = df.get_column_names(), 'target'
df[target] = y
df.head(2)

#,feature0,feature1,feature2,feature3,feature4,feature5,feature6,feature7,feature8,feature9,feature10,feature11,feature12,feature13,feature14,feature15,feature16,feature17,feature18,feature19,target
0,2.22322,0.569268,1.37791,-2.11287,-1.47488,1.60108,0.0509815,-1.44114,-0.540454,2.27826,1.3163,-0.38335,-0.86742,8.47378,0.99395,-1.60651,-0.514052,-0.0179562,0.407837,-0.863185,1
1,3.39834,0.026573,-0.594668,-0.718106,-3.87205,0.430664,1.23862,2.19065,1.60622,-0.745087,-0.94352,-1.05021,0.0745354,5.89844,-1.74486,0.476258,0.322009,-1.23436,2.98763,1.41796,1


In [3]:
train, test = df.ml.train_test_split()



## Training 

In [None]:
from xgboost.sklearn import XGBClassifier
from vaex.ml.sklearn import Predictor


num_epochs = 2
batch_size = 5000
model = None

xgb_params = {
    'update':'refresh',
    'refresh_leaf': True,
    'objective':'multi:softmax',
    'num_class': 3,
    'verbosity': 0
}

for X,y in train.online.to_x_y(features=features, 
                               target=target,
                               batch_size=batch_size, 
                               num_epochs=num_epochs):
    xgb_params['xgb_model'] = model
    model = XGBClassifier(**xgb_params)
    model.fit(X, y)



predictor = Predictor(model=model,
                features=features, 
                target=target,
                prediction_name='prediction')
train = predictor.transform(train)        

training [----------------------------------------]  0.00% estimated time: unknown                 



In [None]:
from goldilox import Pipeline
from sklearn.metrics import accuracy_score

pipeline = Pipeline.from_vaex(train)
print(f"Accuracy: {accuracy_score(pipeline.inference(test)['prediction'].values, test[target].values)}")