# CatBoost - batch learning

## [Add a vaex accesor](https://vaex.io/docs/tutorial.html#Adding-DataFrame-accessors) (optional)
* The numpy version is about 2X fast

In [2]:
import vaex
import numpy as np


@vaex.register_dataframe_accessor('online', override=True)
class Online(object):
    def __init__(self, df):
        self.df = df

    def to_pandas_x_y(self, features=None, target=None, num_epochs=1, batch_size=1000, shuffle=False):
        """Return a tuple X, y which fits the sklearn pattern"""
        num_features = len(features)        
        column_names = features.copy()
        n_samples = len(self.df)
        if target is not None:
            column_names.append(target)
        progressbar = vaex.utils.progressbars(True, title="training")
        progressbar(0)
        def iterator():
            for epoch in range(num_epochs):
                for i1, i2, chunks in self.df.to_pandas_df(column_names=column_names, 
                                                           chunk_size=batch_size):
                    if shuffle and epoch > 0:
                        chunks = chunks.sample(frac=1)
                    X = chunks[column_names]
                    y = chunks[target]
                    yield X, y
                    progressbar((n_samples * epoch + i1) / (num_epochs * n_samples))
            progressbar(1.0)
        return iterator()

    def to_x_y(self, features=None, target=None, num_epochs=1, batch_size=1000, shuffle=False):
        """Return a tuple X, y which fits the sklearn pattern"""
        num_features = len(features)        
        column_names = features.copy()
        n_samples = len(self.df)
        if target is not None:
            column_names.append(target)
        progressbar = vaex.utils.progressbars(True, title="training")
        progressbar(0)
        y = None
        def iterator():
            for epoch in range(num_epochs):
                for i1, i2, chunks in self.df.evaluate_iterator(column_names, 
                                                                chunk_size=batch_size, 
                                                                progress=False):
                    chunks = np.array(chunks).T
                    if shuffle and epoch > 0:
                        np.random.shuffle(chunks)
                    X = chunks[:, 0 : num_features]
                    if target is not None:
                        y = chunks[:, -1]
                    yield X, y
                    progressbar((n_samples * epoch + i1) / (num_epochs * n_samples))
            progressbar(1.0)
        return iterator()

# Get data

In [3]:
from sklearn.datasets import make_classification
import vaex

X, y = make_classification(n_samples=10000, n_informative=10, n_classes=3)
df = vaex.from_arrays(**{f"feature{i}": X[:,i] for i in range(20)})
features, target = df.get_column_names(), 'target'
df[target] = y
df.head(2)

#,feature0,feature1,feature2,feature3,feature4,feature5,feature6,feature7,feature8,feature9,feature10,feature11,feature12,feature13,feature14,feature15,feature16,feature17,feature18,feature19,target
0,0.626102,0.277363,1.80445,-0.387044,-1.97737,-1.18679,-0.705145,-0.877054,0.562539,0.331714,-0.308297,-1.37148,0.437633,0.948141,0.212583,0.513524,1.15034,-4.40183,-1.46181,1.09662,2
1,2.63686,-0.0297119,-0.74685,0.527176,-2.39969,0.133636,1.80218,0.622773,0.212612,-1.40264,-1.43593,0.951531,0.36544,-1.65699,-0.807654,2.29668,-2.96094,0.0621024,0.149074,-0.712765,0


In [4]:
train, test = df.ml.train_test_split()



## Training 

In [5]:
from catboost import CatBoostClassifier
from vaex.ml.sklearn import Predictor
import warnings

warnings.filterwarnings('ignore')

num_epochs = 2
batch_size = 5000
model = None

params = {'verbose': 0, 
          'iterations': 1,
          'objective': 'MultiClass'}

for X,y in train.online.to_x_y(features=features, 
                               target=target,
                               batch_size=batch_size, 
                               num_epochs=num_epochs):
        model = CatBoostClassifier(**params).fit(X, y) if model is None else model.fit(X, y, init_model=model) 
            


model = Predictor(model=model,
                features=features, 
                target=target,
                prediction_name='prediction')
train = model.transform(train)        

training [########################################] 100.00% elapsed time  :     0.12s =  0.0m =  0.0h
 

In [6]:
from goldilox import Pipeline
from sklearn.metrics import accuracy_score

pipeline = Pipeline.from_vaex(train)
print(f"Accuracy: {accuracy_score(pipeline.inference(test)['prediction'].values, test[target].values)}")

Accuracy: 0.8195
