# Skleran - batch learning

## [Add a vaex accesor](https://vaex.io/docs/tutorial.html#Adding-DataFrame-accessors) (optional)
* The numpy version is about 2X fast

In [2]:
import vaex
import numpy as np


@vaex.register_dataframe_accessor('online', override=True)
class Online(object):
    def __init__(self, df):
        self.df = df

    def to_pandas_x_y(self, features=None, target=None, num_epochs=1, batch_size=1000, shuffle=False):
        """Return a tuple X, y which fits the sklearn pattern"""
        num_features = len(features)        
        column_names = features.copy()
        n_samples = len(self.df)
        if target is not None:
            column_names.append(target)
        progressbar = vaex.utils.progressbars(True, title="training")
        progressbar(0)
        def iterator():
            for epoch in range(num_epochs):
                for i1, i2, chunks in self.df.to_pandas_df(column_names=column_names, 
                                                           chunk_size=batch_size):
                    if shuffle and epoch > 0:
                        chunks = chunks.sample(frac=1)
                    X = chunks[column_names]
                    y = chunks[target]
                    yield X, y
                    progressbar((n_samples * epoch + i1) / (num_epochs * n_samples))
            progressbar(1.0)
        return iterator()

    def to_x_y(self, features=None, target=None, num_epochs=1, batch_size=1000, shuffle=False):
        """Return a tuple X, y which fits the sklearn pattern"""
        num_features = len(features)        
        column_names = features.copy()
        n_samples = len(self.df)
        if target is not None:
            column_names.append(target)
        progressbar = vaex.utils.progressbars(True, title="training")
        progressbar(0)
        y = None
        def iterator():
            for epoch in range(num_epochs):
                for i1, i2, chunks in self.df.evaluate_iterator(column_names, 
                                                                chunk_size=batch_size, 
                                                                progress=False):
                    chunks = np.array(chunks).T
                    if shuffle and epoch > 0:
                        np.random.shuffle(chunks)
                    X = chunks[:, 0 : num_features]
                    if target is not None:
                        y = chunks[:, -1]
                    yield X, y
                    progressbar((n_samples * epoch + i1) / (num_epochs * n_samples))
            progressbar(1.0)
        return iterator()

# Get data

In [3]:
from sklearn.datasets import make_classification
import vaex

X, y = make_classification(n_samples=10000, n_informative=10, n_classes=3)
df = vaex.from_arrays(**{f"feature{i}": X[:,i] for i in range(20)})
features, target = df.get_column_names(), 'target'
df[target] = y
df.head(2)

#,feature0,feature1,feature2,feature3,feature4,feature5,feature6,feature7,feature8,feature9,feature10,feature11,feature12,feature13,feature14,feature15,feature16,feature17,feature18,feature19,target
0,2.45147,-0.715539,0.698,3.21942,0.406325,2.38479,-0.103844,0.391614,0.0976282,5.4911,1.76723,1.06463,-0.443518,0.247427,0.978219,-2.61953,-2.15154,-0.544022,0.0145202,0.853765,2
1,-3.65761,-0.858712,2.27286,5.64451,0.918866,1.85282,1.03418,-1.97162,0.192771,1.60397,0.496333,-2.51502,0.170388,-1.56123,-2.98436,-1.45455,-0.246393,1.8335,0.419694,-0.594665,2


In [4]:
train, test = df.ml.train_test_split()



## Training 

In [11]:
from sklearn.linear_model import SGDClassifier
from vaex.ml.sklearn import Predictor
import warnings

warnings.filterwarnings('ignore')

for feature in features:
    train[feature] = train[feature].fillna(train[feature].mean())

num_epochs = 100
batch_size = 5000
model = None


for X,y in train.online.to_x_y(features=features, 
                               target=target,
                               batch_size=batch_size, 
                               num_epochs=num_epochs):

        model = SGDClassifier().fit(X, y) if model is None else model.partial_fit(X, y) 
            


model = Predictor(model=model,
                features=features, 
                target=target,
                prediction_name='prediction')
train = model.transform(train)        

training [########################################] 100.00% elapsed time  :     7.53s =  0.1m =  0.0h 
 

In [12]:
from goldilox import Pipeline
from sklearn.metrics import accuracy_score

pipeline = Pipeline.from_vaex(train)
print(f"Accuracy: {accuracy_score(pipeline.inference(test)['prediction'].values, test[target].values)}")

Accuracy: 0.6625
