# [Vowpal wabbit](https://vowpalwabbit.org/)

## Vaex

In [1]:
from numbers import Number

import numpy as np
import vaex
from vowpalwabbit.DFtoVW import DFtoVW
from vowpalwabbit.pyvw import vw
from vaex.ml.datasets import load_titanic

from goldilox import Pipeline

df, test = load_titanic().ml.train_test_split(test_size=0.2, verbose=False)
target = 'survived'
features = df.get_column_names(regex=f"[^{target}]")
df['survived'] = df['survived'].astype('int')+1 # in VW classification is with int starting from 1


In [2]:
params = {'P':1, 
          "enable_logging": True, 
          'link': 'logistic',
          'oaa': 2}# two classes
model = vw(**params)

for _,_,d in df.to_pandas_df(chunk_size=10):
    for ex in DFtoVW.from_colnames(df=d, y=target, x=features).convert_df():
        model.learn(ex)

model.finish()     
print(' '.join(model.get_log()[-8:]))
print(f"example:\n{ex}\nprediction: {model.predict(ex)}")

0.297994 0.000000         1047         1047.0        1        1        8
 
 finished run
 number of examples = 1047
 weighted example sum = 1047.000000
 weighted label sum = 0.000000
 average loss = 0.297994
 total feature number = 11934

example:
1 | pclass:3 name=Zimmerman, Mr. Leo age:29.0 parch:0 ticket=315082 fare:7.875    
prediction: 1


Sadly, VW is not pickable, it has it's own implementation for serialization.    
It is a small bummer, but we can go around it by implementing a class with *\_\_reduce\_\_()* which implement the VW serialization.     

This is how we can take any model to production as long as there is a way to save it to file!

In [3]:
import traitlets
import tempfile
import base64
import pandas as pd

class VWModell(traitlets.HasTraits):

        # This should work with the reduce's arguments
        def __init__(self, model=None, features=None, target=None, params=None):
            self.params = params or {}
            self.features = features
            self.target = target            
            self.model = self._decode_model(model)

        # This is how you make a class pickalbe
        def __reduce__(self):
            return (self.__class__, (self._encode(), self.features, self.target, self.params))

        # How vw implemented serialization
        def _decode_model(self, encoding):       
            if encoding is None:
                return vw(**self.params)                
            if isinstance(encoding, str):                
                model_data = base64.decodebytes(encoding.encode('ascii'))
                openfilename = tempfile.mktemp()
                with open(openfilename, 'wb') as f:
                    f.write(model_data)
                params = self.params.copy()
                params['i']= openfilename
                return vw(**params)
            else:
                return encoding

        # How vw implemented serialization
        def _encode(self):
            if isinstance(self.model, bytes):
                return self.model
            filename = tempfile.mktemp()
            self.model.save(filename)
            with open(filename, 'rb') as f:
                model_data = f.read()
            encoding =  base64.encodebytes(model_data).decode('ascii')
            return encoding   
        
        def predict(self, data):   
            if isinstance(data, vaex.dataframe.DataFrame):
                data = data.to_pandas_df()
            elif isinstance(data, np.ndarray):
                data = pd.DataFrame(data, columns=features)  
            if self.target not in data:                
                data[self.target] = 1
            examples = DFtoVW.from_colnames(df=data, y=target, x=features).convert_df()            
            return np.array([self.model.predict(ex) for ex in examples])

vw_model = VWModell(model=model, features=features, target=target, params=params)

@vaex.register_function(on_expression=False)
def predict(*columns):
    data = np.array(columns).T                
    return vw_model.predict(data)

df.add_function('predict',predict)
df['prediction'] = df.func.predict(*features)
df.head(2)

#,pclass,survived,name,sex,age,sibsp,parch,ticket,fare,cabin,embarked,boat,body,home_dest,prediction
0,1,1,"Silvey, Mr. William Baird",male,50,1,0,13507,55.9,E44,S,--,,"Duluth, MN",1
1,1,2,"Silvey, Mrs. William Baird (Alice Munger)",female,39,1,0,13507,55.9,E44,S,11,,"Duluth, MN",1


Evaluate:   
We use the pipeline here to apply the model on new data.

In [22]:
from goldilox import Pipeline
from sklearn.metrics import accuracy_score

pipeline = Pipeline.from_vaex(df)
accuracy = accuracy_score(pipeline.inference(test)["prediction"].values, test[target].values)
print(f"Accuracy: {accuracy}")


Accuracy: 0.6297709923664122


# Serve

In [23]:
print(f"Saved to: {pipeline.save('../tests/models/server.pkl')}")
print(f"Check out the docs: http://127.0.0.1:5000\n")
!gl serve ../tests/models/server.pkl

Saved to: ../tests/models/server.pkl
Check out the docs: http://127.0.0.1:5000

[2021-11-27 20:11:02 +0100] [80217] [INFO] Starting gunicorn 20.1.0
[2021-11-27 20:11:02 +0100] [80217] [INFO] Listening at: http://127.0.0.1:5000 (80217)
[2021-11-27 20:11:02 +0100] [80217] [INFO] Using worker: uvicorn.workers.UvicornH11Worker
[2021-11-27 20:11:02 +0100] [80242] [INFO] Booting worker with pid: 80242
[2021-11-27 20:11:02 +0100] [80242] [INFO] Started server process [80242]
[2021-11-27 20:11:02 +0100] [80242] [INFO] Waiting for application startup.
[2021-11-27 20:11:02 +0100] [80242] [INFO] Application startup complete.
^C
[2021-11-27 20:11:43 +0100] [80217] [INFO] Handling signal: int
