# [Vowpal wabbit](https://vowpalwabbit.org/)

## Vaex

In [1]:
from numbers import Number

import numpy as np
import vaex
from vowpalwabbit.DFtoVW import DFtoVW
from vowpalwabbit.pyvw import vw
from vaex.ml.datasets import load_titanic

from goldilox import Pipeline

df, test = load_titanic().ml.train_test_split(test_size=0.2, verbose=False)
target = 'survived'
features = df.get_column_names(regex=f"[^{target}]")
df['survived'] = df['survived'].astype('int')+1 # in VW classification is with int starting from 1


In [3]:
params = {'P':1, 
          "enable_logging": True, 
          'link': 'logistic',
          'oaa': 2}# two classes
model = vw(**params)

for _,_,d in df.to_pandas_df(chunk_size=10):
    for ex in DFtoVW.from_colnames(df=d, y=target, x=features).convert_df():
        model.learn(ex)

model.finish()     
print(' '.join(model.get_log()[-8:]))
print(f"example:\n{ex}\nprediction: {model.predict(ex)}")

0.297994 0.000000         1047         1047.0        1        1        8
 
 finished run
 number of examples = 1047
 weighted example sum = 1047.000000
 weighted label sum = 0.000000
 average loss = 0.297994
 total feature number = 11934

example:
1 | pclass:3 name=Zimmerman, Mr. Leo age:29.0 parch:0 ticket=315082 fare:7.875    
prediction: 1


In [8]:
import traitlets
import tempfile
import base64
import pandas as pd

class VWModell(traitlets.HasTraits):

        # This should work with the reduce's arguments
        def __init__(self, model=None, features=None, target=None, params=None):
            self.params = params or {}
            self.features = features
            self.target = target            
            model = self._decode_model(model)

        # This is how you make a class pickalbe
        def __reduce__(self):
            return (self.__class__, (self._encode(), self.features, self.target, self.params))

        # How vw implemented serialization
        def _decode_model(self, encoding):       
            if encoding is None:
                print(1)
                return vw(**self.params)     
            if isinstance(encoding, bytes):                
                print(2)
                model_data = base64.decodebytes(encoding.encode('ascii'))
                openfilename = tempfile.mktemp()
                with open(openfilename, 'wb') as f:
                    f.write(model_data)
                params = self.params.copy()
                params['i']= openfilename
                return vw(**params)
            else:
                print(3)
                print(encoding)
                return encoding

        # How vw implemented serialization
        def _encode(self):
            if isinstance(self.model, bytes):
                return self.model
            filename = tempfile.mktemp()
            self.model.save(filename)
            with open(filename, 'rb') as f:
                model_data = f.read()
            encoding =  base64.encodebytes(model_data).decode('ascii')
            return encoding   
        
        def predict(self, data):   
            if isinstance(data, vaex.dataframe.DataFrame):
                data = data.to_pandas_df()
            elif isinstance(data, np.ndarray):
                data = pd.DataFrame(data, columns=features)  
            if self.target not in data:                
                data[self.target] = 1
            examples = DFtoVW.from_colnames(df=data, y=target, x=features).convert_df()            
            return np.array([self.model.predict(ex) for ex in examples])

vw_model = VWModell(model=model, features=features, target=target, params=params)

@vaex.register_function(on_expression=False)
def predict(*columns):
    data = np.array(columns).T                
    return vw_model.predict(data)

df.add_function('predict',predict)
df['prediction'] = df.func.predict(*tuple([df[feature] for feature in features]))
df.head(2)

ERROR:MainThread:vaex:error evaluating: prediction at rows 0-2
Traceback (most recent call last):
  File "/Users/yonatanalexander/Dropbox/Development_box/xdss-projects/goldilox/.venv/lib/python3.7/site-packages/vaex/scopes.py", line 106, in evaluate
    result = self[expression]
  File "/Users/yonatanalexander/Dropbox/Development_box/xdss-projects/goldilox/.venv/lib/python3.7/site-packages/vaex/scopes.py", line 166, in __getitem__
    raise KeyError("Unknown variables or column: %r" % (variable,))
KeyError: "Unknown variables or column: 'predict(pclass, name, age, parch, ticket, fare, cabin, boat, body, home_dest)'"

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/Users/yonatanalexander/Dropbox/Development_box/xdss-projects/goldilox/.venv/lib/python3.7/site-packages/vaex/dataframe.py", line 2047, in data_type
    data = self.evaluate(expression, 0, 1, filtered=False, array_type=array_type, parallel=False)
  File "/Users/y

3


#,pclass,survived,name,sex,age,sibsp,parch,ticket,fare,cabin,embarked,boat,body,home_dest,prediction
0,1,1,"Silvey, Mr. William Baird",male,50,1,0,13507,55.9,E44,S,--,,"Duluth, MN",error
1,1,2,"Silvey, Mrs. William Baird (Alice Munger)",female,39,1,0,13507,55.9,E44,S,11,,"Duluth, MN",error


In [5]:
from sklearn.metrics import accuracy_score
from goldilox import Pipeline
pipeline = Pipeline.from_vaex(df)
pipeline.inference(test)
# accuracy_score(pipeline.inference(test)["prediction"].values, test[target].values)


ERROR:MainThread:vaex:error evaluating: prediction at rows 0-5
Traceback (most recent call last):
  File "/Users/yonatanalexander/Dropbox/Development_box/xdss-projects/goldilox/.venv/lib/python3.7/site-packages/vaex/scopes.py", line 106, in evaluate
    result = self[expression]
  File "/Users/yonatanalexander/Dropbox/Development_box/xdss-projects/goldilox/.venv/lib/python3.7/site-packages/vaex/scopes.py", line 166, in __getitem__
    raise KeyError("Unknown variables or column: %r" % (variable,))
KeyError: "Unknown variables or column: 'predict(pclass, name, age, parch, ticket, fare, cabin, boat, body, home_dest)'"

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/Users/yonatanalexander/Dropbox/Development_box/xdss-projects/goldilox/.venv/lib/python3.7/site-packages/vaex/dataframe.py", line 2047, in data_type
    data = self.evaluate(expression, 0, 1, filtered=False, array_type=array_type, parallel=False)
  File "/Users/y

<class 'str'>
BwAAADguMTEuMAABAAAAAG0AAIC/AACAPxIAAAAAAAAAAAAAAAAAAAAZAAAAIC0tbGluayBsb2dp
c3RpYyAtLW9hYSAyAAQAAAC9M8NqAAQAAAAAzTq9BQAAAFgjf7wGAAAA/IsQvAcAAAA/i729DgAA
ACqEBr4PAAAA9bSlPRQAAABne8y9FQAAAETOnz0eAAAA+xKEvh8AAADs6mU+IAAAAIj+9b0hAAAA
aHbTPXAAAACsf4I+cQAAANxuGL+4AAAAA0c5PrkAAADCAVO+ggEAAL7VoryDAQAA4Pd8vvYBAAAL
YcQ99wEAAJZv/702AgAASKhQvjcCAAAo8jc+cgMAAEjZ6b1zAwAAJKfMPaIDAABLbsM9owMAAPTm
9r0GBAAAI03bvQcEAAA0Mrw9OgQAAON8r707BAAAXvmIPeYEAABorOU95wQAAHDHAL5SBQAAo/DK
PVMFAACMyu69egUAAGohlj17BQAAsnnAvZwFAAAsMN69nQUAAMfOuz3eBQAAC7KDvd8FAAB8W/G8
4AUAAOD9Qb7hBQAAnHktPuIFAADGpco94wUAAFSbB75MBgAAqxW+PU0GAAD8X/q9pgYAAPVaVz6n
BgAA4beUviAHAAAwEae9IQcAAHjuSj1qBwAAUDGNPWsHAAD5QM29bAcAAJLDlj1tBwAAtXvDvfYH
AACxjbg99wcAAOXK2L0gCAAAamoBviEIAADtMuo9hAgAANoKYj2FCAAA5t7jvegIAACS1Zo96QgA
AO2Gvb0cCQAAmjObPR0JAADMi769JgkAAOgG9r0nCQAAyUzUPW4JAABLDrc9bwkAANq33r0QCgAA
k1iQPBEKAACyD1q+YAoAAB3TsD1hCgAANZPKvWoKAADXDPA9awoAADb8B76MCwAAevC0PY0LAABk
tem9MgwAAGd/qL0zDAAAaAGUPUoMAADT/2w+SwwAAML/nb5iDAAAI9a7PWMMAA

ERROR:MainThread:vaex:error evaluating: prediction at rows 0-5
Traceback (most recent call last):
  File "/Users/yonatanalexander/Dropbox/Development_box/xdss-projects/goldilox/.venv/lib/python3.7/site-packages/vaex/scopes.py", line 106, in evaluate
    result = self[expression]
  File "/Users/yonatanalexander/Dropbox/Development_box/xdss-projects/goldilox/.venv/lib/python3.7/site-packages/vaex/scopes.py", line 166, in __getitem__
    raise KeyError("Unknown variables or column: %r" % (variable,))
KeyError: "Unknown variables or column: 'predict(pclass, name, age, parch, ticket, fare, cabin, boat, body, home_dest)'"

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/Users/yonatanalexander/Dropbox/Development_box/xdss-projects/goldilox/.venv/lib/python3.7/site-packages/vaex/dataframe.py", line 2047, in data_type
    data = self.evaluate(expression, 0, 1, filtered=False, array_type=array_type, parallel=False)
  File "/Users/y

#,pclass,survived,name,sex,age,sibsp,parch,ticket,fare,cabin,embarked,boat,body,home_dest,prediction
0,1,2,"Allen, Miss. Elisabeth Walton",female,29.0,0,0,24160,211.3375,B5,S,2,,"St Louis, MO",error
1,1,2,"Allison, Master. Hudson Trevor",male,0.9167,1,2,113781,151.55,C22 C26,S,11,,"Montreal, PQ / Chesterville, ON",error
2,1,1,"Allison, Miss. Helen Loraine",female,2.0,1,2,113781,151.55,C22 C26,S,--,,"Montreal, PQ / Chesterville, ON",error
3,1,1,"Allison, Mr. Hudson Joshua Creighton",male,30.0,1,2,113781,151.55,C22 C26,S,--,135.0,"Montreal, PQ / Chesterville, ON",error
4,1,1,"Allison, Mrs. Hudson J C (Bessie Waldo Daniels)",female,25.0,1,2,113781,151.55,C22 C26,S,--,,"Montreal, PQ / Chesterville, ON",error
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
257,1,2,"Schabert, Mrs. Paul (Emma Mock)",female,35.0,1,0,13236,57.75,C28,C,11,,"New York, NY",error
258,1,2,"Serepeca, Miss. Augusta",female,30.0,0,0,113798,31.0,--,C,4,,--,error
259,1,2,"Seward, Mr. Frederic Kimber",male,34.0,0,0,113794,26.55,--,S,7,,"New York, NY",error
260,1,2,"Shutes, Miss. Elizabeth W",female,40.0,0,0,PC 17582,153.4625,C125,S,3,,"New York, NY / Greenwich CT",error
