# Online learning benchmarks

* [LightGBM]()
* [XGBoost]()
* [Catboost]()
* [River]()
* [Skleran]()
* [Vopal Wabbit]()

# Data

In [1]:
import vaex

from goldilox import Pipeline

df = vaex.open('../../../datasets/taxi_2009_2015_f32.hdf5')
df['pickup_datetime']
train = df[df['pickup_datetime'].dt.year == 2015]
test = df[df['pickup_datetime'].dt.year < 2015]
df.head(3)

#,vendor_id,pickup_datetime,dropoff_datetime,passenger_count,payment_type,trip_distance,pickup_longitude,pickup_latitude,rate_code,store_and_fwd_flag,dropoff_longitude,dropoff_latitude,fare_amount,surcharge,mta_tax,tip_amount,tolls_amount,total_amount
0,VTS,2009-01-04 02:52:00.000000000,2009-01-04 03:02:00.000000000,1,CASH,2.63,-73.992,40.7216,,,-73.9938,40.6959,8.9,0.5,,0.0,0,9.4
1,VTS,2009-01-04 03:31:00.000000000,2009-01-04 03:38:00.000000000,3,Credit,4.55,-73.9821,40.7363,,,-73.9558,40.768,12.1,0.5,,2.0,0,14.6
2,VTS,2009-01-03 15:43:00.000000000,2009-01-03 15:57:00.000000000,5,Credit,10.35,-74.0026,40.7397,,,-73.87,40.7702,23.7,0.0,,4.74,0,28.44


# Feature engineering

In [2]:
import numpy as np
import vaex.ml

target = 'trip_duration_min'
train = train.dropna(column_names=['dropoff_latitude', 'dropoff_longitude', 'pickup_latitude'])

# Time in transit (minutes) - This is the target variable
train['trip_duration_min'] = (train.dropoff_datetime - train.pickup_datetime) /
                             np.timedelta64(1, 'm')

# Speed (miles per hour) - To be used for cleaning of the training data
train['trip_speed_mph'] = train.trip_distance /
                          ((train.dropoff_datetime - train.pickup_datetime) /
                           np.timedelta64(1, 'h'))

# clean data
train = train[(train.passenger_count > 0) & (train.passenger_count < 7)]
train = train[(train.trip_distance > 0) & (train.trip_distance < 10)]
train = train[(train.trip_duration_min > 2) & (train.trip_duration_min < 30)]
train = train[(train.trip_speed_mph > 1) & (train.trip_speed_mph < 60)]

# Define the NYC boundaries
long_min = -74.05
long_max = -73.75
lat_min = 40.58
lat_max = 40.90
train = train[(train.pickup_longitude > long_min) & (train.pickup_longitude < long_max) &
              (train.pickup_latitude > lat_min) & (train.pickup_latitude < lat_max) &
              (train.dropoff_longitude > long_min) & (train.dropoff_longitude < long_max) &
              (train.dropoff_latitude > lat_min) & (train.dropoff_latitude < lat_max)]

# New features
train['pickup_time'] = train.pickup_datetime.dt.hour + train.pickup_datetime.dt.minute / 60.
train['pickup_day'] = train.pickup_datetime.dt.dayofweek
train['pickup_month'] = train.pickup_datetime.dt.month - 1  # so it starts from 0
train['pickup_is_weekend'] = (train.pickup_day >= 5).astype('int')
train['pickup_time_x'] = (np.sin(2 * np.pi * train.pickup_time / 24.)).jit_numba()
train['pickup_time_y'] = (np.cos(2 * np.pi * train.pickup_time / 24.)).jit_numba()
train['pickup_day_x'] = (np.sin(2 * np.pi * train.pickup_day / 7.)).jit_numba()
train['pickup_day_y'] = (np.cos(2 * np.pi * train.pickup_day / 7.)).jit_numba()
train['pickup_month_x'] = (np.sin(2 * np.pi * train.pickup_month / 12.)).jit_numba()
train['pickup_month_y'] = (np.cos(2 * np.pi * train.pickup_month / 12.)).jit_numba()

features = train.get_column_names(regex='.*_x') + train.get_column_names(regex='.*_y') + ['pickup_is_weekend']

# Preview the features
train.head(2)[features]

  result_data = op['op'](a_data, b_data)
  result_data = op['op'](a_data, b_data)


#,pickup_time_x,pickup_day_x,pickup_month_x,pickup_time_y,pickup_day_y,pickup_month_y,pickup_is_weekend
0,-0.96005,0.433884,0,0.279829,-0.900969,1,0
1,-0.785317,-0.974928,0,0.619094,-0.222521,1,1


In [3]:
processed = Pipeline.from_vaex(train)
processed.save('transformed.pkl')

'transformed.pkl'

# River

In [4]:
from vaex.ml.incubator.river import RiverModel
import vaex.ml.metrics
from river.linear_model import LinearRegression
from river import optim

import warnings

warnings.filterwarnings("ignore")

# Set up the model
model = RiverModel(model=LinearRegression(optimizer=optim.SGD(lr=0.0001), intercept_lr=0.0001),
                   batch_size=11_000_000,
                   features=features,
                   target=target,
                   prediction_name='river_predictions')

# Fit the model
model.fit(train, progress='widget')
train = model.transform(train)

HBox(children=(FloatProgress(value=0.0, max=1.0), Label(value='In progress...')))

In [5]:
processed = Pipeline.from_vaex(train)
processed.save('transformed1.pkl')

'transformed1.pkl'

# Vopal Wabbit

In [None]:
from vowpalwabbit.DFtoVW import DFtoVW
from vowpalwabbit.pyvw import vw
import tempfile
import base64
import pandas as pd

params = {'P': 1,
          "enable_logging": True
          }
model = vw(**params)

for _, _, d in train.to_pandas_df(chunk_size=10000):
    for ex in DFtoVW.from_colnames(df=d, y=target, x=features).convert_df():
        model.learn(ex)

model.finish()


class VWModell(traitlets.HasTraits):

    # This should work with the reduce's arguments
    def __init__(self, model=None, features=None, target=None, params=None):
        self.params = params or {}
        self.features = features
        self.target = target
        self.model = self._decode_model(model)

    # This is how you make a class pickalbe
    def __reduce__(self):
        return (self.__class__, (self._encode(), self.features, self.target, self.params))

    # How vw implemented serialization
    def _decode_model(self, encoding):
        if encoding is None:
            return vw(**self.params)
        if isinstance(encoding, str):
            model_data = base64.decodebytes(encoding.encode('ascii'))
            openfilename = tempfile.mktemp()
            with open(openfilename, 'wb') as f:
                f.write(model_data)
            params = self.params.copy()
            params['i'] = openfilename
            return vw(**params)
        else:
            return encoding

    # How vw implemented serialization
    def _encode(self):
        if isinstance(self.model, bytes):
            return self.model
        filename = tempfile.mktemp()
        self.model.save(filename)
        with open(filename, 'rb') as f:
            model_data = f.read()
        encoding = base64.encodebytes(model_data).decode('ascii')
        return encoding

    def predict(self, data):
        if isinstance(data, vaex.dataframe.DataFrame):
            data = data.to_pandas_df()
        elif isinstance(data, np.ndarray):
            data = pd.DataFrame(data, columns=features)
        if self.target not in data:
            data[self.target] = 1
        examples = DFtoVW.from_colnames(df=data, y=target, x=features).convert_df()
        return np.array([self.model.predict(ex) for ex in examples])


vw_model = VWModell(model=model, features=features, target=target, params=params)


@vaex.register_function(on_expression=False)
def predict(*columns):
    data = np.array(columns).T
    return vw_model.predict(data)


df.add_function('predict', predict)
df['prediction'] = df.func.predict(*features)
df.head(2)

# Misc

In [None]:
import numpy as np
import vaex.ml

target = 'trip_duration_min'
train = train.dropna(column_names=['dropoff_latitude', 'dropoff_longitude', 'pickup_latitude'])

# Time in transit (minutes) - This is the target variable
train['trip_duration_min'] = (train.dropoff_datetime - train.pickup_datetime) /
                             np.timedelta64(1, 'm')

# Speed (miles per hour) - To be used for cleaning of the training data
train['trip_speed_mph'] = train.trip_distance /
                          ((train.dropoff_datetime - train.pickup_datetime) /
                           np.timedelta64(1, 'h'))

# clean data
train = train[(train.passenger_count > 0) & (train.passenger_count < 7)]
train = train[(train.trip_distance > 0) & (train.trip_distance < 10)]
train = train[(train.trip_duration_min > 2) & (train.trip_duration_min < 30)]
train = train[(train.trip_speed_mph > 1) & (train.trip_speed_mph < 60)]

# Define the NYC boundaries
long_min = -74.05
long_max = -73.75
lat_min = 40.58
lat_max = 40.90
train = train[(train.pickup_longitude > long_min) & (train.pickup_longitude < long_max) &
              (train.pickup_latitude > lat_min) & (train.pickup_latitude < lat_max) &
              (train.dropoff_longitude > long_min) & (train.dropoff_longitude < long_max) &
              (train.dropoff_latitude > lat_min) & (train.dropoff_latitude < lat_max)]


# New features

def arc_distance(theta_1, phi_1, theta_2, phi_2):
    temp = (np.sin((theta_2 - theta_1) / 2 * np.pi / 180) ** 2
            + np.cos(theta_1 * np.pi / 180) * np.cos(theta_2 * np.pi / 180) * np.sin(
                (phi_2 - phi_1) / 2 * np.pi / 180) ** 2)
    distance = 2 * np.arctan2(np.sqrt(temp), np.sqrt(1 - temp))
    return distance * 3958.8


train['arc_distance'] = arc_distance(train.pickup_longitude,
                                     train.pickup_latitude,
                                     train.dropoff_longitude,
                                     train.dropoff_latitude).jit_numba()


def direction_angle(theta_1, phi_1, theta_2, phi_2):
    dtheta = theta_2 - theta_1
    dphi = phi_2 - phi_1
    radians = np.arctan2(dtheta, dphi)
    return np.rad2deg(radians)


train['direction_angle'] = direction_angle(train.pickup_longitude,
                                           train.pickup_latitude,
                                           train.dropoff_longitude,
                                           train.dropoff_latitude).jit_numba()

train['direction_angle_x'] = (np.sin(2 * np.pi * train.direction_angle / 360.)).jit_numba()
train['direction_angle_y'] = (np.cos(2 * np.pi * train.direction_angle / 360.)).jit_numba()

# Select all the features to be used for training the model
features = train.get_column_names(regex='.*_x')
train.get_column_names(regex='.*_y') + ['pickup_is_weekend']

# Preview the features
train.head(2)[features]