[<img src="ml_impossible.png">](https://towardsdatascience.com/ml-impossible-train-a-1-billion-sample-model-in-20-minutes-with-vaex-and-scikit-learn-on-your-9e2968e6f385)

# `vaex` @ MLOps community

## Modern data science with `vaex`: a new approach to DataFrames and pipelines

### New York Taxi Dataset (2009-2015): Exploratory Data Analysis and Machine Learning example (Part II)

https://www1.nyc.gov/site/tlc/about/tlc-trip-record-data.page

In [1]:
import vaex
import vaex.ml

import warnings; warnings.simplefilter('ignore')

### ~Load~ Point to the data on disk

In [2]:
# Check file size on disk
!du -h /data/yellow_taxi_*

# Read the same file as before
# df = vaex.open('/data/taxi/yellow_taxi_2012.hdf5')
df = vaex.open('/data/yellow_taxi_2009_2015_f32.hdf5')

108G	/data/yellow_taxi_2009_2015_f32.hdf5
28G	/data/yellow_taxi_2009_2015_f32.parquet
164G	/data/yellow_taxi_2009_2015.hdf5
12G	/data/yellow_taxi_2015_f32.arrow
12G	/data/yellow_taxi_2015_f32.hdf5


### Get the test set

In [3]:
df_train = df[:1_026_944_937]
df_test = df[1_026_944_937:]

print(f'Number of samples in the full dataset: {len(df):,}')
print(f'Number of samples in the training set: {len(df_train):,}')
print(f'Number of samples in the test set:      {len(df_test):,}')

# Check if the lengths of the datasets match
assert len(df) == len(df_test) + len(df_train)

Number of samples in the full dataset: 1,173,057,927
Number of samples in the training set: 1,026,944,937
Number of samples in the test set:      146,112,990


### Inspect the test set

In [4]:
df_test

#,vendor_id,pickup_datetime,dropoff_datetime,passenger_count,payment_type,trip_distance,pickup_longitude,pickup_latitude,rate_code,store_and_fwd_flag,dropoff_longitude,dropoff_latitude,fare_amount,surcharge,mta_tax,tip_amount,tolls_amount,total_amount
0,VTS,2014-12-15 18:23:00.000000000,2014-12-15 18:58:00.000000000,2,,6.28,-74.00419,40.72119,1.0,,-73.97,,,,,,,
1,VTS,2015-01-15 19:05:39.000000000,2015-01-15 19:23:42.000000000,1,1,1.59,-73.9939,40.75011,1.0,0.0,-73.974785,40.750618,12.0,1.0,0.5,3.25,0.0,17.05
2,CMT,2015-01-10 20:33:38.000000000,2015-01-10 20:53:28.000000000,1,1,3.3,-74.00165,40.724243,1.0,0.0,-73.994415,40.75911,14.5,0.5,0.5,2.0,0.0,17.8
3,CMT,2015-01-10 20:33:38.000000000,2015-01-10 20:43:41.000000000,1,2,1.8,-73.96334,40.802788,1.0,0.0,-73.95182,40.824413,9.5,0.5,0.5,0.0,0.0,10.8
4,CMT,2015-01-10 20:33:39.000000000,2015-01-10 20:35:31.000000000,1,2,0.5,-74.00909,40.713818,1.0,0.0,-74.004326,40.719986,3.5,0.5,0.5,0.0,0.0,4.8
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
146112985,VTS,2015-12-31 23:59:56.000000000,2016-01-01 00:08:18.000000000,5,1,1.2,-73.99381,40.72087,1.0,0.0,-73.98621,40.72247,7.5,0.5,0.5,1.76,0.0,10.56
146112986,CMT,2015-12-31 23:59:58.000000000,2016-01-01 00:05:19.000000000,2,2,2.0,-73.96527,40.76028,1.0,0.0,-73.939514,40.752388,7.5,0.5,0.5,0.0,0.0,8.8
146112987,CMT,2015-12-31 23:59:59.000000000,2016-01-01 00:12:55.000000000,2,2,3.8,-73.9873,40.73908,1.0,0.0,-73.98867,40.6933,13.5,0.5,0.5,0.0,0.0,14.8
146112988,VTS,2015-12-31 23:59:59.000000000,2016-01-01 00:10:26.000000000,1,2,1.96,-73.99756,40.725693,1.0,0.0,-74.01712,40.705322,8.5,0.5,0.5,0.0,0.0,9.8


### Apply the state to the test DataFrame

In [5]:
# df_test.state_load('./taxi_ml_state.json')  # if the state file is local

fs_options = {'token': '/home/jovan/.keys/vaex-282913-a6370a7624ec.json', 'cache': False}
df_test.state_load('gs://vaex-data/demo/taxi_ml_state.json', fs_options=fs_options)

In [6]:
df_test

#,vendor_id,pickup_datetime,dropoff_datetime,passenger_count,payment_type,trip_distance,pickup_longitude,pickup_latitude,rate_code,store_and_fwd_flag,dropoff_longitude,dropoff_latitude,fare_amount,surcharge,mta_tax,tip_amount,tolls_amount,total_amount,trip_duration_min,trip_speed_mph,pickup_time,pickup_day,pickup_is_weekend,arc_distance,direction_angle,PCA_0,PCA_1,PCA_2,PCA_3,pickup_time_x,pickup_time_y,pickup_day_x,pickup_day_y,direction_angle_x,direction_angle_y,standard_scaled_arc_distance,predicted_duration_min,pred_final
0,VTS,2015-01-15 19:05:39.000000000,2015-01-15 19:23:42.000000000,1,1,1.59,-73.9939,40.75011,1.0,0.0,-73.974785,40.750618,12.0,1.0,0.5,3.25,0.0,17.05,18.05,5.285318670510585,19.083333333333332,3,0,1.3205364,88.47933,-0.009463026,0.00962955,0.0009385613,-0.0042585577,0.279829014030992,-0.9600498543859287,-0.900968867902419,0.43388373911755823,0.026537478,0.9996478,0.18589446,13.243260238548848,13.243260238548848
1,CMT,2015-01-10 20:33:38.000000000,2015-01-10 20:53:28.000000000,1,1,3.3,-74.00165,40.724243,1.0,0.0,-73.994415,40.75911,14.5,0.5,0.5,2.0,0.0,17.8,19.833333333333332,9.983193133057666,20.55,5,1,0.8311264,11.719215,-0.034722015,7.998012e-05,-0.0031515714,0.016735025,0.619093949309834,-0.785316930880745,-0.2225209339563146,-0.9749279121818236,0.97915477,0.20311569,-0.36002454,10.521180965591727,10.521180965591727
2,CMT,2015-01-10 20:33:38.000000000,2015-01-10 20:43:41.000000000,1,2,1.8,-73.96334,40.802788,1.0,0.0,-73.95182,40.824413,9.5,0.5,0.5,0.0,0.0,10.8,10.05,10.746268372037516,20.55,5,1,0.89671814,28.045202,0.05094445,0.017341323,0.074795686,0.018506352,0.619093949309834,-0.785316930880745,-0.2225209339563146,-0.9749279121818236,0.88257694,0.47016802,-0.28685933,8.98142024075389,8.98142024075389
3,CMT,2015-01-10 20:33:39.000000000,2015-01-10 20:52:58.000000000,1,2,3.0,-73.971176,40.76243,1.0,0.0,-74.00418,40.742653,15.0,0.5,0.5,0.0,0.0,16.3,19.316666666666666,9.318377911993098,20.55,5,1,2.3113708,-120.928696,0.014120486,-0.0009419392,-0.022247035,0.015489833,0.619093949309834,-0.785316930880745,-0.2225209339563146,-0.9749279121818236,-0.5139711,-0.8578075,1.2911342,15.954830808893831,15.954830808893831
4,CMT,2015-01-10 20:33:39.000000000,2015-01-10 20:58:31.000000000,1,2,2.2,-73.98328,40.72601,1.0,0.0,-73.99247,40.749634,14.0,0.5,0.5,0.0,0.0,15.3,24.866666666666667,5.308311107011646,20.55,5,1,0.7786042,-21.26345,-0.022162506,-0.013443824,-0.009868959,0.009774372,0.619093949309834,-0.785316930880745,-0.2225209339563146,-0.9749279121818236,0.93192273,-0.36265683,-0.41861114,10.188928459050407,10.188928459050407
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
114684110,VTS,2015-12-31 23:59:56.000000000,2016-01-01 00:08:18.000000000,5,1,1.2,-73.99381,40.72087,1.0,0.0,-73.98621,40.72247,7.5,0.5,0.5,1.76,0.0,10.56,8.366666666666667,8.605578031197961,23.983333333333334,3,0,0.5259203,78.12149,-0.03264316,-0.008193069,-0.028742317,-0.010740086,0.9999904807207345,-0.004363309284746544,-0.900968867902419,0.43388373911755823,0.20583715,0.97858626,-0.70047086,8.600918589581578,8.600918589581578
114684111,CMT,2015-12-31 23:59:58.000000000,2016-01-01 00:05:19.000000000,2,2,2.0,-73.96527,40.76028,1.0,0.0,-73.939514,40.752388,7.5,0.5,0.5,0.0,0.0,8.8,5.35,22.429906542056074,23.983333333333334,3,0,1.7860186,107.036514,0.01600008,-0.006937808,0.022333903,-0.032354712,0.9999904807207345,-0.004363309284746544,-0.900968867902419,0.43388373911755823,-0.29298112,0.9561182,0.70512295,11.928962449936897,11.928962449936897
114684112,CMT,2015-12-31 23:59:59.000000000,2016-01-01 00:12:55.000000000,2,2,3.8,-73.9873,40.73908,1.0,0.0,-73.98867,40.6933,13.5,0.5,0.5,0.0,0.0,14.8,12.933333333333334,17.628865758168327,23.983333333333334,3,0,0.8776615,-178.28178,-0.0142205665,-0.0023130435,-0.054195464,-0.025201043,0.9999904807207345,-0.004363309284746544,-0.900968867902419,0.43388373911755823,-0.9995504,-0.029983945,-0.30811632,10.337722683373048,10.337722683373048
114684113,VTS,2015-12-31 23:59:59.000000000,2016-01-01 00:10:26.000000000,1,2,1.96,-73.99756,40.725693,1.0,0.0,-74.01712,40.705322,8.5,0.5,0.5,0.0,0.0,9.8,10.45,11.253588735772093,23.983333333333334,3,0,1.4061307,-136.1602,-0.031087026,-0.0022887718,-0.0603564,0.005064793,0.9999904807207345,-0.004363309284746544,-0.900968867902419,0.43388373911755823,-0.72127926,-0.69264436,0.28137177,12.76441980356352,12.76441980356352


### Check metrics

In [7]:
# See metrics
print('Mean absolute error:', df_test.ml.metrics.mean_absolute_error('trip_duration_min', 'pred_final'))

Mean absolute error: 3.584379318840206


### What about production?

In [8]:
df_test = df[1_026_944_937:]

fs_options = {'token': '/home/jovan/.keys/vaex-282913-a6370a7624ec.json', 'cache': False}
df_test.state_load('gs://vaex-data/demo/taxi_ml_state.json', fs_options=fs_options)

# Set the "production" variable to True
df_test.variables['production'] = True

print(f'Number of samples in the test set in "production mode" is {len(df_test):,}.')

Number of samples in the test set in "production mode" is 146,112,990.


Be careful however...

In [9]:
features = ['PCA_0',
            'PCA_1',
            'PCA_2',
            'PCA_3',
            'standard_scaled_arc_distance',
            'pickup_time_x',
            'pickup_day_x',
            'direction_angle_x',
            'pickup_time_y',
            'pickup_day_y',
            'direction_angle_y',
            'pickup_is_weekend']


df_test[features]

#,PCA_0,PCA_1,PCA_2,PCA_3,standard_scaled_arc_distance,pickup_time_x,pickup_day_x,direction_angle_x,pickup_time_y,pickup_day_y,direction_angle_y,pickup_is_weekend
0,-0.038689442,0.00024529733,0.01138061,-0.0029030032,,0.10018806161207608,1.0,,-0.9949685182509117,0.0,,0
1,-0.009463026,0.00962955,0.0009385613,-0.0042585577,0.18589446,0.279829014030992,-0.900968867902419,0.026537478,-0.9600498543859287,0.43388373911755823,0.9996478,0
2,-0.034722015,7.998012e-05,-0.0031515714,0.016735025,-0.36002454,0.619093949309834,-0.2225209339563146,0.97915477,-0.785316930880745,-0.9749279121818236,0.20311569,1
3,0.05094445,0.017341323,0.074795686,0.018506352,-0.28685933,0.619093949309834,-0.2225209339563146,0.88257694,-0.785316930880745,-0.9749279121818236,0.47016802,1
4,-0.047522366,-0.00034094043,-0.041028067,0.0027979594,-0.89751655,0.619093949309834,-0.2225209339563146,0.79164016,-0.785316930880745,-0.9749279121818236,0.6109876,1
...,...,...,...,...,...,...,...,...,...,...,...,...
146112985,-0.03264316,-0.008193069,-0.028742317,-0.010740086,-0.70047086,0.9999904807207345,-0.900968867902419,0.20583715,-0.004363309284746544,0.43388373911755823,0.97858626,0
146112986,0.01600008,-0.006937808,0.022333903,-0.032354712,0.70512295,0.9999904807207345,-0.900968867902419,-0.29298112,-0.004363309284746544,0.43388373911755823,0.9561182,0
146112987,-0.0142205665,-0.0023130435,-0.054195464,-0.025201043,-0.30811632,0.9999904807207345,-0.900968867902419,-0.9995504,-0.004363309284746544,0.43388373911755823,-0.029983945,0
146112988,-0.031087026,-0.0022887718,-0.0603564,0.005064793,0.28137177,0.9999904807207345,-0.900968867902419,-0.72127926,-0.004363309284746544,0.43388373911755823,-0.69264436,0
