# PyGrunn - `vaex` - Machine Learning example: the "server" side

In [1]:
import vaex
import vaex.ml

### Read in the data

In [2]:
test = vaex.open('./data/test_set.arrow')
print('This test set has %i samples.' % len(test))

This test set has 1000000 samples.


### Inspect the test set

In [3]:
test

#,vendor_id,pickup_datetime,dropoff_datetime,passenger_count,trip_distance,pickup_longitude,pickup_latitude,rate_code,store_and_fwd_flag,dropoff_longitude,dropoff_latitude,payment_type,fare_amount,surcharge,mta_tax,tip_amount,tolls_amount,total_amount
0,CMT,2013-01-01 15:11:48.000000000,2013-01-01 15:18:10.000000000,4,1.0,-73.978165,40.757977,1.0,0.0,-73.98984,40.751173,CSH,6.5,0.0,0.5,0.0,0.0,7.0
1,CMT,2013-01-06 00:18:35.000000000,2013-01-06 00:22:54.000000000,1,1.5,-74.00668,40.731781,1.0,0.0,-73.994499,40.750659,CSH,6.0,0.5,0.5,0.0,0.0,7.0
2,CMT,2013-01-05 18:49:41.000000000,2013-01-05 18:54:23.000000000,1,1.1,-74.004711,40.73777,1.0,0.0,-74.009831,40.726,CSH,5.5,1.0,0.5,0.0,0.0,7.0
3,CMT,2013-01-07 23:54:15.000000000,2013-01-07 23:58:20.000000000,2,0.7,-73.97459999999998,40.759945,1.0,0.0,-73.98473699999998,40.759388,CSH,5.0,0.5,0.5,0.0,0.0,6.0
4,CMT,2013-01-07 23:25:03.000000000,2013-01-07 23:34:24.000000000,1,2.1,-73.976252,40.748528,1.0,0.0,-74.002583,40.747867,CSH,9.5,0.5,0.5,0.0,0.0,10.5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
999995,VTS,2013-01-01 12:34:00.000000000,2013-01-01 12:39:00.000000000,6,1.17,-73.974007,40.764175,1.0,,-73.96455,40.773002,CRD,5.5,0.0,0.5,1.1,0.0,7.1
999996,VTS,2013-01-01 12:01:00.000000000,2013-01-01 12:36:00.000000000,2,12.37,-73.87305999999998,40.774145,1.0,,-73.98869,40.747712,CSH,40.0,0.0,0.5,0.0,4.8,45.3
999997,VTS,2013-01-01 12:28:00.000000000,2013-01-01 12:38:00.000000000,4,3.02,-73.98922299999998,40.753687,1.0,,-74.010683,40.713553,CSH,11.0,0.0,0.5,0.0,0.0,11.5
999998,VTS,2013-01-01 12:36:00.000000000,2013-01-01 12:36:00.000000000,1,0.0,0.0,0.0,1.0,,0.0,0.0,CSH,2.5,0.0,0.5,0.0,0.0,3.0


### Load the state into the test DataFrame

In [4]:
# This is where the magic happens
test.state_load('./data/vaex_model_pipeline.json')

This means that in case of installing LightGBM from PyPI via the ``pip install lightgbm`` command, you don't need to install the gcc compiler anymore.
Instead of that, you need to install the OpenMP library, which is required for running LightGBM on the system with the Apple Clang compiler.
You can install the OpenMP library by the following command: ``brew install libomp``.


In [5]:
# view the transformed test set - it inclides the predictions and all intermediate steps!
test

#,vendor_id,pickup_datetime,dropoff_datetime,passenger_count,trip_distance,pickup_longitude,pickup_latitude,rate_code,store_and_fwd_flag,dropoff_longitude,dropoff_latitude,payment_type,fare_amount,surcharge,mta_tax,tip_amount,tolls_amount,total_amount,trip_duration_min,pu_hour,pu_day_of_week,pu_is_weekend,arc_distance_miles,direction_angle,PCA_0,PCA_1,PCA_2,PCA_3,label_encoded_vendor_id,lightgbm_prediction
0,CMT,2013-01-01 15:11:48.000000000,2013-01-01 15:18:10.000000000,4,1.0,-73.978165,40.757977,1.0,0.0,-73.98984,40.751173,CSH,6.5,0.0,0.5,0.0,0.0,7.0,6.366666666666666,15,1,0,0.8170350797859289,-120.23293627513229,0.005563989582974179,0.0034554530129773824,-0.007398977515577462,0.009276893157306015,0,7.364544196271947
1,CMT,2013-01-06 00:18:35.000000000,2013-01-06 00:22:54.000000000,1,1.5,-74.00668,40.731781,1.0,0.0,-73.994499,40.750659,CSH,6.0,0.5,0.5,0.0,0.0,7.0,4.316666666666666,0,6,1,0.915206089778209,32.832067311173155,-0.033099698565622254,0.0055661068803717975,-0.010575714579848391,0.012723459365073513,0,6.422300378809424
2,CMT,2013-01-05 18:49:41.000000000,2013-01-05 18:54:23.000000000,1,1.1,-74.004711,40.73777,1.0,0.0,-74.009831,40.726,CSH,5.5,1.0,0.5,0.0,0.0,7.0,4.7,18,5,1,0.4187481463954876,-156.4907551070662,-0.027438352569111583,0.008339956823645151,-0.039523027790593815,0.010445398154522724,0,6.431858147806219
3,CMT,2013-01-07 23:54:15.000000000,2013-01-07 23:58:20.000000000,2,0.7,-73.97459999999998,40.759945,1.0,0.0,-73.98473699999998,40.759388,CSH,5.0,0.5,0.5,0.0,0.0,6.0,4.083333333333333,23,0,0,0.7004873614138178,-93.14508131372725,0.009462373222828913,0.0022786548158773585,0.0022418228681065477,0.01003965259655025,0,5.840539501229937
4,CMT,2013-01-07 23:25:03.000000000,2013-01-07 23:34:24.000000000,1,2.1,-73.976252,40.748528,1.0,0.0,-74.002583,40.747867,CSH,9.5,0.5,0.5,0.0,0.0,10.5,9.35,23,0,0,1.819360209413874,-91.43802201585738,0.00013781489544046222,-0.004513223053993152,-0.017617758869502797,0.01757690476215882,0,11.853951409124315
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
745211,VTS,2013-01-01 11:44:00.000000000,2013-01-01 11:48:00.000000000,2,1.01,-73.977867,40.752242,1.0,,-73.984707,40.74635,CSH,5.5,0.0,0.5,0.0,0.0,6.0,4.0,11,1,0,0.4857721080797788,-130.74172519596877,0.0016677981982832183,-0.0007634001124341561,-0.008238445770749805,0.0022837314225409282,1,6.7908121204710366
745212,VTS,2013-01-01 12:30:00.000000000,2013-01-01 12:35:00.000000000,3,1.43,-73.97408799999998,40.755217,1.0,,-73.985583,40.74194,CRD,6.5,0.0,0.5,1.3,0.0,8.3,5.0,12,1,0,0.8336106186269362,-139.11454490848595,0.0064363219895815765,-0.0013900155064279053,-0.012308814501983484,0.00037386902126578404,1,8.219501134401456
745213,VTS,2013-01-01 12:27:00.000000000,2013-01-01 12:38:00.000000000,1,3.22,-73.98677999999998,40.74939,1.0,,-74.000422,40.723847,CSH,12.0,0.0,0.5,0.0,0.0,12.5,11.0,12,1,0,1.0607977719708614,-151.89422599616444,-0.006598518727099495,0.0036233387091474216,-0.03567689647266646,0.0015926040258109411,1,15.97530794255817
745214,VTS,2013-01-01 12:28:00.000000000,2013-01-01 12:38:00.000000000,4,3.02,-73.98922299999998,40.753687,1.0,,-74.010683,40.713553,CSH,11.0,0.0,0.5,0.0,0.0,11.5,10.0,12,1,0,1.668174698555257,-151.86619536928146,-0.005229616762420809,0.008372924523837585,-0.05005047116436957,0.0037501275093600565,1,14.871115191447839


In [6]:
# check the performance
from sklearn.metrics import mean_absolute_error, mean_squared_error

mae_test_score = mean_absolute_error(test.trip_duration_min.values, test.lightgbm_prediction.values)
mse_test_score = mean_squared_error(test.trip_duration_min.values, test.lightgbm_prediction.values)

print('The mean absolute error is %2.3f' % mae_test_score)
print('The mean squared score is %2.3f' % mse_test_score)

The mean absolute error is 2.364
The mean squared score is 12.942


### End of part 4