# Get data and packages

## Prep benchmarks

In [5]:
import vaex
import pandas as pd
import numpy as np
import warnings
import datetime as dt
import time
import gc

warnings.filterwarnings("ignore")

benchmarks = {}

def benchmark(f, name, df, repetitions=1, **kwargs):
    times = []
    count = 0
    for i in range(repetitions):
        start_time = time.time()
        ret = f(df, **kwargs)
        times.append(time.time()-start_time)
        count+=1
    benchmarks[name] = np.mean(times)/count
    return benchmarks[name]

def get_results(benchmarks, name):
    results = pd.DataFrame.from_dict(benchmarks, orient='index')
    results.columns = [name]
    return results

### Vaex implementation

In [6]:
def read_file():
    return vaex.open('datasets/taxi_1m.hdf5')
    
def mean(df):
    return df.fare_amount.mean()
    
def standard_deviation(df):
    return df.fare_amount.std()

def sum_columns(df):
    return df.fare_amount + df.passenger_count

def product_columns(df):
    return df.fare_amount * df.passenger_count

def complicated_arithmetic_operation(df):
    theta_1 = df.pickup_longitude
    phi_1 = df.pickup_latitude
    theta_2 = df.dropoff_longitude
    phi_2 = df.dropoff_latitude
    temp = (np.sin((theta_2-theta_1)/2*np.pi/180)**2
           + np.cos(theta_1*np.pi/180)*np.cos(theta_2*np.pi/180) * np.sin((phi_2-phi_1)/2*np.pi/180)**2)
    return 2 * np.arctan2(np.sqrt(temp), np.sqrt(1-temp))

def value_counts(df):
    return df.passenger_count.value_counts()

def groupby_statistics(df):
    return df.groupby(by='pickup_hour').agg({'fare_amount': ['mean', 'std'], 
                                               'tip_amount': ['mean', 'std']
                                              })
def join(df, other):
    df_joined = df.join(other=gp, on = 'pickup_hour', rsuffix = '_right')
    

def filter_data(df):
    return df[expr_filter]



## Naive

In [10]:
# Load data
data = vaex.open('datasets/taxi_1m.hdf5')
data['pickup_hour'] = data.pickup_datetime.dt.hour

In [11]:
# benchmark
benchmark(mean, 'mean', data, repetitions=10)
benchmark(standard_deviation,'standard deviation', data, repetitions=10)
benchmark(sum_columns, 'sum columns', data, repetitions=10)
benchmark(product_columns, 'product columns', data, repetitions=10)
benchmark(complicated_arithmetic_operation, 'complicated arithmetic operation', data, repetitions=10)
benchmark(value_counts, 'value counts', data, repetitions=10)
benchmark(groupby_statistics, 'groupby statistics', data, repetitions=10)

gp = groupby_statistics(data)
benchmark(join, 'join', data, repetitions=10, other=gp)
print(f"Done benchmarks on all data")

Done benchmarks on all data


## Filtered

In [12]:
# load data
long_min = -74.05
long_max = -73.75
lat_min = 40.58
lat_max = 40.90

expr_filter = (data.pickup_longitude > long_min)  & (data.pickup_longitude < long_max) & \
              (data.pickup_latitude > lat_min)    & (data.pickup_latitude < lat_max) & \
              (data.dropoff_longitude > long_min) & (data.dropoff_longitude < long_max) & \
              (data.dropoff_latitude > lat_min)   & (data.dropoff_latitude < lat_max)

filterd = data[expr_filter]
del data
del gp

deleted = gc.collect()
print(f"Prepare filtered data and deleted {deleted} MB")

Prepare filtered data and deleted 1864 MB


In [13]:
# benchmark
benchmark(mean, 'filtered mean', filterd, repetitions=10)
benchmark(standard_deviation,'filtered standard deviation', filterd, repetitions=10)
benchmark(sum_columns, 'filtered sum columns', filterd, repetitions=10)
benchmark(product_columns, 'filtered product_columns', filterd, repetitions=10)
benchmark(complicated_arithmetic_operation, 'filtered complicated arithmetic_operation', filterd, repetitions=10)
benchmark(value_counts, 'filtered value_counts', filterd, repetitions=10)
benchmark(groupby_statistics, 'filtered groupby statistics', filterd, repetitions=10)

gp = filterd.groupby(by='pickup_hour').agg({'fare_amount': ['mean', 'std'], 
                                        'tip_amount': ['mean', 'std']
                                        })
benchmark(join, 'filtered join', filterd, repetitions=10, other=gp)

0.006154913902282714

In [17]:
name = 'vaex'
results = get_results(benchmarks, name)
results.to_csv(f"results/{name}_1m.csv")
results.head()

Unnamed: 0,vaex
mean,0.0009630513
standard deviation,0.002164185
sum columns,1.227856e-06
product columns,8.893013e-07
complicated arithmetic operation,0.0004458714


In [19]:
filterd

#,vendor_id,pickup_datetime,dropoff_datetime,passenger_count,payment_type,trip_distance,pickup_longitude,pickup_latitude,rate_code,store_and_fwd_flag,dropoff_longitude,dropoff_latitude,fare_amount,surcharge,mta_tax,tip_amount,tolls_amount,total_amount,pickup_hour
0,VTS,2009-01-04 02:52:00.000000000,2009-01-04 03:02:00.000000000,1,CASH,2.630000114440918,-73.99195861816406,40.72156524658203,,,-73.99380493164062,40.6959228515625,8.899999618530273,0.5,,0.0,0.0,9.399999618530273,2
1,VTS,2009-01-04 03:31:00.000000000,2009-01-04 03:38:00.000000000,3,Credit,4.550000190734863,-73.98210144042969,40.736289978027344,,,-73.95584869384766,40.768028259277344,12.100000381469727,0.5,,2.0,0.0,14.600000381469727,3
2,VTS,2009-01-03 15:43:00.000000000,2009-01-03 15:57:00.000000000,5,Credit,10.350000381469727,-74.0025863647461,40.73974609375,,,-73.86997985839844,40.770225524902344,23.700000762939453,0.0,,4.739999771118164,0.0,28.440000534057617,15
3,DDS,2009-01-01 20:52:58.000000000,2009-01-01 21:14:00.000000000,1,CREDIT,5.0,-73.9742660522461,40.79095458984375,,,-73.9965591430664,40.731849670410156,14.899999618530273,0.5,,3.049999952316284,0.0,18.450000762939453,20
4,DDS,2009-01-24 16:18:23.000000000,2009-01-24 16:24:56.000000000,1,CASH,0.4000000059604645,-74.00157928466797,40.719383239746094,,,-74.00837707519531,40.7203483581543,3.700000047683716,0.0,,0.0,0.0,3.700000047683716,16
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
979057,CMT,2009-01-14 19:43:31.000000000,2009-01-14 19:48:08.000000000,1,Cash,0.6000000238418579,-73.98531341552734,40.76328659057617,,,-73.98221588134766,40.77125549316406,5.5,0.0,,0.0,0.0,5.5,19
979058,CMT,2009-01-12 16:36:30.000000000,2009-01-12 16:40:43.000000000,1,Credit,0.800000011920929,-73.95894622802734,40.78066635131836,,,-73.9701919555664,40.78417205810547,5.900000095367432,0.0,,1.0,0.0,6.900000095367432,16
979059,CMT,2009-01-12 14:16:57.000000000,2009-01-12 14:28:02.000000000,1,Credit,4.699999809265137,-73.98450469970703,40.72896957397461,,,-74.01480102539062,40.70855712890625,12.899999618530273,0.0,,1.9299999475479126,0.0,14.829999923706055,14
979060,CMT,2009-01-22 11:21:35.000000000,2009-01-22 11:37:06.000000000,1,Cash,1.5,-73.99223327636719,40.749298095703125,,,-73.98204040527344,40.762874603271484,9.300000190734863,0.0,,0.0,0.0,9.300000190734863,11
