# Get data and packages

In [1]:
%%capture
%%bash
python -m pip install -U pip
python -m pip install vaex-core==2.0.0a5
python -m pip install vaex-hdf5==0.6.0a1 
python -m pip install -U numpy 
python -m pip install -U ipython ipykernel
# aws s3 cp s3://xdss-public-datasets/demos/taxi_1B.hdf5 datasets/taxi_1B.hdf5

# Please restart the karnel after installation 

In [79]:
import gc
import numpy as np
import pandas as pd
import warnings
import time
import gc
import os

instance_type = 'c5d2xlarge' # change this
results_bucket = f"s3://vaex-sagemaker-demo/benchmarks" # change this

name = 'vaex'
data_path = 'datasets/taxi_1B.hdf5'
output_file = f'{name}_{instance_type}.csv'
results_path = f"results/{output_file}"
results_bucket = f"{results_bucket}/{output_file}" 
benchmarks = {
    'run':[],
    'duration': [],
    'task': []   
}

long_min = -74.05
long_max = -73.75
lat_min = 40.58
lat_max = 40.90


def get_results(benchmarks):
    return pd.DataFrame.from_dict(benchmarks, orient='index').T

def persist():
    gc.collect()
    get_results(benchmarks).to_csv(results_path)
    os.system(f"aws s3 cp {results_path} {results_bucket}")
    
def benchmark(f, df, name, **kwargs):    
    for i in range(2):
        start_time = time.time()
        ret = f(df, **kwargs)
        benchmarks['duration'].append(time.time()-start_time)
        benchmarks['task'].append(name)
        benchmarks['run'].append(i+1)
    persist()
    print(f"{name} took: {benchmarks['duration'][-1]}")
    return benchmarks['duration'][-1]


!mkdir -p results
!mkdir -p datasets
print(f"We test every benchmark twice and save both results")

We test every benchmark twice and save both results


# Benchmark

In [84]:
import vaex
import numpy as np

data = vaex.open(data_path)
print(f"size: {len(data)} with {len(data.columns)} columns")

size: 1173057927 with 18 columns


In [85]:
def open_file(df=None):
    return vaex.open(data_path)

benchmark(open_file, df=data, name='read_file')

read_file took: 0.0059468746185302734


0.0059468746185302734

In [86]:
def count(df=None):
    return len(df)

benchmark(count, df=data, name='count')

count took: 2.1457672119140625e-06


2.1457672119140625e-06

In [90]:
def mean(df):
    return df.fare_amount.mean()

benchmark(mean, df=data, name='mean')

mean took: 1.0801217555999756


1.0801217555999756

In [91]:
def standard_deviation(df):
    return df.fare_amount.std()

benchmark(standard_deviation, df=data, name='standard deviation')

standard deviation took: 3.5066850185394287


3.5066850185394287

In [92]:
def mean_of_sum(df):
    return (df.fare_amount + df.trip_distance).mean()

benchmark(mean_of_sum, df=data, name='sum columns mean')

sum columns mean took: 2.2473130226135254


2.2473130226135254

In [93]:
# lazy evaulation - instant
def sum_columns(df):
    return (df.fare_amount + df.trip_distance)

benchmark(sum_columns, df=data, name='sum columns')

sum columns took: 1.5735626220703125e-05


1.5735626220703125e-05

In [94]:
def mean_of_product(df):
    return (df.fare_amount * df.trip_distance).mean()

benchmark(mean_of_product, df=data, name='product columns mean')

product columns mean took: 1.313370704650879


1.313370704650879

In [95]:
# lazy evaulation - instant
def product(df):
    return df.fare_amount * df.trip_distance

benchmark(product, df=data, name='product columns')

product columns took: 7.867813110351562e-06


7.867813110351562e-06

In [None]:
def lazy_mean(df):
    df['lazy'] = df.fare_amount * df.trip_distance
    return df['lazy'].mean()
    
benchmark(lazy_mean, df=data, name='lazy evaluation')  

In [96]:
def mean_of_complicated_arithmetic_operation(df):
    theta_1 = df.pickup_longitude
    phi_1 = df.pickup_latitude
    theta_2 = df.dropoff_longitude
    phi_2 = df.dropoff_latitude
    temp = (np.sin((theta_2-theta_1)/2*np.pi/180)**2
           + np.cos(theta_1*np.pi/180)*np.cos(theta_2*np.pi/180) * np.sin((phi_2-phi_1)/2*np.pi/180)**2)
    df['complicated'] = 2 * np.arctan2(np.sqrt(temp), np.sqrt(1-temp))
    return df['complicated'].mean()

benchmark(mean_of_complicated_arithmetic_operation, df=data, name='arithmetic operation mean')


arithmetic operation mean took: 53.13127875328064


53.13127875328064

In [None]:
# lazy evaulation - instant
def complicated_arithmetic_operation(df):
    theta_1 = df.pickup_longitude
    phi_1 = df.pickup_latitude
    theta_2 = df.dropoff_longitude
    phi_2 = df.dropoff_latitude
    temp = (np.sin((theta_2-theta_1)/2*np.pi/180)**2
           + np.cos(theta_1*np.pi/180)*np.cos(theta_2*np.pi/180) * np.sin((phi_2-phi_1)/2*np.pi/180)**2)
    df['complicated'] = 2 * np.arctan2(np.sqrt(temp), np.sqrt(1-temp))
    return df['complicated']

benchmark(complicated_arithmetic_operation, df=data, name='arithmetic operation')

In [98]:
def value_counts(df):
    return df.passenger_count.value_counts()

benchmark(value_counts, df=data, name='value counts')

value counts took: 4.853485345840454


4.853485345840454

In [99]:
def groupby_statistics(df):
    return df.groupby(by='passenger_count').agg({'fare_amount': ['mean', 'std'], 
                                               'tip_amount': ['mean', 'std']
                                              })

benchmark(groupby_statistics, df=data, name='groupby statistics')

groupby statistics took: 47.245296478271484


47.245296478271484

In [100]:
other = groupby_statistics(data)

In [101]:
def join_count(df, other):
    return len(df.join(other=other, on = 'passenger_count', rsuffix = '_right'))

benchmark(join_count, data, name='join count', other=other)

join count took: 1.424567699432373


1.424567699432373

In [102]:
def join_data(df, other):
    return df.join(other=other, on = 'passenger_count', rsuffix = '_right')

benchmark(join_data, data, name='join', other=other)

join took: 0.9929296970367432


0.9929296970367432

## Filtered data

Dask is not build to run on filter data like you would normally, so we will apply the same strategy

In [103]:
print(f"Prepare filtered data and deleted {gc.collect()} MB")

Prepare filtered data and deleted 1024 MB


In [104]:
def filter_data(df):
    expr_filter = (df.pickup_longitude > long_min)  & (df.pickup_longitude < long_max) & \
                  (df.pickup_latitude > lat_min)    & (df.pickup_latitude < lat_max) & \
                  (df.dropoff_longitude > long_min) & (df.dropoff_longitude < long_max) & \
                  (df.dropoff_latitude > lat_min)   & (df.dropoff_latitude < lat_max)
    return df[expr_filter]

benchmark(filter_data, data, name='filter data')

filter data took: 1.16917085647583


1.16917085647583

In [105]:
filtered = filter_data(data)

del data
print(f"cleaned {gc.collect()} mb")

cleaned 7075 mb


In [106]:
benchmark(mean, filtered, name='filtered mean')
benchmark(standard_deviation, filtered, name='filtered standard deviation')
benchmark(mean_of_sum, filtered, name ='filtered sum columns mean')
benchmark(sum_columns, df=filtered, name='filtered sum columns')
benchmark(mean_of_product, filtered, name ='filtered product columns mean')
benchmark(product, df=filtered, name='filtered product columns')
benchmark(mean_of_complicated_arithmetic_operation, filtered, name='filtered arithmetic operation mean')
benchmark(complicated_arithmetic_operation, filtered, name='filtered arithmetic operation')
benchmark(value_counts, filtered, name ='filtered value counts')
benchmark(groupby_statistics, filtered, name='filtered groupby statistics')
other = groupby_statistics(filtered)
benchmark(join_count, filtered, name='filtered join count', other=other)
benchmark(join_data, filtered, name='filtered join', other=other)
print('vaex')
get_results(benchmarks)



filterd mean took: 1.7613804340362549
filtered standard deviation took: 4.014634847640991
filtered sum columns mean took: 4.663895130157471
filtered sum columns took: 8.821487426757812e-06
filterd product columns mean took: 3.725148916244507
filterd product columns took: 9.775161743164062e-06
filterd arithmetic operation mean took: 49.48960566520691
filterd arithmetic operation took: 0.0046312808990478516
filtered value counts took: 22.166354417800903
filtered groupby statistics took: 46.3994722366333
filtered join count took: 0.5538485050201416
filtered join took: 0.45148587226867676
vaex


Unnamed: 0,run,duration,task
0,1,0.00752783,read_file
1,2,0.00560069,read_file
2,1,3.60012e-05,count
3,2,2.14577e-06,count
4,1,0.00786614,read_file
5,2,0.00594687,read_file
6,1,3.50475e-05,count
7,2,2.14577e-06,count
8,1,16.9071,mean
9,2,1.08012,mean
