# Get data and packages

In [1]:
%%capture
%%bash
pip install jupyter_contrib_nbextensions
pip install jupyter_nbextensions_configurator
jupyter contrib nbextension install --user
jupyter nbextensions_configurator enable --user

jupyter nbextension enable codefolding/main
jupyter nbextension enable scratchpad/main
jupyter nbextension enable execute_time/ExecuteTime
jupyter nbextension enable autosavetime/main
pip install -U pip dask numpy fsspec>=0.3.3 tqdm pyarrow


In [2]:
%%capture
!pip install vaex-core vaex-hdf5
!pip install -U numpy

In [None]:
!aws s3 cp s3://xdss-public-datasets/demos/taxi_1B.hdf5 ../datasets/taxi_1B.hdf5

In [1]:
import gc
from src.config import repetitions
import time
import numpy as np
import pandas as pd
import numpy as np
import datetime as dt
import warnings
import time
import gc
import os

instace_type = 'mlm52xlarge'
name = 'vaex'
data_path = '../datasets/taxi_1B.hdf5'
results_path = f"../results/{name}_1b_{instace_type}.csv"
benchmarks = {}
print(f"test for {repetitions} repetitions")

test for 1 repetitions


In [4]:
import numpy as np
import pandas as pd
import time 


def get_results(benchmarks, name):
    results = pd.DataFrame.from_dict(benchmarks, orient='index')
    results.columns = [name]
    return results

def persist():
    get_results(benchmarks, name).to_csv(results_path)
    !aws s3 cp  ../results/vaex_1b_mlm52xlarge.csv s3://vaex-sagemaker-demo/benchmarks/vaex_1b_mlm52xlarge_results.csv 
    
def benchmark(f, df, name, repetitions=1, **kwargs):
    times = []
    for i in range(repetitions):
        start_time = time.time()
        ret = f(df, **kwargs)
        times.append(time.time()-start_time)
    benchmarks[name] = np.mean(times)
    persist()
    print(f"{name} took: {benchmarks[name]}")
    return benchmarks[name]


long_min = -74.05
long_max = -73.75
lat_min = 40.58
lat_max = 40.90

# Benchmark

In [7]:
import vaex
import numpy as np

data = vaex.open(data_path).head(1000)
print(f"size: {len(data)} with {len(data.columns)} columns")

size: 1000 with 18 columns


In [9]:
def open_file(df=None):
    return vaex.open(data_path)

benchmark(open_file, df=data, name='read_file', repetitions=repetitions)

upload: ../results/vaex_1b_mlm52xlarge.csv to s3://vaex-sagemaker-demo/benchmarks/vaex_1b_mlm52xlarge_results.csv
read_file took: 0.00716400146484375


0.00716400146484375

In [10]:
def count(df=None):
    return len(df)

benchmark(count, df=data, name='count', repetitions=repetitions)

upload: ../results/vaex_1b_mlm52xlarge.csv to s3://vaex-sagemaker-demo/benchmarks/vaex_1b_mlm52xlarge_results.csv
count took: 3.719329833984375e-05


3.719329833984375e-05

In [11]:
def mean(df):
    return df.fare_amount.mean()

benchmark(mean, df=data, name='mean', repetitions=repetitions)

upload: ../results/vaex_1b_mlm52xlarge.csv to s3://vaex-sagemaker-demo/benchmarks/vaex_1b_mlm52xlarge_results.csv
mean took: 0.009560823440551758


0.009560823440551758

In [12]:
def standard_deviation(df):
    return df.fare_amount.std()

benchmark(standard_deviation, df=data, name='standard', repetitions=repetitions)

upload: ../results/vaex_1b_mlm52xlarge.csv to s3://vaex-sagemaker-demo/benchmarks/vaex_1b_mlm52xlarge_results.csv
standard took: 0.005056619644165039


0.005056619644165039

Unlike other techonlogies, vaex can return columns, or subset of values to explore lazely, 
but becouse many of the other tecnologies crashed at this point, we return a scalar instead.

In [13]:
def mean_of_sum(df):
    return (df.fare_amount + df.trip_distance).mean()

benchmark(mean_of_sum, df=data, name='sum columns mean', repetitions=repetitions)
benchmarks['sum columns'] =  benchmarks['sum columns mean'] - benchmarks['mean']

upload: ../results/vaex_1b_mlm52xlarge.csv to s3://vaex-sagemaker-demo/benchmarks/vaex_1b_mlm52xlarge_results.csv
sum columns mean took: 0.006809234619140625


In [14]:
def mean_of_product(df):
    return (df.fare_amount * df.trip_distance).mean()

benchmark(mean_of_product, df=data, name='product columns mean', repetitions=repetitions)
benchmarks['product columns'] =  benchmarks['product columns mean'] - benchmarks['mean']

upload: ../results/vaex_1b_mlm52xlarge.csv to s3://vaex-sagemaker-demo/benchmarks/vaex_1b_mlm52xlarge_results.csv
product columns mean took: 0.0051991939544677734


In [15]:
def mean_of_complicated_arithmetic_operation(df):
    theta_1 = df.pickup_longitude
    phi_1 = df.pickup_latitude
    theta_2 = df.dropoff_longitude
    phi_2 = df.dropoff_latitude
    temp = (np.sin((theta_2-theta_1)/2*np.pi/180)**2
           + np.cos(theta_1*np.pi/180)*np.cos(theta_2*np.pi/180) * np.sin((phi_2-phi_1)/2*np.pi/180)**2)
    df['complicated'] = 2 * np.arctan2(np.sqrt(temp), np.sqrt(1-temp))
    return df['complicated'].mean()

benchmark(mean_of_complicated_arithmetic_operation, df=data, name='arithmetic operation mean', repetitions=repetitions)
benchmarks['arithmetic operation'] =  benchmarks['arithmetic operation mean'] - benchmarks['mean']

upload: ../results/vaex_1b_mlm52xlarge.csv to s3://vaex-sagemaker-demo/benchmarks/vaex_1b_mlm52xlarge_results.csv
arithmetic operation mean took: 0.015137195587158203


In [16]:
def value_counts(df):
    return df.fare_amount.value_counts()

benchmark(value_counts, df=data, name='value counts', repetitions=repetitions)

upload: ../results/vaex_1b_mlm52xlarge.csv to s3://vaex-sagemaker-demo/benchmarks/vaex_1b_mlm52xlarge_results.csv
value counts took: 0.00772547721862793


0.00772547721862793

In [17]:
def groupby_statistics(df):
    return df.groupby(by='passenger_count').agg({'fare_amount': ['mean', 'std'], 
                                               'tip_amount': ['mean', 'std']
                                              })

benchmark(groupby_statistics, df=data, name='groupby statistics', repetitions=repetitions)

upload: ../results/vaex_1b_mlm52xlarge.csv to s3://vaex-sagemaker-demo/benchmarks/vaex_1b_mlm52xlarge_results.csv
groupby statistics took: 0.0168001651763916


0.0168001651763916

In [18]:
other = groupby_statistics(data)

In [19]:
def join(df, other):
    return df.join(other=other, on = 'passenger_count', rsuffix = '_right')

benchmark(join, data, name='join', repetitions=repetitions, other=other)

upload: ../results/vaex_1b_mlm52xlarge.csv to s3://vaex-sagemaker-demo/benchmarks/vaex_1b_mlm52xlarge_results.csv
join took: 0.0037360191345214844


0.0037360191345214844

In [20]:
def join_count(df, other):
    return len(df.join(other=other, on = 'passenger_count', rsuffix = '_right'))

benchmark(join_count, data, name='join count', repetitions=repetitions, other=other)

upload: ../results/vaex_1b_mlm52xlarge.csv to s3://vaex-sagemaker-demo/benchmarks/vaex_1b_mlm52xlarge_results.csv
join count took: 0.003653287887573242


0.003653287887573242

## Filtered data

Dask is not build to run on filter data like you would normally, so we will apply the same strategy

In [21]:
print(f"Prepare filtered data and deleted {gc.collect()} MB")

Prepare filtered data and deleted 3442 MB


In [22]:
def filter_data(df):
    expr_filter = (df.pickup_longitude > long_min)  & (df.pickup_longitude < long_max) & \
                  (df.pickup_latitude > lat_min)    & (df.pickup_latitude < lat_max) & \
                  (df.dropoff_longitude > long_min) & (df.dropoff_longitude < long_max) & \
                  (df.dropoff_latitude > lat_min)   & (df.dropoff_latitude < lat_max)
    return df[expr_filter]

benchmark(filter_data, data, name='filter data', repetitions=repetitions)

upload: ../results/vaex_1b_mlm52xlarge.csv to s3://vaex-sagemaker-demo/benchmarks/vaex_1b_mlm52xlarge_results.csv
filter data took: 0.0014729499816894531


0.0014729499816894531

In [23]:
filterd = filter_data(data)

del data
print(f"cleaned {gc.collect()} mb")

cleaned 291 mb


In [25]:
benchmark(filter_data, filterd, name='filterd count', repetitions=repetitions)
benchmark(mean, filterd, name='filterd mean', repetitions=repetitions)
benchmark(standard_deviation, filterd, name='filtered standard deviation', repetitions=repetitions)
benchmark(mean_of_sum, filterd, name ='filtered sum columns mean', repetitions=repetitions)
benchmarks['filtered sum columns'] =  benchmarks['filtered sum columns mean'] - benchmarks['filterd mean']
benchmark(mean_of_product, filterd, name ='filterd product columns mean', repetitions=repetitions)
benchmarks['filterd product columns'] = benchmarks['filterd product columns mean'] - benchmarks['filterd mean']
benchmark(mean_of_complicated_arithmetic_operation, filterd, name='filterd arithmetic operation mean', repetitions=repetitions)
benchmarks['filterd arithmetic operation'] =  benchmarks['filterd arithmetic operation mean'] - benchmarks['filterd mean']
benchmark(value_counts, filterd, name ='filtered value counts', repetitions=repetitions)
benchmark(groupby_statistics, filterd, name='filtered groupby statistics', repetitions=repetitions)
other = groupby_statistics(filterd)
benchmark(join, filterd, name='filtered join', repetitions=repetitions, other=other)
benchmark(join_count, filterd, name='filtered join count', repetitions=repetitions, other=other)
print('Done!')

upload: ../results/vaex_1b_mlm52xlarge.csv to s3://vaex-sagemaker-demo/benchmarks/vaex_1b_mlm52xlarge_results.csv
filterd count took: 0.0022890567779541016
upload: ../results/vaex_1b_mlm52xlarge.csv to s3://vaex-sagemaker-demo/benchmarks/vaex_1b_mlm52xlarge_results.csv
filterd mean took: 0.00453948974609375
upload: ../results/vaex_1b_mlm52xlarge.csv to s3://vaex-sagemaker-demo/benchmarks/vaex_1b_mlm52xlarge_results.csv
filtered standard deviation took: 0.00540924072265625
upload: ../results/vaex_1b_mlm52xlarge.csv to s3://vaex-sagemaker-demo/benchmarks/vaex_1b_mlm52xlarge_results.csv
filtered sum columns mean took: 0.004755496978759766
upload: ../results/vaex_1b_mlm52xlarge.csv to s3://vaex-sagemaker-demo/benchmarks/vaex_1b_mlm52xlarge_results.csv
filterd product columns mean took: 0.004438161849975586
upload: ../results/vaex_1b_mlm52xlarge.csv to s3://vaex-sagemaker-demo/benchmarks/vaex_1b_mlm52xlarge_results.csv
filterd arithmetic operation mean took: 0.015936613082885742
upload: ../