# Get data and packages

In [1]:
%%capture
%%bash
pip install jupyter_contrib_nbextensions
pip install jupyter_nbextensions_configurator
jupyter contrib nbextension install --user
jupyter nbextensions_configurator enable --user

jupyter nbextension enable codefolding/main
jupyter nbextension enable scratchpad/main
jupyter nbextension enable execute_time/ExecuteTime
jupyter nbextension enable autosavetime/main

In [2]:
%%capture
!pip install vaex-core vaex-hdf5
!pip install -U numpy

In [None]:
!aws s3 cp s3://xdss-public-datasets/demos/taxi_1B.hdf5 ../datasets/taxi_1B.hdf5

Completed 17.4 GiB/107.0 GiB (266.8 MiB/s) with 1 file(s) remaining  

## Prep benchmarks

In [None]:
import gc
from src.benchmarks_utils import benchmark, get_results
from src.vaex_utils import *
from src.config import repetitions

name = 'vaex'
data_path = '../datasets/taxi_1B.hdf5'
results_path = f"../results/{name}_1b_mlm52xlarge.csv"
benchmarks = {}
print(f"test for {repetitions} repetitions")

# Benchmark

In [None]:
# Load data
data = read_file(data_path=data_path)
data['pickup_hour'] = data.pickup_datetime.dt.hour
print(f"size: {data.shape[0]} with {data.shape[1]} columns")

In [None]:
benchmarks['read_file']= benchmark(read_file, df=data, data_path=data_path, repetitions=repetitions)
benchmarks['mean']= benchmark(mean, data, repetitions=repetitions)
benchmarks['standard deviation']= benchmark(standard_deviation, data, repetitions=repetitions)
benchmarks['sum columns']= benchmark(sum_columns, data, repetitions=repetitions)
benchmarks['product columns']= benchmark(product_columns, data, repetitions=repetitions)
benchmarks['arithmetic operation']= benchmark(complicated_arithmetic_operation, data, repetitions=repetitions)
benchmarks['value counts']= benchmark(value_counts, data, repetitions=repetitions)
benchmarks['groupby statistics']= benchmark(groupby_statistics, data, repetitions=repetitions)
benchmarks['filter']= benchmark(filter_data, data, repetitions=repetitions)
print(f"cleaned {gc.collect()} mb")
benchmarks['join'] = benchmark(join, data, repetitions=repetitions, other=groupby_statistics(data))
print(f"Done benchmarks on all data")

# filtered
filterd = filter_data(data)
del data

print(f"Prepare filtered data and deleted {gc.collect()} MB")
benchmarks['filtered mean'] = benchmark(mean, filterd, repetitions=repetitions)
benchmarks['filtered standard deviation'] = benchmark(standard_deviation, filterd, repetitions=repetitions)
benchmarks['filtered sum columns'] = benchmark(sum_columns , filterd, repetitions=repetitions)
benchmarks['filtered product_columns'] = benchmark(product_columns , filterd, repetitions=repetitions)
benchmarks['filtered complicated arithmetic operation'] = benchmark(complicated_arithmetic_operation, filterd, repetitions=repetitions)
benchmarks['filtered value counts'] = benchmark(value_counts, filterd, repetitions=repetitions)
benchmarks['filtered groupby statistics'] = benchmark(groupby_statistics, filterd, repetitions=repetitions)
benchmarks['filtered join'] = benchmark(join, filterd, repetitions=repetitions, other=groupby_statistics(filterd))
print(f"Done benchmarks on filterd data")



Done benchmarks on filterd data


In [None]:
results = get_results(benchmarks, name)
results.to_csv(results_path)
results.head()

Unnamed: 0,vaex
read_file,0.006672
mean,1.243813
standard deviation,3.929131
sum columns,19.817493
product columns,0.338438


In [None]:
!aws s3 cp  ../results/vaex_1b_mlm52xlarge.csv s3://vaex-sagemaker-demo/benchmarks/vaex_1b_mlm52xlarge_results.csv 

Completed 660 Bytes/660 Bytes (6.4 KiB/s) with 1 file(s) remainingupload: ../results/vaex_1b_mlm52xlarge.csv to s3://vaex-sagemaker-demo/benchmarks/vaex_1b_mlm52xlarge_results.csv


In [None]:
1