# Get data and packages

In [1]:
%%capture
%%bash
pip install jupyter_contrib_nbextensions
pip install jupyter_nbextensions_configurator
jupyter contrib nbextension install --user
jupyter nbextensions_configurator enable --user

jupyter nbextension enable codefolding/main
jupyter nbextension enable scratchpad/main
jupyter nbextension enable execute_time/ExecuteTime
jupyter nbextension enable autosavetime/main


In [2]:
%%capture
!pip install vaex-core vaex-hdf5
!pip install -U numpy

In [11]:
import gc
from src.config import repetitions
import time
import numpy as np
import pandas as pd
import numpy as np
import datetime as dt
import warnings
import time
import gc
import os

instace_type = 'mlm4xlarge'
name = 'vaex'
data_path = '../datasets/taxi_1B.hdf5'
results_path = f"../results/{name}_1b_{instace_type}.csv"
benchmarks = {}
!mkdir -p ../results

single_repetition = 1
statistic_repetition = 5
print(f"test for {single_repetition} repetitions for join and groupby and {statistic_repetition} repetitions for statistics")

test for 1 repetitions for join and groupby and 5 repetitions for statistics


In [12]:
import numpy as np
import pandas as pd
import time 


def get_results(benchmarks, name):
    results = pd.DataFrame.from_dict(benchmarks, orient='index')
    results.columns = [name]
    return results

def persist():
    gc.collect()
    get_results(benchmarks, name).to_csv(results_path)
    !aws s3 cp  ../results/vaex_1b_mlm4xlarge.csv s3://vaex-sagemaker-demo/benchmarks/vaex_1b_mlm4xlarge_results.csv 
    
def benchmark(f, df, name, repetitions=1, **kwargs):
    times = []

    for i in range(repetitions):
        start_time = time.time()
        ret = f(df, **kwargs)
        times.append(time.time()-start_time)
    benchmarks[name] = np.mean(times)
    persist()
    print(f"{name} took: {benchmarks[name]}")
    return benchmarks[name]


long_min = -74.05
long_max = -73.75
lat_min = 40.58
lat_max = 40.90

# Benchmark

In [13]:
import vaex
import numpy as np

data = vaex.open(data_path)
print(f"size: {len(data)} with {len(data.columns)} columns")

size: 1173057927 with 18 columns


In [14]:
def open_file(df=None):
    return vaex.open(data_path)

benchmark(open_file, df=data, name='read_file', repetitions=statistic_repetition)

upload: ../results/vaex_1b_mlm4xlarge.csv to s3://vaex-sagemaker-demo/benchmarks/vaex_1b_mlm4xlarge_results.csv
read_file took: 0.009302616119384766


0.009302616119384766

In [15]:
def count(df=None):
    return len(df)

benchmark(count, df=data, name='count', repetitions=statistic_repetition)

upload: ../results/vaex_1b_mlm4xlarge.csv to s3://vaex-sagemaker-demo/benchmarks/vaex_1b_mlm4xlarge_results.csv
count took: 9.918212890625e-06


9.918212890625e-06

In [16]:
def mean(df):
    return df.fare_amount.mean()

benchmark(mean, df=data, name='mean', repetitions=statistic_repetition)

upload: ../results/vaex_1b_mlm4xlarge.csv to s3://vaex-sagemaker-demo/benchmarks/vaex_1b_mlm4xlarge_results.csv
mean took: 42.91337180137634


42.91337180137634

In [17]:
def standard_deviation(df):
    return df.fare_amount.std()

benchmark(standard_deviation, df=data, name='standard deviation', repetitions=statistic_repetition)

upload: ../results/vaex_1b_mlm4xlarge.csv to s3://vaex-sagemaker-demo/benchmarks/vaex_1b_mlm4xlarge_results.csv
standard deviation took: 7.726777267456055


7.726777267456055

Unlike other techonlogies, vaex can return columns, or subset of values to explore lazely, 
but becouse many of the other tecnologies crashed at this point, we return a scalar instead.

In [18]:
def mean_of_sum(df):
    return (df.fare_amount + df.trip_distance).mean()

benchmark(mean_of_sum, df=data, name='sum columns mean', repetitions=statistic_repetition)

upload: ../results/vaex_1b_mlm4xlarge.csv to s3://vaex-sagemaker-demo/benchmarks/vaex_1b_mlm4xlarge_results.csv
sum columns mean took: 40.578082704544066


40.578082704544066

In [19]:
def sum_columns(df):
    return (df.fare_amount + df.trip_distance)

benchmark(sum_columns, df=data, name='sum columns', repetitions=statistic_repetition)

upload: ../results/vaex_1b_mlm4xlarge.csv to s3://vaex-sagemaker-demo/benchmarks/vaex_1b_mlm4xlarge_results.csv
sum columns took: 2.608299255371094e-05


2.608299255371094e-05

In [20]:
def mean_of_product(df):
    return (df.fare_amount * df.trip_distance).mean()

benchmark(mean_of_product, df=data, name='product columns mean', repetitions=statistic_repetition)

upload: ../results/vaex_1b_mlm4xlarge.csv to s3://vaex-sagemaker-demo/benchmarks/vaex_1b_mlm4xlarge_results.csv
product columns mean took: 3.457593297958374


3.457593297958374

In [21]:
def product(df):
    return df.fare_amount * df.trip_distance

benchmark(product, df=data, name='product columns', repetitions=statistic_repetition)

upload: ../results/vaex_1b_mlm4xlarge.csv to s3://vaex-sagemaker-demo/benchmarks/vaex_1b_mlm4xlarge_results.csv
product columns took: 2.5987625122070312e-05


2.5987625122070312e-05

In [22]:
def mean_of_complicated_arithmetic_operation(df):
    theta_1 = df.pickup_longitude
    phi_1 = df.pickup_latitude
    theta_2 = df.dropoff_longitude
    phi_2 = df.dropoff_latitude
    temp = (np.sin((theta_2-theta_1)/2*np.pi/180)**2
           + np.cos(theta_1*np.pi/180)*np.cos(theta_2*np.pi/180) * np.sin((phi_2-phi_1)/2*np.pi/180)**2)
    df['complicated'] = 2 * np.arctan2(np.sqrt(temp), np.sqrt(1-temp))
    return df['complicated'].mean()

benchmark(mean_of_complicated_arithmetic_operation, df=data, name='arithmetic operation mean', repetitions=repetitions)


upload: ../results/vaex_1b_mlm4xlarge.csv to s3://vaex-sagemaker-demo/benchmarks/vaex_1b_mlm4xlarge_results.csv
arithmetic operation mean took: 569.4003694057465


569.4003694057465

In [23]:
def complicated_arithmetic_operation(df):
    theta_1 = df.pickup_longitude
    phi_1 = df.pickup_latitude
    theta_2 = df.dropoff_longitude
    phi_2 = df.dropoff_latitude
    temp = (np.sin((theta_2-theta_1)/2*np.pi/180)**2
           + np.cos(theta_1*np.pi/180)*np.cos(theta_2*np.pi/180) * np.sin((phi_2-phi_1)/2*np.pi/180)**2)
    df['complicated'] = 2 * np.arctan2(np.sqrt(temp), np.sqrt(1-temp))
    return df['complicated']

benchmark(complicated_arithmetic_operation, df=data, name='arithmetic operation', repetitions=single_repetition)

upload: ../results/vaex_1b_mlm4xlarge.csv to s3://vaex-sagemaker-demo/benchmarks/vaex_1b_mlm4xlarge_results.csv
arithmetic operation took: 0.006922483444213867


0.006922483444213867

In [24]:
def value_counts(df):
    return df.vendor_id.value_counts()

benchmark(value_counts, df=data, name='value counts', repetitions=statistic_repetition)

upload: ../results/vaex_1b_mlm4xlarge.csv to s3://vaex-sagemaker-demo/benchmarks/vaex_1b_mlm4xlarge_results.csv
value counts took: 147.60955419540406


147.60955419540406

In [25]:
def groupby_statistics(df):
    return df.groupby(by='passenger_count').agg({'fare_amount': ['mean', 'std'], 
                                               'tip_amount': ['mean', 'std']
                                              })

benchmark(groupby_statistics, df=data, name='groupby statistics', repetitions=single_repetition)

upload: ../results/vaex_1b_mlm4xlarge.csv to s3://vaex-sagemaker-demo/benchmarks/vaex_1b_mlm4xlarge_results.csv
groupby statistics took: 616.9009525775909


616.9009525775909

In [26]:
other = groupby_statistics(data)

In [28]:
# def join(df, other):
#     return df.join(other=other, on = 'passenger_count', rsuffix = '_right')

# benchmark(join, data, name='join', repetitions=single_repetition, other=other)
benchmarks['join'] = np.nan

In [32]:
def join_count(df, other):
    return len(df.join(other=other, on = 'passenger_count', rsuffix = '_right'))

benchmark(join_count, data, name='join count', repetitions=single_repetition, other=other)

upload: ../results/vaex_1b_mlm4xlarge.csv to s3://vaex-sagemaker-demo/benchmarks/vaex_1b_mlm4xlarge_results.csv
join count took: 83.8996832370758


83.8996832370758

## Filtered data

Dask is not build to run on filter data like you would normally, so we will apply the same strategy

In [33]:
print(f"Prepare filtered data and deleted {gc.collect()} MB")

Prepare filtered data and deleted 0 MB


In [34]:
def filter_data(df):
    expr_filter = (df.pickup_longitude > long_min)  & (df.pickup_longitude < long_max) & \
                  (df.pickup_latitude > lat_min)    & (df.pickup_latitude < lat_max) & \
                  (df.dropoff_longitude > long_min) & (df.dropoff_longitude < long_max) & \
                  (df.dropoff_latitude > lat_min)   & (df.dropoff_latitude < lat_max)
    return df[expr_filter]

benchmark(filter_data, data, name='filter data', repetitions=statistic_repetition)

upload: ../results/vaex_1b_mlm4xlarge.csv to s3://vaex-sagemaker-demo/benchmarks/vaex_1b_mlm4xlarge_results.csv
filter data took: 1.3000512599945069


1.3000512599945069

In [35]:
filterd = filter_data(data)

del data
print(f"cleaned {gc.collect()} mb")

cleaned 490 mb


In [None]:
benchmark(mean, filterd, name='filterd mean', repetitions=statistic_repetition)
benchmark(standard_deviation, filterd, name='filtered standard deviation', repetitions=statistic_repetition)
benchmark(mean_of_sum, filterd, name ='filtered sum columns mean', repetitions=statistic_repetition)
benchmark(sum_columns, df=filterd, name='filtered sum columns', repetitions=statistic_repetition)
benchmark(mean_of_product, filterd, name ='filterd product columns mean', repetitions=statistic_repetition)
benchmark(product, df=filterd, name='filterd product columns', repetitions=statistic_repetition)
benchmark(mean_of_complicated_arithmetic_operation, filterd, name='filterd arithmetic operation mean', repetitions=single_repetition)
benchmark(complicated_arithmetic_operation, filterd, name='filterd arithmetic operation', repetitions=single_repetition)

benchmark(value_counts, filterd, name ='filtered value counts', repetitions=statistic_repetition)
benchmark(groupby_statistics, filterd, name='filtered groupby statistics', repetitions=single_repetition)
other = groupby_statistics(filterd)
benchmark(join, filterd, name='filtered join', repetitions=single_repetition, other=other)
benchmark(join_count, filterd, name='filtered join count', repetitions=single_repetition, other=other)
print('Done!')



upload: ../results/vaex_1b_mlm4xlarge.csv to s3://vaex-sagemaker-demo/benchmarks/vaex_1b_mlm4xlarge_results.csv
filterd mean took: 53.58897795677185
upload: ../results/vaex_1b_mlm4xlarge.csv to s3://vaex-sagemaker-demo/benchmarks/vaex_1b_mlm4xlarge_results.csv
filtered standard deviation took: 8.918181991577148
upload: ../results/vaex_1b_mlm4xlarge.csv to s3://vaex-sagemaker-demo/benchmarks/vaex_1b_mlm4xlarge_results.csv
filtered sum columns mean took: 16.044873905181884
upload: ../results/vaex_1b_mlm4xlarge.csv to s3://vaex-sagemaker-demo/benchmarks/vaex_1b_mlm4xlarge_results.csv
filtered sum columns took: 3.342628479003906e-05
upload: ../results/vaex_1b_mlm4xlarge.csv to s3://vaex-sagemaker-demo/benchmarks/vaex_1b_mlm4xlarge_results.csv
filterd product columns mean took: 8.250533246994019
upload: ../results/vaex_1b_mlm4xlarge.csv to s3://vaex-sagemaker-demo/benchmarks/vaex_1b_mlm4xlarge_results.csv
filterd product columns took: 3.399848937988281e-05
upload: ../results/vaex_1b_mlm4xla

In [37]:
print('vaex')
benchmarks

vaex


{'read_file': 0.009302616119384766,
 'count': 9.918212890625e-06,
 'mean': 42.91337180137634,
 'standard deviation': 7.726777267456055,
 'sum columns mean': 40.578082704544066,
 'sum columns': 2.608299255371094e-05,
 'product columns mean': 3.457593297958374,
 'product columns': 2.5987625122070312e-05,
 'arithmetic operation mean': 569.4003694057465,
 'arithmetic operation': 0.006922483444213867,
 'value counts': 147.60955419540406,
 'groupby statistics': 616.9009525775909,
 'join': nan,
 'join count': 83.8996832370758,
 'filter data': 1.3000512599945069,
 'filterd mean': 53.58897795677185,
 'filtered standard deviation': 8.918181991577148,
 'filtered sum columns mean': 16.044873905181884,
 'filtered sum columns': 3.342628479003906e-05,
 'filterd product columns mean': 8.250533246994019,
 'filterd product columns': 3.399848937988281e-05,
 'filterd arithmetic operation mean': 184.2358250617981,
 'filterd arithmetic operation': 0.010091781616210938,
 'filtered value counts': 144.21353287