# Get data and packages

In [1]:
%%capture
%%bash
pip install jupyter_contrib_nbextensions
pip install jupyter_nbextensions_configurator
jupyter contrib nbextension install --user
jupyter nbextensions_configurator enable --user

jupyter nbextension enable codefolding/main
jupyter nbextension enable scratchpad/main
jupyter nbextension enable execute_time/ExecuteTime
jupyter nbextension enable autosavetime/main
pip install -U pip dask numpy fsspec>=0.3.3 tqdm pyarrow


In [7]:
%%capture
!pip install -U pip datatable vaex-core vaex-hdf5 vaex-arrow

In [4]:
import gc
from src.config import repetitions
import time
import numpy as np
import pandas as pd
import numpy as np
import warnings
import time
import gc
import os

name = 'datatable'
instance_type = 'mlm52xlarge'
data_path = '../datasets/taxi_1B.hdf5'
results_path = f"../results/{name}_1b_{instance_type}.csv"
benchmarks = {}
print(f"test {name} for {repetitions} repetitions")

test datatable for 1 repetitions


In [9]:
import numpy as np
import pandas as pd
import time 


def get_results(benchmarks, name):
    results = pd.DataFrame.from_dict(benchmarks, orient='index')
    results.columns = [name]
    return results

def persist():
    get_results(benchmarks, name).to_csv(results_path)
    !aws s3 cp  ../results/datatable_1b_mlm52xlarge.csv s3://vaex-sagemaker-demo/benchmarks/datatable_1b_mlm52xlarge_results.csv 
    
def benchmark(f, df, name, repetitions=1, **kwargs):
    times = []
    for i in range(repetitions):
        start_time = time.time()
        ret = f(df, **kwargs)
        times.append(time.time()-start_time)
    benchmarks[name] = np.mean(times)
    persist()
    print(f"{name} took: {benchmarks[name]}")
    return benchmarks[name]


long_min = -74.05
long_max = -73.75
lat_min = 40.58
lat_max = 40.90

# Benchmark

In [10]:
import vaex
import numpy as np
import datatable as dt
from datatable import f, math

In [15]:
def read_file(data=None):
    vdf = vaex.open(data_path)
    columns = {}
    for name in vdf.get_column_names():
        data = vdf.columns[name]
        if data.dtype == str:
            pass  # skip strings
        elif data.dtype.kind == 'f':
            # datatable is picky about <f4 format
            columns[name] = data.view(np.float32)
        elif data.dtype.kind == 'i':
            columns[name] = data
        else:
            pass  # ignore non int and float
    return dt.Frame(**columns)

# Load data
data = read_file(data=None)
print(f"size: {data.shape[0]} with {data.shape[1]} columns")

size: 1173057927 with 14 columns


In [16]:
key = 'read_file'
f= read_file
benchmark(f, df=data, name=key, repetitions=repetitions)

upload: ../results/datatable_1b_mlm52xlarge.csv to s3://vaex-sagemaker-demo/benchmarks/datatable_1b_mlm52xlarge_results.csv
read_file took: 0.008263111114501953


0.008263111114501953

In [19]:
def count(df=None):
    return df.shape[0]

key = 'count'
f = count
benchmark(f, df=data, name=key, repetitions=repetitions)

upload: ../results/datatable_1b_mlm52xlarge.csv to s3://vaex-sagemaker-demo/benchmarks/datatable_1b_mlm52xlarge_results.csv
count took: 1.239776611328125e-05


1.239776611328125e-05

In [None]:
def mean(df):
    return df[:, dt.mean(dt.f.fare_amount)]

key = 'mean'
f = mean
benchmark(f, df=data, name=key, repetitions=repetitions)

In [64]:
def standard_deviation(df):
    return df[:, dt.sd(dt.f.fare_amount)]

key = 'standard deviation'
f = standard_deviation
benchmark(f, df=data, name=key, repetitions=repetitions)

Completed 576 Bytes/576 Bytes (7.9 KiB/s) with 1 file(s) remainingupload: ../results/dask_1b_mlm52xlarge.csv to s3://vaex-sagemaker-demo/benchmarks/dask_1b_mlm52xlarge_results.csv


13.961443424224854

To calculate the time when using two columns, we can't return the response since it will get into memroy and break, so we run a mean calculation on it, and then remove the time it took to run the mean.

In [11]:
def mean_of_sum(df):
    return df[:, dt.mean(f.fare_amount + f.trip_distance)]

key = 'sum columns mean'
f = mean_of_sum
benchmark(f, df=data, name=key, repetitions=repetitions)
benchmarks['sum columns'] =  benchmarks['sum columns mean'] - benchmarks['mean']

In [14]:
def mean_of_product(df):
    return df[:, dt.mean(f.fare_amount * f.trip_distance)]

key = 'product columns mean'
f = mean_of_product
benchmark(f, df=data, name=key, repetitions=repetitions)
benchmarks['product columns'] =  benchmarks['product columns mean'] - benchmarks['mean']

In [15]:
def mean_of_complicated_arithmetic_operation(df):
    theta_1 = f.pickup_longitude
    phi_1 = f.pickup_latitude
    theta_2 = f.dropoff_longitude
    phi_2 = f.dropoff_latitude
    temp = (math.sin((theta_2-theta_1)/2*math.pi/180)**2
           + math.cos(theta_1*math.pi/180)*math.cos(theta_2*math.pi/180) * math.sin((phi_2-phi_1)/2*math.pi/180)**2)
    expr = 2 * math.atan2(math.sqrt(temp), math.sqrt(1-temp))
    return df[:, dt.mean(expr)]

key = 'arithmetic operation mean'
f  = mean_of_complicated_arithmetic_operation
benchmark(f, df=data, name=key, repetitions=repetitions)
benchmarks['arithmetic operation'] =  benchmarks['arithmetic operation mean'] - benchmarks['mean']

In [16]:
def value_counts(df):
    return df['passenger_count'].value_counts()

key = 'value counts'
f  = value_counts
benchmark(f, df=data, name=key, repetitions=repetitions)

In [None]:
def groupby_statistics(df):
    aggs = {
            'fare_amount_mean': dt.mean(f.fare_amount),
            'fare_amount_std': dt.sd(f.fare_amount),
            'tip_amount_mean': dt.mean(f.tip_amount),
            'tip_amount_std': dt.sd(f.tip_amount),
        }
    return df[:, aggs, dt.by(f.passenger_count)

key = 'groupby statistics'
f = groupby_statistics
benchmark(f, df=data, name=key, repetitions=repetitions)

In [None]:
other = groupby_statistics(data)

In [None]:
def join(df, other):
    # like vaex and dask, no precomputed index
    other.key = 'passenger_count'
    return df[:,:,dt.join(other)]

key = 'join'
f = join
benchmark(f, data, name=key, repetitions=repetitions, other=other)

In [53]:
def join_count(df, other):
    # like vaex and dask, no precomputed index
    other.key = 'passenger_count'
    return df[:,:,dt.join(other)].shape[0]

other = groupby_statistics(data)
key = 'join count'
f = join_count
benchmark(f, data, name=key, repetitions=repetitions, other=other)

## Filtered data

Dask is not build to run on filter data like you would normally, so we will apply the same strategy

In [45]:
print(f"Prepare filtered data and deleted {gc.collect()} MB")

Prepare filtered data and deleted 798 MB


In [48]:
def filter_data(df):
    expr_filter = (f.pickup_longitude > long_min)  & (f.pickup_longitude < long_max) & \
              (f.pickup_latitude > lat_min)    & (f.pickup_latitude < lat_max) & \
              (f.dropoff_longitude > long_min) & (f.dropoff_longitude < long_max) & \
              (f.dropoff_latitude > lat_min)   & (f.dropoff_latitude < lat_max)
    return df[expr_filter,:]

key = 'filter data'
f = filter_data
benchmark(f, data, name=key, repetitions=repetitions)

cleaned 181 mb


In [None]:
filterd = filter_data(data)

del data
print(f"cleaned {gc.collect()} mb")

In [None]:
benchmark(filter_data, filterd, name='filterd count', repetitions=repetitions)
benchmark(mean, filterd, name='filterd mean', repetitions=repetitions)
benchmark(standard_deviation, filterd, name='filtered standard deviation', repetitions=repetitions)
benchmark(mean_of_sum, filterd, name ='filtered sum columns mean', repetitions=repetitions)
benchmarks['filtered sum columns'] =  benchmarks['filtered sum columns mean'] - benchmarks['filterd mean']
benchmark(mean_of_product, filterd, name ='filterd product columns mean', repetitions=repetitions)
benchmarks['filterd product columns'] = benchmarks['filterd product columns mean'] - benchmarks['filterd mean']
benchmark(mean_of_complicated_arithmetic_operation, filterd, name='filterd arithmetic operation mean', repetitions=repetitions)
benchmarks['filterd arithmetic operation'] =  benchmarks['filterd arithmetic operation mean'] - benchmarks['filterd mean']
benchmark(value_counts, filterd, name ='filtered value counts', repetitions=repetitions)
benchmark(groupby_statistics, filterd, name='filtered groupby statistics', repetitions=repetitions)
other = groupby_statistics(data)
benchmarks['filtered join'] = -1
benchmarks['filtered join count'] = benchmark(join_count, filterd, repetitions=repetitions, other=other)
print('Done!')