# Get data and packages

In [1]:
%%capture
%%bash
pip install jupyter_contrib_nbextensions
pip install jupyter_nbextensions_configurator
jupyter contrib nbextension install --user
jupyter nbextensions_configurator enable --user

jupyter nbextension enable codefolding/main
jupyter nbextension enable scratchpad/main
jupyter nbextension enable execute_time/ExecuteTime
jupyter nbextension enable autosavetime/main
pip install -U pip dask numpy fsspec>=0.3.3 tqdm pyarrow


In [2]:
%%capture
!pip install -U pip dask numpy fsspec>=0.3.3 tqdm pyarrow

In [None]:
!aws s3 cp --recursive s3://xdss-public-datasets/demos/taxi_parquet ../datasets/taxi_parquet

In [1]:
import gc
from src.config import repetitions
import time
import numpy as np
import pandas as pd
import numpy as np
import datetime as dt
import warnings
import time
import gc
import os

name = 'dask'
instance_type = 'mlm52xlarge'
data_path = '../datasets/taxi_parquet'
results_path = f"../results/{name}_1b_{instance_type}.csv"
benchmarks = {}
print(f"test for {repetitions} repetitions")

test for 1 repetitions


In [2]:
import numpy as np
import pandas as pd
import time 


def get_results(benchmarks, name):
    results = pd.DataFrame.from_dict(benchmarks, orient='index')
    results.columns = [name]
    return results

def persist():
    get_results(benchmarks, name).to_csv(results_path)
    !aws s3 cp  ../results/dask_1b_mlm52xlarge.csv s3://vaex-sagemaker-demo/benchmarks/dask_1b_mlm52xlarge_results.csv 
    
def benchmark(f, df, name, repetitions=1, **kwargs):
    times = []
    for i in range(repetitions):
        start_time = time.time()
        ret = f(df, **kwargs)
        times.append(time.time()-start_time)
    benchmarks[name] = np.mean(times)
    persist()
    print(f"{name} took: {benchmarks[name]}")
    return benchmarks[name]


long_min = -74.05
long_max = -73.75
lat_min = 40.58
lat_max = 40.90

# Benchmark

In [3]:
import dask.dataframe as dd
import numpy as np

# Load data
data = dd.read_parquet(data_path, engine='pyarrow')
print(f"size: {len(data)} with {len(data.columns)} columns")

size: 1000000 with 18 columns


In [4]:
def read_file_parquet(df=None):
    return dd.read_parquet(data_path, engine='pyarrow')

benchmark(read_file_parquet, df=data, name='read_file', repetitions=repetitions)

upload: ../results/dask_1b_mlm52xlarge.csv to s3://vaex-sagemaker-demo/benchmarks/dask_1b_mlm52xlarge_results.csv
read_file took: 0.012697696685791016


0.012697696685791016

In [5]:
def count(df=None):
    return len(df)

benchmark(count, df=data, name='count', repetitions=repetitions)

upload: ../results/dask_1b_mlm52xlarge.csv to s3://vaex-sagemaker-demo/benchmarks/dask_1b_mlm52xlarge_results.csv
count took: 0.07673335075378418


0.07673335075378418

In [6]:
def mean(df):
    return df.fare_amount.mean().compute()

benchmark(mean, df=data, name='mean', repetitions=repetitions)

upload: ../results/dask_1b_mlm52xlarge.csv to s3://vaex-sagemaker-demo/benchmarks/dask_1b_mlm52xlarge_results.csv
mean took: 0.04884743690490723


0.04884743690490723

In [7]:
def standard_deviation(df):
    return df.fare_amount.std().compute()

benchmark(standard_deviation, df=data, name='standard', repetitions=repetitions)

upload: ../results/dask_1b_mlm52xlarge.csv to s3://vaex-sagemaker-demo/benchmarks/dask_1b_mlm52xlarge_results.csv
standard took: 0.05604290962219238


0.05604290962219238

To calculate the time when using two columns, we can't return the response since it will get into memroy and break, so we run a mean calculation on it, and then remove the time it took to run the mean.

In [8]:
def mean_of_sum(df):
    return (df.fare_amount + df.trip_distance).mean().compute()

benchmark(mean_of_sum, df=data, name='sum columns mean', repetitions=repetitions)
benchmarks['sum columns'] =  benchmarks['sum columns mean'] - benchmarks['mean']

upload: ../results/dask_1b_mlm52xlarge.csv to s3://vaex-sagemaker-demo/benchmarks/dask_1b_mlm52xlarge_results.csv
sum columns mean took: 0.0740196704864502


In [9]:
def mean_of_product(df):
    return (df.fare_amount * df.trip_distance).mean().compute()

benchmark(mean_of_product, df=data, name='product columns mean', repetitions=repetitions)
benchmarks['product columns'] =  benchmarks['product columns mean'] - benchmarks['mean']

upload: ../results/dask_1b_mlm52xlarge.csv to s3://vaex-sagemaker-demo/benchmarks/dask_1b_mlm52xlarge_results.csv
product columns mean took: 0.06560730934143066


In [10]:
def mean_of_complicated_arithmetic_operation(df):
    theta_1 = df.pickup_longitude
    phi_1 = df.pickup_latitude
    theta_2 = df.dropoff_longitude
    phi_2 = df.dropoff_latitude
    temp = (np.sin((theta_2-theta_1)/2*np.pi/180)**2
           + np.cos(theta_1*np.pi/180)*np.cos(theta_2*np.pi/180) * np.sin((phi_2-phi_1)/2*np.pi/180)**2)
    ret = 2 * np.arctan2(np.sqrt(temp), np.sqrt(1-temp))
    return ret.mean().compute()

benchmark(mean_of_complicated_arithmetic_operation, df=data, name='arithmetic operation mean', repetitions=repetitions)
benchmarks['arithmetic operation'] =  benchmarks['arithmetic operation mean'] - benchmarks['mean']

upload: ../results/dask_1b_mlm52xlarge.csv to s3://vaex-sagemaker-demo/benchmarks/dask_1b_mlm52xlarge_results.csv
arithmetic operation mean took: 0.35970520973205566


In [11]:
def value_counts(df):
    return df.fare_amount.value_counts().compute()

benchmark(value_counts, df=data, name='value counts', repetitions=repetitions)

upload: ../results/dask_1b_mlm52xlarge.csv to s3://vaex-sagemaker-demo/benchmarks/dask_1b_mlm52xlarge_results.csv
value counts took: 0.053315162658691406


0.053315162658691406

In [12]:
def groupby_statistics(df):
    return df.groupby(by='passenger_count').agg({'fare_amount': ['mean', 'std'], 
                                               'tip_amount': ['mean', 'std']
                                              }).compute()

benchmark(groupby_statistics, df=data, name='groupby statistics', repetitions=repetitions)

upload: ../results/dask_1b_mlm52xlarge.csv to s3://vaex-sagemaker-demo/benchmarks/dask_1b_mlm52xlarge_results.csv
groupby statistics took: 1.2294588088989258


1.2294588088989258

In [13]:
other = groupby_statistics(data)

In [14]:
def join_count(df, other):
    return len(dd.merge(df, other, left_index=True, right_index=True))

benchmark(join_count, data, name='join count', repetitions=repetitions, other=other)



upload: ../results/dask_1b_mlm52xlarge.csv to s3://vaex-sagemaker-demo/benchmarks/dask_1b_mlm52xlarge_results.csv
join count took: 1.1382417678833008


1.1382417678833008

In [15]:
# will crash
benchmarks['join'] = -1

## Filtered data

Dask is not build to run on filter data like you would normally, so we will apply the same strategy

In [16]:
print(f"Prepare filtered data and deleted {gc.collect()} MB")

Prepare filtered data and deleted 132 MB


In [17]:
expr_filter = (data.pickup_longitude > long_min)  & (data.pickup_longitude < long_max) & \
                  (data.pickup_latitude > lat_min)    & (data.pickup_latitude < lat_max) & \
                  (data.dropoff_longitude > long_min) & (data.dropoff_longitude < long_max) & \
                  (data.dropoff_latitude > lat_min)   & (data.dropoff_latitude < lat_max)

In [20]:
def filter_data(df):
    return df[expr_filter]

benchmark(filter_data, data, name='filter data', repetitions=repetitions)

upload: ../results/dask_1b_mlm52xlarge.csv to s3://vaex-sagemaker-demo/benchmarks/dask_1b_mlm52xlarge_results.csv
filter data took: 0.0009367465972900391


0.0009367465972900391

In [21]:
filterd = filter_data(data)

del data
print(f"cleaned {gc.collect()} mb")

cleaned 22 mb


In [24]:
benchmark(filter_data, filterd, name='filterd count', repetitions=repetitions)
benchmark(mean, filterd, name='filterd mean', repetitions=repetitions)
benchmark(standard_deviation, filterd, name='filtered standard deviation', repetitions=repetitions)
benchmark(mean_of_sum, filterd, name ='filtered sum columns mean', repetitions=repetitions)
benchmarks['filtered sum columns'] =  benchmarks['filtered sum columns mean'] - benchmarks['filterd mean']
benchmark(mean_of_product, filterd, name ='filterd product columns mean', repetitions=repetitions)
benchmarks['filterd product columns'] = benchmarks['filterd product columns mean'] - benchmarks['filterd mean']
benchmark(mean_of_complicated_arithmetic_operation, filterd, name='filterd arithmetic operation mean', repetitions=repetitions)
benchmarks['filterd arithmetic operation'] =  benchmarks['filterd arithmetic operation mean'] - benchmarks['filterd mean']
benchmark(value_counts, filterd, name ='filtered value counts', repetitions=repetitions)
benchmark(groupby_statistics, filterd, name='filtered groupby statistics', repetitions=repetitions)
other = groupby_statistics(filterd)
benchmarks['filtered join'] = -1
benchmark(join_count, filterd, name='filtered join count', repetitions=repetitions, other=other)
print('Done!')

upload: ../results/dask_1b_mlm52xlarge.csv to s3://vaex-sagemaker-demo/benchmarks/dask_1b_mlm52xlarge_results.csv
filterd count took: 0.0004982948303222656
upload: ../results/dask_1b_mlm52xlarge.csv to s3://vaex-sagemaker-demo/benchmarks/dask_1b_mlm52xlarge_results.csv
filterd mean took: 1.2559869289398193
upload: ../results/dask_1b_mlm52xlarge.csv to s3://vaex-sagemaker-demo/benchmarks/dask_1b_mlm52xlarge_results.csv
filtered standard deviation took: 1.27596116065979
upload: ../results/dask_1b_mlm52xlarge.csv to s3://vaex-sagemaker-demo/benchmarks/dask_1b_mlm52xlarge_results.csv
filtered sum columns mean took: 1.283780813217163
upload: ../results/dask_1b_mlm52xlarge.csv to s3://vaex-sagemaker-demo/benchmarks/dask_1b_mlm52xlarge_results.csv
filterd product columns mean took: 1.2702300548553467
upload: ../results/dask_1b_mlm52xlarge.csv to s3://vaex-sagemaker-demo/benchmarks/dask_1b_mlm52xlarge_results.csv
filterd arithmetic operation mean took: 1.538158655166626
upload: ../results/dask



upload: ../results/dask_1b_mlm52xlarge.csv to s3://vaex-sagemaker-demo/benchmarks/dask_1b_mlm52xlarge_results.csv
filtered join count took: 1.3692877292633057
Done!
