# Get data and packages

In [2]:
%%capture
%%bash
pip install -U pip modin[all]
# aws s3 cp --recursive s3://xdss-public-datasets/demos/taxi_parquet datasets/taxi_parquet/data_0.parquet

# Can only read a single parquet file - keep only for code references

In [1]:
import gc
import numpy as np
import pandas as pd
import warnings
import time
import gc
import os
import os


instance_type = 'c5d2xlarge' # change this
results_bucket = f"s3://vaex-sagemaker-demo/benchmarks" # change this

os.environ["MODIN_ENGINE"] = "ray"  # Modin will use Ray
name = 'modin-ray'
data_path = 'datasets/taxi_parquet/data_0.parquet'
output_file = f'{name}_{instance_type}_1m.csv'
results_path = f"results/{output_file}"
results_bucket = f"{results_bucket}/{output_file}" 
benchmarks = {
    'run':[],
    'duration': [],
    'task': []   
}

long_min = -74.05
long_max = -73.75
lat_min = 40.58
lat_max = 40.90


def get_results(benchmarks=benchmarks):
    return pd.DataFrame.from_dict(benchmarks, orient='index').T

def persist():
    gc.collect()
    get_results(benchmarks).to_csv(results_path)
    os.system(f"aws s3 cp {results_path} {results_bucket}")
    
def benchmark(f, df, name, **kwargs):    
    for i in range(2):
        start_time = time.time()
        ret = f(df, **kwargs)
        benchmarks['duration'].append(time.time()-start_time)
        benchmarks['task'].append(name)
        benchmarks['run'].append(i+1)
    persist()
    print(f"{name} took: {benchmarks['duration'][-1]}")
    return benchmarks['duration'][-1]
          
def add_nan(name):
    for i in range(2):
        benchmarks['duration'].append(np.nan)
        benchmarks['task'].append(name)
        benchmarks['run'].append(i+1)
    persist()
    print(f"{name} took: {benchmarks['duration'][-1]}")
    return benchmarks['duration'][-1]

        

!mkdir -p results
!mkdir -p datasets
print(f"We test every benchmark twice and save both results")

test for 1 repetitions for join and groupby and 5 repetitions for statistics


# Benchmark

In [2]:
import modin.pandas as pd
import numpy as np

# Load data
data = pd.read_parquet(data_path, engine='pyarrow')
print(f"size: {len(data)} with {len(data.columns)} columns")



size: 1000000 with 18 columns


In [59]:
def read_file_parquet(df=None):
    return pd.read_parquet(data_path, engine='pyarrow')

benchmark(read_file_parquet, df=data, name='read_file')



read_file took: 1.5481024742126466


1.5481024742126466

In [60]:
def count(df=None):
    return len(df)

benchmark(count, df=data, name='count')



count took: 1.2254714965820312e-05


1.2254714965820312e-05

In [61]:
def mean(df):
    return df.fare_amount.mean()

benchmark(mean, df=data, name='mean')



mean took: 0.05955638885498047


0.05955638885498047

In [62]:
def standard_deviation(df):
    return df.fare_amount.std()

benchmark(standard_deviation, df=data, name='standard deviation')



standard deviation took: 0.04032907485961914


0.04032907485961914

To calculate the time when using two columns, we can't return the response since it will get into memroy and break, so we run a mean calculation on it, and then remove the time it took to run the mean.

In [63]:
def mean_of_sum(df):
    return (df.fare_amount + df.trip_distance).mean()

benchmark(mean_of_sum, df=data, name='sum columns mean')
add_nan('sum columns')



sum columns mean took: 0.13871545791625978


In [64]:
def mean_of_product(df):
    return (df.fare_amount * df.trip_distance).mean()

benchmark(mean_of_product, df=data, name='product columns mean')
add_nan('product columns')



product columns mean took: 0.14782986640930176


In [65]:
def mean_of_complicated_arithmetic_operation(df):
    theta_1 = df.pickup_longitude
    phi_1 = df.pickup_latitude
    theta_2 = df.dropoff_longitude
    phi_2 = df.dropoff_latitude
    temp = (np.sin((theta_2-theta_1)/2*np.pi/180)**2
           + np.cos(theta_1*np.pi/180)*np.cos(theta_2*np.pi/180) * np.sin((phi_2-phi_1)/2*np.pi/180)**2)
    ret = np.multiply(np.arctan2(np.sqrt(temp), np.sqrt(np.subtract(1, temp))),2) 
    return ret.mean()

benchmark(mean_of_complicated_arithmetic_operation, df=data, name='arithmetic operation mean')
add_nan('arithmetic operation')




arithmetic operation mean took: 1.4310154914855957


In [66]:
def value_counts(df):
    return df.fare_amount.value_counts()

benchmark(value_counts, df=data, name='value counts')



value counts took: 0.13008618354797363


0.13008618354797363

In [67]:
def groupby_statistics(df):
    return df.groupby(by='passenger_count').agg({'fare_amount': ['mean', 'std'], 
                                               'tip_amount': ['mean', 'std']
                                              })

benchmark(groupby_statistics, df=data, name='groupby statistics')



groupby statistics took: 1.2036387920379639


1.2036387920379639

In [69]:
other = groupby_statistics(data)



In [70]:
def join_count(df, other):
    return len(pd.merge(df, other, left_index=True, right_index=True))

benchmark(join_count, data, name='join count', other=other)



join count took: 0.39369869232177734


0.39369869232177734

In [None]:
def join_data(df, other):
    return pd.merge(df, other, left_index=True, right_index=True)

benchmark(join_count, data, name='join', other=other)

## Filtered data

Dask is not build to run on filter data like you would normally, so we will apply the same strategy

In [71]:
print(f"Prepare filtered data and deleted {gc.collect()} MB")

Prepare filtered data and deleted 22 MB


In [None]:
expr_filter = (data.pickup_longitude > long_min)  & (data.pickup_longitude < long_max) & \
                  (data.pickup_latitude > lat_min)    & (data.pickup_latitude < lat_max) & \
                  (data.dropoff_longitude > long_min) & (data.dropoff_longitude < long_max) & \
                  (data.dropoff_latitude > lat_min)   & (data.dropoff_latitude < lat_max)

def filter_data(df):
    return df[expr_filter]

benchmark(filter_data, data, name='filter data')

In [None]:
filtered = filter_data(data)

del data
print(f"cleaned {gc.collect()} mb")

In [None]:
benchmark(mean, filtered, name='filtered mean')
benchmark(standard_deviation, filtered, name='filtered standard deviation')
benchmark(mean_of_sum, filtered, name ='filtered sum columns mean')
add_nan('filtered sum columns')
benchmark(mean_of_product, filtered, name ='filtered product columns mean')
add_nan('filtered product columns')
benchmark(mean_of_complicated_arithmetic_operation, filtered, name='filtered arithmetic operation mean')
add_nan('filtered arithmetic operation')
benchmark(value_counts, filtered, name ='filtered value counts')
benchmark(groupby_statistics, filtered, name='filtered groupby statistics')
other = groupby_statistics(filtered)
add_nan('filtered join')
benchmark(join_count, filtered, name='filtered join count', other=other)
print(name)
get_results(benchmarks)