# Get data and packages

In [1]:
%%capture
%%bash
pip install -U h2o numpy==1.17.0
aws s3 cp s3://xdss-public-datasets/demos/taxi_parquet datasets/taxi_parquet/data_0.parquet

# Run in memory - here for code refernece

In [2]:
import gc
import numpy as np
import pandas as pd
import warnings
import time
import gc
import os
import os


instance_type = 'c5d2xlarge' # change this
results_bucket = f"s3://vaex-sagemaker-demo/benchmarks" # change this

name = 'h2o'
data_path = 'datasets/taxi_parquet/data_0.parquet' # a single file for testing
output_file = f'{name}_{instance_type}_1m.csv'
results_path = f"results/{output_file}"
results_bucket = f"{results_bucket}/{output_file}" 
benchmarks = {
    'run':[],
    'duration': [],
    'task': []   
}

long_min = -74.05
long_max = -73.75
lat_min = 40.58
lat_max = 40.90


def get_results(benchmarks=benchmarks):
    return pd.DataFrame.from_dict(benchmarks, orient='index').T

def persist():
    gc.collect()
    get_results(benchmarks).to_csv(results_path)
    os.system(f"aws s3 cp {results_path} {results_bucket}")
    
def benchmark(f, df, name, **kwargs):    
    for i in range(2):
        start_time = time.time()
        ret = f(df, **kwargs)
        benchmarks['duration'].append(time.time()-start_time)
        benchmarks['task'].append(name)
        benchmarks['run'].append(i+1)
    persist()
    print(f"{name} took: {benchmarks['duration'][-1]}")
    return benchmarks['duration'][-1]
          
def add_nan(name):
    for i in range(2):
        benchmarks['duration'].append(np.nan)
        benchmarks['task'].append(name)
        benchmarks['run'].append(i+1)
    persist()
    print(f"{name} took: {benchmarks['duration'][-1]}")
    return benchmarks['duration'][-1]

        

!mkdir -p results
!mkdir -p datasets
print(f"We test every benchmark twice and save both results")

test for 1 repetitions for join and groupby and 5 repetitions for statistics


# Benchmark

In [5]:
import h2o
h2o.init()
import numpy as np

# Load data
data = h2o.import_file(data_path)
print(f"size: {len(data)} with {len(data.columns)} columns")

Checking whether there is an H2O instance running at http://localhost:54321 ..... not found.
Attempting to start a local H2O server...
  Java Version: openjdk version "1.8.0_152-release"; OpenJDK Runtime Environment (build 1.8.0_152-release-1056-b12); OpenJDK 64-Bit Server VM (build 25.152-b12, mixed mode)
  Starting server from /home/ec2-user/anaconda3/envs/python3/lib/python3.6/site-packages/h2o/backend/bin/h2o.jar
  Ice root: /tmp/tmpcfu6t73t
  JVM stdout: /tmp/tmpcfu6t73t/h2o_ec2_user_started_from_python.out
  JVM stderr: /tmp/tmpcfu6t73t/h2o_ec2_user_started_from_python.err
  Server is running at http://127.0.0.1:54321
Connecting to H2O server at http://127.0.0.1:54321 ... successful.


0,1
H2O cluster uptime:,02 secs
H2O cluster timezone:,Etc/UTC
H2O data parsing timezone:,UTC
H2O cluster version:,3.28.0.3
H2O cluster version age:,12 days
H2O cluster name:,H2O_from_python_ec2_user_4f3ubq
H2O cluster total nodes:,1
H2O cluster free memory:,3.358 Gb
H2O cluster total cores:,8
H2O cluster allowed cores:,8


Parse progress: |█████████████████████████████████████████████████████████| 100%
size: 1000000 with 18 columns


In [8]:
def read_file_parquet(df=None):
    return h2o.import_file(data_path)

benchmark(read_file_parquet, df=data, name='read_file')

Parse progress: |█████████████████████████████████████████████████████████| 100%
Parse progress: |█████████████████████████████████████████████████████████| 100%
Parse progress: |█████████████████████████████████████████████████████████| 100%
Parse progress: |█████████████████████████████████████████████████████████| 100%
Parse progress: |█████████████████████████████████████████████████████████| 100%
read_file took: 2.806292772293091


2.806292772293091

In [9]:
def count(df=None):
    return len(df)

benchmark(count, df=data, name='count')

count took: 5.388259887695313e-06


5.388259887695313e-06

In [11]:
def mean(df):
    return df['fare_amount'].mean()

benchmark(mean, df=data, name='mean')

mean took: 0.03358917236328125


0.03358917236328125

In [13]:
def standard_deviation(df):
    return df['fare_amount'].sd()

benchmark(standard_deviation, df=data, name='standard deviation')

standard deviation took: 0.018494558334350587


0.018494558334350587

To calculate the time when using two columns, we can't return the response since it will get into memroy and break, so we run a mean calculation on it, and then remove the time it took to run the mean.

In [14]:
def mean_of_sum(df):
    return (df['fare_amount'] + df['trip_distance']).mean()

benchmark(mean_of_sum, df=data, name='sum columns mean')


sum columns mean took: 0.09631581306457519


In [15]:
def sum_columns(df):
    return (df['fare_amount'] + df['trip_distance']).mean()
benchmark(sum_columns, df=data, name='sum columns')

sum columns took: 0.07579655647277832


0.07579655647277832

In [17]:
def mean_of_product(df):
    return (df['fare_amount'] * df['trip_distance']).mean()

benchmark(mean_of_product, df=data, name='product columns mean')

product columns mean took: 0.08022994995117187


0.08022994995117187

In [18]:
def product_columns(df):
    return (df['fare_amount'] * df['trip_distance'])

benchmark(product_columns, df=data, name='product columns')

product columns took: 0.00018525123596191406


0.00018525123596191406

In [19]:
def mean_of_complicated_arithmetic_operation(df):
    theta_1 = df['pickup_longitude'].as_data_frame().as_matrix()
    phi_1 = df['pickup_latitude'].as_data_frame().as_matrix()
    theta_2 = df['dropoff_longitude'].as_data_frame().as_matrix()
    phi_2 = df['dropoff_latitude'].as_data_frame().as_matrix()
    temp = (np.sin((theta_2-theta_1)/2*np.pi/180)**2
           + np.cos(theta_1*np.pi/180)*np.cos(theta_2*np.pi/180) * np.sin((phi_2-phi_1)/2*np.pi/180)**2)
    distance = 2 * np.arctan2(np.sqrt(temp), np.sqrt(1-temp))
    return distance.mean()

benchmark(mean_of_complicated_arithmetic_operation, df=data, name='arithmetic operation mean')

  from ipykernel import kernelapp as app
  app.launch_new_instance()


arithmetic operation mean took: 3.015794038772583


3.015794038772583

In [23]:
def value_counts(df):
    return df['fare_amount'].table()

benchmark(value_counts, df=data, name='value counts')

value counts took: 0.00010175704956054688


0.00010175704956054688

In [24]:
def groupby_statistics(df):
    df_grouped = df.group_by(by = ['passenger_count'])
    df_grouped.mean(col = ['fare_amount', 'tip_amount']).sd(col = ['fare_amount', 'tip_amount'])
    return df_grouped.get_frame()

benchmark(groupby_statistics, df=data, name='groupby statistics')

groupby statistics took: 0.04497408866882324


0.04497408866882324

In [32]:
other = groupby_statistics(data)

In [37]:
def join(df, other):
    return df.merge(other)

benchmark(join, data, name='join', other=other)

join took: 0.10525631904602051


0.10525631904602051

In [39]:
def join_count(df, other):
    return len(df.merge(other))

benchmark(join_count, data, name='join count', other=other)

join count took: 1.5625560283660889


1.5625560283660889

## Filtered data

Dask is not build to run on filter data like you would normally, so we will apply the same strategy

In [40]:
print(f"Prepare filtered data and deleted {gc.collect()} MB")

Prepare filtered data and deleted 0 MB


In [43]:
expr_filter = (data['pickup_longitude'] > long_min)  & (data['pickup_longitude'] < long_max) & \
                  (data['pickup_latitude'] > lat_min)    & (data['pickup_latitude'] < lat_max) & \
                  (data['dropoff_longitude'] > long_min) & (data['dropoff_longitude'] < long_max) & \
                  (data['dropoff_latitude'] > lat_min)   & (data['dropoff_latitude'] < lat_max)

def filter_data(df):
    return df[expr_filter]

benchmark(filter_data, data, name='filter data')

filter data took: 5.7125091552734374e-05


5.7125091552734374e-05

In [44]:
filtered = filter_data(data)

del data
print(f"cleaned {gc.collect()} mb")

cleaned 0 mb


In [45]:
benchmark(mean, filtered, name='filtered mean')
benchmark(standard_deviation, filtered, name='filtered standard deviation')
benchmark(mean_of_sum, filtered, name ='filtered sum columns mean')
add_nan('filtered sum columns')
benchmark(mean_of_product, filtered, name ='filtered product columns mean')
add_nan('filtered product columns')
benchmark(mean_of_complicated_arithmetic_operation, filtered, name='filtered arithmetic operation mean')
add_nan('filtered arithmetic operation')
benchmark(value_counts, filtered, name ='filtered value counts')
benchmark(groupby_statistics, filtered, name='filtered groupby statistics')
other = groupby_statistics(filtered)
add_nan('filtered join')
benchmark(join_count, filtered, name='filtered join count', other=other)
print(name)
get_results(benchmarks)

filterd mean took: 0.3057099342346191
filtered standard deviation took: 0.016369342803955078
filtered sum columns mean took: 0.07889642715454101
filterd product columns mean took: 0.07714705467224121


  from ipykernel import kernelapp as app
  app.launch_new_instance()


filterd arithmetic operation mean took: 2.8690340518951416
filtered value counts took: 0.000301361083984375
filtered groupby statistics took: 0.03866004943847656
filtered join count took: 1.5344624519348145
Done!
