# Get data and packages

In [1]:
%%capture
!pip install -U h2o numpy==1.17.0
# !aws s3 cp s3://xdss-public-datasets/demos/taxi_1m.csv ../datasets/taxi_1m.csv

In [2]:
import pandas as pd
import numpy as np
import h2o
import numpy as np
import warnings
import datetime as dt
import time
import gc
import os

h2o.init()

warnings.filterwarnings("ignore")
os.makedirs('../results', exist_ok=True)

benchmarks = {}

def benchmark(f, name, df, repetitions=1, **kwargs):
    times = []
    count = 0
    for i in range(repetitions):
        start_time = time.time()
        ret = f(df, **kwargs)
        times.append(time.time()-start_time)
        count+=1
    benchmarks[name] = np.mean(times)/count
    return benchmarks[name]

def get_results(benchmarks, name):
    results = pd.DataFrame.from_dict(benchmarks, orient='index')
    results.columns = [name]
    return results

data_path = '../datasets/taxi_1m.csv'
name = 'h2o'
repetitions = 1

Checking whether there is an H2O instance running at http://localhost:54321 ..... not found.
Attempting to start a local H2O server...
  Java Version: openjdk version "1.8.0_152-release"; OpenJDK Runtime Environment (build 1.8.0_152-release-1056-b12); OpenJDK 64-Bit Server VM (build 25.152-b12, mixed mode)
  Starting server from /home/ec2-user/anaconda3/envs/python3/lib/python3.6/site-packages/h2o/backend/bin/h2o.jar
  Ice root: /tmp/tmpf3h4u75t
  JVM stdout: /tmp/tmpf3h4u75t/h2o_ec2_user_started_from_python.out
  JVM stderr: /tmp/tmpf3h4u75t/h2o_ec2_user_started_from_python.err
  Server is running at http://127.0.0.1:54321
Connecting to H2O server at http://127.0.0.1:54321 ... successful.


0,1
H2O cluster uptime:,02 secs
H2O cluster timezone:,Etc/UTC
H2O data parsing timezone:,UTC
H2O cluster version:,3.28.0.1
H2O cluster version age:,18 days
H2O cluster name:,H2O_from_python_ec2_user_jtmzlv
H2O cluster total nodes:,1
H2O cluster free memory:,6.983 Gb
H2O cluster total cores:,8
H2O cluster allowed cores:,8


### H2O implementation

In [3]:
def read_file():
    return h2o.import_file(data_path)
    
def mean(df):
    return df['fare_amount'].mean()
    
def standard_deviation(df):
    return df['fare_amount'].sd()

def sum_columns(df):
    return df['fare_amount'] + df['passenger_count']

def product_columns(df):
    return df['fare_amount'] * df['passenger_count']

def complicated_arithmetic_operation(df):
    theta_1 = df['pickup_longitude'].as_data_frame().as_matrix()
    phi_1 = df['pickup_latitude'].as_data_frame().as_matrix()
    theta_2 = df['dropoff_longitude'].as_data_frame().as_matrix()
    phi_2 = df['dropoff_latitude'].as_data_frame().as_matrix()
    temp = (np.sin((theta_2-theta_1)/2*np.pi/180)**2
           + np.cos(theta_1*np.pi/180)*np.cos(theta_2*np.pi/180) * np.sin((phi_2-phi_1)/2*np.pi/180)**2)
    distance = 2 * np.arctan2(np.sqrt(temp), np.sqrt(1-temp))
    return distance * 3958.8

def value_counts(df):
    return df['passenger_count'].table()

def groupby_statistics(df):
    df_grouped = df.group_by(by = ['pickup_hour'])
    df_grouped.mean(col = ['fare_amount', 'tip_amount']).sd(col = ['fare_amount', 'tip_amount'])
    return df_grouped.get_frame()
                                 
def join(df, other):
    return df.merge(other)
    

def filter_data(df):
    long_min = -74.05
    long_max = -73.75
    lat_min = 40.58
    lat_max = 40.90

    expr_filter = (df['pickup_longitude'] > long_min) & (df['pickup_longitude'] < long_max) & \
              (df['pickup_latitude']> lat_min) & (df['pickup_latitude'] < lat_max) & \
              (df['dropoff_longitude']> long_min) & (df['dropoff_longitude'] < long_max) & \
              (df['dropoff_latitude'] > lat_min) & (df['dropoff_latitude'] < lat_max)
    return df[expr_filter]


# All data

In [4]:
# Load data
data = read_file()
data['pickup_hour'] = data['pickup_datetime'].hour()
print(f"size: {data.shape[0]} with {data.shape[0]} columns")

Parse progress: |█████████████████████████████████████████████████████████| 100%


In [16]:
# benchmark
benchmark(read_file, 'read_file', data, repetitions=repetitions)
benchmark(mean, 'mean', data, repetitions=repetitions)
benchmark(standard_deviation,'standard deviation', data, repetitions=repetitions)
benchmark(sum_columns, 'sum columns', data, repetitions=repetitions)
benchmark(product_columns, 'product columns', data, repetitions=repetitions)
benchmark(complicated_arithmetic_operation, 'complicated arithmetic operation', data, repetitions=repetitions)
benchmark(value_counts, 'value counts', data, repetitions=repetitions)
benchmark(groupby_statistics, 'groupby statistics', data, repetitions=repetitions)
benchmark(filter_data, 'filter', data, repetitions=repetitions)
gc.collect()
benchmark(join, 'join', data, repetitions=repetitions, other=groupby_statistics(data))
print(f"Done benchmarks on all data")

Done benchmarks on all data


## Filtered

In [17]:
filterd = filter_data(data)

del data

print(f"Prepare filtered data and deleted {gc.collect()} MB")

Prepare filtered data and deleted 281 MB


In [18]:
# benchmark
benchmark(mean, 'filtered mean', filterd, repetitions=repetitions)
benchmark(standard_deviation,'filtered standard deviation', filterd, repetitions=repetitions)
benchmark(sum_columns, 'filtered sum columns', filterd, repetitions=repetitions)
benchmark(product_columns, 'filtered product_columns', filterd, repetitions=repetitions)
benchmark(complicated_arithmetic_operation, 'filtered complicated arithmetic_operation', filterd, repetitions=repetitions)
benchmark(value_counts, 'filtered value_counts', filterd, repetitions=repetitions)
benchmark(groupby_statistics, 'filtered groupby statistics', filterd, repetitions=repetitions)
benchmark(join, 'filtered join', filterd, repetitions=repetitions, other=groupby_statistics(filterd))
print(f"Done benchmarks on filterd data")

0.01129647970199585

In [18]:
results = get_results(benchmarks, name)
results.to_csv(f"../results/{name}_1m.csv")
results.head()

Unnamed: 0,h2o
mean,0.005461
standard deviation,0.002196
sum columns,2.1e-05
product columns,1.9e-05
complicated arithmetic operation,0.583356


In [20]:
!aws s3 cp  ../results/h2o_1m.csv s3://vaex-sagemaker-demo/benchmarks/h2o_1m_results.csv 