# Install packages

In [6]:
%%capture
%%bash
python -m pip install -U datatable
python -m pip install -U pip numpy vaex-core vaex-hdf5
python -m pip install -U ipython ipykernel 

# Please restart the kanel

In [1]:
import gc
import numpy as np
import pandas as pd
import warnings
import time
import gc
import os

instance_type = 'c5d4xlarge' # change this
results_bucket = f"s3://xdss-benchmarks/benchmarks" # change this

name = 'datatable'
data_path = 'datasets/taxi_1B.hdf5'
output_file = f'{name}_{instance_type}.csv'
results_path = f"results/{output_file}"
results_bucket = f"{results_bucket}/{output_file}" 
benchmarks = {
    'run':[],
    'duration': [],
    'task': []   
}

long_min = -74.05
long_max = -73.75
lat_min = 40.58
lat_max = 40.90


def get_results(benchmarks=benchmarks):
    return pd.DataFrame.from_dict(benchmarks, orient='index').T

def persist():
    gc.collect()
    get_results(benchmarks).to_csv(results_path)
    os.system(f"aws s3 cp {results_path} {results_bucket}")
    
def benchmark(f, df, name, **kwargs):    
    for i in range(2):
        start_time = time.time()
        ret = f(df, **kwargs)
        benchmarks['duration'].append(time.time()-start_time)
        benchmarks['task'].append(name)
        benchmarks['run'].append(i+1)
    persist()
    print(f"{name} took: {benchmarks['duration'][-1]}")
    return benchmarks['duration'][-1]
          
def add_nan(name):
    for i in range(2):
        benchmarks['duration'].append(np.nan)
        benchmarks['task'].append(name)
        benchmarks['run'].append(i+1)
    persist()
    print(f"{name} took: {benchmarks['duration'][-1]}")
    return benchmarks['duration'][-1]

          
!mkdir -p results
!mkdir -p datasets
print(f"We test every benchmark twice and save both results")

We test every benchmark twice and save both results


# Benchmark

In [2]:
import datatable as dt
from datatable import f, math
import vaex
import numpy as np

In [3]:
# This is a hack to let datatable read hdf5, it currently can't read parquet, multiple files, or a huge csv
def read_file(data=None):
    vdf = vaex.open(data_path)
    columns = {}
    for name in vdf.get_column_names():
        data = vdf.columns[name]
        if data.dtype == str:
            pass  # skip strings
        elif data.dtype.kind == 'f':
            # datatable is picky about <f4 format
            columns[name] = data.view(np.float32)
        elif data.dtype.kind == 'i':
            columns[name] = data
        else:
            pass  # ignore non int and float
    return dt.Frame(**columns)

# Load data
data = read_file(data=None)
print(f"size: {data.shape[0]} with {data.shape[1]} columns")

size: 1173057927 with 14 columns


In [4]:
benchmark(read_file, df=data, name='read_file')

read_file took: 0.006750583648681641


0.006750583648681641

In [5]:
def count(df=None):
    return df.shape[0]

benchmark(count, df=data, name='count')

count took: 4.76837158203125e-07


4.76837158203125e-07

In [6]:
def mean(df):
    return df[:, dt.mean(dt.f.fare_amount)]

benchmark(mean, df=data, name='mean')

mean took: 5.278338432312012


5.278338432312012

In [7]:
def standard_deviation(df):
    return df[:, dt.sd(dt.f.fare_amount)]

benchmark(standard_deviation, df=data, name='standard deviation')

standard deviation took: 8.283175706863403


8.283175706863403

In [8]:
def mean_of_sum(df):
    return df[:, dt.mean(f.fare_amount + f.trip_distance)]

benchmark(mean_of_sum, df=data, name='sum columns mean')

sum columns mean took: 14.607588768005371


14.607588768005371

In [9]:
def sum_columns(df):
    return df[:, f.fare_amount + f.trip_distance]

benchmark(sum_columns, df=data, name='sum columns')

sum columns took: 1.4781951904296875e-05


1.4781951904296875e-05

In [10]:
def mean_of_product(df):
    return df[:, dt.mean(f.fare_amount * f.trip_distance)]

benchmark(mean_of_product, df=data, name='product columns mean')

product columns mean took: 14.51786184310913


14.51786184310913

In [11]:
def product_columns(df):
    return df[:, f.fare_amount * f.trip_distance]

benchmark(product_columns, df=data, name='product columns')

product columns took: 1.4543533325195312e-05


1.4543533325195312e-05

In [12]:
def lazy_mean(df):
    df['lazy'] = df[:, f.fare_amount * f.trip_distance]
    return df[:, dt.mean(dt.f.lazy)]
    
benchmark(lazy_mean, df=data, name='lazy evaluation')  


lazy evaluation took: 14.478155136108398


14.478155136108398

In [21]:
# Memorry crash
from datatable import f, count

def value_counts(df):
    return df[:,count(f.passenger_count),'passenger_count']

# add_nan('value counts')
benchmark(value_counts, df=data, name='value counts')  

value counts took: 5.268906354904175


5.268906354904175

In [24]:
# Memorry crash
from datatable import math

def complicated_arithmetic_operation(df):
    theta_1 = f.pickup_longitude
    phi_1 = f.pickup_latitude
    theta_2 = f.dropoff_longitude
    phi_2 = f.dropoff_latitude
    temp = (math.sin((theta_2-theta_1)/2*math.pi/180)**2
           + math.cos(theta_1*math.pi/180)*math.cos(theta_2*math.pi/180) * math.sin((phi_2-phi_1)/2*math.pi/180)**2)
    expr = 2 * math.atan2(math.sqrt(temp), math.sqrt(1-temp))
    return df[:, expr]


def mean_of_complicated_arithmetic_operation(df):
    theta_1 = f.pickup_longitude
    phi_1 = f.pickup_latitude
    theta_2 = f.dropoff_longitude
    phi_2 = f.dropoff_latitude
    temp = (math.sin((theta_2-theta_1)/2*math.pi/180)**2
           + math.cos(theta_1*math.pi/180)*math.cos(theta_2*math.pi/180) * math.sin((phi_2-phi_1)/2*math.pi/180)**2)
    expr = 2 * math.atan2(math.sqrt(temp), math.sqrt(1-temp))
    return df[:, dt.mean(expr)]


add_nan("arithmetic operation")
add_nan("arithmetic operation mean")

arithmetic operation took: nan
arithmetic operation mean took: nan


nan

In [15]:
def groupby_statistics(df):
    aggs = {
            'fare_amount_mean': dt.mean(f.fare_amount),
            'fare_amount_std': dt.sd(f.fare_amount),
            'tip_amount_mean': dt.mean(f.tip_amount),
            'tip_amount_std': dt.sd(f.tip_amount),
        }
    return df[:, aggs, dt.by(f.passenger_count)]

benchmark(groupby_statistics, df=data, name='groupby statistics')

groupby statistics took: 42.71509027481079


42.71509027481079

In [16]:
other = dt.Frame(groupby_statistics(data).to_pandas())
other.key = 'passenger_count'

In [17]:
def join_count(df, other):
    # like vaex and dask, no precomputed index
    return df[:,:,dt.join(other)].shape[0]

benchmark(join_count, data, name='join count', other=other)

join count took: 5.821533203125


5.821533203125

In [18]:
def join_data(df, other):
    # like vaex and dask, no precomputed index
    ret = df[:,:,dt.join(other)]
    ret.head(20)
    return ret

benchmark(join_data, data, name='join', other=other)

join took: 5.821089506149292


5.821089506149292

## Filtered data
For the entire dataset, non of this runs without crashing

In [19]:
print(f"Prepare filtered data and deleted {gc.collect()} MB")

Prepare filtered data and deleted 0 MB


## Memroy crash on filter 