# Get data and packages

In [1]:
%%capture
%%bash
pip install jupyter_contrib_nbextensions
pip install jupyter_nbextensions_configurator
jupyter contrib nbextension install --user
jupyter nbextensions_configurator enable --user

jupyter nbextension enable codefolding/main
jupyter nbextension enable scratchpad/main
jupyter nbextension enable execute_time/ExecuteTime
jupyter nbextension enable autosavetime/main
pip install -U pip dask numpy fsspec>=0.3.3 tqdm pyarrow


In [4]:
%%capture
%%bash
!pip install --upgrade turicreate

In [1]:
import gc
from src.config import repetitions
import time
import numpy as np
import pandas as pd
import numpy as np
import datetime as dt
import warnings
import time
import gc
import os

instance_type = 'mlm52xlarge'
name = 'turicreate'
data_path = '../datasets/taxi_1B.sf'
instance_type = 'mlm52xlarge'
results_path = f"../results/{name}_1b_{instance_type}.csv"
benchmarks = {}
print(f"test for {repetitions} repetitions")

test for 1 repetitions


In [9]:
import numpy as np
import pandas as pd
import time 


def get_results(benchmarks, name):
    results = pd.DataFrame.from_dict(benchmarks, orient='index')
    results.columns = [name]
    return results

def persist():
    get_results(benchmarks, name).to_csv(results_path)
    !aws s3 cp  ../results/turicreate_1b_mlm52xlarge.csv s3://vaex-sagemaker-demo/benchmarks/turicreate_1b_mlm52xlarge_results.csv 
    
def benchmark(f, df, name, repetitions=1, **kwargs):
    times = []
    for i in range(repetitions):
        start_time = time.time()
        ret = f(df, **kwargs)
        times.append(time.time()-start_time)
    benchmarks[name] = np.mean(times)
    persist()
    print(f"{name} took: {benchmarks[name]}")
    return benchmarks[name]


long_min = -74.05
long_max = -73.75
lat_min = 40.58
lat_max = 40.90

# Benchmark

In [10]:
import turicreate as tc
import numpy as np

# Load data
data = tc.SFrame(data_path)
print(f"size: {data.shape[0]} with {data.shape[1]} columns")

size: 1000 with 18 columns


In [11]:
def open_file(df=None):
    return tc.SFrame(data_path)

benchmark(open_file, df=data, name='read_file', repetitions=repetitions)

upload: ../results/turicreate_1b_mlm52xlarge.csv to s3://vaex-sagemaker-demo/benchmarks/turicreate_1b_mlm52xlarge_results.csv
read_file took: 0.0012040138244628906


0.0012040138244628906

In [12]:
def count(df=None):
    return len(df)

benchmark(count, df=data, name='count', repetitions=repetitions)

upload: ../results/turicreate_1b_mlm52xlarge.csv to s3://vaex-sagemaker-demo/benchmarks/turicreate_1b_mlm52xlarge_results.csv
count took: 3.0517578125e-05


3.0517578125e-05

In [13]:
def mean(df):
    return df['fare_amount'].mean()

benchmark(mean, df=data, name='mean', repetitions=repetitions)

upload: ../results/turicreate_1b_mlm52xlarge.csv to s3://vaex-sagemaker-demo/benchmarks/turicreate_1b_mlm52xlarge_results.csv
mean took: 0.001392364501953125


0.001392364501953125

In [14]:
def standard_deviation(df):
    return df['fare_amount'].std()

benchmark(standard_deviation, df=data, name='standard', repetitions=repetitions)

upload: ../results/turicreate_1b_mlm52xlarge.csv to s3://vaex-sagemaker-demo/benchmarks/turicreate_1b_mlm52xlarge_results.csv
standard took: 0.0012896060943603516


0.0012896060943603516

Unlike other techonlogies, vaex can return columns, or subset of values to explore lazely, 
but becouse many of the other tecnologies crashed at this point, we return a scalar instead.

In [15]:
def mean_of_sum(df):
    return (df['fare_amount'] + df['trip_distance']).mean()

benchmark(mean_of_sum, df=data, name='sum columns mean', repetitions=repetitions)
benchmarks['sum columns'] =  benchmarks['sum columns mean'] - benchmarks['mean']

upload: ../results/turicreate_1b_mlm52xlarge.csv to s3://vaex-sagemaker-demo/benchmarks/turicreate_1b_mlm52xlarge_results.csv
sum columns mean took: 0.0019731521606445312


In [16]:
def mean_of_product(df):
    return (df['fare_amount'] * df['trip_distance']).mean()

benchmark(mean_of_product, df=data, name='product columns mean', repetitions=repetitions)
benchmarks['product columns'] =  benchmarks['product columns mean'] - benchmarks['mean']

upload: ../results/turicreate_1b_mlm52xlarge.csv to s3://vaex-sagemaker-demo/benchmarks/turicreate_1b_mlm52xlarge_results.csv
product columns mean took: 0.002002716064453125


In [17]:
def mean_of_complicated_arithmetic_operation(df):
    theta_1 = df['pickup_longitude']
    phi_1 = df['pickup_latitude']
    theta_2 = df['dropoff_longitude']
    phi_2 = df['dropoff_latitude']
    temp = (np.sin((theta_2-theta_1)/2*np.pi/180)**2
           + np.cos(theta_1*np.pi/180)*np.cos(theta_2*np.pi/180) * np.sin((phi_2-phi_1)/2*np.pi/180)**2)
    return (2 * np.arctan2(np.sqrt(temp), np.sqrt(1-temp))).mean()

benchmark(mean_of_complicated_arithmetic_operation, df=data, name='arithmetic operation mean', repetitions=repetitions)
benchmarks['arithmetic operation'] =  benchmarks['arithmetic operation mean'] - benchmarks['mean']

upload: ../results/turicreate_1b_mlm52xlarge.csv to s3://vaex-sagemaker-demo/benchmarks/turicreate_1b_mlm52xlarge_results.csv
arithmetic operation mean took: 0.00744318962097168


In [18]:
def value_counts(df):
    return df['passenger_count'].value_counts()

benchmark(value_counts, df=data, name='value counts', repetitions=repetitions)

upload: ../results/turicreate_1b_mlm52xlarge.csv to s3://vaex-sagemaker-demo/benchmarks/turicreate_1b_mlm52xlarge_results.csv
value counts took: 0.010123968124389648


0.010123968124389648

In [19]:
def groupby_statistics(df):
    return df.groupby(key_column_names = 'passenger_count', 
           operations = {
               'fare_amount_mean':tc.aggregate.MEAN('fare_amount'),
               'fare_amount_std':tc.aggregate.STD('fare_amount'),
               'tip_amount_mean':tc.aggregate.MEAN('tip_amount'),
               'tip_amount_mean':tc.aggregate.STD('tip_amount')
           })

benchmark(groupby_statistics, df=data, name='groupby statistics', repetitions=repetitions)

upload: ../results/turicreate_1b_mlm52xlarge.csv to s3://vaex-sagemaker-demo/benchmarks/turicreate_1b_mlm52xlarge_results.csv
groupby statistics took: 0.009274482727050781


0.009274482727050781

In [20]:
other = groupby_statistics(data)

In [None]:
def join(df, other):
    return df.join(other, on = 'passenger_count')

benchmark(join, data, name='join', repetitions=repetitions, other=other)

In [22]:
def join_count(df, other):
    return len(df.join(other, on = 'passenger_count'))

benchmark(join_count, data, name='join count', repetitions=repetitions, other=other)

upload: ../results/turicreate_1b_mlm52xlarge.csv to s3://vaex-sagemaker-demo/benchmarks/turicreate_1b_mlm52xlarge_results.csv
join count took: 0.008756399154663086


0.008756399154663086

Turi have a special tool for statistics called "Sketch"

In [23]:
def skeatch_mean(df):
    return tc.Sketch(df['fare_amount']).mean()

benchmark(skeatch_mean, data, name='skeatch mean', repetitions=repetitions)

upload: ../results/turicreate_1b_mlm52xlarge.csv to s3://vaex-sagemaker-demo/benchmarks/turicreate_1b_mlm52xlarge_results.csv
skeatch mean took: 0.04694008827209473


0.04694008827209473

In [24]:
def skeatch_standatd_deviation(df):
    return  tc.Sketch(df['fare_amount']).std()

benchmark(skeatch_standatd_deviation, data, name='skeatch standatd deviation', repetitions=repetitions)

upload: ../results/turicreate_1b_mlm52xlarge.csv to s3://vaex-sagemaker-demo/benchmarks/turicreate_1b_mlm52xlarge_results.csv
skeatch standatd deviation took: 0.04503369331359863


0.04503369331359863

In [25]:
def skeatch_frequent_items(df):
    return tc.Sketch(df['passenger_count']).frequent_items()

benchmark(skeatch_frequent_items, data, name='skeatch frequent items', repetitions=repetitions)

upload: ../results/turicreate_1b_mlm52xlarge.csv to s3://vaex-sagemaker-demo/benchmarks/turicreate_1b_mlm52xlarge_results.csv
skeatch frequent items took: 0.04623079299926758


0.04623079299926758

## Filtered data

Dask is not build to run on filter data like you would normally, so we will apply the same strategy

In [26]:
print(f"Prepare filtered data and deleted {gc.collect()} MB")

Prepare filtered data and deleted 0 MB


In [27]:
def filter_data(df):
    expr_filter = (df['pickup_longitude'] > long_min)  & (df['pickup_longitude'] < long_max) & \
                  (df['pickup_latitude'] > lat_min)    & (df['pickup_latitude'] < lat_max) & \
                  (df['dropoff_longitude'] > long_min) & (df['dropoff_longitude'] < long_max) & \
                  (df['dropoff_latitude'] > lat_min)   & (df['dropoff_latitude'] < lat_max)
    return df[expr_filter]

benchmark(filter_data, data, name='filter data', repetitions=repetitions)

upload: ../results/turicreate_1b_mlm52xlarge.csv to s3://vaex-sagemaker-demo/benchmarks/turicreate_1b_mlm52xlarge_results.csv
filter data took: 0.0015516281127929688


0.0015516281127929688

In [28]:
filterd = filter_data(data)

del data
print(f"cleaned {gc.collect()} mb")

cleaned 0 mb


In [29]:
benchmark(filter_data, filterd, name='filterd count', repetitions=repetitions)
benchmark(mean, filterd, name='filterd mean', repetitions=repetitions)
benchmark(standard_deviation, filterd, name='filtered standard deviation', repetitions=repetitions)
benchmark(mean_of_sum, filterd, name ='filtered sum columns mean', repetitions=repetitions)
benchmarks['filtered sum columns'] =  benchmarks['filtered sum columns mean'] - benchmarks['filterd mean']
benchmark(mean_of_product, filterd, name ='filterd product columns mean', repetitions=repetitions)
benchmarks['filterd product columns'] = benchmarks['filterd product columns mean'] - benchmarks['filterd mean']
benchmark(mean_of_complicated_arithmetic_operation, filterd, name='filterd arithmetic operation mean', repetitions=repetitions)
benchmarks['filterd arithmetic operation'] =  benchmarks['filterd arithmetic operation mean'] - benchmarks['filterd mean']
benchmark(value_counts, filterd, name ='filtered value counts', repetitions=repetitions)
benchmark(groupby_statistics, filterd, name='filtered groupby statistics', repetitions=repetitions)
benchmark(skeatch_mean, filterd, name='filterd skeatch mean', repetitions=repetitions)
benchmark(skeatch_standatd_deviation, filterd, name='skeatch standatd deviation', repetitions=repetitions)
benchmark(skeatch_frequent_items, filterd, name='skeatch frequent items', repetitions=repetitions)
other = groupby_statistics(filterd)
benchmark(join, filterd, name='filtered join', repetitions=repetitions, other=other)
benchmark(join_count, filterd, name='filtered join count', repetitions=repetitions, other=other)
print('Done!')

upload: ../results/turicreate_1b_mlm52xlarge.csv to s3://vaex-sagemaker-demo/benchmarks/turicreate_1b_mlm52xlarge_results.csv
filterd count took: 0.007070064544677734
upload: ../results/turicreate_1b_mlm52xlarge.csv to s3://vaex-sagemaker-demo/benchmarks/turicreate_1b_mlm52xlarge_results.csv
filterd mean took: 0.002871274948120117
upload: ../results/turicreate_1b_mlm52xlarge.csv to s3://vaex-sagemaker-demo/benchmarks/turicreate_1b_mlm52xlarge_results.csv
filtered standard deviation took: 0.0034503936767578125
upload: ../results/turicreate_1b_mlm52xlarge.csv to s3://vaex-sagemaker-demo/benchmarks/turicreate_1b_mlm52xlarge_results.csv
filtered sum columns mean took: 0.003434896469116211
upload: ../results/turicreate_1b_mlm52xlarge.csv to s3://vaex-sagemaker-demo/benchmarks/turicreate_1b_mlm52xlarge_results.csv
filterd product columns mean took: 0.003442525863647461
upload: ../results/turicreate_1b_mlm52xlarge.csv to s3://vaex-sagemaker-demo/benchmarks/turicreate_1b_mlm52xlarge_results.cs