# Get data and packages

In [1]:
%%capture
%%bash
pip install jupyter_contrib_nbextensions
pip install jupyter_nbextensions_configurator
jupyter contrib nbextension install --user
jupyter nbextensions_configurator enable --user

jupyter nbextension enable codefolding/main
jupyter nbextension enable scratchpad/main
jupyter nbextension enable execute_time/ExecuteTime
jupyter nbextension enable autosavetime/main
pip install -U pip dask numpy fsspec>=0.3.3 tqdm pyarrow


In [4]:
%%capture
%%bash
!pip install --upgrade turicreate

In [10]:
import gc
from src.config import repetitions
import time
import numpy as np
import pandas as pd
import numpy as np
import datetime as dt
import warnings
import time
import gc
import os

instance_type = 'mlm52xlarge'
name = 'turicreate'
data_path = '../datasets/taxi_1B.sf'
instance_type = 'mlm52xlarge'
results_path = f"../results/{name}_1b_{instance_type}.csv"
benchmarks = {}
print(f"test for {repetitions} repetitions")

test for 1 repetitions


In [11]:
import numpy as np
import pandas as pd
import time 


def get_results(benchmarks, name):
    results = pd.DataFrame.from_dict(benchmarks, orient='index')
    results.columns = [name]
    return results

def persist():
    get_results(benchmarks, name).to_csv(results_path)
    !aws s3 cp  ../results/turicreate_1b_mlm52xlarge.csv s3://vaex-sagemaker-demo/benchmarks/turicreate_1b_mlm52xlarge_results.csv 
    
def benchmark(f, df, name, repetitions=1, **kwargs):
    times = []
    for i in range(repetitions):
        start_time = time.time()
        ret = f(df, **kwargs)
        times.append(time.time()-start_time)
    benchmarks[name] = np.mean(times)
    persist()
    print(f"{name} took: {benchmarks[name]}")
    return benchmarks[name]


long_min = -74.05
long_max = -73.75
lat_min = 40.58
lat_max = 40.90

# Benchmark

In [12]:
import turicreate as tc
import numpy as np

# Load data
data = tc.SFrame(data_path)
print(f"size: {data.shape[0]} with {data.shape[1]} columns")

size: 1173057927 with 18 columns


In [13]:
def open_file(df=None):
    return tc.SFrame(data_path)

key = open_file
f= open_file
benchmark(f, df=data, name=key, repetitions=repetitions)

Completed 72 Bytes/72 Bytes (1023 Bytes/s) with 1 file(s) remainingupload: ../results/turicreate_1b_mlm52xlarge.csv to s3://vaex-sagemaker-demo/benchmarks/turicreate_1b_mlm52xlarge_results.csv


0.001241445541381836

In [14]:
def count(df=None):
    return len(df)

key = 'count'
f = count
benchmark(f, df=data, name=key, repetitions=repetitions)

Completed 101 Bytes/101 Bytes (1.8 KiB/s) with 1 file(s) remainingupload: ../results/turicreate_1b_mlm52xlarge.csv to s3://vaex-sagemaker-demo/benchmarks/turicreate_1b_mlm52xlarge_results.csv


3.5762786865234375e-05

In [None]:
def mean(df):
    return df['fare_amount'].mean()

key = 'mean'
f = mean
benchmark(f, df=data, name=key, repetitions=repetitions)

In [10]:
def standard_deviation(df):
    return df['fare_amount'].std()

key = 'standard deviation'
f = standard_deviation
benchmark(f, df=data, name=key, repetitions=repetitions)

Unlike other techonlogies, vaex can return columns, or subset of values to explore lazely, 
but becouse many of the other tecnologies crashed at this point, we return a scalar instead.

In [11]:
def mean_of_sum(df):
    return (df['fare_amount'] + df['trip_distance']).mean()

key = 'sum columns mean'
f = mean_of_sum
benchmark(f, df=data, name=key, repetitions=repetitions)
benchmarks['sum columns'] =  benchmarks['sum columns mean'] - benchmarks['mean']

In [14]:
def mean_of_product(df):
    return (df['fare_amount'] * df['trip_distance']).mean()

key = 'product columns mean'
f = mean_of_product
benchmark(f, df=data, name=key, repetitions=repetitions)
benchmarks['product columns'] =  benchmarks['product columns mean'] - benchmarks['mean']

In [15]:
def mean_of_complicated_arithmetic_operation(df):
    theta_1 = df['pickup_longitude']
    phi_1 = df['pickup_latitude']
    theta_2 = df['dropoff_longitude']
    phi_2 = df['dropoff_latitude']
    temp = (np.sin((theta_2-theta_1)/2*np.pi/180)**2
           + np.cos(theta_1*np.pi/180)*np.cos(theta_2*np.pi/180) * np.sin((phi_2-phi_1)/2*np.pi/180)**2)
    return (2 * np.arctan2(np.sqrt(temp), np.sqrt(1-temp))).mean()

key = 'arithmetic operation mean'
f  = mean_of_complicated_arithmetic_operation
benchmark(f, df=data, name=key, repetitions=repetitions)
benchmarks['arithmetic operation'] =  benchmarks['arithmetic operation mean'] - benchmarks['mean']

In [16]:
def value_counts(df):
    return df['passenger_count'].value_counts()

key = 'value counts'
f  = value_counts
benchmark(f, df=data, name=key, repetitions=repetitions)

In [None]:
def groupby_statistics(df):
    return df.groupby(key_column_names = 'passenger_count', 
           operations = {
               'fare_amount_mean':tc.aggregate.MEAN('fare_amount'),
               'fare_amount_std':tc.aggregate.STD('fare_amount'),
               'tip_amount_mean':tc.aggregate.MEAN('tip_amount'),
               'tip_amount_mean':tc.aggregate.STD('tip_amount')
           })

key = 'groupby statistics'
f = groupby_statistics
benchmark(f, df=data, name=key, repetitions=repetitions)

In [None]:
def join_count(df, other):
    return len(df.join(other, on = 'passenger_count').count())

other = groupby_statistics(data)
key = 'join count'
f = join_count
benchmark(f, data, name=key, repetitions=repetitions, other=other)

In [None]:
def join(df, other):
    return df.join(other, on = 'passenger_count')

key = 'join'
f = join
benchmark(f, data, name=key, repetitions=repetitions, other=other)

Turi have a special tool for statistics called "Sketch"

In [None]:
def skeatch_mean(df):
    return tc.Sketch(df['fare_amount']).mean()

key = 'skeatch mean'
f = skeatch_mean
benchmark(f, data, name=key, repetitions=repetitions)

In [None]:
def skeatch_standatd_deviation(df):
    return  tc.Sketch(df['fare_amount']).std()

key = 'skeatch standatd deviation'
f = skeatch_standatd_deviation
benchmark(f, data, name=key, repetitions=repetitions)


In [None]:
def skeatch_frequent_items(df):
    return tc.Sketch(df['passenger_count']).frequent_items()

key = 'skeatch frequent items'
f = skeatch_frequent_items
benchmark(f, data, name=key, repetitions=repetitions)

## Filtered data

Dask is not build to run on filter data like you would normally, so we will apply the same strategy

In [None]:
print(f"Prepare filtered data and deleted {gc.collect()} MB")

In [None]:
def filter_data(df):
    expr_filter = (df['pickup_longitude'] > long_min)  & (df['pickup_longitude'] < long_max) & \
                  (df['pickup_latitude'] > lat_min)    & (df['pickup_latitude'] < lat_max) & \
                  (df['dropoff_longitude'] > long_min) & (df['dropoff_longitude'] < long_max) & \
                  (df['dropoff_latitude'] > lat_min)   & (df['dropoff_latitude'] < lat_max)
    return df[expr_filter]

key = 'filter data'
f = filter_data
benchmark(f, data, name=key, repetitions=repetitions)

In [None]:
filterd = filter_data(data)

del data
print(f"cleaned {gc.collect()} mb")

In [None]:
key = 'filterd count'
f = filter_data
benchmark(f, filterd, name=key, repetitions=repetitions)

In [None]:
key = 'filterd mean'
f = mean
benchmark(f, filterd, name=key, repetitions=repetitions)

In [None]:
key = 'filterd standard deviation'
f = standard_deviation
benchmark(f, filterd, name=key, repetitions=repetitions)

In [None]:
key = 'filtered sum columns mean'
f = mean_of_sum
benchmark(f, filterd, name=key, repetitions=repetitions)
benchmarks['filtered sum columns'] =  benchmarks['filtered sum columns mean'] - benchmarks['filterd mean']
persist()

In [None]:
key = 'filterd product columns mean'
f = mean_of_product
benchmark(f, filterd, name=key, repetitions=repetitions)
benchmarks['filterd product columns'] =  benchmarks['filterd product columns mean'] - benchmarks['filterd mean']
persist()

In [None]:
key = 'filterd arithmetic operation mean'
f = mean_of_complicated_arithmetic_operation
benchmark(f, filterd, name=key, repetitions=repetitions)
benchmarks['filterd arithmetic operation'] =  benchmarks['filterd arithmetic operation mean'] - benchmarks['filterd mean']
persist()

In [None]:
key = 'filtered value counts'
f = value_counts
benchmark(f, filterd, name=key, repetitions=repetitions)

In [None]:
key = 'filtered groupby statistics'
f = groupby_statistics
benchmark(f, filterd, name=key, repetitions=repetitions)

In [None]:
key = 'filtered join'
f = join
benchmark(f, filterd, name=key, repetitions=repetitions)

In [None]:
key = 'filtered join count'
f = join_count
benchmark(f, filterd, name=key, repetitions=repetitions)

In [None]:
print(f"Done benchmarks on filterd data")