# Get data and packages
Use conda_tensorflow_p36 kernel

In [1]:
%%capture
%%bash
pip install --upgrade turicreate
aws s3 cp --recursive s3://xdss-public-datasets/demos/taxi_1B.sf datasets/taxi_1B.sf

In [7]:
import gc
import numpy as np
import pandas as pd
import warnings
import time
import gc
import os

instance_type = 'c5d2xlarge' # change this
results_bucket = f"s3://vaex-sagemaker-demo/benchmarks" # change this

name = 'turicreate'
data_path = 'datasets/taxi_1B.sf'
output_file = f'{name}_{instance_type}.csv'
results_path = f"results/{output_file}"
results_bucket = f"{results_bucket}/{output_file}" 
benchmarks = {
    'sketch':[],
    'run':[],
    'duration': [],
    'task': []   
}

long_min = -74.05
long_max = -73.75
lat_min = 40.58
lat_max = 40.90


def get_results(benchmarks=benchmarks):
    return pd.DataFrame.from_dict(benchmarks, orient='index').T

def persist():
    gc.collect()
    get_results(benchmarks).to_csv(results_path)
    os.system(f"aws s3 cp {results_path} {results_bucket}")
    
def benchmark(f, df, name, sketch=0, **kwargs):    
    for i in range(2):
        start_time = time.time()
        ret = f(df, **kwargs)
        benchmarks['duration'].append(time.time()-start_time)
        benchmarks['task'].append(name)
        benchmarks['run'].append(i+1)
        benchmarks['sketch'].append(kwargs.get('sketch', 0))
    persist()
    print(f"{name} took: {benchmarks['duration'][-1]}")
    return benchmarks['duration'][-1]
          
def add_nan(name):
    for i in range(2):
        benchmarks['duration'].append(np.nan)
        benchmarks['task'].append(name)
        benchmarks['run'].append(i+1)
    persist()
    print(f"{name} took: {benchmarks['duration'][-1]}")
    return benchmarks['duration'][-1]

          
!mkdir -p results
!mkdir -p datasets
print(f"We test every benchmark twice and save both results")

We test every benchmark twice and save both results


# Benchmark

In [8]:
import turicreate as tc
import numpy as np

# Load data
data = tc.SFrame(data_path)
print(f"size: {data.shape[0]} with {data.shape[1]} columns")

size: 1173057927 with 18 columns


In [9]:
def open_file(df=None):
    return tc.SFrame(data_path)

benchmark(open_file, df=data, name='read_file')

read_file took: 0.0010561943054199219


0.0010561943054199219

In [10]:
def count(df=None):
    return len(df)

benchmark(count, df=data, name='count')

count took: 1.6689300537109375e-06


1.6689300537109375e-06

In [11]:
def skeatch_mean(df):
    return tc.Sketch(df['fare_amount']).mean()

benchmark(skeatch_mean, data, name='skeatch mean', sketch=1)

skeatch mean took: 97.00968027114868


97.00968027114868

In [12]:
def skeatch_standatd_deviation(df):
    return  tc.Sketch(df['fare_amount']).std()

benchmark(skeatch_standatd_deviation, data, name='skeatch standatd deviation', sketch=1)

skeatch standatd deviation took: 96.99326801300049


96.99326801300049

In [13]:
def skeatch_frequent_items(df):
    return tc.Sketch(df['passenger_count']).frequent_items()

benchmark(skeatch_frequent_items, data, name='skeatch frequent items', sketch=1)

skeatch frequent items took: 36.2844877243042


36.2844877243042

In [14]:
def mean(df):
    return df['fare_amount'].mean()

benchmark(mean, df=data, name='mean')

mean took: 92.37087798118591


92.37087798118591

In [15]:
def standard_deviation(df):
    return df['fare_amount'].std()

benchmark(standard_deviation, df=data, name='standard deviation')

standard deviation took: 91.88946914672852


91.88946914672852

In [16]:
def mean_of_sum(df):
    return (df['fare_amount'] + df['trip_distance']).mean()

benchmark(mean_of_sum, df=data, name='sum columns mean')

sum columns mean took: 156.42764568328857


156.42764568328857

In [17]:
def sum_columns(df):
    return df['fare_amount'] + df['trip_distance']

benchmark(sum_columns, df=data, name='sum columns')

sum columns took: 0.0001900196075439453


0.0001900196075439453

In [18]:
def mean_of_product(df):
    return (df['fare_amount'] * df['trip_distance']).mean()

benchmark(mean_of_product, df=data, name='product columns mean')

product columns mean took: 158.18946170806885


158.18946170806885

In [19]:
def product(df):
    return df['fare_amount'] * df['trip_distance']

benchmark(product, df=data, name='product columns')

product columns took: 0.00010371208190917969


0.00010371208190917969

In [None]:
def lazy_mean(df):
    df['lazy'] = df['fare_amount'] * df['trip_distance']
    return df['lazy'].mean()
    
benchmark(lazy_mean, df=data, name='lazy evaluation')    

In [None]:
def value_counts(df):
    return df['passenger_count'].value_counts()

benchmark(value_counts, df=data, name='value counts')

In [None]:
def mean_of_complicated_arithmetic_operation(df):
    theta_1 = df['pickup_longitude']
    phi_1 = df['pickup_latitude']
    theta_2 = df['dropoff_longitude']
    phi_2 = df['dropoff_latitude']
    temp = (np.sin((theta_2-theta_1)/2*np.pi/180)**2
           + np.cos(theta_1*np.pi/180)*np.cos(theta_2*np.pi/180) * np.sin((phi_2-phi_1)/2*np.pi/180)**2)
    return (2 * np.arctan2(np.sqrt(temp), np.sqrt(1-temp))).mean()

benchmark(mean_of_complicated_arithmetic_operation, df=data, name='arithmetic operation mean')

In [None]:
def complicated_arithmetic_operation(df):
    theta_1 = df['pickup_longitude']
    phi_1 = df['pickup_latitude']
    theta_2 = df['dropoff_longitude']
    phi_2 = df['dropoff_latitude']
    temp = (np.sin((theta_2-theta_1)/2*np.pi/180)**2
           + np.cos(theta_1*np.pi/180)*np.cos(theta_2*np.pi/180) * np.sin((phi_2-phi_1)/2*np.pi/180)**2)
    return (2 * np.arctan2(np.sqrt(temp), np.sqrt(1-temp)))

benchmark(complicated_arithmetic_operation, df=data, name='arithmetic operation')

In [None]:
def groupby_statistics(df):
    return df.groupby(key_column_names = 'passenger_count', 
           operations = {
               'fare_amount_mean':tc.aggregate.MEAN('fare_amount'),
               'fare_amount_std':tc.aggregate.STD('fare_amount'),
               'tip_amount_mean':tc.aggregate.MEAN('tip_amount'),
               'tip_amount_mean':tc.aggregate.STD('tip_amount')
           })

benchmark(groupby_statistics, df=data, name='groupby statistics')

In [None]:
other = groupby_statistics(data)
other.materialize()

In [None]:
def join_data(df, other):
    return df.join(other, on = 'passenger_count')

benchmark(join_data, data, name='join', other=other)

In [None]:
def join_count(df, other):
    return len(df.join(other, on = 'passenger_count'))

benchmark(join_count, data, name='join count', other=other)

## Filtered data

In [None]:
print(f"Prepare filtered data and deleted {gc.collect()} MB")

In [None]:
def filter_data(df):
    expr_filter = (df['pickup_longitude'] > long_min)  & (df['pickup_longitude'] < long_max) & \
                  (df['pickup_latitude'] > lat_min)    & (df['pickup_latitude'] < lat_max) & \
                  (df['dropoff_longitude'] > long_min) & (df['dropoff_longitude'] < long_max) & \
                  (df['dropoff_latitude'] > lat_min)   & (df['dropoff_latitude'] < lat_max)
    return df[expr_filter]

benchmark(filter_data, data, name='filter data')

In [None]:
filtered = filter_data(data)
filtered.materialize()

del data
print(f"cleaned {gc.collect()} mb")

In [None]:
benchmark(mean, filtered, name='filtered mean')
benchmark(standard_deviation, filtered, name='filtered standard deviation')
benchmark(mean_of_sum, filtered, name ='filtered sum columns mean')
benchmark(sum_columns, df=filtered, name='filtered sum columns')
benchmark(mean_of_product, filtered, name ='filtered product columns mean')
benchmark(product_columns, df=filtered, name='filtered product columns')
benchmark(mean_of_complicated_arithmetic_operation, filtered, name='filtered arithmetic operation mean')
benchmark(complicated_arithmetic_operation, filtered, name='filtered arithmetic operation')
benchmark(value_counts, filtered, name ='filtered value counts')
benchmark(groupby_statistics, filtered, name='filtered groupby statistics')

benchmark(skeatch_mean, filtered, name='filtered mean', sketch=1)
benchmark(skeatch_standatd_deviation, filtered, name='filtered standatd deviation', sketch=1)
benchmark(skeatch_frequent_items, filtered, name='filtered value counts', sketch=1)

other = groupby_statistics(data)
other.materialize()
benchmark(join_count, filtered, name='filtered join count', other=other)
benchmark(join_data, filtered, name='filtered join', other=other)
print(name)
get_results(benchmarks)