# Get data and packages

In [2]:
!pip install -U pip
!conda update -y --all
!pip install --upgrade tensorflow==2.0.0-beta1
!pip install --upgrade turicreate
!aws s3 cp --recursive s3://xdss-public-datasets/demos/taxi_1m.sframe ../datasets/taxi_1m.sframe

Collecting pip
[?25l  Downloading https://files.pythonhosted.org/packages/00/b6/9cfa56b4081ad13874b0c6f96af8ce16cfbc1cb06bedf8e9164ce5551ec1/pip-19.3.1-py2.py3-none-any.whl (1.4MB)
[K    100% |████████████████████████████████| 1.4MB 21.5MB/s ta 0:00:01
[?25hInstalling collected packages: pip
  Found existing installation: pip 10.0.1
    Uninstalling pip-10.0.1:
      Successfully uninstalled pip-10.0.1
Successfully installed pip-19.3.1
Solving environment: | 
  - defaults::jupyter_console-6.0.0-py36_0, defaults::libgfortran-3.0.0-1, defaults::prompt_toolkit-2.0.10-py_0
  - conda-forge::libgfortran-3.0.0-1, defaults::jupyter_console-6.0.0-py36_0, defaults::prompt_toolkit-2.0.10-py_0
  - defaults::jupyter_console-5.2.0-py36_1, defaults::libgfortran-3.0.0-1, defaults::prompt_toolkit-3.0.2-py_0
  - conda-forge::libgfortran-3.0.0-1, defaults::jupyter_console-5.2.0-py36_1, defaults::prompt_toolkit-3.0.2-pydone


  current version: 4.5.12
  latest version: 4.8.0

Please update conda by run

## Prep benchmarks

In [9]:
import turicreate
import numpy as np
import warnings
import datetime as dt
import time
import gc

warnings.filterwarnings("ignore")

benchmarks = {}

def benchmark(f, name, df, repetitions=1, **kwargs):
    times = []
    count = 0
    for i in range(repetitions):
        start_time = time.time()
        ret = f(df, **kwargs)
        times.append(time.time()-start_time)
        count+=1
    benchmarks[name] = np.mean(times)/count
    return benchmarks[name]

def get_results(benchmarks, name):
    results = pd.DataFrame.from_dict(benchmarks, orient='index')
    results.columns = [name]
    return results

### Turicreate implementation

In [63]:
def read_file():
    return tc.SFrame('../datasets/taxi_1m.sframe')
    
def mean(df):
    return df['fare_amount'].mean()
    
def standard_deviation(df):
    return df['fare_amount'].std()

def sum_columns(df):
    return df['fare_amount'] + df['passenger_count']

def product_columns(df):
    return df['fare_amount'] * df['passenger_count']

def complicated_arithmetic_operation(df):
    theta_1 = df['pickup_longitude']
    phi_1 = df['pickup_latitude']
    theta_2 = df['dropoff_longitude']
    phi_2 = df['dropoff_latitude']
    temp = (np.sin((theta_2-theta_1)/2*np.pi/180)**2
           + np.cos(theta_1*np.pi/180)*np.cos(theta_2*np.pi/180) * np.sin((phi_2-phi_1)/2*np.pi/180)**2)
    return 2 * np.arctan2(np.sqrt(temp), np.sqrt(1-temp))

def value_counts(df):
    return df['passenger_count'].value_counts()

def groupby_statistics(df):
    return df.groupby(key_column_names = 'pickup_hour', 
           operations = {
               'fare_amount_mean':tc.aggregate.MEAN('fare_amount'),
               'fare_amount_std':tc.aggregate.STD('fare_amount'),
               'tip_amount_mean':tc.aggregate.MEAN('tip_amount'),
               'tip_amount_mean':tc.aggregate.STD('tip_amount')
           })

def join(df, other):
    df_joined = df.join(gp, on = 'pickup_hour')
    

def filter_data(df):
    return df[expr_filter]


def skeatch_mean(df):
    return tc.Sketch(df['fare_amount']).mean()
    
def skeatch_standatd_deviation(df):
    return tc.Sketch(df['fare_amount']).std()

def skeatch_frequent_items(df):
    return tc.Sketch(df['passenger_count']).frequent_items()



## Naive

In [64]:
# Load data
import pandas as pd
data = tc.SFrame('../datasets/taxi_1m.sframe')
data['pickup_hour'] = df['pickup_datetime'].str_to_datetime().apply(lambda x: x.hour)

In [65]:
# benchmark
benchmark(mean, 'mean', data, repetitions=10)
benchmark(standard_deviation,'standard deviation', data, repetitions=10)
benchmark(sum_columns, 'sum columns', data, repetitions=10)
benchmark(product_columns, 'product columns', data, repetitions=10)
benchmark(complicated_arithmetic_operation, 'complicated arithmetic operation', data, repetitions=10)
benchmark(value_counts, 'value counts', data, repetitions=10)
benchmark(groupby_statistics, 'groupby statistics', data, repetitions=10)

# Sketch
benchmark(skeatch_mean, 'sketch mean', data, repetitions=10)
benchmark(skeatch_standatd_deviation,'sketch standard deviation', data, repetitions=10)
benchmark(skeatch_frequent_items,'skeatch frequent items', data, repetitions=10)

gp = groupby_statistics(data)

benchmark(join, 'join', data, repetitions=10, other=gp)
print(f"Done benchmarks on all data")

Done benchmarks on all data


## Filtered

In [66]:
# load data
long_min = -74.05
long_max = -73.75
lat_min = 40.58
lat_max = 40.90

expr_filter = (df['pickup_longitude'] > long_min)  & (df['pickup_longitude'] < long_max) & \
              (df['pickup_latitude'] > lat_min)    & (df['pickup_latitude'] < lat_max) & \
              (df['dropoff_longitude'] > long_min) & (df['dropoff_longitude'] < long_max) & \
              (df['dropoff_latitude'] > lat_min)   & (df['dropoff_latitude'] < lat_max)

filterd = data[expr_filter]
del data
del gp

deleted = gc.collect()
print(f"Prepare filtered data and deleted {deleted} MB")

Prepare filtered data and deleted 1877 MB


In [68]:
# benchmark
benchmark(mean, 'filtered mean', filterd, repetitions=10)
benchmark(standard_deviation,'filtered standard deviation', filterd, repetitions=10)
benchmark(sum_columns, 'filtered sum columns', filterd, repetitions=10)
benchmark(product_columns, 'filtered product_columns', filterd, repetitions=10)
benchmark(complicated_arithmetic_operation, 'filtered complicated arithmetic_operation', filterd, repetitions=10)
benchmark(value_counts, 'filtered value_counts', filterd, repetitions=10)
benchmark(groupby_statistics, 'filtered groupby statistics', filterd, repetitions=10)

# Sketch
benchmark(skeatch_mean, 'filtered sketch mean', filterd, repetitions=10)
benchmark(skeatch_standatd_deviation,'filtered sketch standard deviation', filterd, repetitions=10)
benchmark(skeatch_frequent_items,'filtered skeatch frequent items', filterd, repetitions=10)

gp = groupby_statistics(filterd)
benchmark(join, 'filtered join', filterd, repetitions=10, other=gp)

0.61381427526474

In [69]:
name = 'turicreate'
results = get_results(benchmarks, name)
results.to_csv(f"../results/{name}_1m.csv")
results.head()

Unnamed: 0,turicreate
mean,0.004361
standard deviation,0.004402
sum columns,2.1e-05
product columns,2e-05
complicated arithmetic operation,0.24316
