# Get data and packages

In [13]:
import pandas as pd
import numpy as np
import warnings
import datetime as dt
import time
import gc

warnings.filterwarnings("ignore")

benchmarks = {}

def benchmark(f, name, df, repetitions=1, **kwargs):
    times = []
    count = 0
    for i in range(repetitions):
        start_time = time.time()
        ret = f(df, **kwargs)
        times.append(time.time()-start_time)
        count+=1
    benchmarks[name] = np.mean(times)/count
    return benchmarks[name]

def get_results(benchmarks, name):
    results = pd.DataFrame.from_dict(benchmarks, orient='index')
    results.columns = [name]
    return results

data_path = '../datasets/taxi_1m.csv'

### Pandas implementation

In [14]:
def read_file():
    return pd.read_csv(data_path)
    
def mean(df):
    return df.fare_amount.mean()
    
def standard_deviation(df):
    return df.fare_amount.std()

def sum_columns(df):
    return df.eval('fare_amount + passenger_count') 

def product_columns(df):
    return df.eval('fare_amount * passenger_count') 

def complicated_arithmetic_operation(df):
    theta_1 = df.pickup_longitude
    phi_1 = df.pickup_latitude
    theta_2 = df.dropoff_longitude
    phi_2 = df.dropoff_latitude
    temp = (np.sin((theta_2-theta_1)/2*np.pi/180)**2
           + np.cos(theta_1*np.pi/180)*np.cos(theta_2*np.pi/180) * np.sin((phi_2-phi_1)/2*np.pi/180)**2)
    return 2 * np.arctan2(np.sqrt(temp), np.sqrt(1-temp))

def value_counts(df):
    return df.passenger_count.value_counts()

def groupby_statistics(df):
    return df.groupby(by='pickup_hour').agg({'fare_amount': ['mean', 'std'], 
                                               'tip_amount': ['mean', 'std']
                                              })
def join(df, other):
    return df.join(other=gp, on = 'pickup_hour', rsuffix = '_right')
    

def filter_data(df):
    long_min = -74.05
    long_max = -73.75
    lat_min = 40.58
    lat_max = 40.90

    expr_filter = (data.pickup_longitude > long_min)  & (data.pickup_longitude < long_max) & \
                  (data.pickup_latitude > lat_min)    & (data.pickup_latitude < lat_max) & \
                  (data.dropoff_longitude > long_min) & (data.dropoff_longitude < long_max) & \
                  (data.dropoff_latitude > lat_min)   & (data.dropoff_latitude < lat_max)
    return df[expr_filter]


# Naive

In [15]:
# Load data
data = read_file()
data.pickup_datetime = pd.to_datetime(data.pickup_datetime)
data['pickup_hour'] = data.pickup_datetime.dt.hour

In [16]:
# benchmark
benchmark(mean, 'mean', data, repetitions=10)
benchmark(standard_deviation,'standard deviation', data, repetitions=10)
benchmark(sum_columns, 'sum columns', data, repetitions=10)
benchmark(product_columns, 'product columns', data, repetitions=10)
benchmark(complicated_arithmetic_operation, 'complicated arithmetic operation', data, repetitions=10)
benchmark(value_counts, 'value counts', data, repetitions=10)
benchmark(groupby_statistics, 'groupby statistics', data, repetitions=10)
benchmark(filter_data, 'filter', data, repetitions=10)
gc.collect()

gp = groupby_statistics(data)
benchmark(join, 'join', data, repetitions=10, other=gp)
print(f"Done benchmarks on all data")

Done benchmarks on all data


## Filtered

In [17]:
filterd = filter_data(data)
del data
del gp

print(f"Prepare filtered data and deleted {gc.collect()} MB")

Prepare filtered data and deleted 281 MB


In [18]:
# benchmark
benchmark(mean, 'filtered mean', filterd, repetitions=10)
benchmark(standard_deviation,'filtered standard deviation', filterd, repetitions=10)
benchmark(sum_columns, 'filtered sum columns', filterd, repetitions=10)
benchmark(product_columns, 'filtered product_columns', filterd, repetitions=10)
benchmark(complicated_arithmetic_operation, 'filtered complicated arithmetic_operation', filterd, repetitions=10)
benchmark(value_counts, 'filtered value_counts', filterd, repetitions=10)
benchmark(groupby_statistics, 'filtered groupby statistics', filterd, repetitions=10)

gp = filterd.groupby(by='pickup_hour').agg({'fare_amount': ['mean', 'std'], 
                                        'tip_amount': ['mean', 'std']
                                        })
benchmark(join, 'filtered join', filterd, repetitions=10, other=gp)
print(f"Done benchmarks on all data")

0.01129647970199585

In [21]:
name = 'pandas'
results = get_results(benchmarks, name)
results.to_csv(f"results/{name}_1m.csv")
results.head()

Unnamed: 0,pandas
mean,0.000133
standard deviation,0.000241
sum columns,0.000609
product columns,0.000509
complicated arithmetic operation,0.022738


In [20]:
!aws s3 cp  ../results/pandas_1m.csv s3://vaex-sagemaker-demo/benchmarks/pandas_1m_results.csv 