# Get data

## Prep benchmarks

In [11]:
import pandas as pd
import numpy as np
import warnings
import datetime as dt
import time


warnings.filterwarnings("ignore")

benchmarks = {}

def benchmark(f, repetitions=1):
    times = []
    count = 0
    for i in range(repetitions):
        start_time = time.time()
        ret = f()
        times.append(time.time()-start_time)
        count+=1
    benchmarks[f.__name__] = np.mean(times)/count
    return benchmarks[f.__name__]

def get_results(benchmarks, name):
    results = pd.DataFrame.from_dict(benchmarks, orient='index')
    results.columns = [name]
    return results

### Pandas implementation

In [12]:
df = pd.read_csv('datasets/taxi_1m.csv')

df.pickup_datetime = pd.to_datetime(df.pickup_datetime)
df['pickup_hour'] = df.pickup_datetime.dt.hour

long_min = -74.05
long_max = -73.75
lat_min = 40.58
lat_max = 40.90

expr_filter = (df.pickup_longitude > long_min)  & (df.pickup_longitude < long_max) & \
              (df.pickup_latitude > lat_min)    & (df.pickup_latitude < lat_max) & \
              (df.dropoff_longitude > long_min) & (df.dropoff_longitude < long_max) & \
              (df.dropoff_latitude > lat_min)   & (df.dropoff_latitude < lat_max)

filterd = df[expr_filter]

gp = df.groupby(by='pickup_hour').agg({'fare_amount': ['mean', 'std'], 
                                        'tip_amount': ['mean', 'std']
                                        })



def read_csv():
    return pd.read_csv('datasets/taxi_1m.csv')
    
def mean():
    return df.fare_amount.mean()
    
def standard_deviation():
    return df.fare_amount.std()

def sum_columns():
    return df.eval('fare_amount + passenger_count') 

def product_columns():
    return df.eval('fare_amount * passenger_count') 

def complicated_arithmetic_operation():
    theta_1 = df.pickup_longitude
    phi_1 = df.pickup_latitude
    theta_2 = df.dropoff_longitude
    phi_2 = df.dropoff_latitude
    temp = (np.sin((theta_2-theta_1)/2*np.pi/180)**2
           + np.cos(theta_1*np.pi/180)*np.cos(theta_2*np.pi/180) * np.sin((phi_2-phi_1)/2*np.pi/180)**2)
    return 2 * np.arctan2(np.sqrt(temp), np.sqrt(1-temp))

def value_counts():
    return df.passenger_count.value_counts()

def groupby_statistics():
    return df.groupby(by='pickup_hour').agg({'fare_amount': ['mean', 'std'], 
                                               'tip_amount': ['mean', 'std']
                                              })
def join():
    df_joined = df.join(other=gp, on = 'pickup_hour', rsuffix = '_right')
    

def filter_data():
    return df[expr_filter]


def mean_filtered():
    return df.fare_amount.mean()
    
def standard_deviation_filtered():
    return df.fare_amount.std()

def sum_columns_filtered():
    return df.eval('fare_amount + passenger_count') 

def product_columns_filtered():
    return df.eval('fare_amount * passenger_count') 

def complicated_arithmetic_operation_filtered():
    theta_1 = df.pickup_longitude
    phi_1 = df.pickup_latitude
    theta_2 = df.dropoff_longitude
    phi_2 = df.dropoff_latitude
    temp = (np.sin((theta_2-theta_1)/2*np.pi/180)**2
           + np.cos(theta_1*np.pi/180)*np.cos(theta_2*np.pi/180) * np.sin((phi_2-phi_1)/2*np.pi/180)**2)
    return 2 * np.arctan2(np.sqrt(temp), np.sqrt(1-temp))

def value_counts_filtered():
    return df.passenger_count.value_counts()

def groupby_statistics_filtered():
    return df.groupby(by='pickup_hour').agg({'fare_amount': ['mean', 'std'], 
                                               'tip_amount': ['mean', 'std']
                                              })
def join_filtered():
    return df.join(other=gp, on = 'pickup_hour', rsuffix = '_right')
    

def filter_data():
    return df[expr_filter]


## Benchmark

In [13]:
functions_to_benchmark = [mean, 
                          standard_deviation, 
                          sum_columns,
                          product_columns,
                          complicated_arithmetic_operation,
                          value_counts,
                          groupby_statistics,
                          join,
                          filter_data,
                          mean_filtered,
                          standard_deviation_filtered,
                          sum_columns_filtered,
                          product_columns_filtered,
                          complicated_arithmetic_operation_filtered,
                          value_counts_filtered,
                          groupby_statistics_filtered,
                          join_filtered,
                          filter_data
                         ]
for f in functions_to_benchmark:
    benchmark(f, repetitions=10)

In [15]:
results = get_results(benchmarks, 'pandas')
results.to_csv('results/pandas_1m.csv')