# Pandas as API references

# Install packages

In [5]:
%%capture
%%bash
pip install -U pyarrow numpy# to read parquet

In [9]:
import gc
import numpy as np
import pandas as pd
import warnings
import time
import gc
import os
import os


instance_type = 'c5d4xlarge' # change this
results_bucket = f"s3://xdss-benchmarks/benchmarks" # change this

name = 'pandas'
data_path = 'datasets/taxi_parquet/data_0.parquet' # single file
output_file = f'{name}_{instance_type}_1m_results.csv'
results_path = f"results/{output_file}"
results_bucket = f"{results_bucket}/{output_file}" 
benchmarks = {}
single_repetition = 1
statistic_repetition = 5

long_min = -74.05
long_max = -73.75
lat_min = 40.58

long_min = -74.05
long_max = -73.75
lat_min = 40.58
lat_max = 40.90


def get_results(benchmarks, name):
    results = pd.DataFrame.from_dict(benchmarks, orient='index')
    results.columns = [name]
    return results

def persist():
    gc.collect()
    get_results(benchmarks, name).to_csv(results_path)
    os.system(f"aws s3 cp {results_path} {results_bucket}")
    
def benchmark(f, df, name, repetitions=1, **kwargs):
    times = []
    for i in range(repetitions):
        start_time = time.time()
        ret = f(df, **kwargs)
        times.append(time.time()-start_time)
    benchmarks[name] = np.mean(times)
    persist()
    print(f"{name} took: {benchmarks[name]}")
    return benchmarks[name]


!mkdir -p results
!mkdir -p datasets
print(f"test for {single_repetition} repetitions for join and groupby and {statistic_repetition} repetitions for statistics")

test for 1 repetitions for join and groupby and 5 repetitions for statistics


# Benchmark

In [10]:
import pandas as pd
import numpy as np

# Load data
data = pd.read_parquet(data_path, engine='pyarrow')
print(f"size: {len(data)} with {len(data.columns)} columns")

size: 1000000 with 18 columns


In [11]:
def read_file_parquet(df=None):
    return pd.read_parquet(data_path, engine='pyarrow')

benchmark(read_file_parquet, df=data, name='read_file', repetitions=statistic_repetition)

read_file took: 0.794035005569458


0.794035005569458

In [12]:
def count(df=None):
    return len(df)

benchmark(count, df=data, name='count', repetitions=statistic_repetition)

count took: 8.58306884765625e-06


8.58306884765625e-06

In [13]:
def mean(df):
    return df.fare_amount.mean()

benchmark(mean, df=data, name='mean', repetitions=statistic_repetition)

mean took: 0.0015757083892822266


0.0015757083892822266

In [14]:
def standard_deviation(df):
    return df.fare_amount.std()

benchmark(standard_deviation, df=data, name='standard deviation', repetitions=statistic_repetition)

standard deviation took: 0.00252532958984375


0.00252532958984375

In [15]:
def mean_of_sum(df):
    return (df.fare_amount + df.trip_distance).mean()

benchmark(mean_of_sum, df=data, name='sum columns mean', repetitions=statistic_repetition)


sum columns mean took: 0.011311006546020509


In [16]:
def sum_columns(df):
    return df.fare_amount + df.trip_distance

benchmark(sum_columns, df=data, name='sum columns', repetitions=statistic_repetition)

sum columns took: 0.0025166988372802733


0.0025166988372802733

In [17]:
def mean_of_product(df):
    return (df.fare_amount * df.trip_distance).mean()

benchmark(mean_of_product, df=data, name='product columns mean', repetitions=statistic_repetition)

product columns mean took: 0.003281116485595703


0.003281116485595703

In [19]:
def product_columns(df):
    return df.fare_amount * df.trip_distance

benchmark(product_columns, df=data, name='product columns', repetitions=statistic_repetition)

product columns took: 0.0025734901428222656


0.0025734901428222656

In [20]:
def complicated_arithmetic_operation(df):
    theta_1 = df.pickup_longitude
    phi_1 = df.pickup_latitude
    theta_2 = df.dropoff_longitude
    phi_2 = df.dropoff_latitude
    temp = (np.sin((theta_2-theta_1)/2*np.pi/180)**2
           + np.cos(theta_1*np.pi/180)*np.cos(theta_2*np.pi/180) * np.sin((phi_2-phi_1)/2*np.pi/180)**2)
    return np.multiply(np.arctan2(np.sqrt(temp), np.sqrt(np.subtract(1, temp))),2) 

benchmark(complicated_arithmetic_operation, df=data, name='arithmetic operation', repetitions=single_repetition)

arithmetic operation took: 0.6707596778869629


0.6707596778869629

In [21]:
def mean_of_complicated_arithmetic_operation(df):
    theta_1 = df.pickup_longitude
    phi_1 = df.pickup_latitude
    theta_2 = df.dropoff_longitude
    phi_2 = df.dropoff_latitude
    temp = (np.sin((theta_2-theta_1)/2*np.pi/180)**2
           + np.cos(theta_1*np.pi/180)*np.cos(theta_2*np.pi/180) * np.sin((phi_2-phi_1)/2*np.pi/180)**2)
    ret = np.multiply(np.arctan2(np.sqrt(temp), np.sqrt(np.subtract(1, temp))),2) 
    return ret.mean()

benchmark(mean_of_complicated_arithmetic_operation, df=data, name='arithmetic operation mean', repetitions=single_repetition)

arithmetic operation mean took: 0.2180957794189453


0.2180957794189453

In [22]:
def value_counts(df):
    return df.fare_amount.value_counts()

benchmark(value_counts, df=data, name='value counts', repetitions=statistic_repetition)

value counts took: 0.016116619110107422


0.016116619110107422

In [23]:
def groupby_statistics(df):
    return df.groupby(by='passenger_count').agg({'fare_amount': ['mean', 'std'], 
                                               'tip_amount': ['mean', 'std']
                                              })

benchmark(groupby_statistics, df=data, name='groupby statistics', repetitions=single_repetition)

groupby statistics took: 0.1632380485534668


0.1632380485534668

In [24]:
other = groupby_statistics(data)

In [25]:
def join_data(df, other):
    return pd.merge(df, other, left_index=True, right_index=True)

benchmark(join_data, data, name='join', repetitions=single_repetition, other=other)



join took: 0.05270791053771973


0.05270791053771973

In [27]:
def join_count(df, other):
    return len(pd.merge(df, other, left_index=True, right_index=True))

benchmark(join_count, data, name='join count', repetitions=single_repetition, other=other)



join count took: 0.012810945510864258


0.012810945510864258

## Filtered data

In [28]:
print(f"Prepare filtered data and deleted {gc.collect()} MB")

Prepare filtered data and deleted 0 MB


In [29]:
expr_filter = (data.pickup_longitude > long_min)  & (data.pickup_longitude < long_max) & \
                  (data.pickup_latitude > lat_min)    & (data.pickup_latitude < lat_max) & \
                  (data.dropoff_longitude > long_min) & (data.dropoff_longitude < long_max) & \
                  (data.dropoff_latitude > lat_min)   & (data.dropoff_latitude < lat_max)

In [30]:
def filter_data(df):
    return df[expr_filter]

benchmark(filter_data, data, name='filter data', repetitions=statistic_repetition)

filter data took: 0.14194197654724122


0.14194197654724122

In [31]:
filtered = filter_data(data)

del data
print(f"cleaned {gc.collect()} mb")

cleaned 0 mb


In [34]:
benchmark(mean, filtered, name='filtered mean', repetitions=statistic_repetition)
benchmark(standard_deviation, filtered, name='filtered standard deviation', repetitions=statistic_repetition)
benchmark(mean_of_sum, filtered, name ='filtered sum columns mean', repetitions=statistic_repetition)
benchmark(sum_columns, df=filtered, name='filtered sum columns', repetitions=statistic_repetition)
benchmark(mean_of_product, filtered, name ='filtered product columns mean', repetitions=statistic_repetition)
benchmark(product_columns, df=filtered, name='filtered product columns', repetitions=statistic_repetition)
benchmark(mean_of_complicated_arithmetic_operation, filtered, name='filtered arithmetic operation mean', repetitions=single_repetition)
benchmark(complicated_arithmetic_operation, filtered, name='filtered arithmetic operation', repetitions=single_repetition)
benchmark(value_counts, filtered, name ='filtered value counts', repetitions=statistic_repetition)
benchmark(groupby_statistics, filtered, name='filtered groupby statistics', repetitions=single_repetition)
other = groupby_statistics(filtered)
benchmark(join_data, filtered, name='filtered join', repetitions=single_repetition, other=other)
benchmark(join_count, filtfilterederd, name='filtered join count', repetitions=single_repetition, other=other)
print(name)
get_results(benchmarks)

filterd mean took: 0.0012643814086914062
filtered standard deviation took: 0.0024105072021484374
filtered sum columns mean took: 0.002856540679931641
filtered sum columns took: 0.0021256446838378907
filterd product columns mean took: 0.002691030502319336
filterd product columns took: 0.0015808582305908204
filterd arithmetic operation mean took: 0.21105647087097168
filterd arithmetic operation took: 0.20889568328857422
filtered value counts took: 0.012538766860961914
filtered groupby statistics took: 0.04370427131652832




filtered join took: 0.011433124542236328
filtered join count took: 0.010342121124267578
Done!
