# Get data and packages
Use the condat_python3 kernel, not the pyspark one

In [None]:
%%capture
%%bash
aws s3 cp --recursive s3://xdss-public-datasets/demos/taxi_parquet datasets/taxi_parquet

In [None]:
import gc
import numpy as np
import pandas as pd
import warnings
import time
import gc
import os

instance_type = 'c5d2xlarge' # change this
results_bucket = f"s3://vaex-sagemaker-demo/benchmarks" # change this

name = 'spark'
data_path = 'datasets/taxi_parquet'
output_file = f'{name}_{instance_type}_results.csv'
results_path = f"results/{output_file}"
results_bucket = f"{results_bucket}/{output_file}" 
benchmarks = {}
single_repetition = 1
statistic_repetition = 5

long_min = -74.05
long_max = -73.75
lat_min = 40.58
lat_max = 40.90


def get_results(benchmarks, name):
    results = pd.DataFrame.from_dict(benchmarks, orient='index')
    results.columns = [name]
    return results

def persist():
    gc.collect()
    get_results(benchmarks, name).to_csv(results_path)
    os.system(f"aws s3 cp {results_path} {results_bucket}")
    
def benchmark(f, df, name, repetitions=1, **kwargs):
    times = []
    for i in range(repetitions):
        start_time = time.time()
        ret = f(df, **kwargs)
        times.append(time.time()-start_time)
    benchmarks[name] = np.mean(times)
    persist()
    print(f"{name} took: {benchmarks[name]}")
    return benchmarks[name]


!mkdir -p results
!mkdir -p datasets
print(f"test for {single_repetition} repetitions for join and groupby and {statistic_repetition} repetitions for statistics")

# Benchmark

In [31]:
from pyspark import sql, SparkConf, SparkContext
import pyspark.sql.functions as f

conf = SparkConf().setAppName('Benchmarks')

# make sure you have enough memory for this
conf.setExecutorEnv('spark.executor.memory', '2g') 
conf.setExecutorEnv('spark.driver.memory', '12g')
sc = SparkContext(conf = conf)
sqlContext = sql.SQLContext(sc)
data = sqlContext.read.parquet(data_path)
print(f"size: {data.count()} with {len(data.columns)} columns")

size: 1173057928 with 18 columns


In [32]:
# read/ open
def open_file(df=None):
    return sqlContext.read.parquet(data_path)

benchmark(open_file, df=data, name='read_file', repetitions=statistic_repetition)

read_file took: 0.07748112678527833


0.07748112678527833

In [33]:
def count(df=None):
    return df.count()

benchmark(count, df=data, name='count', repetitions=statistic_repetition)

count took: 0.3561862468719482


0.3561862468719482

In [34]:
def mean(df):
    return df.select(f.mean('fare_amount')).collect()

benchmark(mean, df=data, name='mean', repetitions=statistic_repetition)

mean took: 2.4659940719604494


2.4659940719604494

In [35]:
def standard_deviation(df):
    return df.select(f.stddev('fare_amount')).collect()

benchmark(standard_deviation, df=data, name='standard deviation', repetitions=statistic_repetition)

standard took: 3.7465686321258547


3.7465686321258547

Unlike other techonlogies, vaex can return columns, or subset of values to explore lazely, 
but becouse many of the other tecnologies crashed at this point, we return a scalar instead.

In [36]:
def mean_of_sum(df):
    return df.select(f.mean(df['fare_amount'] + df['trip_distance'])).collect()

benchmark(mean_of_sum, df=data, name='sum columns mean', repetitions=statistic_repetition)
benchmarks['sum columns'] =  np.nan

sum columns mean took: 3.65928635597229


In [37]:
def mean_of_product(df):
    return df.select(f.mean(df['fare_amount'] * df['trip_distance'])).collect()

benchmark(mean_of_product, df=data, name='product columns mean', repetitions=statistic_repetition)
benchmarks['product columns'] =  np.nan

product columns mean took: 3.7690373420715333


In [38]:
def mean_of_complicated_arithmetic_operation(df):
    theta_1 = df['pickup_longitude']
    phi_1 = df['pickup_latitude']
    theta_2 = df['dropoff_longitude']
    phi_2 = df['dropoff_latitude']
    temp = (f.cos(theta_1)*np.pi/180) * (f.cos(theta_2)*np.pi/180) * (f.sin(phi_2-phi_1)/2*np.pi/180)**2
    expression = 2 * f.atan2(f.sqrt(temp), f.sqrt(1-temp))
    return df.select(f.mean(expression)).collect()

benchmark(mean_of_complicated_arithmetic_operation, df=data, name='arithmetic operation mean', repetitions=single_repetition)
benchmarks['arithmetic operation'] =  np.nan

arithmetic operation mean took: 96.96435236930847


In [39]:
def value_counts(df):
    return df.select('fare_amount').distinct().collect()

benchmark(value_counts, df=data, name='value counts', repetitions=statistic_repetition)

value counts took: 11.73745174407959


11.73745174407959

In [40]:
def groupby_statistics(df):
    ret = df.groupby('passenger_count').agg(
        f.mean('fare_amount'),
        f.stddev('fare_amount'),
        f.mean('tip_amount'),
        f.stddev('tip_amount')
    )
    ret.take(3)
    return ret

benchmark(groupby_statistics, df=data, name='groupby statistics', repetitions=single_repetition)

groupby statistics took: 32.12943077087402


32.12943077087402

In [None]:
other = groupby_statistics(data)

In [None]:
def join(df, other):
    ret = df.join(other, on = 'passenger_count')
    ret.take(3)
    return ret
    
benchmark(join, data, name='join', repetitions=single_repetition, other=other)

In [None]:
def join_count(df, other):
    return df.join(other, on = 'passenger_count').count()

benchmark(join_count, data, name='join count', repetitions=single_repetition, other=other)

## Filtered data

Dask is not build to run on filter data like you would normally, so we will apply the same strategy

In [45]:
print(f"Prepare filtered data and deleted {gc.collect()} MB")

Prepare filtered data and deleted 0 MB


In [46]:
def filter_data(df):
    expr_filter = (df.pickup_longitude > long_min)  & (df.pickup_longitude < long_max) & \
              (df.pickup_latitude > lat_min)    & (df.pickup_latitude < lat_max) & \
              (df.dropoff_longitude > long_min) & (df.dropoff_longitude < long_max) & \
              (df.dropoff_latitude > lat_min)   & (df.dropoff_latitude < lat_max)
    ret = df[expr_filter]
    ret.take(3) # evaluate the filter
    return ret
benchmark(filter_data, data, name='filter data', repetitions=statistic_repetition)

filter took: 0.12175817489624023


0.12175817489624023

In [47]:
filterd = filter_data(data)

del data
print(f"cleaned {gc.collect()} mb")

cleaned 194 mb


In [None]:
benchmark(filter_data, filterd, name='filterd count', repetitions=statistic_repetition)
benchmark(mean, filterd, name='filterd mean', repetitions=statistic_repetition)
benchmark(standard_deviation, filterd, name='filtered standard deviation', repetitions=statistic_repetition)
benchmark(mean_of_sum, filterd, name ='filtered sum columns mean', repetitions=statistic_repetition)
benchmarks['filtered sum columns'] =  np.nan
benchmark(mean_of_product, filterd, name ='filterd product columns mean', repetitions=statistic_repetition)
benchmarks['filterd product columns'] = np.nan
benchmark(mean_of_complicated_arithmetic_operation, filterd, name='filterd arithmetic operation mean', repetitions=statistic_repetition)
benchmarks['filterd arithmetic operation'] =  np.nan
benchmark(value_counts, filterd, name ='filtered value counts', repetitions=statistic_repetition)
benchmark(groupby_statistics, filterd, name='filtered groupby statistics', repetitions=single_repetition)
other = groupby_statistics(filterd)
benchmark(join, filterd, name='filtered join', repetitions=single_repetition, other=other)
benchmark(join_count, filterd, name='filtered join count', repetitions=single_repetition, other=other)
print('Done!')

filterd count took: 0.08333163261413574
filterd mean took: 17.987318897247313
filtered standard deviation took: 17.573638582229613
filtered sum columns mean took: 21.039139175415038
filterd product columns mean took: 20.938899803161622


In [None]:
print('spark')
benchmarks