# Install  packages - No need
Use the condat_python3 kernel, not the pyspark one

In [1]:
import gc
import numpy as np
import pandas as pd
import warnings
import time
import gc
import os

instance_type = 'c5d4xlarge' # change this
results_bucket = f"s3://xdss-benchmarks/benchmarks" # change this

name = 'spark'
data_path = 'datasets/taxi_parquet'
output_file = f'{name}_{instance_type}.csv'
results_path = f"results/{output_file}"
results_bucket = f"{results_bucket}/{output_file}" 
benchmarks = {
    'run':[],
    'duration': [],
    'task': []   
}

long_min = -74.05
long_max = -73.75
lat_min = 40.58
lat_max = 40.90


def get_results(benchmarks=benchmarks):
    return pd.DataFrame.from_dict(benchmarks, orient='index').T

def persist():
    gc.collect()
    get_results(benchmarks).to_csv(results_path)
    os.system(f"aws s3 cp {results_path} {results_bucket}")
    
def benchmark(f, df, name, **kwargs):    
    for i in range(2):
        start_time = time.time()
        ret = f(df, **kwargs)
        benchmarks['duration'].append(time.time()-start_time)
        benchmarks['task'].append(name)
        benchmarks['run'].append(i+1)
    persist()
    print(f"{name} took: {benchmarks['duration'][-1]}")
    return benchmarks['duration'][-1]
          
def add_nan(name):
    for i in range(2):
        benchmarks['duration'].append(np.nan)
        benchmarks['task'].append(name)
        benchmarks['run'].append(i+1)
    persist()
    print(f"{name} took: {benchmarks['duration'][-1]}")
    return benchmarks['duration'][-1]

          


!mkdir -p results
!mkdir -p datasets
print(f"We test every benchmark twice and save both results")

We test every benchmark twice and save both results


# Benchmark

In [2]:
from pyspark import sql, SparkConf, SparkContext
import pyspark.sql.functions as f

conf = SparkConf().setAppName('Benchmarks')

# make sure you have enough memory for this
conf.setExecutorEnv('spark.executor.memory', '2g') 
conf.setExecutorEnv('spark.driver.memory', '12g')
sc = SparkContext(conf = conf)
sqlContext = sql.SQLContext(sc)
data = sqlContext.read.parquet(data_path)
print(f"size: {data.count()} with {len(data.columns)} columns")

size: 1173057928 with 18 columns


In [3]:
# read/ open
def open_file(df=None):
    return sqlContext.read.parquet(data_path)

benchmark(open_file, df=data, name='read_file')

read_file took: 0.09594082832336426


0.09594082832336426

In [4]:
def count(df=None):
    return df.count()

benchmark(count, df=data, name='count')

count took: 0.3005337715148926


0.3005337715148926

In [5]:
def mean(df):
    return df.select(f.mean('fare_amount')).collect()

benchmark(mean, df=data, name='mean')

mean took: 1.230557918548584


1.230557918548584

In [6]:
def standard_deviation(df):
    return df.select(f.stddev('fare_amount')).collect()

benchmark(standard_deviation, df=data, name='standard deviation')

standard deviation took: 1.939530849456787


1.939530849456787

Unlike other techonlogies, vaex can return columns, or subset of values to explore lazely, 
but becouse many of the other tecnologies crashed at this point, we return a scalar instead.

In [7]:
def mean_of_sum(df):
    return df.select(f.mean(df.fare_amount + df.trip_distance)).collect()

benchmark(mean_of_sum, df=data, name='sum columns mean')

sum columns mean took: 2.1712541580200195


2.1712541580200195

In [8]:
def sum_columns(df):
    return df.select(df.fare_amount + df.trip_distance)

benchmark(sum_columns, df=data, name='sum columns')

sum columns took: 0.0028922557830810547


0.0028922557830810547

In [9]:
def mean_of_product(df):
    return df.select(f.mean(df['fare_amount'] * df['trip_distance'])).collect()

benchmark(mean_of_product, df=data, name='product columns mean')

product columns mean took: 2.267188310623169


2.267188310623169

In [10]:
def product_columns(df):
    return df.select(df.fare_amount * df.trip_distance)

benchmark(product_columns, df=data, name='product columns')

product columns took: 0.002756834030151367


0.002756834030151367

In [11]:
def lazy_mean(df):
    df = df.withColumn("lazy", df['fare_amount'] * df['trip_distance'])
    return df.select(f.mean('lazy')).collect()
    
benchmark(lazy_mean, df=data, name='lazy evaluation')   

lazy evaluation took: 2.469942808151245


2.469942808151245

In [12]:
def value_counts(df):
    return df.select('passenger_count').distinct().collect()

benchmark(value_counts, df=data, name='value counts')

value counts took: 4.320232629776001


4.320232629776001

In [13]:
def mean_of_complicated_arithmetic_operation(df):
    theta_1 = df['pickup_longitude']
    phi_1 = df['pickup_latitude']
    theta_2 = df['dropoff_longitude']
    phi_2 = df['dropoff_latitude']
    temp = (f.cos(theta_1)*np.pi/180) * (f.cos(theta_2)*np.pi/180) * (f.sin(phi_2-phi_1)/2*np.pi/180)**2
    expression = 2 * f.atan2(f.sqrt(temp), f.sqrt(1-temp))
    return df.select(f.mean(expression)).collect()

benchmark(mean_of_complicated_arithmetic_operation, df=data, name='arithmetic operation mean')

arithmetic operation mean took: 45.67928624153137


45.67928624153137

In [14]:
# crash
def complicated_arithmetic_operation(df):
    theta_1 = df['pickup_longitude']
    phi_1 = df['pickup_latitude']
    theta_2 = df['dropoff_longitude']
    phi_2 = df['dropoff_latitude']
    temp = (f.cos(theta_1)*np.pi/180) * (f.cos(theta_2)*np.pi/180) * (f.sin(phi_2-phi_1)/2*np.pi/180)**2
    expression = 2 * f.atan2(f.sqrt(temp), f.sqrt(1-temp))
    return df.select(expression).collect()

# benchmark(complicated_arithmetic_operation, df=data, name='arithmetic operation mean')
add_nan('arithmetic operation')

arithmetic operation took: nan


nan

In [15]:
def groupby_statistics(df):
    ret = df.groupby('passenger_count').agg(
        f.mean('fare_amount'),
        f.stddev('fare_amount'),
        f.mean('tip_amount'),
        f.stddev('tip_amount')
    )
    ret.take(3)
    return ret

benchmark(groupby_statistics, df=data, name='groupby statistics')

groupby statistics took: 16.090863943099976


16.090863943099976

In [5]:
other = groupby_statistics(data)

In [6]:
def join_count(df, other):
    return df.join(other, on = 'passenger_count').count()

benchmark(join_count, data, name='join count', other=other)

join count took: 258.1469507217407


258.1469507217407

In [7]:
# Crash 
def join_data(df, other):
    ret = df.join(other, on = 'passenger_count')
    ret.take(3)
    return ret
    
# benchmark(join_data, data, name='join', other=other)
add_nan('join')

join took: nan


nan

## Filtered data

Dask is not build to run on filter data like you would normally, so we will apply the same strategy

In [8]:
print(f"Prepare filtered data and deleted {gc.collect()} MB")

Prepare filtered data and deleted 0 MB


In [9]:
def filter_data(df):
    expr_filter = (df.pickup_longitude > long_min)  & (df.pickup_longitude < long_max) & \
              (df.pickup_latitude > lat_min)    & (df.pickup_latitude < lat_max) & \
              (df.dropoff_longitude > long_min) & (df.dropoff_longitude < long_max) & \
              (df.dropoff_latitude > lat_min)   & (df.dropoff_latitude < lat_max)
    ret = df[expr_filter]
    ret.take(3) # evaluate the filter
    return ret
benchmark(filter_data, data, name='filter data')

filter data took: 0.09316062927246094


0.09316062927246094

In [10]:
filtered = filter_data(data)

del data
print(f"cleaned {gc.collect()} mb")

cleaned 192 mb


In [15]:
benchmark(filter_data, filtered, name='filtered count')
benchmark(mean, filtered, name='filtered mean')
benchmark(standard_deviation, filtered, name='filtered standard deviation')
benchmark(mean_of_sum, filtered, name ='filtered sum columns mean')
benchmark(sum_columns, filtered, name ='filtered sum columns')
benchmark(mean_of_product, filtered, name ='filtered product columns mean')
benchmark(product_columns, filtered, name ='filtered product columns')
benchmark(mean_of_complicated_arithmetic_operation, filtered, name='filtered arithmetic operation mean')
add_nan('filtered arithmetic operation')
benchmark(value_counts, filtered, name ='filtered value counts')
benchmark(groupby_statistics, filtered, name='filtered groupby statistics')
other = groupby_statistics(filtered)
benchmark(join_count, filtered, name='filtered join count', other=other)
add_nan('filtered join')
print(name)
get_results(benchmarks)

filtered mean took: 10.708798170089722
filtered standard deviation took: 11.21460509300232
filtered sum columns mean took: 13.624402523040771
filtered sum columns took: 0.0021200180053710938
filtered product columns mean took: 13.47539734840393
filtered product columns took: 0.0021560192108154297
filtered arithmetic operation mean took: 45.62188744544983
filtered arithmetic operation took: nan
filtered value counts took: 13.600687980651855
filtered groupby statistics took: 26.765904903411865
filtered join count took: 262.21544551849365
filtered join took: nan
spark


Unnamed: 0,run,duration,task
0,1,0.105811,read_file
1,2,0.0959408,read_file
2,1,0.377406,count
3,2,0.300534,count
4,1,1.4994,mean
5,2,1.23056,mean
6,1,3.67782,standard deviation
7,2,1.93953,standard deviation
8,1,2.27602,sum columns mean
9,2,2.17125,sum columns mean
