# Get data and packages

In [1]:
%%capture
%%bash
pip install jupyter_contrib_nbextensions
pip install jupyter_nbextensions_configurator
jupyter contrib nbextension install --user
jupyter nbextensions_configurator enable --user

jupyter nbextension enable codefolding/main
jupyter nbextension enable scratchpad/main
jupyter nbextension enable execute_time/ExecuteTime
jupyter nbextension enable autosavetime/main
pip install -U pip dask numpy fsspec>=0.3.3 tqdm pyarrow


In [None]:
!aws s3 cp --recursive s3://xdss-public-datasets/demos/taxi_parquet ../datasets/taxi_parquet

In [4]:
import gc
from src.config import repetitions
import time
import numpy as np
import pandas as pd
import numpy as np
import datetime as dt
import warnings
import time
import gc
import os

instance_type = 'mlm52xlarge'
name = 'spark'
data_path = '../datasets/taxi_parquet'
instance_type = 'mlm52xlarge'
results_path = f"../results/{name}_1b_{instance_type}.csv"
benchmarks = {}
print(f"test for {repetitions} repetitions")

test for 1 repetitions


In [5]:
import numpy as np
import pandas as pd
import time 


def get_results(benchmarks, name):
    results = pd.DataFrame.from_dict(benchmarks, orient='index')
    results.columns = [name]
    return results

def persist():
    get_results(benchmarks, name).to_csv(results_path)
    !aws s3 cp  ../results/spark_1b_mlm52xlarge.csv s3://vaex-sagemaker-demo/benchmarks/spark_1b_mlm52xlarge_results.csv 
    
def benchmark(f, df, name, repetitions=1, **kwargs):
    times = []
    for i in range(repetitions):
        start_time = time.time()
        ret = f(df, **kwargs)
        times.append(time.time()-start_time)
    benchmarks[name] = np.mean(times)
    persist()
    print(f"{name} took: {benchmarks[name]}")
    return benchmarks[name]


long_min = -74.05
long_max = -73.75
lat_min = 40.58
lat_max = 40.90

# Benchmark

In [None]:
from pyspark import sql, SparkConf, SparkContext
import pyspark.sql.functions as f

conf = SparkConf().setAppName('Benchmarks')
conf.setExecutorEnv('spark.executor.memory', '2g')
conf.setExecutorEnv('spark.driver.memory', '30g')
sc = SparkContext(conf = conf)
sqlContext = sql.SQLContext(sc)

data = sqlContext.read.parquet(data_path)
print(f"size: {data.count()} with {len(data.columns)} columns")

In [62]:
# read/ open
def open_file(df=None):
    return sqlContext.read.parquet(data_path)

benchmark(open_file, df=data, name='read_file', repetitions=repetitions)

upload: ../results/spark_1b_mlm52xlarge.csv to s3://vaex-sagemaker-demo/benchmarks/spark_1b_mlm52xlarge_results.csv
read_file took: 0.04250836372375488


0.04250836372375488

In [63]:
def count(df=None):
    return df.count()

benchmark(count, df=data, name='count', repetitions=repetitions)

upload: ../results/spark_1b_mlm52xlarge.csv to s3://vaex-sagemaker-demo/benchmarks/spark_1b_mlm52xlarge_results.csv
count took: 0.03326821327209473


0.03326821327209473

In [69]:
def mean(df):
    return df.select(f.mean('fare_amount')).collect()

benchmark(mean, df=data, name='mean', repetitions=repetitions)

upload: ../results/spark_1b_mlm52xlarge.csv to s3://vaex-sagemaker-demo/benchmarks/spark_1b_mlm52xlarge_results.csv
mean took: 0.059554338455200195


0.059554338455200195

In [70]:
def standard_deviation(df):
    return df.select(f.stddev('fare_amount')).collect()

benchmark(standard_deviation, df=data, name='standard', repetitions=repetitions)

upload: ../results/spark_1b_mlm52xlarge.csv to s3://vaex-sagemaker-demo/benchmarks/spark_1b_mlm52xlarge_results.csv
standard took: 0.0534512996673584


0.0534512996673584

Unlike other techonlogies, vaex can return columns, or subset of values to explore lazely, 
but becouse many of the other tecnologies crashed at this point, we return a scalar instead.

In [71]:
def mean_of_sum(df):
    return df.select(f.mean(df['fare_amount'] + df['trip_distance'])).collect()

benchmark(mean_of_sum, df=data, name='sum columns mean', repetitions=repetitions)
benchmarks['sum columns'] =  benchmarks['sum columns mean'] - benchmarks['mean']

upload: ../results/spark_1b_mlm52xlarge.csv to s3://vaex-sagemaker-demo/benchmarks/spark_1b_mlm52xlarge_results.csv
sum columns mean took: 0.05313467979431152


In [72]:
def mean_of_product(df):
    return df.select(f.mean(df['fare_amount'] * df['trip_distance'])).collect()

benchmark(mean_of_product, df=data, name='product columns mean', repetitions=repetitions)
benchmarks['product columns'] =  benchmarks['product columns mean'] - benchmarks['mean']

upload: ../results/spark_1b_mlm52xlarge.csv to s3://vaex-sagemaker-demo/benchmarks/spark_1b_mlm52xlarge_results.csv
product columns mean took: 0.057082176208496094


In [73]:
def mean_of_complicated_arithmetic_operation(df):
    theta_1 = df['pickup_longitude']
    phi_1 = df['pickup_latitude']
    theta_2 = df['dropoff_longitude']
    phi_2 = df['dropoff_latitude']
    temp = (f.cos(theta_1)*np.pi/180) * (f.cos(theta_2)*np.pi/180) * (f.sin(phi_2-phi_1)/2*np.pi/180)**2
    expression = 2 * f.atan2(f.sqrt(temp), f.sqrt(1-temp))
    return df.select(f.mean(expression)).collect()

benchmark(mean_of_complicated_arithmetic_operation, df=data, name='arithmetic operation mean', repetitions=repetitions)
benchmarks['arithmetic operation'] =  benchmarks['arithmetic operation mean'] - benchmarks['mean']

upload: ../results/spark_1b_mlm52xlarge.csv to s3://vaex-sagemaker-demo/benchmarks/spark_1b_mlm52xlarge_results.csv
arithmetic operation mean took: 0.5879056453704834


In [74]:
def value_counts(df):
    return df.select('fare_amount').distinct().collect()

benchmark(value_counts, df=data, name='value counts', repetitions=repetitions)

upload: ../results/spark_1b_mlm52xlarge.csv to s3://vaex-sagemaker-demo/benchmarks/spark_1b_mlm52xlarge_results.csv
value counts took: 0.21796870231628418


0.21796870231628418

In [75]:
def groupby_statistics(df):
    ret = df.groupby('passenger_count').agg(
        f.mean('fare_amount'),
        f.stddev('fare_amount'),
        f.mean('tip_amount'),
        f.stddev('tip_amount')
    )
    ret.take(3)
    return ret

benchmark(groupby_statistics, df=data, name='groupby statistics', repetitions=repetitions)

upload: ../results/spark_1b_mlm52xlarge.csv to s3://vaex-sagemaker-demo/benchmarks/spark_1b_mlm52xlarge_results.csv
groupby statistics took: 0.2846958637237549


0.2846958637237549

In [76]:
other = groupby_statistics(data)

In [77]:
def join(df, other):
    ret = df.join(other, on = 'passenger_count')
    ret.take(3)
    return ret

benchmark(join, data, name='join', repetitions=repetitions, other=other)

upload: ../results/spark_1b_mlm52xlarge.csv to s3://vaex-sagemaker-demo/benchmarks/spark_1b_mlm52xlarge_results.csv
join took: 0.47157740592956543


0.47157740592956543

In [78]:
def join_count(df, other):
    return df.join(other, on = 'passenger_count').count()

benchmark(join_count, data, name='join count', repetitions=repetitions, other=other)

upload: ../results/spark_1b_mlm52xlarge.csv to s3://vaex-sagemaker-demo/benchmarks/spark_1b_mlm52xlarge_results.csv
join count took: 0.2263627052307129


0.2263627052307129

## Filtered data

Dask is not build to run on filter data like you would normally, so we will apply the same strategy

In [79]:
print(f"Prepare filtered data and deleted {gc.collect()} MB")

Prepare filtered data and deleted 1693 MB


In [86]:
def filter_data(df):
    expr_filter = (df.pickup_longitude > long_min)  & (df.pickup_longitude < long_max) & \
              (df.pickup_latitude > lat_min)    & (df.pickup_latitude < lat_max) & \
              (df.dropoff_longitude > long_min) & (df.dropoff_longitude < long_max) & \
              (df.dropoff_latitude > lat_min)   & (df.dropoff_latitude < lat_max)
    ret = df[expr_filter]
    ret.take(3) # evaluate the filter
    return ret
benchmark(filter_data, data, name='filter', repetitions=repetitions)

upload: ../results/spark_1b_mlm52xlarge.csv to s3://vaex-sagemaker-demo/benchmarks/spark_1b_mlm52xlarge_results.csv
filter took: 0.10109424591064453


0.10109424591064453

In [87]:
filterd = filter_data(data)

del data
print(f"cleaned {gc.collect()} mb")

cleaned 872 mb


In [88]:
benchmark(filter_data, filterd, name='filterd count', repetitions=repetitions)
benchmark(mean, filterd, name='filterd mean', repetitions=repetitions)
benchmark(standard_deviation, filterd, name='filtered standard deviation', repetitions=repetitions)
benchmark(mean_of_sum, filterd, name ='filtered sum columns mean', repetitions=repetitions)
benchmarks['filtered sum columns'] =  benchmarks['filtered sum columns mean'] - benchmarks['filterd mean']
benchmark(mean_of_product, filterd, name ='filterd product columns mean', repetitions=repetitions)
benchmarks['filterd product columns'] = benchmarks['filterd product columns mean'] - benchmarks['filterd mean']
benchmark(mean_of_complicated_arithmetic_operation, filterd, name='filterd arithmetic operation mean', repetitions=repetitions)
benchmarks['filterd arithmetic operation'] =  benchmarks['filterd arithmetic operation mean'] - benchmarks['filterd mean']
benchmark(value_counts, filterd, name ='filtered value counts', repetitions=repetitions)
benchmark(groupby_statistics, filterd, name='filtered groupby statistics', repetitions=repetitions)
other = groupby_statistics(filterd)
benchmark(join, filterd, name='filtered join', repetitions=repetitions, other=other)
benchmark(join_count, filterd, name='filtered join count', repetitions=repetitions, other=other)
print('Done!')

upload: ../results/spark_1b_mlm52xlarge.csv to s3://vaex-sagemaker-demo/benchmarks/spark_1b_mlm52xlarge_results.csv
filterd count took: 0.09546041488647461
upload: ../results/spark_1b_mlm52xlarge.csv to s3://vaex-sagemaker-demo/benchmarks/spark_1b_mlm52xlarge_results.csv
filterd mean took: 0.17607545852661133
upload: ../results/spark_1b_mlm52xlarge.csv to s3://vaex-sagemaker-demo/benchmarks/spark_1b_mlm52xlarge_results.csv
filtered standard deviation took: 0.18120908737182617
upload: ../results/spark_1b_mlm52xlarge.csv to s3://vaex-sagemaker-demo/benchmarks/spark_1b_mlm52xlarge_results.csv
filtered sum columns mean took: 0.18535757064819336
upload: ../results/spark_1b_mlm52xlarge.csv to s3://vaex-sagemaker-demo/benchmarks/spark_1b_mlm52xlarge_results.csv
filterd product columns mean took: 0.18712377548217773
upload: ../results/spark_1b_mlm52xlarge.csv to s3://vaex-sagemaker-demo/benchmarks/spark_1b_mlm52xlarge_results.csv
filterd arithmetic operation mean took: 0.6530771255493164
uploa