# Get data and packages

In [1]:
%%capture
%%bash
pip install jupyter_contrib_nbextensions
pip install jupyter_nbextensions_configurator
jupyter contrib nbextension install --user
jupyter nbextensions_configurator enable --user

jupyter nbextension enable codefolding/main
jupyter nbextension enable scratchpad/main
jupyter nbextension enable execute_time/ExecuteTime
jupyter nbextension enable autosavetime/main
pip install -U pip dask numpy fsspec>=0.3.3 tqdm pyarrow


In [None]:
!aws s3 cp --recursive s3://xdss-public-datasets/demos/taxi_parquet ../datasets/taxi_parquet

In [5]:
import gc
from src.config import repetitions
import time
import numpy as np
import pandas as pd
import numpy as np
import datetime as dt
import warnings
import time
import gc
import os

instance_type = 'mlm52xlarge'
name = 'spark'
data_path = '../datasets/taxi_parquet/'
instance_type = 'mlm52xlarge'
results_path = f"../results/{name}_1b_{instance_type}.csv"
benchmarks = {}
print(f"test for {repetitions} repetitions")

test for 1 repetitions


In [37]:
import numpy as np
import pandas as pd
import time 


def get_results(benchmarks, name):
    results = pd.DataFrame.from_dict(benchmarks, orient='index')
    results.columns = [name]
    return results

def persist():
    get_results(benchmarks, name).to_csv(results_path)
    !aws s3 cp  ../results/spark_1b_mlm52xlarge.csv s3://vaex-sagemaker-demo/benchmarks/spark_1b_mlm52xlarge_results.csv 
    
def benchmark(f, df, name, repetitions=1, **kwargs):
    times = []
    for i in range(repetitions):
        start_time = time.time()
        ret = f(df, **kwargs)
        times.append(time.time()-start_time)
    benchmarks[name] = np.mean(times)
    persist()
    print(f"{name} took: {benchmarks[name]}")
    return benchmarks[name]


long_min = -74.05
long_max = -73.75
lat_min = 40.58
lat_max = 40.90

# Benchmark

In [None]:
from pyspark import sql, SparkConf, SparkContext
import pyspark.sql.functions as f

conf = SparkConf().setAppName('Benchmarks')
conf.setExecutorEnv('spark.executor.memory', '2g')
conf.setExecutorEnv('spark.driver.memory', '30g')
sc = SparkContext(conf = conf)
sqlContext = sql.SQLContext(sc)

data = sqlContext.read.parquet(data_path)
print(f"size: {len(data)} with {len(data.columns)} columns")

In [7]:
# read/ open
def open_file(df=None, data_path=None):
    return sqlContext.read.parquet(data_path)

key = open_file
f= open_file
benchmark(f, df=data, name=key, repetitions=repetitions)

In [8]:
def count(df=None):
    return df.count()

key = 'count'
f = count
benchmark(f, df=data, name=key, repetitions=repetitions)

In [9]:
def mean(df):
    return df.select(f.mean('fare_amount')).collect()

key = 'mean'
f = mean
benchmark(f, df=data, name=key, repetitions=repetitions)

In [10]:
def standard_deviation(df):
    return df.select(f.stddev('fare_amount')).collect()

key = 'standard deviation'
f = standard_deviation
benchmark(f, df=data, name=key, repetitions=repetitions)

Unlike other techonlogies, vaex can return columns, or subset of values to explore lazely, 
but becouse many of the other tecnologies crashed at this point, we return a scalar instead.

In [11]:
def mean_of_sum(df):
    return df.select(f.mean(df['fare_amount'] + df['trip_distance'])).collect()

key = 'sum columns mean'
f = mean_of_sum
benchmark(f, df=data, name=key, repetitions=repetitions)
benchmarks['sum columns'] =  benchmarks['sum columns mean'] - benchmarks['mean']

In [14]:
def mean_of_product(df):
    return df.select(f.mean(df['fare_amount'] * df['trip_distance'])).collect()

key = 'product columns mean'
f = mean_of_product
benchmark(f, df=data, name=key, repetitions=repetitions)
benchmarks['product columns'] =  benchmarks['product columns mean'] - benchmarks['mean']

In [15]:
def mean_of_complicated_arithmetic_operation(df):
    theta_1 = df['pickup_longitude']
    phi_1 = df['pickup_latitude']
    theta_2 = df['dropoff_longitude']
    phi_2 = df['dropoff_latitude']
    temp = (f.cos(theta_1)*np.pi/180) * (f.cos(theta_2)*np.pi/180) * (f.sin(phi_2-phi_1)/2*np.pi/180)**2
    expression = 2 * f.atan2(f.sqrt(temp), f.sqrt(1-temp))
    return df.select(f.mean(expression)).collect()

key = 'arithmetic operation mean'
f  = mean_of_complicated_arithmetic_operation
benchmark(f, df=data, name=key, repetitions=repetitions)
benchmarks['arithmetic operation'] =  benchmarks['arithmetic operation mean'] - benchmarks['mean']

In [16]:
def value_counts(df):
    return df.select('fare_amount').distinct().collect()

key = 'value counts'
f  = value_counts
benchmark(f, df=data, name=key, repetitions=repetitions)

In [None]:
def groupby_statistics(df):
    ret = df.groupby('passenger_count').agg(
        f.mean('fare_amount'),
        f.stddev('fare_amount'),
        f.mean('tip_amount'),
        f.stddev('tip_amount')
    )
    ret.take(3)
    return ret

key = 'groupby statistics'
f = groupby_statistics
benchmark(f, df=data, name=key, repetitions=repetitions)

In [None]:
other = groupby_statistics(data)

In [None]:
def join(df, other):
    ret = df.join(other, on = 'passenger_count')
    ret.take(3)
    return ret

key = 'join'
f = join
benchmark(f, data, name=key, repetitions=repetitions, other=other)

In [None]:
def join_count(df, other):
    return df.join(other, on = 'passenger_count').count()

key = 'join count'
f = join_count
benchmark(f, data, name=key, repetitions=repetitions, other=other)

In [None]:
persist()

## Filtered data

Dask is not build to run on filter data like you would normally, so we will apply the same strategy

In [None]:
print(f"Prepare filtered data and deleted {gc.collect()} MB")

In [None]:
def filter_data(df):
    expr_filter = (df.pickup_longitude > long_min)  & (df.pickup_longitude < long_max) & \
              (df.pickup_latitude > lat_min)    & (df.pickup_latitude < lat_max) & \
              (df.dropoff_longitude > long_min) & (df.dropoff_longitude < long_max) & \
              (df.dropoff_latitude > lat_min)   & (df.dropoff_latitude < lat_max)
    ret = df[expr_filter]
    ret.take(3) # evaluate the filter
    return ret

key = 'filter data'
f = filter_data
benchmark(f, data, name=key, repetitions=repetitions)


In [None]:
filterd = filter_data(data)

del data
print(f"cleaned {gc.collect()} mb")

In [None]:
benchmark(filter_data, filterd, name='filterd count', repetitions=repetitions)
benchmark(mean, filterd, name='filterd mean', repetitions=repetitions)
benchmark(standard_deviation, filterd, name='filtered standard deviation', repetitions=repetitions)
benchmark(mean_of_sum, filterd, name ='filtered sum columns mean', repetitions=repetitions)
benchmarks['filtered sum columns'] =  benchmarks['filtered sum columns mean'] - benchmarks['filterd mean']
benchmark(mean_of_product, filterd, name ='filterd product columns mean', repetitions=repetitions)
benchmarks['filterd product columns'] = benchmarks['filterd product columns mean'] - benchmarks['filterd mean']
benchmark(mean_of_complicated_arithmetic_operation, filterd, name='filterd arithmetic operation mean', repetitions=repetitions)
benchmarks['filterd arithmetic operation'] =  benchmarks['filterd arithmetic operation mean'] - benchmarks['filterd mean']
benchmark(value_counts, filterd, name ='filtered value counts', repetitions=repetitions)
benchmark(groupby_statistics, filterd, name='filtered groupby statistics', repetitions=repetitions)
other = groupby_statistics(data)
benchmarks['filtered join'] = -1
benchmarks['filtered join count'] = benchmark(join_count, filterd, repetitions=repetitions, other=other)
print('Done!')