# Get data and packages

In [2]:
import pandas as pd
import os
from pyspark import sql, SparkConf, SparkContext
import pyspark.sql.functions as f
import numpy as np
import warnings
import datetime as dt
import time
import gc

warnings.filterwarnings("ignore")
os.makedirs('../results', exist_ok=True)

benchmarks = {}

def benchmark(f, name, df, repetitions=1, **kwargs):
    times = []
    count = 0
    for i in range(repetitions):
        start_time = time.time()
        ret = f(df, **kwargs)
        times.append(time.time()-start_time)
        count+=1
    benchmarks[name] = np.mean(times)/count
    return benchmarks[name]

def get_results(benchmarks, name):
    results = pd.DataFrame.from_dict(benchmarks, orient='index')
    results.columns = [name]
    return results

conf = SparkConf().setAppName('Read_CSV')
sc = SparkContext(conf = conf)
sqlContext = sql.SQLContext(sc)

data_path = '../datasets/taxi_1m.csv'
name = 'spark'
repetitions = 1

### Spark implementation

In [3]:
def read_file():
    return sqlContext.read.csv(data_path, sep = ',', header = 'True', inferSchema = 'true')
    
def mean(df):
    return df.select(f.mean('fare_amount')).collect()
    
def standard_deviation(df):
    return df.select(f.stddev('fare_amount')).collect()

def sum_columns(df):
    return df.select(df['fare_amount'] + df['passenger_count']).collect()

def product_columns(df):
    return df.select(df['fare_amount'] * df['passenger_count']).collect() 

def complicated_arithmetic_operation(df):
    theta_1 = df['pickup_longitude']
    phi_1 = df['pickup_latitude']
    theta_2 = df['dropoff_longitude']
    phi_2 = df['dropoff_latitude']
    temp = ((np.cos(df.select(theta_1).collect())*np.pi/180)*np.cos(df.select(theta_2).collect())*np.pi/180) \
            * (np.sin((df.select(phi_2-phi_1).collect()))/2*np.pi/180)**2

    return 2 * np.arctan2(np.sqrt(temp), np.sqrt(1-temp))

def value_counts(df):
    return df.select('fare_amount').distinct().collect()

def groupby_statistics(df):
    return df.groupby('pickup_hour').agg(
        f.mean('fare_amount'),
        f.stddev('fare_amount'),
        f.mean('tip_amount'),
        f.stddev('tip_amount'))

def join(df, other):
    return df.join(other, on = 'pickup_hour')
    

def filter_data(df):
    long_min = -74.05
    long_max = -73.75
    lat_min = 40.58
    lat_max = 40.90

    expr_filter = (df.pickup_longitude > long_min)  & (df.pickup_longitude < long_max) & \
              (df.pickup_latitude > lat_min)    & (df.pickup_latitude < lat_max) & \
              (df.dropoff_longitude > long_min) & (df.dropoff_longitude < long_max) & \
              (df.dropoff_latitude > lat_min)   & (df.dropoff_latitude < lat_max)
    return df.filter(expr_filter)


# All data

In [11]:
# Load data
data = read_file()
data = data.withColumn('pickup_hour', sql.functions.split(data['pickup_datetime'], ' ').getItem(1))
print(f"size: {data.count()} with {len(data.columns)} columns")

size: 1000000 with 19 columns


In [None]:
# benchmark
benchmark(read_file, 'read_file', data, repetitions=repetitions)
benchmark(mean, 'mean', data, repetitions=repetitions)
benchmark(standard_deviation,'standard deviation', data, repetitions=repetitions)
benchmark(sum_columns, 'sum columns', data, repetitions=repetitions)
benchmark(product_columns, 'product columns', data, repetitions=repetitions)
benchmark(complicated_arithmetic_operation, 'complicated arithmetic operation', data, repetitions=repetitions)
benchmark(value_counts, 'value counts', data, repetitions=repetitions)
benchmark(groupby_statistics, 'groupby statistics', data, repetitions=repetitions)
benchmark(filter_data, 'filter', data, repetitions=repetitions)
gc.collect()
benchmark(join, 'join', data, repetitions=repetitions, other=groupby_statistics(data))
print(f"Done benchmarks on all data")

# Filtered

In [55]:
filterd = filter_data(data)

del data

print(f"Prepare filtered data and deleted {gc.collect()} MB")

Prepare filtered data and deleted 414 MB


In [58]:
# benchmark
benchmark(mean, 'filtered mean', filterd, repetitions=repetitions)
benchmark(standard_deviation,'filtered standard deviation', filterd, repetitions=repetitions)
benchmark(sum_columns, 'filtered sum columns', filterd, repetitions=repetitions)
benchmark(product_columns, 'filtered product_columns', filterd, repetitions=repetitions)
benchmark(complicated_arithmetic_operation, 'filtered complicated arithmetic_operation', filterd, repetitions=repetitions)
benchmark(value_counts, 'filtered value_counts', filterd, repetitions=repetitions)
benchmark(groupby_statistics, 'filtered groupby statistics', filterd, repetitions=repetitions)
benchmark(join, 'filtered join', filterd, repetitions=repetitions, other=groupby_statistics(filterd))
print(f"Done benchmarks on filterd data")

Done benchmarks on all data


In [None]:
!mkdir -p 

In [64]:
results = get_results(benchmarks, name)
results.to_csv(f"../results/{name}_1m.csv")
results.head()

Unnamed: 0,spark
mean,0.046338
standard deviation,0.038415
sum columns,0.367603
product columns,0.361916
complicated arithmetic operation,2.992864


In [60]:
!aws s3 cp  ../results/spark_1m.csv s3://vaex-sagemaker-demo/benchmarks/spark_1m_results.csv 


The user-provided path ../results/spark_1m.csv does not exist.
