In [1]:
from pyspark.sql import SparkSession
from datetime import datetime
from pyspark.sql.types import *
import pandas as pd
import pyspark.sql.functions as f 

spark = SparkSession.builder \
        .appName("Spark Benchmarking") \
        .master('local[*]') \
        .config('spark.driver.memory', '16g')\
        .config("spark.driver.maxResultSize", "6g")\
        .getOrCreate()
import glob
import shutil

schema = StructType([StructField('Project ID', StringType(), True),
                         StructField('Resource Item Name', StringType(),True),
                         StructField('Resource Quantity', DecimalType(), True),
                         StructField('Resource Unit Price', DecimalType(), True),
                         StructField('Resource Vendor Name', StringType(), True),])

def function_to_apply_row(x, y):
    return x*y


In [5]:
%timeit resources = spark.read.load('./data/Resources.csv', format="csv", header='true', schema=schema).collect()

35.4 s ± 1.1 s per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [6]:
resources = spark.read.load('./data/Resources.csv', format="csv", header='true', schema=schema)
resources.show()

+--------------------+--------------------+-----------------+-------------------+--------------------+
|          Project ID|  Resource Item Name|Resource Quantity|Resource Unit Price|Resource Vendor Name|
+--------------------+--------------------+-----------------+-------------------+--------------------+
|000009891526c0ade...|chair move and st...|                1|                350|                null|
|00000ce845c00cbf0...|sony mdr zx100 bl...|               40|                 13|               CDW-G|
|00002d44003ed46b0...|gaiam kids stay-n...|                4|                 19|     Amazon Business|
|00002d44003ed46b0...|cf520x - giant co...|                1|                269|Lakeshore Learnin...|
|00002d44003ed46b0...|serta lounger, mi...|                1|                132|     Amazon Business|
|00002d44003ed46b0...|big joe roma bean...|                2|                 34|     Amazon Business|
|00002eb25d60a09c3...|m48 carton of 6 m...|                3|            

In [7]:
%%timeit -n1 -r7 
resources.toPandas().to_csv("./output/output.csv")

In [0]:
%%timeit -n1 -r7 
resources.coalesce(1).write.format('csv').mode("overwrite").save(f"./output/output_folder")
for filename in glob.glob('./output/output_folder/*.csv'):
    shutil.move(filename, './output/output.csv')


In [0]:
%%timeit -n1 -r7 
resources.withColumn("cost",function_to_apply_row(f.col("Resource Quantity"),f.col("Resource Unit Price")))
resources.show()

In [0]:
%%timeit -n1 -r7 
resources = resources.withColumn("cost",f.col("Resource Quantity")*f.col("Resource Unit Price"))
resources.show()

In [0]:
%timeit -n1 -r7 resources.groupby("Resource Vendor Name").agg({"Resource Quantity" : 'mean'})

%timeit -n1 -r7 resources.na.fill('0').collect()

In [0]:
del resources

In [0]:
%timeit resources = spark.read.load('./data/resources_v2.csv', format="csv", header='true', schema=schema)

In [0]:
resources = spark.read.load('./data/resources_v2.csv', format="csv", header='true', schema=schema)

In [0]:
%%timeit -n1 -r7 
resources.toPandas().to_csv("./output/output.csv")

In [0]:
%%timeit -n1 -r7 
resources.coalesce(1).write.format('csv').mode("overwrite").save(f"./output/output_folder")
for filename in glob.glob('./output/output_folder/*.csv'):
    shutil.move(filename, './output/output.csv')


In [0]:
%%timeit -n1 -r7 
resources = resources.withColumn("cost",function_to_apply_row(f.col("Resource Quantity"),f.col("Resource Unit Price")))
resources.collect()

In [0]:
%%timeit -n1 -r7 
resources = resources.withColumn("cost",f.col("Resource Quantity")*f.col("Resource Unit Price"))
resources.collect()

In [0]:
%timeit -n1 -r7 resources.groupby("Resource Vendor Name").agg({"Resource Quantity" : 'mean'}).collect()

%timeit -n1 -r7 resources.na.fill('0').collect()

In [0]:
del resources

In [0]:
spark.stop()