In [60]:
from json import loads
from pyspark.sql import SparkSession
import warnings
from collections import defaultdict
warnings.filterwarnings("ignore")
from sparkmeasure import StageMetrics,TaskMetrics
from pyspark.sql.functions import col,from_json,udf
from pyspark.sql.types import StructType,StructField, StringType, IntegerType,MapType,FloatType,ArrayType

In [2]:
spark = SparkSession.\
        builder.\
        appName("testing").\
        master("spark://spark-master:7077").\
        config("spark.executor.memory", "1024m").\
        config("spark.jars.packages", "ch.cern.sparkmeasure:spark-measure_2.12:0.17").\
        getOrCreate()

In [3]:
stagemetrics = StageMetrics(spark)

In [7]:
data = spark.read.parquet('hdfs://namenode:9000/TikiCleaned/metaData').limit(100000)

In [8]:
data.groupby('category_id').count().show()



+-----------+-----+
|category_id|count|
+-----------+-----+
|       8322|88863|
|       1815| 5816|
|       1882| 5321|
+-----------+-----+



                                                                                

In [9]:
data.createOrReplaceTempView('data')

# Storage space in variety of format

## Write statistics

### CSV

In [62]:
csv_elapsedTime = []
for i in range(10):
    stagemetrics.begin()
    data.write.partitionBy("category_id").option("header",True).mode('overwrite').csv('hdfs://namenode:9000/testing/data/csv')
    stagemetrics.end()
    
    elapsedTime = int(stagemetrics.report().replace(' ','').split('\n')[6].split('>')[1].split('(')[0])
    csv_elapsedTime.append(elapsedTime)

23/01/08 13:39:14 WARN StageMetrics: Stage metrics data refreshed into temp view PerfStageMetrics
23/01/08 13:39:52 WARN StageMetrics: Stage metrics data refreshed into temp view PerfStageMetrics
23/01/08 13:40:33 WARN StageMetrics: Stage metrics data refreshed into temp view PerfStageMetrics
23/01/08 13:41:10 WARN StageMetrics: Stage metrics data refreshed into temp view PerfStageMetrics
23/01/08 13:41:48 WARN StageMetrics: Stage metrics data refreshed into temp view PerfStageMetrics
23/01/08 13:42:25 WARN StageMetrics: Stage metrics data refreshed into temp view PerfStageMetrics
23/01/08 13:43:02 WARN StageMetrics: Stage metrics data refreshed into temp view PerfStageMetrics
23/01/08 13:43:39 WARN StageMetrics: Stage metrics data refreshed into temp view PerfStageMetrics
23/01/08 13:44:20 WARN StageMetrics: Stage metrics data refreshed into temp view PerfStageMetrics
23/01/08 13:45:00 WARN StageMetrics: Stage metrics data refreshed into temp view PerfStageMetrics


In [63]:
csv_elapsedTime

[41895, 37839, 40360, 36992, 37097, 37076, 36586, 36807, 40810, 40222]

### Parquet

In [12]:
parquet_elapsedTime = []
for i in range(10):
    stagemetrics.begin()
    data.write.partitionBy("category_id").mode('overwrite').parquet('hdfs://namenode:9000/testing/data/parquet')
    stagemetrics.end()
    
    elapsedTime = int(stagemetrics.report().replace(' ','').split('\n')[6].split('>')[1].split('(')[0])
    parquet_elapsedTime.append(elapsedTime)

23/01/08 12:51:50 WARN StageMetrics: Stage metrics data refreshed into temp view PerfStageMetrics
23/01/08 12:52:27 WARN StageMetrics: Stage metrics data refreshed into temp view PerfStageMetrics
23/01/08 12:52:59 WARN StageMetrics: Stage metrics data refreshed into temp view PerfStageMetrics
23/01/08 12:53:32 WARN StageMetrics: Stage metrics data refreshed into temp view PerfStageMetrics
23/01/08 12:54:06 WARN StageMetrics: Stage metrics data refreshed into temp view PerfStageMetrics
23/01/08 12:54:38 WARN StageMetrics: Stage metrics data refreshed into temp view PerfStageMetrics
23/01/08 12:55:13 WARN StageMetrics: Stage metrics data refreshed into temp view PerfStageMetrics
23/01/08 12:55:45 WARN StageMetrics: Stage metrics data refreshed into temp view PerfStageMetrics
23/01/08 12:56:18 WARN StageMetrics: Stage metrics data refreshed into temp view PerfStageMetrics
23/01/08 12:56:51 WARN StageMetrics: Stage metrics data refreshed into temp view PerfStageMetrics


In [13]:
parquet_elapsedTime

[33887, 36467, 32081, 32506, 33377, 32419, 33940, 31878, 33363, 32263]

### Json

In [14]:
json_elapsedTime = []
for i in range(10):
    stagemetrics.begin()
    data.write.partitionBy("category_id").mode('overwrite').json('hdfs://namenode:9000/testing/data/json')
    stagemetrics.end()
    
    elapsedTime = int(stagemetrics.report().replace(' ','').split('\n')[6].split('>')[1].split('(')[0])
    json_elapsedTime.append(elapsedTime)

23/01/08 12:57:34 WARN StageMetrics: Stage metrics data refreshed into temp view PerfStageMetrics
23/01/08 12:58:11 WARN StageMetrics: Stage metrics data refreshed into temp view PerfStageMetrics
23/01/08 12:58:48 WARN StageMetrics: Stage metrics data refreshed into temp view PerfStageMetrics
23/01/08 12:59:29 WARN StageMetrics: Stage metrics data refreshed into temp view PerfStageMetrics
23/01/08 13:00:06 WARN StageMetrics: Stage metrics data refreshed into temp view PerfStageMetrics
23/01/08 13:00:47 WARN StageMetrics: Stage metrics data refreshed into temp view PerfStageMetrics
23/01/08 13:01:25 WARN StageMetrics: Stage metrics data refreshed into temp view PerfStageMetrics
23/01/08 13:02:02 WARN StageMetrics: Stage metrics data refreshed into temp view PerfStageMetrics
23/01/08 13:02:42 WARN StageMetrics: Stage metrics data refreshed into temp view PerfStageMetrics
23/01/08 13:03:20 WARN StageMetrics: Stage metrics data refreshed into temp view PerfStageMetrics


In [15]:
json_elapsedTime

[42320, 36728, 37449, 40406, 37107, 40874, 37085, 37211, 39872, 37668]

### Orc

In [16]:
orc_elapsedTime = []
for i in range(10):
    stagemetrics.begin()
    data.write.partitionBy("category_id").mode('overwrite').orc('hdfs://namenode:9000/testing/data/orc')
    stagemetrics.end()
    
    elapsedTime = int(stagemetrics.report().replace(' ','').split('\n')[6].split('>')[1].split('(')[0])
    orc_elapsedTime.append(elapsedTime)

23/01/08 13:03:53 WARN StageMetrics: Stage metrics data refreshed into temp view PerfStageMetrics
23/01/08 13:04:25 WARN StageMetrics: Stage metrics data refreshed into temp view PerfStageMetrics
23/01/08 13:05:00 WARN StageMetrics: Stage metrics data refreshed into temp view PerfStageMetrics
23/01/08 13:05:32 WARN StageMetrics: Stage metrics data refreshed into temp view PerfStageMetrics
23/01/08 13:06:04 WARN StageMetrics: Stage metrics data refreshed into temp view PerfStageMetrics
23/01/08 13:06:37 WARN StageMetrics: Stage metrics data refreshed into temp view PerfStageMetrics
23/01/08 13:07:10 WARN StageMetrics: Stage metrics data refreshed into temp view PerfStageMetrics
23/01/08 13:07:43 WARN StageMetrics: Stage metrics data refreshed into temp view PerfStageMetrics
23/01/08 13:08:15 WARN StageMetrics: Stage metrics data refreshed into temp view PerfStageMetrics
23/01/08 13:08:52 WARN StageMetrics: Stage metrics data refreshed into temp view PerfStageMetrics


In [19]:
orc_elapsedTime

[32334, 32013, 34310, 32299, 31692, 33027, 32665, 32165, 32108, 36520]

In [18]:
# data.write.mode('overwrite').csv('hdfs://namenode:9000/testing/data/csv')
# data.write.mode('overwrite').parquet('hdfs://namenode:9000/testing/data/parquet')
# data.write.mode('overwrite').json('hdfs://namenode:9000/testing/data/json')
# data.write.mode('overwrite').orc('hdfs://namenode:9000/testing/data/orc')

## Read statistics

### CSV

In [64]:
csv_elapsedTime = []
for i in range(10):
    stagemetrics.begin()
    tmp = spark.read.option("header", "true").csv('hdfs://namenode:9000/testing/data/csv').take(10000)
    stagemetrics.end()
    
    elapsedTime = int(stagemetrics.report().replace(' ','').split('\n')[6].split('>')[1].split('(')[0])
    csv_elapsedTime.append(elapsedTime)

23/01/08 13:45:01 WARN StageMetrics: Stage metrics data refreshed into temp view PerfStageMetrics
23/01/08 13:45:02 WARN StageMetrics: Stage metrics data refreshed into temp view PerfStageMetrics
23/01/08 13:45:03 WARN StageMetrics: Stage metrics data refreshed into temp view PerfStageMetrics
23/01/08 13:45:04 WARN StageMetrics: Stage metrics data refreshed into temp view PerfStageMetrics
23/01/08 13:45:05 WARN StageMetrics: Stage metrics data refreshed into temp view PerfStageMetrics
23/01/08 13:45:06 WARN StageMetrics: Stage metrics data refreshed into temp view PerfStageMetrics
23/01/08 13:45:07 WARN StageMetrics: Stage metrics data refreshed into temp view PerfStageMetrics
23/01/08 13:45:08 WARN StageMetrics: Stage metrics data refreshed into temp view PerfStageMetrics
23/01/08 13:45:09 WARN StageMetrics: Stage metrics data refreshed into temp view PerfStageMetrics
23/01/08 13:45:10 WARN StageMetrics: Stage metrics data refreshed into temp view PerfStageMetrics


In [65]:
csv_elapsedTime

[615, 555, 556, 643, 586, 707, 775, 551, 579, 570]

### Parquet

In [31]:
parquet_elapsedTime = []
for i in range(10):
    stagemetrics.begin()
    tmp = spark.read.parquet('hdfs://namenode:9000/testing/data/parquet').take(10000)
    stagemetrics.end()
    
    elapsedTime = int(stagemetrics.report().replace(' ','').split('\n')[6].split('>')[1].split('(')[0])
    parquet_elapsedTime.append(elapsedTime)

23/01/08 13:12:04 WARN StageMetrics: Stage metrics data refreshed into temp view PerfStageMetrics
23/01/08 13:12:05 WARN StageMetrics: Stage metrics data refreshed into temp view PerfStageMetrics
23/01/08 13:12:05 WARN StageMetrics: Stage metrics data refreshed into temp view PerfStageMetrics
23/01/08 13:12:06 WARN StageMetrics: Stage metrics data refreshed into temp view PerfStageMetrics
23/01/08 13:12:07 WARN StageMetrics: Stage metrics data refreshed into temp view PerfStageMetrics
23/01/08 13:12:08 WARN StageMetrics: Stage metrics data refreshed into temp view PerfStageMetrics
23/01/08 13:12:08 WARN StageMetrics: Stage metrics data refreshed into temp view PerfStageMetrics
23/01/08 13:12:09 WARN StageMetrics: Stage metrics data refreshed into temp view PerfStageMetrics
23/01/08 13:12:10 WARN StageMetrics: Stage metrics data refreshed into temp view PerfStageMetrics
23/01/08 13:12:10 WARN StageMetrics: Stage metrics data refreshed into temp view PerfStageMetrics


In [32]:
parquet_elapsedTime

[273, 244, 298, 257, 271, 284, 196, 178, 213, 184]

### Json

In [33]:
json_elapsedTime = []
for i in range(10):
    stagemetrics.begin()
    tmp = spark.read.json('hdfs://namenode:9000/testing/data/json').take(10000)
    stagemetrics.end()
    
    elapsedTime = int(stagemetrics.report().replace(' ','').split('\n')[6].split('>')[1].split('(')[0])
    json_elapsedTime.append(elapsedTime)

23/01/08 13:12:41 WARN StageMetrics: Stage metrics data refreshed into temp view PerfStageMetrics
23/01/08 13:13:06 WARN StageMetrics: Stage metrics data refreshed into temp view PerfStageMetrics
23/01/08 13:13:30 WARN StageMetrics: Stage metrics data refreshed into temp view PerfStageMetrics
23/01/08 13:13:53 WARN StageMetrics: Stage metrics data refreshed into temp view PerfStageMetrics
23/01/08 13:14:17 WARN StageMetrics: Stage metrics data refreshed into temp view PerfStageMetrics
23/01/08 13:14:40 WARN StageMetrics: Stage metrics data refreshed into temp view PerfStageMetrics
23/01/08 13:15:03 WARN StageMetrics: Stage metrics data refreshed into temp view PerfStageMetrics
23/01/08 13:15:27 WARN StageMetrics: Stage metrics data refreshed into temp view PerfStageMetrics
23/01/08 13:15:52 WARN StageMetrics: Stage metrics data refreshed into temp view PerfStageMetrics
23/01/08 13:16:15 WARN StageMetrics: Stage metrics data refreshed into temp view PerfStageMetrics


In [34]:
json_elapsedTime

[25980, 24505, 23554, 23570, 22885, 22651, 22959, 23929, 24180, 22880]

### Orc

In [37]:
orc_elapsedTime = []
for i in range(10):
    stagemetrics.begin()
    tmp = spark.read.orc('hdfs://namenode:9000/testing/data/orc').take(10000)
    stagemetrics.end()
    
    elapsedTime = int(stagemetrics.report().replace(' ','').split('\n')[6].split('>')[1].split('(')[0])
    orc_elapsedTime.append(elapsedTime)

23/01/08 13:16:40 WARN StageMetrics: Stage metrics data refreshed into temp view PerfStageMetrics
23/01/08 13:16:40 WARN StageMetrics: Stage metrics data refreshed into temp view PerfStageMetrics
23/01/08 13:16:41 WARN StageMetrics: Stage metrics data refreshed into temp view PerfStageMetrics
23/01/08 13:16:42 WARN StageMetrics: Stage metrics data refreshed into temp view PerfStageMetrics
23/01/08 13:16:42 WARN StageMetrics: Stage metrics data refreshed into temp view PerfStageMetrics
23/01/08 13:16:43 WARN StageMetrics: Stage metrics data refreshed into temp view PerfStageMetrics
23/01/08 13:16:43 WARN StageMetrics: Stage metrics data refreshed into temp view PerfStageMetrics
23/01/08 13:16:44 WARN StageMetrics: Stage metrics data refreshed into temp view PerfStageMetrics
23/01/08 13:16:44 WARN StageMetrics: Stage metrics data refreshed into temp view PerfStageMetrics
23/01/08 13:16:45 WARN StageMetrics: Stage metrics data refreshed into temp view PerfStageMetrics


In [38]:
orc_elapsedTime

[207, 212, 183, 181, 203, 186, 163, 155, 165, 166]

# Filter performance

In [4]:
df_csv = spark.read.option("header", "true").csv('hdfs://namenode:9000/testing/data/csv')
df_parquet = spark.read.parquet('hdfs://namenode:9000/testing/data/parquet')
df_json = spark.read.json('hdfs://namenode:9000/testing/data/json')
df_orc = spark.read.orc('hdfs://namenode:9000/testing/data/orc')

                                                                                

In [5]:
df_csv.createOrReplaceTempView('df_csv')
df_parquet.createOrReplaceTempView('df_parquet')
df_json.createOrReplaceTempView('df_json')
df_orc.createOrReplaceTempView('df_orc')

23/01/08 15:16:47 WARN package: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.


## CSV

In [68]:
csv_elapsedTime = []
for i in range(10):
    stagemetrics.begin()
    tmp = spark.sql('''
                select * from df_csv
                where price > 100000
            ''').take(1000)
    stagemetrics.end()
    
    elapsedTime = int(stagemetrics.report().replace(' ','').split('\n')[6].split('>')[1].split('(')[0])
    csv_elapsedTime.append(elapsedTime)

23/01/08 13:50:02 WARN StageMetrics: Stage metrics data refreshed into temp view PerfStageMetrics
23/01/08 13:50:02 WARN StageMetrics: Stage metrics data refreshed into temp view PerfStageMetrics
23/01/08 13:50:03 WARN StageMetrics: Stage metrics data refreshed into temp view PerfStageMetrics
23/01/08 13:50:03 WARN StageMetrics: Stage metrics data refreshed into temp view PerfStageMetrics
23/01/08 13:50:04 WARN StageMetrics: Stage metrics data refreshed into temp view PerfStageMetrics
23/01/08 13:50:04 WARN StageMetrics: Stage metrics data refreshed into temp view PerfStageMetrics
23/01/08 13:50:05 WARN StageMetrics: Stage metrics data refreshed into temp view PerfStageMetrics
23/01/08 13:50:05 WARN StageMetrics: Stage metrics data refreshed into temp view PerfStageMetrics
23/01/08 13:50:06 WARN StageMetrics: Stage metrics data refreshed into temp view PerfStageMetrics
23/01/08 13:50:06 WARN StageMetrics: Stage metrics data refreshed into temp view PerfStageMetrics


In [69]:
csv_elapsedTime

[297, 369, 316, 314, 321, 316, 327, 319, 307, 290]

## Parquet

In [70]:
parquet_elapsedTime = []
for i in range(10):
    stagemetrics.begin()
    tmp = spark.sql('''
                select * from df_parquet
                where price > 100000
            ''').take(1000)
    stagemetrics.end()
    
    elapsedTime = int(stagemetrics.report().replace(' ','').split('\n')[6].split('>')[1].split('(')[0])
    parquet_elapsedTime.append(elapsedTime)

23/01/08 13:50:31 WARN StageMetrics: Stage metrics data refreshed into temp view PerfStageMetrics
23/01/08 13:50:32 WARN StageMetrics: Stage metrics data refreshed into temp view PerfStageMetrics
23/01/08 13:50:32 WARN StageMetrics: Stage metrics data refreshed into temp view PerfStageMetrics
23/01/08 13:50:32 WARN StageMetrics: Stage metrics data refreshed into temp view PerfStageMetrics
23/01/08 13:50:32 WARN StageMetrics: Stage metrics data refreshed into temp view PerfStageMetrics
23/01/08 13:50:33 WARN StageMetrics: Stage metrics data refreshed into temp view PerfStageMetrics
23/01/08 13:50:33 WARN StageMetrics: Stage metrics data refreshed into temp view PerfStageMetrics
23/01/08 13:50:33 WARN StageMetrics: Stage metrics data refreshed into temp view PerfStageMetrics
23/01/08 13:50:33 WARN StageMetrics: Stage metrics data refreshed into temp view PerfStageMetrics
23/01/08 13:50:33 WARN StageMetrics: Stage metrics data refreshed into temp view PerfStageMetrics


In [71]:
parquet_elapsedTime

[92, 75, 40, 40, 44, 38, 51, 55, 37, 46]

In [72]:
json_elapsedTime = []
for i in range(10):
    stagemetrics.begin()
    tmp = spark.sql('''
                select * from df_json
                where price > 100000
            ''').take(1000)
    stagemetrics.end()
    
    elapsedTime = int(stagemetrics.report().replace(' ','').split('\n')[6].split('>')[1].split('(')[0])
    json_elapsedTime.append(elapsedTime)

23/01/08 13:51:07 WARN StageMetrics: Stage metrics data refreshed into temp view PerfStageMetrics
23/01/08 13:51:08 WARN StageMetrics: Stage metrics data refreshed into temp view PerfStageMetrics
23/01/08 13:51:08 WARN StageMetrics: Stage metrics data refreshed into temp view PerfStageMetrics
23/01/08 13:51:08 WARN StageMetrics: Stage metrics data refreshed into temp view PerfStageMetrics
23/01/08 13:51:09 WARN StageMetrics: Stage metrics data refreshed into temp view PerfStageMetrics
23/01/08 13:51:09 WARN StageMetrics: Stage metrics data refreshed into temp view PerfStageMetrics
23/01/08 13:51:10 WARN StageMetrics: Stage metrics data refreshed into temp view PerfStageMetrics
23/01/08 13:51:10 WARN StageMetrics: Stage metrics data refreshed into temp view PerfStageMetrics
23/01/08 13:51:11 WARN StageMetrics: Stage metrics data refreshed into temp view PerfStageMetrics
23/01/08 13:51:11 WARN StageMetrics: Stage metrics data refreshed into temp view PerfStageMetrics


In [73]:
json_elapsedTime

[277, 220, 237, 231, 283, 264, 281, 291, 264, 295]

In [74]:
orc_elapsedTime = []
for i in range(10):
    stagemetrics.begin()
    tmp = spark.sql('''
                select * from df_orc
                where price > 100000
            ''').take(1000)
    stagemetrics.end()
    
    elapsedTime = int(stagemetrics.report().replace(' ','').split('\n')[6].split('>')[1].split('(')[0])
    orc_elapsedTime.append(elapsedTime)

23/01/08 13:51:45 WARN StageMetrics: Stage metrics data refreshed into temp view PerfStageMetrics
23/01/08 13:51:45 WARN StageMetrics: Stage metrics data refreshed into temp view PerfStageMetrics
23/01/08 13:51:45 WARN StageMetrics: Stage metrics data refreshed into temp view PerfStageMetrics
23/01/08 13:51:45 WARN StageMetrics: Stage metrics data refreshed into temp view PerfStageMetrics
23/01/08 13:51:45 WARN StageMetrics: Stage metrics data refreshed into temp view PerfStageMetrics
23/01/08 13:51:46 WARN StageMetrics: Stage metrics data refreshed into temp view PerfStageMetrics
23/01/08 13:51:46 WARN StageMetrics: Stage metrics data refreshed into temp view PerfStageMetrics
23/01/08 13:51:46 WARN StageMetrics: Stage metrics data refreshed into temp view PerfStageMetrics
23/01/08 13:51:46 WARN StageMetrics: Stage metrics data refreshed into temp view PerfStageMetrics
23/01/08 13:51:47 WARN StageMetrics: Stage metrics data refreshed into temp view PerfStageMetrics


In [82]:
orc_elapsedTime

[122, 48, 116, 37, 49, 39, 52, 39, 47, 42]

 # Aggregate performance

## CSV

In [None]:
agg_dict = defaultdict(list)
for i in range(10):
    stagemetrics.begin()
    tmp = spark.sql('''
                select category_id,sum(price) total_price from df_csv
                group by category_id
            ''').collect()
    stagemetrics.end()
    
    elapsedTime = int(stagemetrics.report().replace(' ','').split('\n')[6].split('>')[1].split('(')[0])
    executorCpuTime = int(stagemetrics.report().replace(' ','').split('\n')[9].split('>')[1].split('(')[0])
    jvmGCTime = int(stagemetrics.report().replace(' ','').split('\n')[13].split('>')[1].split('(')[0])
    
    
    agg_dict['elapsedTime'].append(elapsedTime)
    agg_dict['executorCpuTime'].append(executorCpuTime)
    agg_dict['jvmGCTime'].append(jvmGCTime)

In [59]:
agg_dict

defaultdict(list,
            {'elapsedTime': [3720,
              3542,
              3656,
              3435,
              3686,
              3680,
              3629,
              3745,
              3487,
              3204],
             'executorCpuTime': [6212,
              5912,
              6081,
              5721,
              6194,
              6233,
              6108,
              6319,
              5988,
              5413],
             'jvmGCTime': [52, 50, 83, 50, 57, 51, 71, 44, 49, 47]})

## Parquet

In [None]:
agg_dict = defaultdict(list)
for i in range(10):
    stagemetrics.begin()
    tmp = spark.sql('''
                select category_id,sum(price) total_price from df_parquet
                group by category_id
            ''').collect()
    stagemetrics.end()
    
    elapsedTime = int(stagemetrics.report().replace(' ','').split('\n')[6].split('>')[1].split('(')[0])
    executorCpuTime = int(stagemetrics.report().replace(' ','').split('\n')[9].split('>')[1].split('(')[0])
    jvmGCTime = int(stagemetrics.report().replace(' ','').split('\n')[13].split('>')[1].split('(')[0])
    
    
    agg_dict['elapsedTime'].append(elapsedTime)
    agg_dict['executorCpuTime'].append(executorCpuTime)
    agg_dict['jvmGCTime'].append(jvmGCTime)

In [62]:
agg_dict

defaultdict(list,
            {'elapsedTime': [444, 456, 526, 466, 468, 451, 441, 425, 516, 474],
             'executorCpuTime': [263,
              256,
              290,
              278,
              278,
              264,
              261,
              246,
              297,
              284],
             'jvmGCTime': [4, 4, 4, 4, 0, 7, 6, 7, 5, 0]})

## Json

In [None]:
agg_dict = defaultdict(list)
for i in range(10):
    stagemetrics.begin()
    tmp = spark.sql('''
                select category_id,sum(price) total_price from df_json
                group by category_id
            ''').collect()
    stagemetrics.end()
    
    elapsedTime = int(stagemetrics.report().replace(' ','').split('\n')[6].split('>')[1].split('(')[0])
    executorCpuTime = int(stagemetrics.report().replace(' ','').split('\n')[9].split('>')[1].split('(')[0])
    jvmGCTime = int(stagemetrics.report().replace(' ','').split('\n')[13].split('>')[1].split('(')[0])
    
    
    agg_dict['elapsedTime'].append(elapsedTime)
    agg_dict['executorCpuTime'].append(executorCpuTime)
    agg_dict['jvmGCTime'].append(jvmGCTime)

In [64]:
agg_dict

defaultdict(list,
            {'elapsedTime': [1973,
              2016,
              1959,
              2068,
              1809,
              2090,
              1799,
              2024,
              2020,
              2100],
             'executorCpuTime': [2868,
              3178,
              2836,
              3196,
              2740,
              3274,
              2713,
              3129,
              3034,
              3256],
             'jvmGCTime': [17, 9, 0, 4, 13, 10, 13, 7, 8, 8]})

## Orc

In [None]:
agg_dict = defaultdict(list)
for i in range(10):
    stagemetrics.begin()
    tmp = spark.sql('''
                select category_id,sum(price) total_price from df_orc
                group by category_id
            ''').collect()
    stagemetrics.end()
    
    elapsedTime = int(stagemetrics.report().replace(' ','').split('\n')[6].split('>')[1].split('(')[0])
    executorCpuTime = int(stagemetrics.report().replace(' ','').split('\n')[9].split('>')[1].split('(')[0])
    jvmGCTime = int(stagemetrics.report().replace(' ','').split('\n')[13].split('>')[1].split('(')[0])
    
    
    agg_dict['elapsedTime'].append(elapsedTime)
    agg_dict['executorCpuTime'].append(executorCpuTime)
    agg_dict['jvmGCTime'].append(jvmGCTime)

In [66]:
agg_dict

defaultdict(list,
            {'elapsedTime': [443, 466, 504, 518, 564, 509, 442, 514, 506, 433],
             'executorCpuTime': [269,
              258,
              260,
              271,
              290,
              259,
              237,
              276,
              255,
              239],
             'jvmGCTime': [0, 18, 9, 11, 26, 11, 12, 31, 11, 0]})