In [1]:
from json import loads
from pyspark.sql import SparkSession
import warnings
from collections import defaultdict
warnings.filterwarnings("ignore")
from sparkmeasure import StageMetrics,TaskMetrics
from pyspark.sql.functions import col,from_json,udf
from pyspark.sql.types import StructType,StructField, StringType, IntegerType,MapType,FloatType,ArrayType

In [2]:
spark = SparkSession.\
        builder.\
        appName("testing").\
        master("spark://spark-master:7077").\
        config("spark.executor.memory", "2048m").\
        config("spark.jars.packages", "ch.cern.sparkmeasure:spark-measure_2.12:0.17").\
        getOrCreate()

Ivy Default Cache set to: /root/.ivy2/cache
The jars for the packages stored in: /root/.ivy2/jars
:: loading settings :: url = jar:file:/usr/local/lib/python3.9/dist-packages/pyspark/jars/ivy-2.4.0.jar!/org/apache/ivy/core/settings/ivysettings.xml
ch.cern.sparkmeasure#spark-measure_2.12 added as a dependency
:: resolving dependencies :: org.apache.spark#spark-submit-parent-62313ba2-5f6e-403e-8df8-7f46d7e284c0;1.0
	confs: [default]
	found ch.cern.sparkmeasure#spark-measure_2.12;0.17 in central
	found com.fasterxml.jackson.module#jackson-module-scala_2.12;2.9.9 in central
	found com.fasterxml.jackson.core#jackson-core;2.9.9 in central
	found com.fasterxml.jackson.core#jackson-annotations;2.9.9 in central
	found com.fasterxml.jackson.core#jackson-databind;2.9.9 in central
	found com.fasterxml.jackson.module#jackson-module-paranamer;2.9.9 in central
	found com.thoughtworks.paranamer#paranamer;2.8 in central
	found org.slf4j#slf4j-api;1.7.26 in central
	found org.influxdb#influxdb-java;2.14

In [3]:
stagemetrics = StageMetrics(spark)

In [4]:
data = spark.read.parquet('hdfs://namenode:9000/analysis/metaData')

                                                                                

In [5]:
data.createOrReplaceTempView('data')

23/01/25 14:21:05 WARN package: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.


In [6]:
spark.sql("""
    select distinct category_id from data
""").show()

                                                                                

+-----------+
|category_id|
+-----------+
|       1975|
|        976|
|       4384|
|       2549|
|      27498|
|       8594|
|       4221|
|      17166|
|       8322|
|       1846|
|        931|
|       1703|
|       1520|
|       1815|
|       1789|
|       6000|
|       1801|
|       1883|
|       1686|
|       8371|
+-----------+
only showing top 20 rows



In [7]:
df_testing = spark.sql("""
    select * from
    (
        (select * from data where category_id = 1975 limit 20000)
        union all 
        (select * from data where category_id = 6000 limit 20000)
        union all 
        (select * from data where category_id = 4384 limit 20000)
        union all 
        (select * from data where category_id = 2549 limit 20000)
        union all 
        (select * from data where category_id = 27498 limit 20000)
    )
""")

In [8]:
df_testing.count()

                                                                                

100000

In [9]:
len(df_testing.columns)

42

In [10]:
df_testing.groupby('category_id').count().show()

+-----------+-----+
|category_id|count|
+-----------+-----+
|       1975|20000|
|       4384|20000|
|       2549|20000|
|      27498|20000|
|       6000|20000|
+-----------+-----+



In [11]:
df_testing.write.partitionBy("category_id").mode('overwrite').parquet('hdfs://namenode:9000/testing/df_testing')

                                                                                

In [13]:
data = spark.read.parquet('hdfs://namenode:9000/testing/df_testing')

In [5]:
data.createOrReplaceTempView('data')

23/01/25 15:20:26 WARN package: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.


# Storage space in variety of format

## Write statistics

### CSV

In [14]:
csv_elapsedTime = []
for i in range(10):
    stagemetrics.begin()
    data.write.partitionBy("category_id").option("header",True).mode('overwrite').csv('hdfs://namenode:9000/testing/data/csv')
    stagemetrics.end()
    
    elapsedTime = int(stagemetrics.report().replace(' ','').split('\n')[6].split('>')[1].split('(')[0])
    csv_elapsedTime.append(elapsedTime)

23/01/25 14:21:30 WARN StageMetrics: Stage metrics data refreshed into temp view PerfStageMetrics
23/01/25 14:21:34 WARN StageMetrics: Stage metrics data refreshed into temp view PerfStageMetrics
23/01/25 14:21:38 WARN StageMetrics: Stage metrics data refreshed into temp view PerfStageMetrics
23/01/25 14:21:41 WARN StageMetrics: Stage metrics data refreshed into temp view PerfStageMetrics
23/01/25 14:21:45 WARN StageMetrics: Stage metrics data refreshed into temp view PerfStageMetrics
23/01/25 14:21:48 WARN StageMetrics: Stage metrics data refreshed into temp view PerfStageMetrics
23/01/25 14:21:51 WARN StageMetrics: Stage metrics data refreshed into temp view PerfStageMetrics
23/01/25 14:21:54 WARN StageMetrics: Stage metrics data refreshed into temp view PerfStageMetrics
23/01/25 14:21:57 WARN StageMetrics: Stage metrics data refreshed into temp view PerfStageMetrics
23/01/25 14:22:00 WARN StageMetrics: Stage metrics data refreshed into temp view PerfStageMetrics


In [15]:
csv_elapsedTime

[4334, 3476, 2959, 3099, 3103, 2732, 2910, 2413, 2883, 2583]

### Parquet

In [16]:
parquet_elapsedTime = []
for i in range(10):
    stagemetrics.begin()
    data.write.partitionBy("category_id").mode('overwrite').parquet('hdfs://namenode:9000/testing/data/parquet')
    stagemetrics.end()
    
    elapsedTime = int(stagemetrics.report().replace(' ','').split('\n')[6].split('>')[1].split('(')[0])
    parquet_elapsedTime.append(elapsedTime)

23/01/25 14:22:01 WARN StageMetrics: Stage metrics data refreshed into temp view PerfStageMetrics
23/01/25 14:22:03 WARN StageMetrics: Stage metrics data refreshed into temp view PerfStageMetrics
23/01/25 14:22:04 WARN StageMetrics: Stage metrics data refreshed into temp view PerfStageMetrics
23/01/25 14:22:06 WARN StageMetrics: Stage metrics data refreshed into temp view PerfStageMetrics
23/01/25 14:22:07 WARN StageMetrics: Stage metrics data refreshed into temp view PerfStageMetrics
23/01/25 14:22:09 WARN StageMetrics: Stage metrics data refreshed into temp view PerfStageMetrics
23/01/25 14:22:10 WARN StageMetrics: Stage metrics data refreshed into temp view PerfStageMetrics
23/01/25 14:22:12 WARN StageMetrics: Stage metrics data refreshed into temp view PerfStageMetrics
23/01/25 14:22:13 WARN StageMetrics: Stage metrics data refreshed into temp view PerfStageMetrics
23/01/25 14:22:15 WARN StageMetrics: Stage metrics data refreshed into temp view PerfStageMetrics


In [17]:
parquet_elapsedTime

[1403, 1318, 1229, 1234, 1226, 1325, 1191, 1262, 1205, 1218]

### Json

In [18]:
json_elapsedTime = []
for i in range(10):
    stagemetrics.begin()
    data.write.partitionBy("category_id").mode('overwrite').json('hdfs://namenode:9000/testing/data/json')
    stagemetrics.end()
    
    elapsedTime = int(stagemetrics.report().replace(' ','').split('\n')[6].split('>')[1].split('(')[0])
    json_elapsedTime.append(elapsedTime)

23/01/25 14:22:18 WARN StageMetrics: Stage metrics data refreshed into temp view PerfStageMetrics
23/01/25 14:22:21 WARN StageMetrics: Stage metrics data refreshed into temp view PerfStageMetrics
23/01/25 14:22:24 WARN StageMetrics: Stage metrics data refreshed into temp view PerfStageMetrics
23/01/25 14:22:28 WARN StageMetrics: Stage metrics data refreshed into temp view PerfStageMetrics
23/01/25 14:22:31 WARN StageMetrics: Stage metrics data refreshed into temp view PerfStageMetrics
23/01/25 14:22:35 WARN StageMetrics: Stage metrics data refreshed into temp view PerfStageMetrics
23/01/25 14:22:40 WARN StageMetrics: Stage metrics data refreshed into temp view PerfStageMetrics
23/01/25 14:22:42 WARN StageMetrics: Stage metrics data refreshed into temp view PerfStageMetrics
23/01/25 14:22:46 WARN StageMetrics: Stage metrics data refreshed into temp view PerfStageMetrics
23/01/25 14:22:49 WARN StageMetrics: Stage metrics data refreshed into temp view PerfStageMetrics


In [19]:
json_elapsedTime

[3101, 2852, 2797, 3294, 2791, 3871, 4054, 2538, 3791, 2794]

### Orc

In [20]:
orc_elapsedTime = []
for i in range(10):
    stagemetrics.begin()
    data.write.partitionBy("category_id").mode('overwrite').orc('hdfs://namenode:9000/testing/data/orc')
    stagemetrics.end()
    
    elapsedTime = int(stagemetrics.report().replace(' ','').split('\n')[6].split('>')[1].split('(')[0])
    orc_elapsedTime.append(elapsedTime)

23/01/25 14:22:52 WARN StageMetrics: Stage metrics data refreshed into temp view PerfStageMetrics
23/01/25 14:22:53 WARN StageMetrics: Stage metrics data refreshed into temp view PerfStageMetrics
23/01/25 14:22:55 WARN StageMetrics: Stage metrics data refreshed into temp view PerfStageMetrics
23/01/25 14:22:56 WARN StageMetrics: Stage metrics data refreshed into temp view PerfStageMetrics
23/01/25 14:22:58 WARN StageMetrics: Stage metrics data refreshed into temp view PerfStageMetrics
23/01/25 14:23:00 WARN StageMetrics: Stage metrics data refreshed into temp view PerfStageMetrics
23/01/25 14:23:01 WARN StageMetrics: Stage metrics data refreshed into temp view PerfStageMetrics
23/01/25 14:23:03 WARN StageMetrics: Stage metrics data refreshed into temp view PerfStageMetrics
23/01/25 14:23:04 WARN StageMetrics: Stage metrics data refreshed into temp view PerfStageMetrics
23/01/25 14:23:05 WARN StageMetrics: Stage metrics data refreshed into temp view PerfStageMetrics


In [21]:
orc_elapsedTime

[2180, 1364, 1223, 1285, 1238, 1572, 1407, 1256, 1333, 1170]

In [22]:
# data.write.mode('overwrite').csv('hdfs://namenode:9000/testing/data/csv')
# data.write.mode('overwrite').parquet('hdfs://namenode:9000/testing/data/parquet')
# data.write.mode('overwrite').json('hdfs://namenode:9000/testing/data/json')
# data.write.mode('overwrite').orc('hdfs://namenode:9000/testing/data/orc')

## Read statistics

### CSV

In [23]:
csv_elapsedTime = []
for i in range(10):
    stagemetrics.begin()
    tmp = spark.read.option("header", "true").csv('hdfs://namenode:9000/testing/data/csv').take(10000)
    stagemetrics.end()
    
    elapsedTime = int(stagemetrics.report().replace(' ','').split('\n')[6].split('>')[1].split('(')[0])
    csv_elapsedTime.append(elapsedTime)

23/01/25 14:23:07 WARN StageMetrics: Stage metrics data refreshed into temp view PerfStageMetrics
23/01/25 14:23:09 WARN StageMetrics: Stage metrics data refreshed into temp view PerfStageMetrics
23/01/25 14:23:10 WARN StageMetrics: Stage metrics data refreshed into temp view PerfStageMetrics
23/01/25 14:23:12 WARN StageMetrics: Stage metrics data refreshed into temp view PerfStageMetrics
23/01/25 14:23:13 WARN StageMetrics: Stage metrics data refreshed into temp view PerfStageMetrics
23/01/25 14:23:15 WARN StageMetrics: Stage metrics data refreshed into temp view PerfStageMetrics
23/01/25 14:23:16 WARN StageMetrics: Stage metrics data refreshed into temp view PerfStageMetrics
23/01/25 14:23:17 WARN StageMetrics: Stage metrics data refreshed into temp view PerfStageMetrics
23/01/25 14:23:19 WARN StageMetrics: Stage metrics data refreshed into temp view PerfStageMetrics
23/01/25 14:23:20 WARN StageMetrics: Stage metrics data refreshed into temp view PerfStageMetrics


In [24]:
csv_elapsedTime

[1182, 863, 1124, 877, 862, 864, 854, 876, 859, 914]

## 1 columns

In [25]:
csv_elapsedTime = []
for i in range(10):
    stagemetrics.begin()
    tmp = spark.read.option("header", "true").csv('hdfs://namenode:9000/testing/data/csv').select('price').take(10000)
    stagemetrics.end()
    
    elapsedTime = int(stagemetrics.report().replace(' ','').split('\n')[6].split('>')[1].split('(')[0])
    csv_elapsedTime.append(elapsedTime)
csv_elapsedTime

23/01/25 15:49:54 WARN StageMetrics: Stage metrics data refreshed into temp view PerfStageMetrics
23/01/25 15:49:54 WARN StageMetrics: Stage metrics data refreshed into temp view PerfStageMetrics
23/01/25 15:49:55 WARN StageMetrics: Stage metrics data refreshed into temp view PerfStageMetrics
23/01/25 15:49:56 WARN StageMetrics: Stage metrics data refreshed into temp view PerfStageMetrics
23/01/25 15:49:56 WARN StageMetrics: Stage metrics data refreshed into temp view PerfStageMetrics
23/01/25 15:49:57 WARN StageMetrics: Stage metrics data refreshed into temp view PerfStageMetrics
23/01/25 15:49:57 WARN StageMetrics: Stage metrics data refreshed into temp view PerfStageMetrics
23/01/25 15:49:58 WARN StageMetrics: Stage metrics data refreshed into temp view PerfStageMetrics
23/01/25 15:49:59 WARN StageMetrics: Stage metrics data refreshed into temp view PerfStageMetrics
23/01/25 15:49:59 WARN StageMetrics: Stage metrics data refreshed into temp view PerfStageMetrics


[981, 415, 451, 755, 379, 393, 383, 408, 396, 430]

### Parquet

In [27]:
parquet_elapsedTime = []
for i in range(10):
    stagemetrics.begin()
    tmp = spark.read.parquet('hdfs://namenode:9000/testing/data/parquet').take(10000)
    stagemetrics.end()
    
    elapsedTime = int(stagemetrics.report().replace(' ','').split('\n')[6].split('>')[1].split('(')[0])
    parquet_elapsedTime.append(elapsedTime)

23/01/25 15:50:28 WARN StageMetrics: Stage metrics data refreshed into temp view PerfStageMetrics
23/01/25 15:50:28 WARN StageMetrics: Stage metrics data refreshed into temp view PerfStageMetrics
23/01/25 15:50:29 WARN StageMetrics: Stage metrics data refreshed into temp view PerfStageMetrics
23/01/25 15:50:29 WARN StageMetrics: Stage metrics data refreshed into temp view PerfStageMetrics
23/01/25 15:50:30 WARN StageMetrics: Stage metrics data refreshed into temp view PerfStageMetrics
23/01/25 15:50:31 WARN StageMetrics: Stage metrics data refreshed into temp view PerfStageMetrics
23/01/25 15:50:31 WARN StageMetrics: Stage metrics data refreshed into temp view PerfStageMetrics
23/01/25 15:50:32 WARN StageMetrics: Stage metrics data refreshed into temp view PerfStageMetrics
23/01/25 15:50:33 WARN StageMetrics: Stage metrics data refreshed into temp view PerfStageMetrics
23/01/25 15:50:33 WARN StageMetrics: Stage metrics data refreshed into temp view PerfStageMetrics


In [29]:
parquet_elapsedTime

[269, 220, 172, 180, 197, 185, 197, 239, 170, 178]

In [33]:
parquet_elapsedTime = []
for i in range(10):
    stagemetrics.begin()
    tmp = spark.read.parquet('hdfs://namenode:9000/testing/data/parquet').select('price').take(10000)
    stagemetrics.end()
    
    elapsedTime = int(stagemetrics.report().replace(' ','').split('\n')[6].split('>')[1].split('(')[0])
    parquet_elapsedTime.append(elapsedTime)
parquet_elapsedTime

23/01/25 15:58:03 WARN StageMetrics: Stage metrics data refreshed into temp view PerfStageMetrics
23/01/25 15:58:03 WARN StageMetrics: Stage metrics data refreshed into temp view PerfStageMetrics
23/01/25 15:58:03 WARN StageMetrics: Stage metrics data refreshed into temp view PerfStageMetrics
23/01/25 15:58:03 WARN StageMetrics: Stage metrics data refreshed into temp view PerfStageMetrics
23/01/25 15:58:04 WARN StageMetrics: Stage metrics data refreshed into temp view PerfStageMetrics
23/01/25 15:58:04 WARN StageMetrics: Stage metrics data refreshed into temp view PerfStageMetrics
23/01/25 15:58:04 WARN StageMetrics: Stage metrics data refreshed into temp view PerfStageMetrics
23/01/25 15:58:05 WARN StageMetrics: Stage metrics data refreshed into temp view PerfStageMetrics
23/01/25 15:58:05 WARN StageMetrics: Stage metrics data refreshed into temp view PerfStageMetrics
23/01/25 15:58:05 WARN StageMetrics: Stage metrics data refreshed into temp view PerfStageMetrics


[76, 70, 67, 69, 68, 67, 72, 69, 70, 76]

### Json

In [27]:
json_elapsedTime = []
for i in range(10):
    stagemetrics.begin()
    tmp = spark.read.json('hdfs://namenode:9000/testing/data/json').take(10000)
    stagemetrics.end()
    
    elapsedTime = int(stagemetrics.report().replace(' ','').split('\n')[6].split('>')[1].split('(')[0])
    json_elapsedTime.append(elapsedTime)

23/01/25 14:23:49 WARN StageMetrics: Stage metrics data refreshed into temp view PerfStageMetrics
23/01/25 14:24:09 WARN StageMetrics: Stage metrics data refreshed into temp view PerfStageMetrics
23/01/25 14:24:29 WARN StageMetrics: Stage metrics data refreshed into temp view PerfStageMetrics
23/01/25 14:24:51 WARN StageMetrics: Stage metrics data refreshed into temp view PerfStageMetrics
23/01/25 14:25:12 WARN StageMetrics: Stage metrics data refreshed into temp view PerfStageMetrics
23/01/25 14:25:33 WARN StageMetrics: Stage metrics data refreshed into temp view PerfStageMetrics
23/01/25 14:25:53 WARN StageMetrics: Stage metrics data refreshed into temp view PerfStageMetrics
23/01/25 14:26:14 WARN StageMetrics: Stage metrics data refreshed into temp view PerfStageMetrics
23/01/25 14:26:34 WARN StageMetrics: Stage metrics data refreshed into temp view PerfStageMetrics
23/01/25 14:26:55 WARN StageMetrics: Stage metrics data refreshed into temp view PerfStageMetrics


In [28]:
json_elapsedTime

[21750, 19982, 19859, 21310, 20930, 19755, 20179, 19837, 20255, 19827]

In [31]:
json_elapsedTime = []
for i in range(10):
    stagemetrics.begin()
    tmp = spark.read.json('hdfs://namenode:9000/testing/data/json').select('price').take(10000)
    stagemetrics.end()
    
    elapsedTime = int(stagemetrics.report().replace(' ','').split('\n')[6].split('>')[1].split('(')[0])
    json_elapsedTime.append(elapsedTime)
json_elapsedTime

23/01/25 15:51:31 WARN StageMetrics: Stage metrics data refreshed into temp view PerfStageMetrics
23/01/25 15:51:53 WARN StageMetrics: Stage metrics data refreshed into temp view PerfStageMetrics
23/01/25 15:52:14 WARN StageMetrics: Stage metrics data refreshed into temp view PerfStageMetrics
23/01/25 15:52:36 WARN StageMetrics: Stage metrics data refreshed into temp view PerfStageMetrics
23/01/25 15:52:56 WARN StageMetrics: Stage metrics data refreshed into temp view PerfStageMetrics
23/01/25 15:53:17 WARN StageMetrics: Stage metrics data refreshed into temp view PerfStageMetrics
23/01/25 15:53:38 WARN StageMetrics: Stage metrics data refreshed into temp view PerfStageMetrics
23/01/25 15:53:59 WARN StageMetrics: Stage metrics data refreshed into temp view PerfStageMetrics
23/01/25 15:54:20 WARN StageMetrics: Stage metrics data refreshed into temp view PerfStageMetrics
23/01/25 15:54:41 WARN StageMetrics: Stage metrics data refreshed into temp view PerfStageMetrics


[24323, 22082, 21434, 21168, 20535, 20793, 20921, 20430, 20668, 21209]

### Orc

In [29]:
orc_elapsedTime = []
for i in range(10):
    stagemetrics.begin()
    tmp = spark.read.orc('hdfs://namenode:9000/testing/data/orc').take(10000)
    stagemetrics.end()
    
    elapsedTime = int(stagemetrics.report().replace(' ','').split('\n')[6].split('>')[1].split('(')[0])
    orc_elapsedTime.append(elapsedTime)

23/01/25 14:26:56 WARN StageMetrics: Stage metrics data refreshed into temp view PerfStageMetrics
23/01/25 14:26:56 WARN StageMetrics: Stage metrics data refreshed into temp view PerfStageMetrics
23/01/25 14:26:57 WARN StageMetrics: Stage metrics data refreshed into temp view PerfStageMetrics
23/01/25 14:26:58 WARN StageMetrics: Stage metrics data refreshed into temp view PerfStageMetrics
23/01/25 14:26:58 WARN StageMetrics: Stage metrics data refreshed into temp view PerfStageMetrics
23/01/25 14:26:59 WARN StageMetrics: Stage metrics data refreshed into temp view PerfStageMetrics
23/01/25 14:26:59 WARN StageMetrics: Stage metrics data refreshed into temp view PerfStageMetrics
23/01/25 14:27:00 WARN StageMetrics: Stage metrics data refreshed into temp view PerfStageMetrics
23/01/25 14:27:00 WARN StageMetrics: Stage metrics data refreshed into temp view PerfStageMetrics
23/01/25 14:27:01 WARN StageMetrics: Stage metrics data refreshed into temp view PerfStageMetrics


In [30]:
orc_elapsedTime

[312, 149, 382, 154, 133, 133, 149, 136, 146, 156]

In [None]:
orc_elapsedTime = []
for i in range(10):
    stagemetrics.begin()
    tmp = spark.read.orc('hdfs://namenode:9000/testing/data/orc').select('price').take(10000)
    stagemetrics.end()
    
    elapsedTime = int(stagemetrics.report().replace(' ','').split('\n')[6].split('>')[1].split('(')[0])
    orc_elapsedTime.append(elapsedTime)
orc_elapsedTime

23/01/25 15:58:33 WARN StageMetrics: Stage metrics data refreshed into temp view PerfStageMetrics
23/01/25 15:58:33 WARN StageMetrics: Stage metrics data refreshed into temp view PerfStageMetrics
23/01/25 15:58:33 WARN StageMetrics: Stage metrics data refreshed into temp view PerfStageMetrics


# Filter performance

In [31]:
df_csv = spark.read.option("header", "true").csv('hdfs://namenode:9000/testing/data/csv')
df_parquet = spark.read.parquet('hdfs://namenode:9000/testing/data/parquet')
df_json = spark.read.json('hdfs://namenode:9000/testing/data/json')
df_orc = spark.read.orc('hdfs://namenode:9000/testing/data/orc')

                                                                                

In [32]:
df_csv.createOrReplaceTempView('df_csv')
df_parquet.createOrReplaceTempView('df_parquet')
df_json.createOrReplaceTempView('df_json')
df_orc.createOrReplaceTempView('df_orc')

## CSV

In [33]:
csv_elapsedTime = []
for i in range(10):
    stagemetrics.begin()
    tmp = spark.sql('''
                select * from df_csv
                where price > 100000
            ''').take(1000)
    stagemetrics.end()
    
    elapsedTime = int(stagemetrics.report().replace(' ','').split('\n')[6].split('>')[1].split('(')[0])
    csv_elapsedTime.append(elapsedTime)

23/01/25 14:27:21 WARN StageMetrics: Stage metrics data refreshed into temp view PerfStageMetrics
23/01/25 14:27:22 WARN StageMetrics: Stage metrics data refreshed into temp view PerfStageMetrics
23/01/25 14:27:22 WARN StageMetrics: Stage metrics data refreshed into temp view PerfStageMetrics
23/01/25 14:27:22 WARN StageMetrics: Stage metrics data refreshed into temp view PerfStageMetrics
23/01/25 14:27:23 WARN StageMetrics: Stage metrics data refreshed into temp view PerfStageMetrics
23/01/25 14:27:23 WARN StageMetrics: Stage metrics data refreshed into temp view PerfStageMetrics
23/01/25 14:27:23 WARN StageMetrics: Stage metrics data refreshed into temp view PerfStageMetrics
23/01/25 14:27:23 WARN StageMetrics: Stage metrics data refreshed into temp view PerfStageMetrics
23/01/25 14:27:24 WARN StageMetrics: Stage metrics data refreshed into temp view PerfStageMetrics
23/01/25 14:27:24 WARN StageMetrics: Stage metrics data refreshed into temp view PerfStageMetrics


In [34]:
csv_elapsedTime

[167, 114, 195, 111, 137, 113, 121, 114, 130, 113]

## Parquet

In [35]:
parquet_elapsedTime = []
for i in range(10):
    stagemetrics.begin()
    tmp = spark.sql('''
                select * from df_parquet
                where price > 100000
            ''').take(1000)
    stagemetrics.end()
    
    elapsedTime = int(stagemetrics.report().replace(' ','').split('\n')[6].split('>')[1].split('(')[0])
    parquet_elapsedTime.append(elapsedTime)

23/01/25 14:27:25 WARN StageMetrics: Stage metrics data refreshed into temp view PerfStageMetrics
23/01/25 14:27:25 WARN StageMetrics: Stage metrics data refreshed into temp view PerfStageMetrics
23/01/25 14:27:25 WARN StageMetrics: Stage metrics data refreshed into temp view PerfStageMetrics
23/01/25 14:27:25 WARN StageMetrics: Stage metrics data refreshed into temp view PerfStageMetrics
23/01/25 14:27:26 WARN StageMetrics: Stage metrics data refreshed into temp view PerfStageMetrics
23/01/25 14:27:26 WARN StageMetrics: Stage metrics data refreshed into temp view PerfStageMetrics
23/01/25 14:27:26 WARN StageMetrics: Stage metrics data refreshed into temp view PerfStageMetrics
23/01/25 14:27:26 WARN StageMetrics: Stage metrics data refreshed into temp view PerfStageMetrics
23/01/25 14:27:27 WARN StageMetrics: Stage metrics data refreshed into temp view PerfStageMetrics
23/01/25 14:27:27 WARN StageMetrics: Stage metrics data refreshed into temp view PerfStageMetrics


In [36]:
parquet_elapsedTime

[139, 118, 54, 57, 58, 73, 53, 50, 50, 56]

In [37]:
json_elapsedTime = []
for i in range(10):
    stagemetrics.begin()
    tmp = spark.sql('''
                select * from df_json
                where price > 100000
            ''').take(1000)
    stagemetrics.end()
    
    elapsedTime = int(stagemetrics.report().replace(' ','').split('\n')[6].split('>')[1].split('(')[0])
    json_elapsedTime.append(elapsedTime)

23/01/25 14:27:27 WARN StageMetrics: Stage metrics data refreshed into temp view PerfStageMetrics
23/01/25 14:27:28 WARN StageMetrics: Stage metrics data refreshed into temp view PerfStageMetrics
23/01/25 14:27:28 WARN StageMetrics: Stage metrics data refreshed into temp view PerfStageMetrics
23/01/25 14:27:28 WARN StageMetrics: Stage metrics data refreshed into temp view PerfStageMetrics
23/01/25 14:27:28 WARN StageMetrics: Stage metrics data refreshed into temp view PerfStageMetrics
23/01/25 14:27:29 WARN StageMetrics: Stage metrics data refreshed into temp view PerfStageMetrics
23/01/25 14:27:29 WARN StageMetrics: Stage metrics data refreshed into temp view PerfStageMetrics
23/01/25 14:27:29 WARN StageMetrics: Stage metrics data refreshed into temp view PerfStageMetrics
23/01/25 14:27:30 WARN StageMetrics: Stage metrics data refreshed into temp view PerfStageMetrics
23/01/25 14:27:30 WARN StageMetrics: Stage metrics data refreshed into temp view PerfStageMetrics


In [38]:
json_elapsedTime

[116, 107, 101, 136, 103, 104, 104, 105, 107, 101]

In [39]:
orc_elapsedTime = []
for i in range(10):
    stagemetrics.begin()
    tmp = spark.sql('''
                select * from df_orc
                where price > 100000
            ''').take(1000)
    stagemetrics.end()
    
    elapsedTime = int(stagemetrics.report().replace(' ','').split('\n')[6].split('>')[1].split('(')[0])
    orc_elapsedTime.append(elapsedTime)

23/01/25 14:27:30 WARN StageMetrics: Stage metrics data refreshed into temp view PerfStageMetrics
23/01/25 14:27:31 WARN StageMetrics: Stage metrics data refreshed into temp view PerfStageMetrics
23/01/25 14:27:31 WARN StageMetrics: Stage metrics data refreshed into temp view PerfStageMetrics
23/01/25 14:27:31 WARN StageMetrics: Stage metrics data refreshed into temp view PerfStageMetrics
23/01/25 14:27:31 WARN StageMetrics: Stage metrics data refreshed into temp view PerfStageMetrics
23/01/25 14:27:32 WARN StageMetrics: Stage metrics data refreshed into temp view PerfStageMetrics
23/01/25 14:27:32 WARN StageMetrics: Stage metrics data refreshed into temp view PerfStageMetrics
23/01/25 14:27:32 WARN StageMetrics: Stage metrics data refreshed into temp view PerfStageMetrics
23/01/25 14:27:33 WARN StageMetrics: Stage metrics data refreshed into temp view PerfStageMetrics
23/01/25 14:27:33 WARN StageMetrics: Stage metrics data refreshed into temp view PerfStageMetrics


In [40]:
orc_elapsedTime

[154, 159, 51, 68, 45, 66, 48, 48, 46, 48]

 # Aggregate performance

## CSV

In [41]:
agg_dict = defaultdict(list)
for i in range(10):
    stagemetrics.begin()
    tmp = spark.sql('''
                select category_id,sum(price) total_price from df_csv
                group by category_id
            ''').collect()
    stagemetrics.end()
    
    elapsedTime = int(stagemetrics.report().replace(' ','').split('\n')[6].split('>')[1].split('(')[0])
    executorCpuTime = int(stagemetrics.report().replace(' ','').split('\n')[9].split('>')[1].split('(')[0])
    jvmGCTime = int(stagemetrics.report().replace(' ','').split('\n')[13].split('>')[1].split('(')[0])
    
    
    agg_dict['elapsedTime'].append(elapsedTime)
    agg_dict['executorCpuTime'].append(executorCpuTime)
    agg_dict['jvmGCTime'].append(jvmGCTime)

23/01/25 14:27:35 WARN StageMetrics: Stage metrics data refreshed into temp view PerfStageMetrics
23/01/25 14:27:35 WARN StageMetrics: Stage metrics data refreshed into temp view PerfStageMetrics
23/01/25 14:27:35 WARN StageMetrics: Stage metrics data refreshed into temp view PerfStageMetrics
23/01/25 14:27:37 WARN StageMetrics: Stage metrics data refreshed into temp view PerfStageMetrics
23/01/25 14:27:37 WARN StageMetrics: Stage metrics data refreshed into temp view PerfStageMetrics
23/01/25 14:27:38 WARN StageMetrics: Stage metrics data refreshed into temp view PerfStageMetrics
23/01/25 14:27:39 WARN StageMetrics: Stage metrics data refreshed into temp view PerfStageMetrics
23/01/25 14:27:40 WARN StageMetrics: Stage metrics data refreshed into temp view PerfStageMetrics
23/01/25 14:27:40 WARN StageMetrics: Stage metrics data refreshed into temp view PerfStageMetrics
23/01/25 14:27:42 WARN StageMetrics: Stage metrics data refreshed into temp view PerfStageMetrics
23/01/25 14:27:42 WA

In [42]:
agg_dict

defaultdict(list,
            {'elapsedTime': [2164,
              1742,
              1760,
              1668,
              1641,
              1597,
              1642,
              1622,
              1622,
              1587],
             'executorCpuTime': [5040,
              4126,
              3899,
              3979,
              4176,
              4125,
              4484,
              3880,
              4135,
              4018],
             'jvmGCTime': [78, 51, 40, 52, 28, 36, 46, 60, 38, 45]})

## Parquet

In [43]:
agg_dict = defaultdict(list)
for i in range(10):
    stagemetrics.begin()
    tmp = spark.sql('''
                select category_id,sum(price) total_price from df_parquet
                group by category_id
            ''').collect()
    stagemetrics.end()
    
    elapsedTime = int(stagemetrics.report().replace(' ','').split('\n')[6].split('>')[1].split('(')[0])
    executorCpuTime = int(stagemetrics.report().replace(' ','').split('\n')[9].split('>')[1].split('(')[0])
    jvmGCTime = int(stagemetrics.report().replace(' ','').split('\n')[13].split('>')[1].split('(')[0])
    
    
    agg_dict['elapsedTime'].append(elapsedTime)
    agg_dict['executorCpuTime'].append(executorCpuTime)
    agg_dict['jvmGCTime'].append(jvmGCTime)

23/01/25 14:27:54 WARN StageMetrics: Stage metrics data refreshed into temp view PerfStageMetrics
23/01/25 14:27:55 WARN StageMetrics: Stage metrics data refreshed into temp view PerfStageMetrics
23/01/25 14:27:55 WARN StageMetrics: Stage metrics data refreshed into temp view PerfStageMetrics
23/01/25 14:27:55 WARN StageMetrics: Stage metrics data refreshed into temp view PerfStageMetrics
23/01/25 14:27:55 WARN StageMetrics: Stage metrics data refreshed into temp view PerfStageMetrics
23/01/25 14:27:55 WARN StageMetrics: Stage metrics data refreshed into temp view PerfStageMetrics
23/01/25 14:27:56 WARN StageMetrics: Stage metrics data refreshed into temp view PerfStageMetrics
23/01/25 14:27:56 WARN StageMetrics: Stage metrics data refreshed into temp view PerfStageMetrics
23/01/25 14:27:56 WARN StageMetrics: Stage metrics data refreshed into temp view PerfStageMetrics
23/01/25 14:27:57 WARN StageMetrics: Stage metrics data refreshed into temp view PerfStageMetrics
23/01/25 14:27:57 WA

In [44]:
agg_dict

defaultdict(list,
            {'elapsedTime': [527, 426, 387, 323, 448, 592, 412, 483, 559, 433],
             'executorCpuTime': [608,
              457,
              426,
              391,
              431,
              439,
              402,
              440,
              434,
              402],
             'jvmGCTime': [34, 8, 0, 7, 0, 24, 16, 0, 63, 0]})

## Json

In [45]:
agg_dict = defaultdict(list)
for i in range(10):
    stagemetrics.begin()
    tmp = spark.sql('''
                select category_id,sum(price) total_price from df_json
                group by category_id
            ''').collect()
    stagemetrics.end()
    
    elapsedTime = int(stagemetrics.report().replace(' ','').split('\n')[6].split('>')[1].split('(')[0])
    executorCpuTime = int(stagemetrics.report().replace(' ','').split('\n')[9].split('>')[1].split('(')[0])
    jvmGCTime = int(stagemetrics.report().replace(' ','').split('\n')[13].split('>')[1].split('(')[0])
    
    
    agg_dict['elapsedTime'].append(elapsedTime)
    agg_dict['executorCpuTime'].append(executorCpuTime)
    agg_dict['jvmGCTime'].append(jvmGCTime)

23/01/25 14:28:04 WARN StageMetrics: Stage metrics data refreshed into temp view PerfStageMetrics
23/01/25 14:28:04 WARN StageMetrics: Stage metrics data refreshed into temp view PerfStageMetrics
23/01/25 14:28:04 WARN StageMetrics: Stage metrics data refreshed into temp view PerfStageMetrics
23/01/25 14:28:06 WARN StageMetrics: Stage metrics data refreshed into temp view PerfStageMetrics
23/01/25 14:28:06 WARN StageMetrics: Stage metrics data refreshed into temp view PerfStageMetrics
23/01/25 14:28:06 WARN StageMetrics: Stage metrics data refreshed into temp view PerfStageMetrics
23/01/25 14:28:08 WARN StageMetrics: Stage metrics data refreshed into temp view PerfStageMetrics
23/01/25 14:28:08 WARN StageMetrics: Stage metrics data refreshed into temp view PerfStageMetrics
23/01/25 14:28:08 WARN StageMetrics: Stage metrics data refreshed into temp view PerfStageMetrics
23/01/25 14:28:09 WARN StageMetrics: Stage metrics data refreshed into temp view PerfStageMetrics
23/01/25 14:28:09 WA

In [46]:
agg_dict

defaultdict(list,
            {'elapsedTime': [1765,
              1292,
              1235,
              1169,
              1105,
              1168,
              1179,
              1057,
              1058,
              1083],
             'executorCpuTime': [4293,
              3196,
              3333,
              2915,
              2618,
              2959,
              2698,
              2496,
              2494,
              2686],
             'jvmGCTime': [58, 50, 0, 15, 8, 0, 30, 6, 18, 0]})

## Orc

In [47]:
agg_dict = defaultdict(list)
for i in range(10):
    stagemetrics.begin()
    tmp = spark.sql('''
                select category_id,sum(price) total_price from df_orc
                group by category_id
            ''').collect()
    stagemetrics.end()
    
    elapsedTime = int(stagemetrics.report().replace(' ','').split('\n')[6].split('>')[1].split('(')[0])
    executorCpuTime = int(stagemetrics.report().replace(' ','').split('\n')[9].split('>')[1].split('(')[0])
    jvmGCTime = int(stagemetrics.report().replace(' ','').split('\n')[13].split('>')[1].split('(')[0])
    
    
    agg_dict['elapsedTime'].append(elapsedTime)
    agg_dict['executorCpuTime'].append(executorCpuTime)
    agg_dict['jvmGCTime'].append(jvmGCTime)

23/01/25 14:28:19 WARN StageMetrics: Stage metrics data refreshed into temp view PerfStageMetrics
23/01/25 14:28:19 WARN StageMetrics: Stage metrics data refreshed into temp view PerfStageMetrics
23/01/25 14:28:19 WARN StageMetrics: Stage metrics data refreshed into temp view PerfStageMetrics
23/01/25 14:28:19 WARN StageMetrics: Stage metrics data refreshed into temp view PerfStageMetrics
23/01/25 14:28:19 WARN StageMetrics: Stage metrics data refreshed into temp view PerfStageMetrics
23/01/25 14:28:19 WARN StageMetrics: Stage metrics data refreshed into temp view PerfStageMetrics
23/01/25 14:28:20 WARN StageMetrics: Stage metrics data refreshed into temp view PerfStageMetrics
23/01/25 14:28:20 WARN StageMetrics: Stage metrics data refreshed into temp view PerfStageMetrics
23/01/25 14:28:20 WARN StageMetrics: Stage metrics data refreshed into temp view PerfStageMetrics
23/01/25 14:28:21 WARN StageMetrics: Stage metrics data refreshed into temp view PerfStageMetrics
23/01/25 14:28:21 WA

In [48]:
agg_dict

defaultdict(list,
            {'elapsedTime': [354, 297, 308, 330, 282, 300, 294, 296, 304, 283],
             'executorCpuTime': [451,
              365,
              350,
              360,
              343,
              340,
              348,
              343,
              351,
              349],
             'jvmGCTime': [7, 0, 0, 10, 0, 0, 0, 16, 14, 0]})

# Partition

In [14]:
data = data.unionAll(data).unionAll(data).unionAll(data)

In [24]:
data.count()

400000

In [20]:
list_partition = [1,8,9,20,100]

In [21]:
for ptt in list_partition:
    data.repartition(ptt).write.mode('overwrite').parquet(f'hdfs://namenode:9000/testing/partition/partition_{ptt}')

                                                                                

In [22]:
for ptt in list_partition:
    tmp = spark.read.parquet(f'hdfs://namenode:9000/testing/partition/partition_{ptt}')
    tmp.createOrReplaceTempView(f'partition_{ptt}')

## Partition 1

In [23]:
for ptt in list_partition:
    agg_dict = defaultdict(list)
    for i in range(10):
        stagemetrics.begin()
        tmp = spark.sql(f'''
                    select category_id,sum(price) total_price from partition_{ptt}
                    group by category_id
                ''').collect()
        stagemetrics.end()

        elapsedTime = int(stagemetrics.report().replace(' ','').split('\n')[6].split('>')[1].split('(')[0])
        executorCpuTime = int(stagemetrics.report().replace(' ','').split('\n')[9].split('>')[1].split('(')[0])
        jvmGCTime = int(stagemetrics.report().replace(' ','').split('\n')[13].split('>')[1].split('(')[0])


        agg_dict['elapsedTime'].append(elapsedTime)
        agg_dict['executorCpuTime'].append(executorCpuTime)
        agg_dict['jvmGCTime'].append(jvmGCTime)
        
    print('='*20)
    print('partition:', ptt)
    print(agg_dict)

23/01/25 15:25:55 WARN StageMetrics: Stage metrics data refreshed into temp view PerfStageMetrics
23/01/25 15:25:55 WARN StageMetrics: Stage metrics data refreshed into temp view PerfStageMetrics
23/01/25 15:25:55 WARN StageMetrics: Stage metrics data refreshed into temp view PerfStageMetrics
23/01/25 15:25:56 WARN StageMetrics: Stage metrics data refreshed into temp view PerfStageMetrics
23/01/25 15:25:56 WARN StageMetrics: Stage metrics data refreshed into temp view PerfStageMetrics
23/01/25 15:25:56 WARN StageMetrics: Stage metrics data refreshed into temp view PerfStageMetrics
23/01/25 15:25:57 WARN StageMetrics: Stage metrics data refreshed into temp view PerfStageMetrics
23/01/25 15:25:57 WARN StageMetrics: Stage metrics data refreshed into temp view PerfStageMetrics
23/01/25 15:25:57 WARN StageMetrics: Stage metrics data refreshed into temp view PerfStageMetrics
23/01/25 15:25:57 WARN StageMetrics: Stage metrics data refreshed into temp view PerfStageMetrics
23/01/25 15:25:58 WA

partition: 1
defaultdict(<class 'list'>, {'elapsedTime': [433, 421, 316, 374, 593, 341, 270, 292, 374, 329], 'executorCpuTime': [372, 364, 340, 334, 334, 334, 303, 320, 340, 329], 'jvmGCTime': [9, 0, 0, 0, 0, 12, 12, 0, 0, 0]})


23/01/25 15:26:03 WARN StageMetrics: Stage metrics data refreshed into temp view PerfStageMetrics
23/01/25 15:26:03 WARN StageMetrics: Stage metrics data refreshed into temp view PerfStageMetrics
23/01/25 15:26:03 WARN StageMetrics: Stage metrics data refreshed into temp view PerfStageMetrics
23/01/25 15:26:03 WARN StageMetrics: Stage metrics data refreshed into temp view PerfStageMetrics
23/01/25 15:26:03 WARN StageMetrics: Stage metrics data refreshed into temp view PerfStageMetrics
23/01/25 15:26:04 WARN StageMetrics: Stage metrics data refreshed into temp view PerfStageMetrics
23/01/25 15:26:04 WARN StageMetrics: Stage metrics data refreshed into temp view PerfStageMetrics
23/01/25 15:26:04 WARN StageMetrics: Stage metrics data refreshed into temp view PerfStageMetrics
23/01/25 15:26:04 WARN StageMetrics: Stage metrics data refreshed into temp view PerfStageMetrics
23/01/25 15:26:05 WARN StageMetrics: Stage metrics data refreshed into temp view PerfStageMetrics
23/01/25 15:26:05 WA

partition: 8
defaultdict(<class 'list'>, {'elapsedTime': [346, 364, 274, 310, 281, 269, 313, 295, 307, 266], 'executorCpuTime': [370, 381, 350, 371, 365, 348, 375, 356, 372, 354], 'jvmGCTime': [26, 0, 0, 7, 0, 0, 0, 8, 0, 0]})


23/01/25 15:26:09 WARN StageMetrics: Stage metrics data refreshed into temp view PerfStageMetrics
23/01/25 15:26:10 WARN StageMetrics: Stage metrics data refreshed into temp view PerfStageMetrics
23/01/25 15:26:10 WARN StageMetrics: Stage metrics data refreshed into temp view PerfStageMetrics
23/01/25 15:26:10 WARN StageMetrics: Stage metrics data refreshed into temp view PerfStageMetrics
23/01/25 15:26:10 WARN StageMetrics: Stage metrics data refreshed into temp view PerfStageMetrics
23/01/25 15:26:10 WARN StageMetrics: Stage metrics data refreshed into temp view PerfStageMetrics
23/01/25 15:26:11 WARN StageMetrics: Stage metrics data refreshed into temp view PerfStageMetrics
23/01/25 15:26:11 WARN StageMetrics: Stage metrics data refreshed into temp view PerfStageMetrics
23/01/25 15:26:11 WARN StageMetrics: Stage metrics data refreshed into temp view PerfStageMetrics
23/01/25 15:26:12 WARN StageMetrics: Stage metrics data refreshed into temp view PerfStageMetrics
23/01/25 15:26:12 WA

partition: 9
defaultdict(<class 'list'>, {'elapsedTime': [338, 333, 305, 341, 333, 300, 304, 318, 310, 305], 'executorCpuTime': [383, 383, 371, 378, 379, 369, 371, 368, 372, 371], 'jvmGCTime': [0, 38, 0, 0, 0, 0, 0, 0, 12, 0]})


23/01/25 15:26:16 WARN StageMetrics: Stage metrics data refreshed into temp view PerfStageMetrics
23/01/25 15:26:16 WARN StageMetrics: Stage metrics data refreshed into temp view PerfStageMetrics
23/01/25 15:26:16 WARN StageMetrics: Stage metrics data refreshed into temp view PerfStageMetrics
23/01/25 15:26:17 WARN StageMetrics: Stage metrics data refreshed into temp view PerfStageMetrics
23/01/25 15:26:17 WARN StageMetrics: Stage metrics data refreshed into temp view PerfStageMetrics
23/01/25 15:26:17 WARN StageMetrics: Stage metrics data refreshed into temp view PerfStageMetrics
23/01/25 15:26:18 WARN StageMetrics: Stage metrics data refreshed into temp view PerfStageMetrics
23/01/25 15:26:18 WARN StageMetrics: Stage metrics data refreshed into temp view PerfStageMetrics
23/01/25 15:26:18 WARN StageMetrics: Stage metrics data refreshed into temp view PerfStageMetrics
23/01/25 15:26:18 WARN StageMetrics: Stage metrics data refreshed into temp view PerfStageMetrics
23/01/25 15:26:18 WA

partition: 20
defaultdict(<class 'list'>, {'elapsedTime': [348, 330, 305, 305, 364, 320, 315, 299, 321, 294], 'executorCpuTime': [425, 419, 413, 419, 422, 408, 409, 407, 409, 394], 'jvmGCTime': [0, 38, 0, 0, 18, 22, 0, 0, 28, 0]})


23/01/25 15:26:23 WARN StageMetrics: Stage metrics data refreshed into temp view PerfStageMetrics
23/01/25 15:26:23 WARN StageMetrics: Stage metrics data refreshed into temp view PerfStageMetrics
23/01/25 15:26:23 WARN StageMetrics: Stage metrics data refreshed into temp view PerfStageMetrics
23/01/25 15:26:24 WARN StageMetrics: Stage metrics data refreshed into temp view PerfStageMetrics
23/01/25 15:26:25 WARN StageMetrics: Stage metrics data refreshed into temp view PerfStageMetrics
23/01/25 15:26:25 WARN StageMetrics: Stage metrics data refreshed into temp view PerfStageMetrics
23/01/25 15:26:25 WARN StageMetrics: Stage metrics data refreshed into temp view PerfStageMetrics
23/01/25 15:26:26 WARN StageMetrics: Stage metrics data refreshed into temp view PerfStageMetrics
23/01/25 15:26:26 WARN StageMetrics: Stage metrics data refreshed into temp view PerfStageMetrics
23/01/25 15:26:26 WARN StageMetrics: Stage metrics data refreshed into temp view PerfStageMetrics
23/01/25 15:26:26 WA

partition: 100
defaultdict(<class 'list'>, {'elapsedTime': [600, 777, 585, 559, 602, 586, 594, 577, 542, 541], 'executorCpuTime': [793, 801, 778, 765, 790, 773, 768, 778, 753, 750], 'jvmGCTime': [0, 204, 0, 0, 0, 0, 24, 11, 0, 24]})


23/01/25 15:26:32 WARN StageMetrics: Stage metrics data refreshed into temp view PerfStageMetrics
