In [1]:
metrics_file = "metrics/application_1601392010735_0030"

In [2]:
import pyspark
import pyspark.sql.functions as F
import json

spark = pyspark.sql.SparkSession.builder.getOrCreate()

In [3]:
metrics = spark.read.json(metrics_file)

In [21]:
metrics.printSchema()

root
 |-- App ID: string (nullable = true)
 |-- App Name: string (nullable = true)
 |-- Block Manager ID: struct (nullable = true)
 |    |-- Executor ID: string (nullable = true)
 |    |-- Host: string (nullable = true)
 |    |-- Port: long (nullable = true)
 |-- Classpath Entries: struct (nullable = true)
 |    |-- /etc/hadoop/conf/: string (nullable = true)
 |    |-- /etc/hive/conf/: string (nullable = true)
 |    |-- /usr/lib/spark/conf/: string (nullable = true)
 |    |-- /usr/lib/spark/jars/HikariCP-2.5.1.jar: string (nullable = true)
 |    |-- /usr/lib/spark/jars/JLargeArrays-1.5.jar: string (nullable = true)
 |    |-- /usr/lib/spark/jars/JTransforms-3.1.jar: string (nullable = true)
 |    |-- /usr/lib/spark/jars/RoaringBitmap-0.7.45.jar: string (nullable = true)
 |    |-- /usr/lib/spark/jars/ST4-4.0.4.jar: string (nullable = true)
 |    |-- /usr/lib/spark/jars/accessors-smart-1.2.jar: string (nullable = true)
 |    |-- /usr/lib/spark/jars/activation-1.1.1.jar: string (nullable =

In [4]:
def collect_and_dictify(df):
    return [json.loads(row[0]) for row in df.selectExpr("to_json(*)").collect()]

def executor_info(df):
    info = df.select("Executor Info").dropna()
    return collect_and_dictify(info)

def plan_dicts(df):
    return collect_and_dictify(df.select("sparkPlanInfo").dropna())

In [36]:
metrics.select("sparkPlanInfo.children").dropna().select(F.element_at("children", 1).alias("child")).select("child.*").columns

['children', 'metrics', 'nodeName', 'simpleString']

In [72]:
from collections import namedtuple

MetricNode = namedtuple("MetricNode", "plan_node accumulatorId metricType name")
PlanInfoNode = namedtuple("PlanInfoNode", "plan_node parent nodeName simpleString")

def nextid():
    i = 0
    while True:
        yield i
        i = i + 1
    
node_ctr = nextid()

def plan_dicts(df):
    return collect_and_dictify(df.select("sparkPlanInfo").dropna())

def flatplan(dicts, parent=-1, plan_nodes=None, metric_nodes=None):
    if plan_nodes is None:
        plan_nodes = list()
        
    if metric_nodes is None:
        metric_nodes = list()
    
    for pd in dicts:
        pid = next(node_ctr)
        for m in pd['metrics']:
            metric_nodes.append(MetricNode(pid, m['accumulatorId'], m['metricType'], m['name']))
        
        plan_nodes.append(PlanInfoNode(pid, parent, pd['nodeName'], pd['simpleString']))
        
        flatplan(pd['children'], pid, plan_nodes, metric_nodes)
    
    return(plan_nodes, metric_nodes)

def plan_dfs(df):
    pn, mn = flatplan(plan_dicts(metrics))
    pndf = spark.createDataFrame(data=[n._asdict() for n in pn])
    mndf = spark.createDataFrame(data=[n._asdict() for n in mn])
    return (pndf, mndf)

In [73]:
plan_nodes, accumulable_nodes = plan_dfs(metrics)

In [74]:
pn, mn = flatplan(plan_dicts(metrics))

In [44]:
def stageInfo(df):
    
    return collect_and_dictify(df.select("Stage Info").dropna())

In [75]:
def tidy_metrics(df, mcol='Task Info', idcol='Task ID', interesting_metrics=None):
    acc_cols = [F.col('Accumulable.%s' % s).alias('Metric %s' % s) for s in ['ID', 'Name', 'Value']]
    obs = df.select(mcol).dropna().select('%s.*' % mcol)
    cols = [F.col(elt) for elt in sorted(set(obs.columns) - set([idcol, 'Accumulables']))]
    
    if interesting_metrics is None:
        interesting_metrics = F.col('Metric Name').isin(
            'internal.metrics.resultSerializationTime',
            'write time',
            'shuffle write time',
            'join time',
            'GPU time',
            'GPU decode time',
            'fetch wait time',
            'internal.metrics.executorCpuTime',
            'internal.metrics.executorDeserializeTime',
            'internal.metrics.jvmGCTime',
            'internal.metrics.jvmGCTime'
        )
    
    return obs.select(
        idcol, 
        F.explode('Accumulables').alias('Accumulable'), 
        *cols
    ).select(
        idcol, 
        *(cols + acc_cols)
    ).withColumnRenamed("Metric ID", "accumulatorId").where(interesting_metrics)

def tidy_tasks(df):
    return tidy_metrics(df, 'Task Info', 'Task ID')

def tidy_stages(df):
    return tidy_metrics(df, 'Stage Info', 'Stage ID')


In [76]:
task_metrics = tidy_tasks(metrics)

In [78]:
task_metrics.join(accumulable_nodes, "accumulatorId").join(plan_nodes, "plan_node").show()

+---------+-------------+-------+-------+-----------+------+-------------+-------------------+--------------------+-----+------+-------------+----------+-----------+------------------+------------+----------+------------------+-------------------+------+--------------------+
|plan_node|accumulatorId|Task ID|Attempt|Executor ID|Failed|  Finish Time|Getting Result Time|                Host|Index|Killed|  Launch Time|  Locality|Speculative|       Metric Name|Metric Value|metricType|              name|           nodeName|parent|        simpleString|
+---------+-------------+-------+-------+-----------+------+-------------+-------------------+--------------------+-----+------+-------------+----------+-----------+------------------+------------+----------+------------------+-------------------+------+--------------------+
|       26|         1842|    114|      0|          2| false|1601579838863|                  0|nvspark-dataproc-...|   23| false|1601579835199|RACK_LOCAL|      false|shuffle

In [None]:
task_metrics = tidy_tasks(metrics).toPandas()

In [None]:
import altair as alt

In [None]:
alt.Chart(task_metrics).mark_bar().encode(
    x='Task ID:N',
    y=alt.Y('sum(Metric Value):Q'),
    color='Metric Name:N',
    tooltip=['Metric Name', 'Metric Value', 'Task ID']
).interactive()

In [None]:
alt.Chart(task_metrics).mark_bar().encode(
    x='Task ID:N',
    y=alt.Y('sum(Metric Value):Q', stack="normalize"),
    color='Metric Name:N',
    tooltip=['Metric Name', 'Metric Value', 'Task ID']
).interactive()

In [None]:
stage_metrics = tidy_stages(metrics).toPandas()
alt.Chart(stage_metrics).mark_bar().encode(
    x='Stage ID:N',
    y='Metric Value:Q',
    color='Metric Name:N',
    tooltip=['Details', 'Metric Name', 'Metric Value', 'Stage ID']
).interactive()

In [None]:
stage_metrics = tidy_stages(metrics).toPandas()
alt.Chart(stage_metrics).mark_bar().encode(
    x='Stage ID:N',
    y=alt.Y('sum(Metric Value):Q', stack="normalize"),
    color='Metric Name:N',
    tooltip=['Details', 'Metric Name', 'Metric Value', 'Stage ID']
).interactive()

In [None]:
stage_metrics

In [None]:
metrics.select("System Properties").dropna().collect()

In [None]:
def melt(df, id_vars = None, value_vars=None, var_name='variable', value_name='value'):
    if id_vars is None:
        id_vars = []
    
    if value_vars is None:
        value_vars = [c for c in df.columns if c not in id_vars]
    
    return df.withColumn(
        "value_tuple",
        F.explode(
            F.array(
                *[
                    F.struct(
                        F.lit(vv).alias(var_name), 
                        F.col("`%s`" % vv).alias(value_name)
                    ) 
                    for vv in value_vars
                ]
            )
        )
    ).select(*(id_vars + [F.col("value_tuple")[cn].alias(cn) for cn in [var_name, value_name]]))

In [None]:
melt(metrics.select("Properties").dropna().select("Properties.*")).dropna().collect()

In [None]:
pdf = metrics.where(F.col("Properties").isNotNull()).toPandas()

In [None]:
metrics.select("Properties").dropna().select("Properties.*").select("`spark.app.id`").distinct().collect()

In [None]:
import pandas as pd
pd.options.display.max_columns = None
pdf

In [None]:
psi = metrics.where(F.col("Properties").isNotNull()).select("Stage Info.*").toPandas()

In [None]:
psi