In [0]:
dbutils.widgets.text("catalog_name", "", "Catalog (required)")
dbutils.widgets.text("schema_name", "", "Schema")
CATALOG_NAME = dbutils.widgets.get("catalog_name").strip()
SCHEMA_NAME = dbutils.widgets.get("schema_name").strip() or "spark_observability"

# UC Validation
if not CATALOG_NAME:
    raise ValueError("catalog widget must point to an existing catalog")

spark.sql(f"CREATE SCHEMA IF NOT EXISTS {CATALOG_NAME}.{SCHEMA_NAME}")

In [0]:
%sql
USE CATALOG IDENTIFIER(:catalog_name);
USE SCHEMA IDENTIFIER(:schema_name);

In [0]:
%sql
CREATE OR REPLACE FUNCTION emr_sqlmetrics(
  clusterid string, limit double default 20000, ranking double DEFAULT 0 
)

RETURNS TABLE (nodestring STRING, successJobIds array<string>, stringlength string, rank double)
COMMENT 'this tool returns string snippet of node metrics for a spark sql query, useful for tuning recommendations'
RETURN

with raw as (
  select try_parse_json(emr_listshssqlraw(clusterid))::array<struct<id:long, status:string, description:string, planDescription:string, submissionTime:string, duration:long, successJobIds:array<string>, failedJobIds:string, nodes: array<struct<nodeId: INT, nodeName: STRING, metrics: array<struct<name:STRING, value:STRING>>>> >> as sqlmetrics),

explode as (
  select explode(sqlmetrics) as sqlmetricsexp
  from raw
),

pu as (select to_json(sqlmetricsexp.nodes) as nodestring, sqlmetricsexp.successJobIds as jobids,
len(to_binary(to_json(sqlmetricsexp.nodes), "UTF-8")) as stringlength,
rank() over (order by len(to_binary(to_json(sqlmetricsexp.nodes), "UTF-8")) desc) as rank
from explode)

select nodestring, jobids, stringlength, rank
from pu 
where stringlength < limit
and if(ranking = 0, 1 = 1, rank = ranking)
order by stringlength desc

In [0]:
%sql
CREATE OR REPLACE FUNCTION emr_jobmetrics(
  clusterid string, jobidsarr array<string>
)

RETURNS TABLE (jobId string, name string, description string, submissionTime string, completionTime string, stageIds array<string>, status string, numTasks double, numCompletedTasks double, numSkippedTasks double, numFailedTasks double, numCompletedStages double, numSkippedStages double, numFailedStages double, runtimesec long)
COMMENT 'this tool returns spark history server job metrics for a spark applications, useful for tuning recommendations'
RETURN

with raw as (
  select try_parse_json(emr_listshsjobsraw(clusterid))::array<struct<jobId:string, name:string, description:string, submissionTime:string, completionTime:string, stageIds:array<string>, status:string, numTasks:double, numCompletedTasks:double, numSkippedTasks:double, numFailedTasks:double, numCompletedStages:double, numSkippedStages:double, numFailedStages:double>> as jobmetrics),

explode as (
  select explode(jobmetrics) as jobmetricsexp
  from raw
)

select jobmetricsexp.*,
timestampdiff(second, to_timestamp(jobmetricsexp.submissionTime), to_timestamp(jobmetricsexp.completionTime)) as runtimesec
from explode 
where array_contains(jobidsarr, jobmetricsexp.jobid) 

In [0]:
%sql
CREATE OR REPLACE FUNCTION emr_stagemetrics(
  clusterid string, stageidsarr array<string>
)

RETURNS TABLE (stageId string, attemptId string, name string, description string, submissionTime string, completionTime string, status string, numTasks double, numCompletedTasks double, numSkippedTasks double, numFailedTasks double, numCompletedStages double, numSkippedStages double, numFailedStages double, memoryBytesSpilled long, diskBytesSpilled long, inputBytes long, inputRecords long, outputBytes long, outputRecords long, shuffleReadBytes long, shuffleReadRecords long, shuffleWriteBytes long, shuffleWriteRecords long, runtimesec long)
COMMENT 'this tool returns spark history server stage metrics for a spark applications, useful for tuning recommendations'
RETURN

with raw as (
  select try_parse_json(emr_listshsstagesraw(clusterid))::array<struct<stageId:string, attemptId:string, name:string, description:string, submissionTime:string, completionTime:string, status:string, numTasks:double, numCompletedTasks:double, numSkippedTasks:double, numFailedTasks:double, numCompletedStages:double, numSkippedStages:double, numFailedStages:double, memoryBytesSpilled:long, diskBytesSpilled:long, inputBytes:long, inputRecords:long, outputBytes:long, outputRecords:long, shuffleReadBytes:long, shuffleReadRecords:long, shuffleWriteBytes:long, shuffleWriteRecords:long >> as stagemetrics),

explode as (
  select explode(stagemetrics) as stagemetricsexp
  from raw
)

select stagemetricsexp.*,
timestampdiff(second, to_timestamp(stagemetricsexp.submissionTime), to_timestamp(stagemetricsexp.completionTime)) as runtimesec
from explode 
where array_contains(stageidsarr, stagemetricsexp.stageId) 

In [0]:
%sql
CREATE OR REPLACE FUNCTION emr_taskmetrics(
  clusterid string, stageid int
)
RETURNS STRING
COMMENT 'Calls shs to get tasks list raw'
RETURN (
  http_request(
  conn => 'shsjobs',
  method => 'GET',
  path => format_string("sparkui/%s/driver-%s/api/v1/applications/%s/stages/%s/0/taskSummary", clusterid, getsparkcontext(clusterid):spark_context_id, getappid(clusterid), stageid),
   headers => map(
       'Cookie', format_string("DATAPLANE_DOMAIN_DBAUTH=%s", secret("shscreds", "cookies")))
    )
  )
.text

In [0]:
%sql
-- Approach 2: Define photon logic upfront, execute within the UDF, have LLM return final value. This is only meant to be directionally accurate. SHS does not expose expressions. Further this is not a weighted average as it treats all operators essentially the same. 

CREATE OR REPLACE FUNCTION emr_photonmetrics(
  clusterid string
)

RETURNS DOUBLE
COMMENT 'Analyzes spark history server sql metrics to derive estimate of how much of spark job would benefit from photon'
RETURN

with raw as (
  select try_parse_json(emr_listshssqlraw(clusterid))::array<struct<id:long, status:string, description:string, planDescription:string, submissionTime:string, duration:long, successJobIds:string, failedJobIds:string, nodes: array<struct<nodeId: INT, nodeName: STRING, metrics: array<struct<name:STRING, value:STRING>>>> >> as sqlmetrics),

firstexplode as (
  select explode(sqlmetrics) as sqlmetricsexp
  from raw
),

secexplode as (
  select sqlmetricsexp.*, nodemetrics
  from firstexplode
  lateral view explode(sqlmetricsexp.nodes) as nodemetrics
),

photoncheck as (select *, case when nodemetrics.nodeName = 'MapElements' then 0 
when nodemetrics.nodeName = 'MapPartitions' then 0 
when nodemetrics.nodeName = 'Scan csv' then 0
when nodemetrics.nodeName = 'Scan json' then 0 
when nodemetrics.nodeName = 'PythonUDF' then 0 
when nodemetrics.nodeName = 'ScalaUDF' then 0 
when nodemetrics.nodeName = 'FlatMapGroupsInPandas' then 0  
when nodemetrics.nodeName = 'DeserializeToObject' then 0
when nodemetrics.nodeName = 'SerializeFromObject' then 0  
else 1 end as photonbinary
from secexplode),

jobcheck as (select try_divide(sum(photonbinary), count(*)) as jobphotonperc 
from photoncheck 
group by all)

select jobphotonperc
from jobcheck