In [0]:
dbutils.widgets.text("catalog_name", "", "Catalog (required)")
dbutils.widgets.text("schema_name", "", "Schema")
CATALOG_NAME = dbutils.widgets.get("catalog_name").strip()
SCHEMA_NAME = dbutils.widgets.get("schema_name").strip() or "spark_observability"

# UC Validation
if not CATALOG_NAME:
    raise ValueError("catalog widget must point to an existing catalog")

spark.sql(f"CREATE SCHEMA IF NOT EXISTS {CATALOG_NAME}.{SCHEMA_NAME}")

In [0]:
%sql
USE CATALOG IDENTIFIER(:catalog_name);
USE SCHEMA IDENTIFIER(:schema_name);

In [0]:
%sql
CREATE OR REPLACE FUNCTION getsparkcontext(
  clusterid string
)
RETURNS STRING
COMMENT 'Calls to db api to get spark context'
RETURN (
  http_request(
  conn => 'clusterapi',
  method => 'GET',
  path => format_string("api/2.1/clusters/get?cluster_id=%s", clusterid))
    )
.text

In [0]:
%sql
CREATE OR REPLACE FUNCTION listappsraw(
  clusterid string
)
RETURNS STRING
COMMENT 'Calls shs to list applications submitted against spark cluster'
RETURN (
  http_request(
  conn => 'shsjobs',
  method => 'GET',
  path => format_string("sparkui/%s/driver-%s/api/v1/applications", clusterid, getsparkcontext(clusterid):spark_context_id),
   headers => map(
       'Cookie', format_string("DATAPLANE_DOMAIN_DBAUTH=%s", secret("shscreds", "cookies")))
    )
  )
.text

In [0]:
%sql
CREATE OR REPLACE FUNCTION getappid(
  clusterid string
)
RETURNS STRING
COMMENT 'Calls shs to get appid'
RETURN try_parse_json(listappsraw(clusterid))::array<struct<id:string>>[0]["id"]

In [0]:
%sql
CREATE OR REPLACE FUNCTION listshsjobsraw(
  clusterid string
)
RETURNS STRING
COMMENT 'Calls shs to get jobs list raw'
RETURN (
  http_request(
  conn => 'shsjobs',
  method => 'GET',
  path => format_string("sparkui/%s/driver-%s/api/v1/applications/%s/jobs", clusterid, getsparkcontext(clusterid):spark_context_id, getappid(clusterid)),
   headers => map(
       'Cookie', format_string("DATAPLANE_DOMAIN_DBAUTH=%s", secret("shscreds", "cookies")))
    )
  )
.text

In [0]:
%sql
CREATE OR REPLACE FUNCTION listshsstagesraw(
  clusterid string
)
RETURNS STRING
COMMENT 'Calls shs to get stages list raw'
RETURN (
  http_request(
  conn => 'shsjobs',
  method => 'GET',
  path => format_string("sparkui/%s/driver-%s/api/v1/applications/%s/stages", clusterid, getsparkcontext(clusterid):spark_context_id, getappid(clusterid)),
   headers => map(
       'Cookie', format_string("DATAPLANE_DOMAIN_DBAUTH=%s", secret("shscreds", "cookies")))
    )
  )
.text

In [0]:
%sql
CREATE OR REPLACE FUNCTION listshssqlraw(
  clusterid string
)
RETURNS STRING
COMMENT 'Calls shs to get sql list raw'
RETURN (
  http_request(
  conn => 'shsjobs',
  method => 'GET',
  path => format_string("sparkui/%s/driver-%s/api/v1/applications/%s/sql", clusterid, getsparkcontext(clusterid):spark_context_id, getappid(clusterid)),
   headers => map(
       'Cookie', format_string("DATAPLANE_DOMAIN_DBAUTH=%s", secret("shscreds", "cookies")))
    )
  )
.text

In [0]:
%sql
CREATE OR REPLACE FUNCTION listshsexecutorsraw(
  clusterid string
)
RETURNS STRING
COMMENT 'Calls shs to get executors list raw'
RETURN (
  http_request(
  conn => 'shsjobs',
  method => 'GET',
  path => format_string("sparkui/%s/driver-%s/api/v1/applications/%s/allexecutors", clusterid, getsparkcontext(clusterid):spark_context_id, getappid(clusterid)),
   headers => map(
       'Cookie', format_string("DATAPLANE_DOMAIN_DBAUTH=%s", secret("shscreds", "cookies")))
    )
  )
.text

In [0]:
%sql
CREATE OR REPLACE FUNCTION listshsenvraw(
  clusterid string
)
RETURNS STRING
COMMENT 'Calls shs to get environment raw'
RETURN (
  http_request(
  conn => 'shsjobs',
  method => 'GET',
  path => format_string("sparkui/%s/driver-%s/api/v1/applications/%s/environment", clusterid, getsparkcontext(clusterid):spark_context_id, getappid(clusterid)),
   headers => map(
       'Cookie', format_string("DATAPLANE_DOMAIN_DBAUTH=%s", secret("shscreds", "cookies")))
    )
  )
.text

In [0]:
%sql
CREATE OR REPLACE FUNCTION listshstasksraw(
  clusterid string, stageid int
)
RETURNS STRING
COMMENT 'Calls shs to get tasks list raw'
RETURN (
  http_request(
  conn => 'shsjobs',
  method => 'GET',
  path => format_string("sparkui/%s/driver-%s/api/v1/applications/%s/stages/%s/0/taskSummary", clusterid, getsparkcontext(clusterid):spark_context_id, getappid(clusterid), stageid),
   headers => map(
       'Cookie', format_string("DATAPLANE_DOMAIN_DBAUTH=%s", secret("shscreds", "cookies")))
    )
  )
.text

In [0]:
%sql
--list jobs mcp tool
CREATE OR REPLACE FUNCTION getslowestjobs(
  clusterid string
)

RETURNS TABLE (jobId string, name string, description string, submissionTime string, completionTime string, stageIds string, status string, numTasks double, numCompletedTasks double, numSkippedTasks double, numFailedTasks double, numCompletedStages double, numSkippedStages double, numFailedStages double, runtimesec long)
COMMENT 'Calls shs to get slowestjobs'
RETURN

with raw as (
  select try_parse_json(listshsjobsraw(clusterid))::array<struct<jobId:string, name:string, description:string, submissionTime:string, completionTime:string, stageIds:string, status:string, numTasks:double, numCompletedTasks:double, numSkippedTasks:double, numFailedTasks:double, numCompletedStages:double, numSkippedStages:double, numFailedStages:double>> as jobmetrics),

explode as (
  select explode(jobmetrics) as jobmetricsexp
  from raw
)

select jobmetricsexp.*,
timestampdiff(second, to_timestamp(jobmetricsexp.submissionTime), to_timestamp(jobmetricsexp.completionTime)) as runtimesec
from explode 
order by runtimesec desc

In [0]:
%sql
CREATE OR REPLACE FUNCTION getsloweststages(
  clusterid string
)

RETURNS TABLE (stageId string, attemptId string, name string, description string, submissionTime string, completionTime string, status string, numTasks double, numCompletedTasks double, numSkippedTasks double, numFailedTasks double, numCompletedStages double, numSkippedStages double, numFailedStages double, memoryBytesSpilled long, diskBytesSpilled long, inputBytes long, inputRecords long, outputBytes long, outputRecords long, shuffleReadBytes long, shuffleReadRecords long, shuffleWriteBytes long, shuffleWriteRecords long, runtimesec long)
COMMENT 'Calls shs to get slowest stages'
RETURN

with raw as (
  select try_parse_json(listshsstagesraw(clusterid))::array<struct<stageId:string, attemptId:string, name:string, description:string, submissionTime:string, completionTime:string, status:string, numTasks:double, numCompletedTasks:double, numSkippedTasks:double, numFailedTasks:double, numCompletedStages:double, numSkippedStages:double, numFailedStages:double, memoryBytesSpilled:long, diskBytesSpilled:long, inputBytes:long, inputRecords:long, outputBytes:long, outputRecords:long, shuffleReadBytes:long, shuffleReadRecords:long, shuffleWriteBytes:long, shuffleWriteRecords:long >> as stagemetrics),

explode as (
  select explode(stagemetrics) as stagemetricsexp
  from raw
)

select stagemetricsexp.*,
timestampdiff(second, to_timestamp(stagemetricsexp.submissionTime), to_timestamp(stagemetricsexp.completionTime)) as runtimesec
from explode 
order by runtimesec desc

In [0]:
%sql
--list jobs mcp tool
CREATE OR REPLACE FUNCTION getslowestsql(
  clusterid string
)

RETURNS TABLE (id long, status string, description string, planDescription string, submissionTime string, duration long, successJobIds string, failedJobIds string, nodes array<struct<nodeId: INT, nodeName: STRING, metrics: array<struct<name:STRING, value:STRING>>>>)
COMMENT 'Calls shs to get slowest sql queries'
RETURN

with raw as (
  select try_parse_json(listshssqlraw(clusterid))::array<struct<id:long, status:string, description:string, planDescription:string, submissionTime:string, duration:long, successJobIds:string, failedJobIds:string, nodes: array<struct<nodeId: INT, nodeName: STRING, metrics: array<struct<name:STRING, value:STRING>>>> >> as sqlmetrics),

explode as (
  select explode(sqlmetrics) as sqlmetricsexp
  from raw
)

select sqlmetricsexp.*
from explode 
order by sqlmetricsexp.duration desc

In [0]:
%sql
--list jobs mcp tool
CREATE OR REPLACE FUNCTION getstage(
  clusterid string, stageid int
)

RETURNS TABLE (stageId string, attemptId string, name string, description string, submissionTime string, completionTime string, status string, numTasks double, numCompletedTasks double, numSkippedTasks double, numFailedTasks double, numCompletedStages double, numSkippedStages double, numFailedStages double, memoryBytesSpilled long, diskBytesSpilled long, inputBytes long, inputRecords long, outputBytes long, outputRecords long, shuffleReadBytes long, shuffleReadRecords long, shuffleWriteBytes long, shuffleWriteRecords long, runtimesec long)
COMMENT 'Calls shs to get a specific stage'
RETURN

with raw as (
  select try_parse_json(listshsstagesraw(clusterid))::array<struct<stageId:string, attemptId:string, name:string, description:string, submissionTime:string, completionTime:string, status:string, numTasks:double, numCompletedTasks:double, numSkippedTasks:double, numFailedTasks:double, numCompletedStages:double, numSkippedStages:double, numFailedStages:double, memoryBytesSpilled:long, diskBytesSpilled:long, inputBytes:long, inputRecords:long, outputBytes:long, outputRecords:long, shuffleReadBytes:long, shuffleReadRecords:long, shuffleWriteBytes:long, shuffleWriteRecords:long >> as stagemetrics),

explode as (
  select explode(stagemetrics) as stagemetricsexp
  from raw
)

select stagemetricsexp.*,
timestampdiff(second, to_timestamp(stagemetricsexp.submissionTime), to_timestamp(stagemetricsexp.completionTime)) as runtimesec
from explode 
where stagemetricsexp.stageId = stageid

In [0]:
%sql
--list jobs mcp tool
CREATE OR REPLACE FUNCTION getexecutor(
  clusterid string, executorid int
)

RETURNS TABLE (id string, memoryUsed double, diskUsed double, totalCores double, addTime string, removeTime string, maxTasks double, completedTasks double, totalTasks double, totalDuration double, totalGCTime double, totalInputBytes long, totalShuffleRead long, totalShuffleWrite long, maxMemory long, uptime long)
COMMENT 'Calls shs to get a specific executor'
RETURN

with raw as (
  select try_parse_json(listshsexecutorsraw(clusterid))::array<struct<id:string, memoryUsed:double, diskUsed:double, totalCores:double, addTime:string, removeTime:string, maxTasks:double, completedTasks:double, totalTasks:double, totalDuration:double, totalGCTime:double, totalInputBytes:long, totalShuffleRead:long, totalShuffleWrite:long, maxMemory:long >> as execmetrics),

explode as (
  select explode(execmetrics) as execmetricsexp
  from raw
)

select execmetricsexp.*,
timestampdiff(second, to_timestamp(execmetricsexp.addTime), to_timestamp(execmetricsexp.removeTime)) as uptime
from explode 
where execmetricsexp.id = executorid