In [0]:
%sql
-- ineff jobs across all spark jobs, agged data
CREATE OR REPLACE FUNCTION prodrs.spark_observability.ineffjobs()
RETURNS TABLE (cluster_id string, ineffstageagg double)
COMMENT 'Returns aggregated data regarding inefficient spark jobs'
RETURN SELECT cluster_id, ineffstageagg
  FROM prodrs.spark_observability.ineffjobagg;

In [0]:
%sql
-- ineff jobs across all spark jobs, raw unagged data
CREATE OR REPLACE FUNCTION prodrs.spark_observability.ineffjobsraw()
RETURNS TABLE (cluster_id string, stageid int, skewbinary int, spillbinary int, diskspillbinary int, chunkbinary int, prbinary int, task_completion_rate double)
COMMENT 'Returns raw data regarding inefficient spark jobs'
RETURN SELECT cluster_id, stage_id, skewbinary, spillbinary, diskspillbinary, chunkbinary, prbinary, task_completion_rate
  FROM prodrs.spark_observability.ineffjobraw
Where goldcheckfilter = 1

In [0]:
%sql
-- photon analysis for EMR 
CREATE OR REPLACE FUNCTION prodrs.spark_observability.emrphotonanalysis()
RETURNS TABLE
COMMENT 'Returns analysis regarding which spark jobs are most likely to benefit from photon'
RETURN SELECT cluster_name, jobphotonperc
  FROM prodrs.spark_observability.photonanalysis
  where jobphotonperc > .8

In [0]:
%sql
CREATE OR REPLACE FUNCTION prodrs.spark_observability.getsparkcontext(
  clusterid string
)
RETURNS STRING
COMMENT 'Calls to db api to get spark context'
RETURN (
  http_request(
  conn => 'clusterapi',
  method => 'GET',
  path => format_string("api/2.1/clusters/get?cluster_id=%s", clusterid))
    )
.text

In [0]:
%sql
CREATE OR REPLACE FUNCTION prodrs.spark_observability.listappsraw(
  clusterid string
)
RETURNS STRING
COMMENT 'Calls shs to list applications submitted against spark cluster'
RETURN (
  http_request(
  conn => 'shsjobs',
  method => 'GET',
  path => format_string("sparkui/%s/driver-%s/api/v1/applications", clusterid, prodrs.spark_observability.getsparkcontext(clusterid):spark_context_id),
   headers => map(
       'Cookie', format_string("DATAPLANE_DOMAIN_DBAUTH=%s", secret("shscreds", "cookies")))
    )
  )
.text

In [0]:
%sql
CREATE OR REPLACE FUNCTION prodrs.spark_observability.getappid(
  clusterid string
)
RETURNS STRING
COMMENT 'Calls shs to get appid'
RETURN try_parse_json(prodrs.spark_observability.listappsraw(clusterid))::array<struct<id:string>>[0]["id"]

In [0]:
%sql
CREATE OR REPLACE FUNCTION prodrs.spark_observability.listshsjobsraw(
  clusterid string
)
RETURNS STRING
COMMENT 'Calls shs to get jobs list raw'
RETURN (
  http_request(
  conn => 'shsjobs',
  method => 'GET',
  path => format_string("sparkui/%s/driver-%s/api/v1/applications/%s/jobs", clusterid, prodrs.spark_observability.getsparkcontext(clusterid):spark_context_id, prodrs.spark_observability.getappid(clusterid)),
   headers => map(
       'Cookie', format_string("DATAPLANE_DOMAIN_DBAUTH=%s", secret("shscreds", "cookies")))
    )
  )
.text

In [0]:
%sql
CREATE OR REPLACE FUNCTION prodrs.spark_observability.listshsstagesraw(
  clusterid string
)
RETURNS STRING
COMMENT 'Calls shs to get stages list raw'
RETURN (
  http_request(
  conn => 'shsjobs',
  method => 'GET',
  path => format_string("sparkui/%s/driver-%s/api/v1/applications/%s/stages", clusterid, prodrs.spark_observability.getsparkcontext(clusterid):spark_context_id, prodrs.spark_observability.getappid(clusterid)),
   headers => map(
       'Cookie', format_string("DATAPLANE_DOMAIN_DBAUTH=%s", secret("shscreds", "cookies")))
    )
  )
.text

In [0]:
%sql
CREATE OR REPLACE FUNCTION prodrs.spark_observability.listshssqlraw(
  clusterid string
)
RETURNS STRING
COMMENT 'Calls shs to get sql list raw'
RETURN (
  http_request(
  conn => 'shsjobs',
  method => 'GET',
  path => format_string("sparkui/%s/driver-%s/api/v1/applications/%s/sql", clusterid, prodrs.spark_observability.getsparkcontext(clusterid):spark_context_id, prodrs.spark_observability.getappid(clusterid)),
   headers => map(
       'Cookie', format_string("DATAPLANE_DOMAIN_DBAUTH=%s", secret("shscreds", "cookies")))
    )
  )
.text

In [0]:
%sql
CREATE OR REPLACE FUNCTION prodrs.spark_observability.listshsexecutorsraw(
  clusterid string
)
RETURNS STRING
COMMENT 'Calls shs to get executors list raw'
RETURN (
  http_request(
  conn => 'shsjobs',
  method => 'GET',
  path => format_string("sparkui/%s/driver-%s/api/v1/applications/%s/allexecutors", clusterid, prodrs.spark_observability.getsparkcontext(clusterid):spark_context_id, prodrs.spark_observability.getappid(clusterid)),
   headers => map(
       'Cookie', format_string("DATAPLANE_DOMAIN_DBAUTH=%s", secret("shscreds", "cookies")))
    )
  )
.text

In [0]:
%sql
CREATE OR REPLACE FUNCTION prodrs.spark_observability.listshsenvraw(
  clusterid string
)
RETURNS STRING
COMMENT 'Calls shs to get environment raw'
RETURN (
  http_request(
  conn => 'shsjobs',
  method => 'GET',
  path => format_string("sparkui/%s/driver-%s/api/v1/applications/%s/environment", clusterid, prodrs.spark_observability.getsparkcontext(clusterid):spark_context_id, prodrs.spark_observability.getappid(clusterid)),
   headers => map(
       'Cookie', format_string("DATAPLANE_DOMAIN_DBAUTH=%s", secret("shscreds", "cookies")))
    )
  )
.text

In [0]:
%sql
CREATE OR REPLACE FUNCTION prodrs.spark_observability.listshstasksraw(
  clusterid string, stageid int
)
RETURNS STRING
COMMENT 'Calls shs to get tasks list raw'
RETURN (
  http_request(
  conn => 'shsjobs',
  method => 'GET',
  path => format_string("sparkui/%s/driver-%s/api/v1/applications/%s/stages/%s/0/taskSummary", clusterid, prodrs.spark_observability.getsparkcontext(clusterid):spark_context_id, prodrs.spark_observability.getappid(clusterid), stageid),
   headers => map(
       'Cookie', format_string("DATAPLANE_DOMAIN_DBAUTH=%s", secret("shscreds", "cookies")))
    )
  )
.text

In [0]:
%sql
--list jobs mcp tool
CREATE OR REPLACE FUNCTION prodrs.spark_observability.getslowestjobs(
  clusterid string
)

RETURNS TABLE (jobId string, name string, description string, submissionTime string, completionTime string, stageIds string, status string, numTasks double, numCompletedTasks double, numSkippedTasks double, numFailedTasks double, numCompletedStages double, numSkippedStages double, numFailedStages double, runtimesec long)
COMMENT 'Calls shs to get slowestjobs'
RETURN

with raw as (
  select try_parse_json(prodrs.spark_observability.listshsjobsraw(clusterid))::array<struct<jobId:string, name:string, description:string, submissionTime:string, completionTime:string, stageIds:string, status:string, numTasks:double, numCompletedTasks:double, numSkippedTasks:double, numFailedTasks:double, numCompletedStages:double, numSkippedStages:double, numFailedStages:double>> as jobmetrics),

explode as (
  select explode(jobmetrics) as jobmetricsexp
  from raw
)

select jobmetricsexp.*,
timestampdiff(second, to_timestamp(jobmetricsexp.submissionTime), to_timestamp(jobmetricsexp.completionTime)) as runtimesec
from explode 
order by runtimesec desc

In [0]:
%sql
CREATE OR REPLACE FUNCTION prodrs.spark_observability.getsloweststages(
  clusterid string
)

RETURNS TABLE (stageId string, attemptId string, name string, description string, submissionTime string, completionTime string, status string, numTasks double, numCompletedTasks double, numSkippedTasks double, numFailedTasks double, numCompletedStages double, numSkippedStages double, numFailedStages double, memoryBytesSpilled long, diskBytesSpilled long, inputBytes long, inputRecords long, outputBytes long, outputRecords long, shuffleReadBytes long, shuffleReadRecords long, shuffleWriteBytes long, shuffleWriteRecords long, runtimesec long)
COMMENT 'Calls shs to get slowest stages'
RETURN

with raw as (
  select try_parse_json(prodrs.spark_observability.listshsstagesraw(clusterid))::array<struct<stageId:string, attemptId:string, name:string, description:string, submissionTime:string, completionTime:string, status:string, numTasks:double, numCompletedTasks:double, numSkippedTasks:double, numFailedTasks:double, numCompletedStages:double, numSkippedStages:double, numFailedStages:double, memoryBytesSpilled:long, diskBytesSpilled:long, inputBytes:long, inputRecords:long, outputBytes:long, outputRecords:long, shuffleReadBytes:long, shuffleReadRecords:long, shuffleWriteBytes:long, shuffleWriteRecords:long >> as stagemetrics),

explode as (
  select explode(stagemetrics) as stagemetricsexp
  from raw
)

select stagemetricsexp.*,
timestampdiff(second, to_timestamp(stagemetricsexp.submissionTime), to_timestamp(stagemetricsexp.completionTime)) as runtimesec
from explode 
order by runtimesec desc

In [0]:
%sql
--list jobs mcp tool
CREATE OR REPLACE FUNCTION prodrs.spark_observability.getslowestsql(
  clusterid string
)

RETURNS TABLE (id long, status string, description string, planDescription string, submissionTime string, duration long, successJobIds string, failedJobIds string, nodes array<struct<nodeId: INT, nodeName: STRING, metrics: array<struct<name:STRING, value:STRING>>>>)
COMMENT 'Calls shs to get slowest sql queries'
RETURN

with raw as (
  select try_parse_json(prodrs.spark_observability.listshssqlraw(clusterid))::array<struct<id:long, status:string, description:string, planDescription:string, submissionTime:string, duration:long, successJobIds:string, failedJobIds:string, nodes: array<struct<nodeId: INT, nodeName: STRING, metrics: array<struct<name:STRING, value:STRING>>>> >> as sqlmetrics),

explode as (
  select explode(sqlmetrics) as sqlmetricsexp
  from raw
)

select sqlmetricsexp.*
from explode 
order by sqlmetricsexp.duration desc

In [0]:
%sql
--list jobs mcp tool
CREATE OR REPLACE FUNCTION prodrs.spark_observability.getstage(
  clusterid string, stageid int
)

RETURNS TABLE (stageId string, attemptId string, name string, description string, submissionTime string, completionTime string, status string, numTasks double, numCompletedTasks double, numSkippedTasks double, numFailedTasks double, numCompletedStages double, numSkippedStages double, numFailedStages double, memoryBytesSpilled long, diskBytesSpilled long, inputBytes long, inputRecords long, outputBytes long, outputRecords long, shuffleReadBytes long, shuffleReadRecords long, shuffleWriteBytes long, shuffleWriteRecords long, runtimesec long)
COMMENT 'Calls shs to get a specific stage'
RETURN

with raw as (
  select try_parse_json(prodrs.spark_observability.listshsstagesraw(clusterid))::array<struct<stageId:string, attemptId:string, name:string, description:string, submissionTime:string, completionTime:string, status:string, numTasks:double, numCompletedTasks:double, numSkippedTasks:double, numFailedTasks:double, numCompletedStages:double, numSkippedStages:double, numFailedStages:double, memoryBytesSpilled:long, diskBytesSpilled:long, inputBytes:long, inputRecords:long, outputBytes:long, outputRecords:long, shuffleReadBytes:long, shuffleReadRecords:long, shuffleWriteBytes:long, shuffleWriteRecords:long >> as stagemetrics),

explode as (
  select explode(stagemetrics) as stagemetricsexp
  from raw
)

select stagemetricsexp.*,
timestampdiff(second, to_timestamp(stagemetricsexp.submissionTime), to_timestamp(stagemetricsexp.completionTime)) as runtimesec
from explode 
where stagemetricsexp.stageId = stageid

In [0]:
%sql
--list jobs mcp tool
CREATE OR REPLACE FUNCTION prodrs.spark_observability.getexecutor(
  clusterid string, executorid int
)

RETURNS TABLE (id string, memoryUsed double, diskUsed double, totalCores double, addTime string, removeTime string, maxTasks double, completedTasks double, totalTasks double, totalDuration double, totalGCTime double, totalInputBytes long, totalShuffleRead long, totalShuffleWrite long, maxMemory long, uptime long)
COMMENT 'Calls shs to get a specific executor'
RETURN

with raw as (
  select try_parse_json(prodrs.spark_observability.listshsexecutorsraw(clusterid))::array<struct<id:string, memoryUsed:double, diskUsed:double, totalCores:double, addTime:string, removeTime:string, maxTasks:double, completedTasks:double, totalTasks:double, totalDuration:double, totalGCTime:double, totalInputBytes:long, totalShuffleRead:long, totalShuffleWrite:long, maxMemory:long >> as execmetrics),

explode as (
  select explode(execmetrics) as execmetricsexp
  from raw
)

select execmetricsexp.*,
timestampdiff(second, to_timestamp(execmetricsexp.addTime), to_timestamp(execmetricsexp.removeTime)) as uptime
from explode 
where execmetricsexp.id = executorid

In [0]:
%sql
-- listjobs for single EMR spark job 
CREATE OR REPLACE FUNCTION prodrs.spark_observability.list_jobs_emr(emr_cluster_arn STRING)
RETURNS ARRAY
LANGUAGE PYTHON
AS $$

emr_cluster_arn = {emr_cluster_arn}
region = {emr_cluster_arn}split(":")[3]  # Extract region from ARN # Initialize boto3 client emr_client = boto3.client( "emr", region_name=region, )

createapp = emr_client.create_persistent_app_ui(
                TargetResourceArn=emr_cluster_arn
            )
persistent_ui_id = createapp.get("PersistentAppUIId")

genurl = self.emr_client.get_persistent_app_ui_presigned_url(
                PersistentAppUIId=self.persistent_ui_id, PersistentAppUIType=ui_type
            )
presigned_url = genurl.get("PresignedURL")
Base_url = presigned_url 
session = requests.Session() 
session.headers.update(
            {
                "User-Agent": "EMR-Persistent-UI-Client/1.0",
                "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
                "Accept-Language": "en-US,en;q=0.5",
                "Accept-Encoding": "gzip, deflate",
                "Connection": "keep-alive",
                "Upgrade-Insecure-Requests": "1",
            }
        )
response = session.get(presigned_url, allow_redirects=True) 
Cookies = session.cookies
app_url = base_url + api/v1/applications
response = session.get(app_url, cookies=cookies, allow_redirects=True) 
Responsejson = response.json()
appid = Responsejson.get(‘id’)
job_url = app_url + appid + ‘jobs’
jobresponse = session.get(job_url, cookies=cookies, allow_redirects=True) 
Jobresponsejson = jobsresponse.json()
return Jobresponsejson