In [0]:
dbutils.widgets.text("catalog_name", "", "Catalog (required)")
dbutils.widgets.text("schema_name", "", "Schema")
CATALOG_NAME = dbutils.widgets.get("catalog_name").strip()
SCHEMA_NAME = dbutils.widgets.get("schema_name").strip() or "spark_observability"

# UC Validation
if not CATALOG_NAME:
    raise ValueError("catalog widget must point to an existing catalog")

spark.sql(f"CREATE SCHEMA IF NOT EXISTS {CATALOG_NAME}.{SCHEMA_NAME}")

In [0]:
%sql
USE CATALOG IDENTIFIER(:catalog_name);
USE SCHEMA IDENTIFIER(:schema_name);

## Service Credential Setup

This notebook uses a **Databricks Service Credential** to authenticate with AWS APIs. The credential (`profiler-credential`) must be created before running the functions below.

### Creating a Service Credential

1. Navigate to **Catalog > External Data > Credentials** in your Databricks workspace
2. Click **Create credential** and select **Service credential**
3. Name it `profiler-credential` (or update the references in this notebook)
4. Configure the IAM role ARN with the permissions listed below

### Required IAM Permissions

**EMR**
- `elasticmapreduce:ListClusters`
- `elasticmapreduce:DescribeCluster`
- `elasticmapreduce:ListSteps`
- `elasticmapreduce:DescribeStep`
- `elasticmapreduce:CreatePersistentAppUI`
- `elasticmapreduce:GetPersistentAppUIPresignedURL`
- `elasticmapreduce:ListInstanceGroups`
- `elasticmapreduce:ListInstanceFleets`

**S3** (if writing output)
- `s3:PutObject`
- `s3:GetObject`
- `s3:ListBucket`

**STS**
- `sts:GetCallerIdentity`

### Documentation

- [Databricks: Create a Service Credential](https://docs.databricks.com/aws/en/connect/unity-catalog/cloud-services/service-credentials)
- [AWS: EMR IAM Policies](https://docs.aws.amazon.com/emr/latest/ManagementGuide/emr-iam-roles.html)

In [0]:
%sql
-- Grant access to yourself or your group
GRANT ACCESS ON SERVICE CREDENTIAL `profiler-credential` TO `@domain.com`;


In [0]:
%sql
-- List raw Spark History Server jobs for an EMR cluster
CREATE OR REPLACE FUNCTION emr_listshsjobsraw(emr_cluster_arn STRING)
RETURNS STRING
LANGUAGE PYTHON
ENVIRONMENT (
  dependencies = '["boto3==1.42.0", "requests"]',
  environment_version = "None"
)
-- update with your service credential name
CREDENTIALS (
  `profiler-credential` AS default_cred DEFAULT
)
PARAMETER STYLE PANDAS
HANDLER 'handler_func'
AS $$
import boto3
import json
import requests
import pandas as pd
import time
from typing import Iterator
from urllib.parse import urlparse
from databricks.service_credentials import getServiceCredentialsProvider

def handler_func(iterator: Iterator[pd.Series]) -> Iterator[pd.Series]:
    # Initialize credentials provider once
    credential_variable = 'default_cred'
    
    provider = getServiceCredentialsProvider(credential_variable)
    
    # Cache boto3 clients by region to avoid recreating them for every row
    client_cache = {}

    def get_emr_client(region):
        if region not in client_cache:
            session = boto3.Session(botocore_session=provider, region_name=region)
            client_cache[region] = session.client('emr', region_name=region)
        return client_cache[region]

    def wait_for_presigned_url_ready(emr_client, persistent_ui_id, max_wait=60, interval=5):
        """Poll until PresignedURLReady is True"""
        total_waited = 0
        while total_waited < max_wait:
            response = emr_client.get_persistent_app_ui_presigned_url(
                PersistentAppUIId=persistent_ui_id,
                PersistentAppUIType='SHS'
            )
            if response.get("PresignedURLReady", False):
                return response.get("PresignedURL")
            time.sleep(interval)
            total_waited += interval
        return None

    # Iterate through batches of data
    for batch in iterator:
        results = []
        # Process each ARN in the current batch
        for arn in batch:
            try:
                # Handle potential nulls or empty strings
                if not arn:
                    results.append(json.dumps({"success": False, "error": "Empty ARN", "error_type": "ValueError"}))
                    continue

                region = arn.split(":")[3]
                emr_client = get_emr_client(region)

                # Create persistent app UI
                createapp = emr_client.create_persistent_app_ui(
                    TargetResourceArn=arn
                )
                persistent_ui_id = createapp.get("PersistentAppUIId")
                
                # Wait for presigned URL to be ready, then get it
                presigned_url = wait_for_presigned_url_ready(emr_client, persistent_ui_id)

                if not presigned_url:
                    results.append(json.dumps({
                        "success": False, 
                        "error": "Presigned URL not ready after waiting", 
                        "error_type": "TimeoutError"
                    }))
                    continue
                
                # Parse the presigned URL to extract the base URL
                # The presigned URL has auth params - we need base_url for API calls
                parsed_url = urlparse(presigned_url)
                base_url = f"{parsed_url.scheme}://{parsed_url.netloc}/shs"
                api_base = f"{base_url}/api/v1"
                
                # Create a new session for this cluster and establish cookies
                # by first visiting the presigned URL
                req_session = requests.Session()
                req_session.headers.update({
                    "User-Agent": "EMR-Observability-Client/1.0",
                    "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8"
                })
                
                # Visit presigned URL to establish authenticated session (sets cookies)
                response = req_session.get(presigned_url, allow_redirects=True, timeout=30)
                response.raise_for_status()
                
                # Now update headers for JSON API calls
                req_session.headers.update({"Accept": "application/json"})
                
                # Fetch applications using the base URL (not the presigned URL)
                app_url = f"{api_base}/applications"
                app_response = req_session.get(app_url, allow_redirects=True, timeout=30)
                app_response.raise_for_status()
                apps = app_response.json()
                
                # Get jobs for each app
                all_jobs = []
                for app in apps:
                    app_id = app.get('id')
                    if app_id:
                        job_url = f"{api_base}/applications/{app_id}/jobs"
                        job_response = req_session.get(job_url, allow_redirects=True, timeout=30)
                        if job_response.status_code == 200:
                            jobs = job_response.json()
                            for job in jobs:
                                job['app_id'] = app_id
                                job['cluster_arn'] = arn
                            all_jobs.extend(jobs)
                
                results.append(json.dumps({"success": True, "jobs": all_jobs, "count": len(all_jobs)}))

            except Exception as e:
                # Return error JSON for this specific row instead of failing the whole batch
                results.append(json.dumps({"success": False, "error": str(e), "error_type": type(e).__name__}))
        
        # Yield the results for this batch as a Series
        yield pd.Series(results)
$$;

In [0]:
%sql
-- List raw Spark History Server stages for an EMR cluster
CREATE OR REPLACE FUNCTION emr_listshsstagesraw(emr_cluster_arn STRING)
RETURNS STRING
LANGUAGE PYTHON
ENVIRONMENT (
  dependencies = '["boto3==1.42.0", "requests"]',
  environment_version = "None"
)
-- update with your service credential name
CREDENTIALS (
  `profiler-credential` AS default_cred DEFAULT
)
PARAMETER STYLE PANDAS
HANDLER 'handler_func'
AS $$
import boto3
import json
import requests
import pandas as pd
import time
from typing import Iterator
from urllib.parse import urlparse
from databricks.service_credentials import getServiceCredentialsProvider

def handler_func(iterator: Iterator[pd.Series]) -> Iterator[pd.Series]:
    # Initialize credentials provider once
    credential_variable = 'default_cred'
    
    provider = getServiceCredentialsProvider(credential_variable)
    
    # Cache boto3 clients by region to avoid recreating them for every row
    client_cache = {}

    def get_emr_client(region):
        if region not in client_cache:
            session = boto3.Session(botocore_session=provider, region_name=region)
            client_cache[region] = session.client('emr', region_name=region)
        return client_cache[region]

    def wait_for_presigned_url_ready(emr_client, persistent_ui_id, max_wait=60, interval=5):
        """Poll until PresignedURLReady is True"""
        total_waited = 0
        while total_waited < max_wait:
            response = emr_client.get_persistent_app_ui_presigned_url(
                PersistentAppUIId=persistent_ui_id,
                PersistentAppUIType='SHS'
            )
            if response.get("PresignedURLReady", False):
                return response.get("PresignedURL")
            time.sleep(interval)
            total_waited += interval
        return None

    # Iterate through batches of data
    for batch in iterator:
        results = []
        # Process each ARN in the current batch
        for arn in batch:
            try:
                # Handle potential nulls or empty strings
                if not arn:
                    results.append(json.dumps({"success": False, "error": "Empty ARN", "error_type": "ValueError"}))
                    continue

                region = arn.split(":")[3]
                emr_client = get_emr_client(region)

                # Create persistent app UI
                createapp = emr_client.create_persistent_app_ui(
                    TargetResourceArn=arn
                )
                persistent_ui_id = createapp.get("PersistentAppUIId")
                
                # Wait for presigned URL to be ready, then get it
                presigned_url = wait_for_presigned_url_ready(emr_client, persistent_ui_id)

                if not presigned_url:
                    results.append(json.dumps({
                        "success": False, 
                        "error": "Presigned URL not ready after waiting", 
                        "error_type": "TimeoutError"
                    }))
                    continue
                
                # Parse the presigned URL to extract the base URL
                # The presigned URL has auth params - we need base_url for API calls
                parsed_url = urlparse(presigned_url)
                base_url = f"{parsed_url.scheme}://{parsed_url.netloc}/shs"
                api_base = f"{base_url}/api/v1"
                
                # Create a new session for this cluster and establish cookies
                # by first visiting the presigned URL
                req_session = requests.Session()
                req_session.headers.update({
                    "User-Agent": "EMR-Observability-Client/1.0",
                    "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8"
                })
                
                # Visit presigned URL to establish authenticated session (sets cookies)
                response = req_session.get(presigned_url, allow_redirects=True, timeout=30)
                response.raise_for_status()
                
                # Now update headers for JSON API calls
                req_session.headers.update({"Accept": "application/json"})
                
                # Fetch applications using the base URL (not the presigned URL)
                app_url = f"{api_base}/applications"
                app_response = req_session.get(app_url, allow_redirects=True, timeout=30)
                app_response.raise_for_status()
                apps = app_response.json()
                
                # Get stages for each app
                all_stages = []
                for app in apps:
                    app_id = app.get('id')
                    if app_id:
                        stage_url = f"{api_base}/applications/{app_id}/stages"
                        stage_response = req_session.get(stage_url, allow_redirects=True, timeout=30)
                        if stage_response.status_code == 200:
                            stages = stage_response.json()
                            for stage in stages:
                                stage['app_id'] = app_id
                                stage['cluster_arn'] = arn
                            all_stages.extend(stages)
                
                results.append(json.dumps({"success": True, "stages": all_stages, "count": len(all_stages)}))

            except Exception as e:
                # Return error JSON for this specific row instead of failing the whole batch
                results.append(json.dumps({"success": False, "error": str(e), "error_type": type(e).__name__}))
        
        # Yield the results for this batch as a Series
        yield pd.Series(results)
$$;

In [0]:
%sql
-- List raw Spark History Server SQL execution data for an EMR cluster
CREATE OR REPLACE FUNCTION emr_listshssqlraw(emr_cluster_arn STRING)
RETURNS STRING
LANGUAGE PYTHON
ENVIRONMENT (
  dependencies = '["boto3==1.42.0", "requests"]',
  environment_version = "None"
)
-- update with your service credential name
CREDENTIALS (
  `profiler-credential` AS default_cred DEFAULT
)
PARAMETER STYLE PANDAS
HANDLER 'handler_func'
AS $$
import boto3
import json
import requests
import pandas as pd
import time
from typing import Iterator
from urllib.parse import urlparse
from databricks.service_credentials import getServiceCredentialsProvider

def handler_func(iterator: Iterator[pd.Series]) -> Iterator[pd.Series]:
    # Initialize credentials provider once
    credential_variable = 'default_cred'
    
    provider = getServiceCredentialsProvider(credential_variable)
    
    # Cache boto3 clients by region to avoid recreating them for every row
    client_cache = {}

    def get_emr_client(region):
        if region not in client_cache:
            session = boto3.Session(botocore_session=provider, region_name=region)
            client_cache[region] = session.client('emr', region_name=region)
        return client_cache[region]

    def wait_for_presigned_url_ready(emr_client, persistent_ui_id, max_wait=60, interval=5):
        """Poll until PresignedURLReady is True"""
        total_waited = 0
        while total_waited < max_wait:
            response = emr_client.get_persistent_app_ui_presigned_url(
                PersistentAppUIId=persistent_ui_id,
                PersistentAppUIType='SHS'
            )
            if response.get("PresignedURLReady", False):
                return response.get("PresignedURL")
            time.sleep(interval)
            total_waited += interval
        return None

    # Iterate through batches of data
    for batch in iterator:
        results = []
        # Process each ARN in the current batch
        for arn in batch:
            try:
                # Handle potential nulls or empty strings
                if not arn:
                    results.append(json.dumps({"success": False, "error": "Empty ARN", "error_type": "ValueError"}))
                    continue

                region = arn.split(":")[3]
                emr_client = get_emr_client(region)

                # Create persistent app UI
                createapp = emr_client.create_persistent_app_ui(
                    TargetResourceArn=arn
                )
                persistent_ui_id = createapp.get("PersistentAppUIId")
                
                # Wait for presigned URL to be ready, then get it
                presigned_url = wait_for_presigned_url_ready(emr_client, persistent_ui_id)

                if not presigned_url:
                    results.append(json.dumps({
                        "success": False, 
                        "error": "Presigned URL not ready after waiting", 
                        "error_type": "TimeoutError"
                    }))
                    continue
                
                # Parse the presigned URL to extract the base URL
                # The presigned URL has auth params - we need base_url for API calls
                parsed_url = urlparse(presigned_url)
                base_url = f"{parsed_url.scheme}://{parsed_url.netloc}/shs"
                api_base = f"{base_url}/api/v1"
                
                # Create a new session for this cluster and establish cookies
                # by first visiting the presigned URL
                req_session = requests.Session()
                req_session.headers.update({
                    "User-Agent": "EMR-Observability-Client/1.0",
                    "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8"
                })
                
                # Visit presigned URL to establish authenticated session (sets cookies)
                response = req_session.get(presigned_url, allow_redirects=True, timeout=30)
                response.raise_for_status()
                
                # Now update headers for JSON API calls
                req_session.headers.update({"Accept": "application/json"})
                
                # Fetch applications using the base URL (not the presigned URL)
                app_url = f"{api_base}/applications"
                app_response = req_session.get(app_url, allow_redirects=True, timeout=30)
                app_response.raise_for_status()
                apps = app_response.json()
                
                # Get SQL queries for each app
                all_sql = []
                for app in apps:
                    app_id = app.get('id')
                    if app_id:
                        sql_url = f"{api_base}/applications/{app_id}/sql"
                        sql_response = req_session.get(sql_url, allow_redirects=True, timeout=30)
                        if sql_response.status_code == 200:
                            sqlqueries = sql_response.json()
                            for sql in sqlqueries:
                                sql['app_id'] = app_id
                                sql['cluster_arn'] = arn
                            all_sql.extend(sqlqueries)
                
                results.append(json.dumps({"success": True, "sql": all_sql, "count": len(all_sql)}))

            except Exception as e:
                # Return error JSON for this specific row instead of failing the whole batch
                results.append(json.dumps({"success": False, "error": str(e), "error_type": type(e).__name__}))
        
        # Yield the results for this batch as a Series
        yield pd.Series(results)
$$;

In [0]:
%sql
-- List raw Spark History Server executors for an EMR cluster
CREATE OR REPLACE FUNCTION emr_listshsexecutorsraw(emr_cluster_arn STRING)
RETURNS STRING
LANGUAGE PYTHON
ENVIRONMENT (
  dependencies = '["boto3==1.42.0", "requests"]',
  environment_version = "None"
)
-- update with your service credential name
CREDENTIALS (
  `profiler-credential` AS default_cred DEFAULT
)
PARAMETER STYLE PANDAS
HANDLER 'handler_func'
AS $$
import boto3
import json
import requests
import pandas as pd
import time
from typing import Iterator
from urllib.parse import urlparse
from databricks.service_credentials import getServiceCredentialsProvider

def handler_func(iterator: Iterator[pd.Series]) -> Iterator[pd.Series]:
    # Initialize credentials provider once
    credential_variable = 'default_cred'
    
    provider = getServiceCredentialsProvider(credential_variable)
    
    # Cache boto3 clients by region to avoid recreating them for every row
    client_cache = {}

    def get_emr_client(region):
        if region not in client_cache:
            session = boto3.Session(botocore_session=provider, region_name=region)
            client_cache[region] = session.client('emr', region_name=region)
        return client_cache[region]

    def wait_for_presigned_url_ready(emr_client, persistent_ui_id, max_wait=60, interval=5):
        """Poll until PresignedURLReady is True"""
        total_waited = 0
        while total_waited < max_wait:
            response = emr_client.get_persistent_app_ui_presigned_url(
                PersistentAppUIId=persistent_ui_id,
                PersistentAppUIType='SHS'
            )
            if response.get("PresignedURLReady", False):
                return response.get("PresignedURL")
            time.sleep(interval)
            total_waited += interval
        return None

    # Iterate through batches of data
    for batch in iterator:
        results = []
        # Process each ARN in the current batch
        for arn in batch:
            try:
                # Handle potential nulls or empty strings
                if not arn:
                    results.append(json.dumps({"success": False, "error": "Empty ARN", "error_type": "ValueError"}))
                    continue

                region = arn.split(":")[3]
                emr_client = get_emr_client(region)

                # Create persistent app UI
                createapp = emr_client.create_persistent_app_ui(
                    TargetResourceArn=arn
                )
                persistent_ui_id = createapp.get("PersistentAppUIId")
                
                # Wait for presigned URL to be ready, then get it
                presigned_url = wait_for_presigned_url_ready(emr_client, persistent_ui_id)

                if not presigned_url:
                    results.append(json.dumps({
                        "success": False, 
                        "error": "Presigned URL not ready after waiting", 
                        "error_type": "TimeoutError"
                    }))
                    continue
                
                # Parse the presigned URL to extract the base URL
                # The presigned URL has auth params - we need base_url for API calls
                parsed_url = urlparse(presigned_url)
                base_url = f"{parsed_url.scheme}://{parsed_url.netloc}/shs"
                api_base = f"{base_url}/api/v1"
                
                # Create a new session for this cluster and establish cookies
                # by first visiting the presigned URL
                req_session = requests.Session()
                req_session.headers.update({
                    "User-Agent": "EMR-Observability-Client/1.0",
                    "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8"
                })
                
                # Visit presigned URL to establish authenticated session (sets cookies)
                response = req_session.get(presigned_url, allow_redirects=True, timeout=30)
                response.raise_for_status()
                
                # Now update headers for JSON API calls
                req_session.headers.update({"Accept": "application/json"})
                
                # Fetch applications using the base URL (not the presigned URL)
                app_url = f"{api_base}/applications"
                app_response = req_session.get(app_url, allow_redirects=True, timeout=30)
                app_response.raise_for_status()
                apps = app_response.json()
                
                # Get executors for each app (using allexecutors endpoint)
                all_executors = []
                for app in apps:
                    app_id = app.get('id')
                    if app_id:
                        executor_url = f"{api_base}/applications/{app_id}/allexecutors"
                        executor_response = req_session.get(executor_url, allow_redirects=True, timeout=30)
                        if executor_response.status_code == 200:
                            executors = executor_response.json()
                            for executor in executors:
                                executor['app_id'] = app_id
                                executor['cluster_arn'] = arn
                            all_executors.extend(executors)
                
                results.append(json.dumps({"success": True, "executors": all_executors, "count": len(all_executors)}))

            except Exception as e:
                # Return error JSON for this specific row instead of failing the whole batch
                results.append(json.dumps({"success": False, "error": str(e), "error_type": type(e).__name__}))
        
        # Yield the results for this batch as a Series
        yield pd.Series(results)
$$;

In [0]:
%sql
-- List raw Spark History Server environment info for an EMR cluster
CREATE OR REPLACE FUNCTION emr_listshsenvraw(emr_cluster_arn STRING)
RETURNS STRING
LANGUAGE PYTHON
ENVIRONMENT (
  dependencies = '["boto3==1.42.0", "requests"]',
  environment_version = "None"
)
-- update with your service credential name
CREDENTIALS (
  `profiler-credential` AS default_cred DEFAULT
)
PARAMETER STYLE PANDAS
HANDLER 'handler_func'
AS $$
import boto3
import json
import requests
import pandas as pd
import time
from typing import Iterator
from urllib.parse import urlparse
from databricks.service_credentials import getServiceCredentialsProvider

def handler_func(iterator: Iterator[pd.Series]) -> Iterator[pd.Series]:
    # Initialize credentials provider once
    credential_variable = 'default_cred'
    
    provider = getServiceCredentialsProvider(credential_variable)
    
    # Cache boto3 clients by region to avoid recreating them for every row
    client_cache = {}

    def get_emr_client(region):
        if region not in client_cache:
            session = boto3.Session(botocore_session=provider, region_name=region)
            client_cache[region] = session.client('emr', region_name=region)
        return client_cache[region]

    def wait_for_presigned_url_ready(emr_client, persistent_ui_id, max_wait=60, interval=5):
        """Poll until PresignedURLReady is True"""
        total_waited = 0
        while total_waited < max_wait:
            response = emr_client.get_persistent_app_ui_presigned_url(
                PersistentAppUIId=persistent_ui_id,
                PersistentAppUIType='SHS'
            )
            if response.get("PresignedURLReady", False):
                return response.get("PresignedURL")
            time.sleep(interval)
            total_waited += interval
        return None

    # Iterate through batches of data
    for batch in iterator:
        results = []
        # Process each ARN in the current batch
        for arn in batch:
            try:
                # Handle potential nulls or empty strings
                if not arn:
                    results.append(json.dumps({"success": False, "error": "Empty ARN", "error_type": "ValueError"}))
                    continue

                region = arn.split(":")[3]
                emr_client = get_emr_client(region)

                # Create persistent app UI
                createapp = emr_client.create_persistent_app_ui(
                    TargetResourceArn=arn
                )
                persistent_ui_id = createapp.get("PersistentAppUIId")
                
                # Wait for presigned URL to be ready, then get it
                presigned_url = wait_for_presigned_url_ready(emr_client, persistent_ui_id)

                if not presigned_url:
                    results.append(json.dumps({
                        "success": False, 
                        "error": "Presigned URL not ready after waiting", 
                        "error_type": "TimeoutError"
                    }))
                    continue
                
                # Parse the presigned URL to extract the base URL
                # The presigned URL has auth params - we need base_url for API calls
                parsed_url = urlparse(presigned_url)
                base_url = f"{parsed_url.scheme}://{parsed_url.netloc}/shs"
                api_base = f"{base_url}/api/v1"
                
                # Create a new session for this cluster and establish cookies
                # by first visiting the presigned URL
                req_session = requests.Session()
                req_session.headers.update({
                    "User-Agent": "EMR-Observability-Client/1.0",
                    "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8"
                })
                
                # Visit presigned URL to establish authenticated session (sets cookies)
                response = req_session.get(presigned_url, allow_redirects=True, timeout=30)
                response.raise_for_status()
                
                # Now update headers for JSON API calls
                req_session.headers.update({"Accept": "application/json"})
                
                # Fetch applications using the base URL (not the presigned URL)
                app_url = f"{api_base}/applications"
                app_response = req_session.get(app_url, allow_redirects=True, timeout=30)
                app_response.raise_for_status()
                apps = app_response.json()
                
                # Get environment info for each app
                all_env = []
                for app in apps:
                    app_id = app.get('id')
                    if app_id:
                        env_url = f"{api_base}/applications/{app_id}/environment"
                        env_response = req_session.get(env_url, allow_redirects=True, timeout=30)
                        if env_response.status_code == 200:
                            env_data = env_response.json()
                            env_data['app_id'] = app_id
                            env_data['cluster_arn'] = arn
                            all_env.append(env_data)
                
                results.append(json.dumps({"success": True, "environment": all_env, "count": len(all_env)}))

            except Exception as e:
                # Return error JSON for this specific row instead of failing the whole batch
                results.append(json.dumps({"success": False, "error": str(e), "error_type": type(e).__name__}))
        
        # Yield the results for this batch as a Series
        yield pd.Series(results)
$$;

In [0]:
%sql
-- List raw Spark History Server tasks for a given stage on an EMR cluster
CREATE OR REPLACE FUNCTION emr_listshstasksraw(emr_cluster_arn STRING, stage_id INT)
RETURNS STRING
LANGUAGE PYTHON
ENVIRONMENT (
  dependencies = '["boto3==1.42.0", "requests"]',
  environment_version = "None"
)
-- update with your service credential name
CREDENTIALS (
  `profiler-credential` AS default_cred DEFAULT
)
PARAMETER STYLE PANDAS
HANDLER 'handler_func'
AS $$
import boto3
import json
import requests
import pandas as pd
import time
from typing import Iterator, Tuple
from urllib.parse import urlparse
from databricks.service_credentials import getServiceCredentialsProvider

def handler_func(iterator: Iterator[Tuple[pd.Series, pd.Series]]) -> Iterator[pd.Series]:
    # Initialize credentials provider once
    credential_variable = 'default_cred'
    
    provider = getServiceCredentialsProvider(credential_variable)
    
    # Cache boto3 clients by region to avoid recreating them for every row
    client_cache = {}

    def get_emr_client(region):
        if region not in client_cache:
            session = boto3.Session(botocore_session=provider, region_name=region)
            client_cache[region] = session.client('emr', region_name=region)
        return client_cache[region]

    def wait_for_presigned_url_ready(emr_client, persistent_ui_id, max_wait=60, interval=5):
        """Poll until PresignedURLReady is True"""
        total_waited = 0
        while total_waited < max_wait:
            response = emr_client.get_persistent_app_ui_presigned_url(
                PersistentAppUIId=persistent_ui_id,
                PersistentAppUIType='SHS'
            )
            if response.get("PresignedURLReady", False):
                return response.get("PresignedURL")
            time.sleep(interval)
            total_waited += interval
        return None

    # Iterate through batches of data (tuple of Series for multi-param UDF)
    for arn_batch, stage_id_batch in iterator:
        results = []
        # Process each row in the current batch
        for arn, stage_id in zip(arn_batch, stage_id_batch):
            try:
                # Handle potential nulls or empty strings
                if not arn:
                    results.append(json.dumps({"success": False, "error": "Empty ARN", "error_type": "ValueError"}))
                    continue

                region = arn.split(":")[3]
                emr_client = get_emr_client(region)

                # Create persistent app UI
                createapp = emr_client.create_persistent_app_ui(
                    TargetResourceArn=arn
                )
                persistent_ui_id = createapp.get("PersistentAppUIId")
                
                # Wait for presigned URL to be ready, then get it
                presigned_url = wait_for_presigned_url_ready(emr_client, persistent_ui_id)

                if not presigned_url:
                    results.append(json.dumps({
                        "success": False, 
                        "error": "Presigned URL not ready after waiting", 
                        "error_type": "TimeoutError"
                    }))
                    continue
                
                # Parse the presigned URL to extract the base URL
                # The presigned URL has auth params - we need base_url for API calls
                parsed_url = urlparse(presigned_url)
                base_url = f"{parsed_url.scheme}://{parsed_url.netloc}/shs"
                api_base = f"{base_url}/api/v1"
                
                # Create a new session for this cluster and establish cookies
                # by first visiting the presigned URL
                req_session = requests.Session()
                req_session.headers.update({
                    "User-Agent": "EMR-Observability-Client/1.0",
                    "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8"
                })
                
                # Visit presigned URL to establish authenticated session (sets cookies)
                response = req_session.get(presigned_url, allow_redirects=True, timeout=30)
                response.raise_for_status()
                
                # Now update headers for JSON API calls
                req_session.headers.update({"Accept": "application/json"})
                
                # Fetch applications using the base URL (not the presigned URL)
                app_url = f"{api_base}/applications"
                app_response = req_session.get(app_url, allow_redirects=True, timeout=30)
                app_response.raise_for_status()
                apps = app_response.json()
                
                # Get tasks for the specified stage in each app
                all_tasks = []
                for app in apps:
                    app_id = app.get('id')
                    if app_id:
                        # Use taskList endpoint for actual task data (stage_attempt=0)
                        task_url = f"{api_base}/applications/{app_id}/stages/{stage_id}/0/taskList"
                        task_response = req_session.get(task_url, allow_redirects=True, timeout=30)
                        if task_response.status_code == 200:
                            tasks = task_response.json()
                            for task in tasks:
                                task['app_id'] = app_id
                                task['stage_id'] = stage_id
                                task['cluster_arn'] = arn
                            all_tasks.extend(tasks)
                
                results.append(json.dumps({"success": True, "tasks": all_tasks, "count": len(all_tasks)}))

            except Exception as e:
                # Return error JSON for this specific row instead of failing the whole batch
                results.append(json.dumps({"success": False, "error": str(e), "error_type": type(e).__name__}))
        
        # Yield the results for this batch as a Series
        yield pd.Series(results)
$$;

In [0]:
%sql
-- Get slowest Spark jobs for an EMR cluster
CREATE OR REPLACE FUNCTION emr_getslowestjobs(
  clusterid STRING
)
RETURNS TABLE (
  jobId STRING, 
  name STRING, 
  description STRING, 
  submissionTime STRING, 
  completionTime STRING, 
  stageIds STRING, 
  status STRING, 
  numTasks DOUBLE, 
  numCompletedTasks DOUBLE, 
  numSkippedTasks DOUBLE, 
  numFailedTasks DOUBLE, 
  numCompletedStages DOUBLE, 
  numSkippedStages DOUBLE, 
  numFailedStages DOUBLE, 
  runtimesec LONG
)
COMMENT 'Calls SHS to get slowest jobs sorted by runtime'
RETURN

WITH raw AS (
  SELECT try_parse_json(emr_listshsjobsraw(clusterid)) AS parsed_result
),

jobs_extracted AS (
  SELECT parsed_result:jobs::ARRAY<STRUCT<
    jobId: STRING, 
    name: STRING, 
    description: STRING, 
    submissionTime: STRING, 
    completionTime: STRING, 
    stageIds: STRING, 
    status: STRING, 
    numTasks: DOUBLE, 
    numCompletedTasks: DOUBLE, 
    numSkippedTasks: DOUBLE, 
    numFailedTasks: DOUBLE, 
    numCompletedStages: DOUBLE, 
    numSkippedStages: DOUBLE, 
    numFailedStages: DOUBLE
  >> AS jobmetrics
  FROM raw
  WHERE parsed_result:success::BOOLEAN = true
),

exploded AS (
  SELECT EXPLODE(jobmetrics) AS job
  FROM jobs_extracted
)

SELECT 
  job.jobId,
  job.name,
  job.description,
  job.submissionTime,
  job.completionTime,
  job.stageIds,
  job.status,
  job.numTasks,
  job.numCompletedTasks,
  job.numSkippedTasks,
  job.numFailedTasks,
  job.numCompletedStages,
  job.numSkippedStages,
  job.numFailedStages,
  TIMESTAMPDIFF(SECOND, TO_TIMESTAMP(job.submissionTime), TO_TIMESTAMP(job.completionTime)) AS runtimesec
FROM exploded 
ORDER BY runtimesec DESC

In [0]:
%sql
-- Get slowest Spark stages for an EMR cluster
CREATE OR REPLACE FUNCTION emr_getsloweststages(
  clusterid STRING
)
RETURNS TABLE (
  stageId STRING, 
  attemptId STRING, 
  name STRING, 
  submissionTime STRING, 
  completionTime STRING, 
  status STRING, 
  numTasks DOUBLE, 
  numCompletedTasks DOUBLE, 
  numSkippedTasks DOUBLE, 
  numFailedTasks DOUBLE,
  memoryBytesSpilled LONG, 
  diskBytesSpilled LONG, 
  inputBytes LONG, 
  inputRecords LONG, 
  outputBytes LONG, 
  outputRecords LONG, 
  shuffleReadBytes LONG, 
  shuffleReadRecords LONG, 
  shuffleWriteBytes LONG, 
  shuffleWriteRecords LONG, 
  runtimesec LONG
)
COMMENT 'Calls SHS to get slowest stages sorted by runtime'
RETURN

WITH raw AS (
  SELECT try_parse_json(emr_listshsstagesraw(clusterid)) AS parsed_result
),

stages_extracted AS (
  SELECT parsed_result:stages::ARRAY<STRUCT<
    stageId: STRING, 
    attemptId: STRING, 
    name: STRING, 
    submissionTime: STRING, 
    completionTime: STRING, 
    status: STRING, 
    numTasks: DOUBLE, 
    numCompletedTasks: DOUBLE, 
    numSkippedTasks: DOUBLE, 
    numFailedTasks: DOUBLE,
    memoryBytesSpilled: LONG, 
    diskBytesSpilled: LONG, 
    inputBytes: LONG, 
    inputRecords: LONG, 
    outputBytes: LONG, 
    outputRecords: LONG, 
    shuffleReadBytes: LONG, 
    shuffleReadRecords: LONG, 
    shuffleWriteBytes: LONG, 
    shuffleWriteRecords: LONG
  >> AS stagemetrics
  FROM raw
  WHERE parsed_result:success::BOOLEAN = true
),

exploded AS (
  SELECT EXPLODE(stagemetrics) AS stage
  FROM stages_extracted
)

SELECT 
  stage.stageId,
  stage.attemptId,
  stage.name,
  stage.submissionTime,
  stage.completionTime,
  stage.status,
  stage.numTasks,
  stage.numCompletedTasks,
  stage.numSkippedTasks,
  stage.numFailedTasks,
  stage.memoryBytesSpilled,
  stage.diskBytesSpilled,
  stage.inputBytes,
  stage.inputRecords,
  stage.outputBytes,
  stage.outputRecords,
  stage.shuffleReadBytes,
  stage.shuffleReadRecords,
  stage.shuffleWriteBytes,
  stage.shuffleWriteRecords,
  TIMESTAMPDIFF(SECOND, TO_TIMESTAMP(stage.submissionTime), TO_TIMESTAMP(stage.completionTime)) AS runtimesec
FROM exploded 
ORDER BY runtimesec DESC

In [0]:
%sql
-- Get slowest SQL executions for an EMR cluster
CREATE OR REPLACE FUNCTION emr_getslowestsql(
  clusterid STRING
)
RETURNS TABLE (
  id LONG, 
  status STRING, 
  description STRING, 
  planDescription STRING, 
  submissionTime STRING, 
  duration LONG, 
  successJobIds STRING, 
  failedJobIds STRING, 
  nodes ARRAY<STRUCT<nodeId: INT, nodeName: STRING, metrics: ARRAY<STRUCT<name: STRING, value: STRING>>>>
)
COMMENT 'Calls SHS to get slowest SQL queries sorted by duration'
RETURN

WITH raw AS (
  SELECT try_parse_json(emr_listshssqlraw(clusterid)) AS parsed_result
),

sql_extracted AS (
  SELECT parsed_result:sql::ARRAY<STRUCT<
    id: LONG, 
    status: STRING, 
    description: STRING, 
    planDescription: STRING, 
    submissionTime: STRING, 
    duration: LONG, 
    successJobIds: STRING, 
    failedJobIds: STRING, 
    nodes: ARRAY<STRUCT<nodeId: INT, nodeName: STRING, metrics: ARRAY<STRUCT<name: STRING, value: STRING>>>>
  >> AS sqlmetrics
  FROM raw
  WHERE parsed_result:success::BOOLEAN = true
),

exploded AS (
  SELECT EXPLODE(sqlmetrics) AS sql_query
  FROM sql_extracted
)

SELECT 
  sql_query.id,
  sql_query.status,
  sql_query.description,
  sql_query.planDescription,
  sql_query.submissionTime,
  sql_query.duration,
  sql_query.successJobIds,
  sql_query.failedJobIds,
  sql_query.nodes
FROM exploded 
ORDER BY sql_query.duration DESC

In [0]:
%sql
-- Get a specific stage by ID
CREATE OR REPLACE FUNCTION emr_getstage(
  clusterid STRING, 
  stageid INT
)
RETURNS TABLE (
  stageId STRING, 
  attemptId STRING, 
  name STRING, 
  submissionTime STRING, 
  completionTime STRING, 
  status STRING, 
  numTasks DOUBLE, 
  numCompletedTasks DOUBLE, 
  numSkippedTasks DOUBLE, 
  numFailedTasks DOUBLE,
  memoryBytesSpilled LONG, 
  diskBytesSpilled LONG, 
  inputBytes LONG, 
  inputRecords LONG, 
  outputBytes LONG, 
  outputRecords LONG, 
  shuffleReadBytes LONG, 
  shuffleReadRecords LONG, 
  shuffleWriteBytes LONG, 
  shuffleWriteRecords LONG, 
  runtimesec LONG
)
COMMENT 'Calls SHS to get a specific stage by ID'
RETURN

WITH raw AS (
  SELECT try_parse_json(emr_listshsstagesraw(clusterid)) AS parsed_result
),

stages_extracted AS (
  SELECT parsed_result:stages::ARRAY<STRUCT<
    stageId: STRING, 
    attemptId: STRING, 
    name: STRING, 
    submissionTime: STRING, 
    completionTime: STRING, 
    status: STRING, 
    numTasks: DOUBLE, 
    numCompletedTasks: DOUBLE, 
    numSkippedTasks: DOUBLE, 
    numFailedTasks: DOUBLE,
    memoryBytesSpilled: LONG, 
    diskBytesSpilled: LONG, 
    inputBytes: LONG, 
    inputRecords: LONG, 
    outputBytes: LONG, 
    outputRecords: LONG, 
    shuffleReadBytes: LONG, 
    shuffleReadRecords: LONG, 
    shuffleWriteBytes: LONG, 
    shuffleWriteRecords: LONG
  >> AS stagemetrics
  FROM raw
  WHERE parsed_result:success::BOOLEAN = true
),

exploded AS (
  SELECT EXPLODE(stagemetrics) AS stage
  FROM stages_extracted
)

SELECT 
  stage.stageId,
  stage.attemptId,
  stage.name,
  stage.submissionTime,
  stage.completionTime,
  stage.status,
  stage.numTasks,
  stage.numCompletedTasks,
  stage.numSkippedTasks,
  stage.numFailedTasks,
  stage.memoryBytesSpilled,
  stage.diskBytesSpilled,
  stage.inputBytes,
  stage.inputRecords,
  stage.outputBytes,
  stage.outputRecords,
  stage.shuffleReadBytes,
  stage.shuffleReadRecords,
  stage.shuffleWriteBytes,
  stage.shuffleWriteRecords,
  TIMESTAMPDIFF(SECOND, TO_TIMESTAMP(stage.submissionTime), TO_TIMESTAMP(stage.completionTime)) AS runtimesec
FROM exploded 
WHERE stage.stageId = stageid

In [0]:
%sql
-- Get a specific executor by ID
CREATE OR REPLACE FUNCTION emr_getexecutor(
  clusterid STRING, 
  executorid STRING
)
RETURNS TABLE (
  id STRING, 
  memoryUsed DOUBLE, 
  diskUsed DOUBLE, 
  totalCores DOUBLE, 
  addTime STRING, 
  removeTime STRING, 
  maxTasks DOUBLE, 
  completedTasks DOUBLE, 
  totalTasks DOUBLE, 
  totalDuration DOUBLE, 
  totalGCTime DOUBLE, 
  totalInputBytes LONG, 
  totalShuffleRead LONG, 
  totalShuffleWrite LONG, 
  maxMemory LONG, 
  uptime LONG
)
COMMENT 'Calls SHS to get a specific executor by ID'
RETURN

WITH raw AS (
  SELECT try_parse_json(emr_listshsexecutorsraw(clusterid)) AS parsed_result
),

executors_extracted AS (
  SELECT parsed_result:executors::ARRAY<STRUCT<
    id: STRING, 
    memoryUsed: DOUBLE, 
    diskUsed: DOUBLE, 
    totalCores: DOUBLE, 
    addTime: STRING, 
    removeTime: STRING, 
    maxTasks: DOUBLE, 
    completedTasks: DOUBLE, 
    totalTasks: DOUBLE, 
    totalDuration: DOUBLE, 
    totalGCTime: DOUBLE, 
    totalInputBytes: LONG, 
    totalShuffleRead: LONG, 
    totalShuffleWrite: LONG, 
    maxMemory: LONG
  >> AS execmetrics
  FROM raw
  WHERE parsed_result:success::BOOLEAN = true
),

exploded AS (
  SELECT EXPLODE(execmetrics) AS executor
  FROM executors_extracted
)

SELECT 
  executor.id,
  executor.memoryUsed,
  executor.diskUsed,
  executor.totalCores,
  executor.addTime,
  executor.removeTime,
  executor.maxTasks,
  executor.completedTasks,
  executor.totalTasks,
  executor.totalDuration,
  executor.totalGCTime,
  executor.totalInputBytes,
  executor.totalShuffleRead,
  executor.totalShuffleWrite,
  executor.maxMemory,
  TIMESTAMPDIFF(SECOND, TO_TIMESTAMP(executor.addTime), TO_TIMESTAMP(executor.removeTime)) AS uptime
FROM exploded 
WHERE executor.id = executorid

In [0]:
%sql
-- Analyze Photon compatibility of SQL queries
CREATE OR REPLACE FUNCTION emr_photonanalysis(
  clusterid STRING
)
RETURNS TABLE (
  sql_id LONG,
  cluster_arn STRING,
  photon_compatible_nodes INT,
  total_nodes INT,
  photon_percentage DOUBLE
)
COMMENT 'Analyzes EMR Spark History Server SQL metrics to estimate how much of a Spark job would benefit from Photon'
RETURN

WITH raw AS (
  SELECT try_parse_json(emr_listshssqlraw(clusterid)) AS parsed_result
),

sql_extracted AS (
  SELECT parsed_result:sql::ARRAY<STRUCT<
    id: LONG, 
    status: STRING, 
    description: STRING, 
    planDescription: STRING, 
    submissionTime: STRING, 
    duration: LONG, 
    successJobIds: STRING, 
    failedJobIds: STRING,
    app_id: STRING,
    cluster_arn: STRING,
    nodes: ARRAY<STRUCT<nodeId: INT, nodeName: STRING, metrics: ARRAY<STRUCT<name: STRING, value: STRING>>>>
  >> AS sqlmetrics
  FROM raw
  WHERE parsed_result:success::BOOLEAN = true
),

sql_exploded AS (
  SELECT EXPLODE(sqlmetrics) AS sql_query
  FROM sql_extracted
),

nodes_exploded AS (
  SELECT 
    sql_query.id AS sql_id,
    sql_query.cluster_arn,
    node
  FROM sql_exploded
  LATERAL VIEW EXPLODE(sql_query.nodes) AS node
),

photon_check AS (
  SELECT 
    sql_id,
    cluster_arn,
    node.nodeName,
    CASE 
      WHEN node.nodeName LIKE '%MapElements%' THEN 0 
      WHEN node.nodeName LIKE '%MapPartitions%' THEN 0 
      WHEN node.nodeName LIKE '%Scan csv%' THEN 0
      WHEN node.nodeName LIKE '%Scan json%' THEN 0 
      WHEN node.nodeName LIKE '%PythonUDF%' THEN 0 
      WHEN node.nodeName LIKE '%ScalaUDF%' THEN 0 
      WHEN node.nodeName LIKE '%FlatMapGroupsInPandas%' THEN 0  
      WHEN node.nodeName LIKE '%DeserializeToObject%' THEN 0
      WHEN node.nodeName LIKE '%SerializeFromObject%' THEN 0  
      ELSE 1 
    END AS photon_compatible
  FROM nodes_exploded
),

aggregated AS (
  SELECT 
    sql_id,
    cluster_arn,
    SUM(photon_compatible) AS photon_compatible_nodes,
    COUNT(*) AS total_nodes,
    TRY_DIVIDE(SUM(photon_compatible), COUNT(*)) AS photon_percentage
  FROM photon_check 
  GROUP BY sql_id, cluster_arn
)

SELECT 
  sql_id,
  cluster_arn,
  photon_compatible_nodes,
  total_nodes,
  photon_percentage
FROM aggregated
ORDER BY photon_percentage DESC