### Photon Compatibility Analysis
This cell analyzes the `sql_df` DataFrame to identify how much of a Spark application's workload can be accelerated by Photon.
It parses the JSON query plans, flags compatible operations, and calculates a "Photon Compatibility Score" for each application.
A higher score indicates a greater portion of the application's SQL workload is Photon-compatible, suggesting better performance. A low score highlights opportunities for optimization by refactoring non-compatible operations.

In [0]:
from pyspark.sql.functions import col, from_json, explode, when, count, avg
from pyspark.sql.types import StructType, StructField, StringType, ArrayType
import logging

logger = logging.getLogger(__name__)

# A list of common Spark plan operations that are NOT accelerated by Photon.
# This list can be extended as needed based on specific workloads.
NON_PHOTON_OPERATIONS = [
    "MapElements",       # Often associated with Scala/Java UDFs
    "MapPartitions",     # Can be used by various custom operations
    "PythonUDF",         # Python User-Defined Functions
    "ScalaUDF",          # Scala User-Defined Functions
    "FlatMapGroupsInPandas", # Pandas UDFs
    "Scan csv",          # Photon does not accelerate CSV scans
    "Scan json"          # Photon does not accelerate JSON scans
]

# Define the schema to extract the nodeName from the JSON query plan.
# We only need the nodeName for this analysis.
NODES_SCHEMA = ArrayType(
    StructType([
        StructField("nodeName", StringType(), True)
    ])
)

try:
    # Ensure the required DataFrames exist and are not empty before proceeding.
    if 'sql_df' in locals() and sql_df is not None and sql_df.count() > 0 and 'applications_df' in locals() and applications_df is not None:
        logger.info("🚀 Starting Photon compatibility analysis...")
        print("🚀 Starting Photon compatibility analysis...")

        # Step 1: Get the application names from the applications_df DataFrame.
        # We select distinct rows to avoid any potential duplicates.
        app_names_df = applications_df.select(
            "cluster_id", "application_id", "application_name"
        ).distinct()

        # Step 2: Parse the raw JSON, join with app names, and explode the query plan nodes.
        # A left join ensures all SQL queries are kept, even if an app name is missing.
        exploded_nodes_df = sql_df.join(
            app_names_df,
            on=["cluster_id", "application_id"],
            how="left"
        ).withColumn(
            "nodes",
            from_json(col("sql_raw_json"), NODES_SCHEMA)
        ).select(
            "cluster_id",
            "cluster_name",
            "application_id",
            "application_name",
            "sql_id",
            explode(col("nodes")).alias("node")
        )

        # Step 3: Flag operations that are compatible with Photon.
        # A new column 'is_photon_op' is added, with 1 for compatible ops.
        photon_check_df = exploded_nodes_df.withColumn(
            "is_photon_op",
            when(col("node.nodeName").isin(NON_PHOTON_OPERATIONS), 0).otherwise(1)
        )

        # Step 4: Calculate the percentage of compatible operations for each application.
        # Group by all identifying fields, including the names, to calculate the score.
        photon_analysis_df = photon_check_df.groupBy(
            "cluster_id", "cluster_name", "application_id", "application_name"
        ).agg(
            (round(avg(col("is_photon_op")) * 100, 2)).alias("photon_compatibility_pct")
        ).orderBy(col("photon_compatibility_pct").desc())

        logger.info("✅ Photon compatibility analysis complete.")
        print("✅ Photon compatibility analysis complete.")
        print("📊 Displaying applications ranked by their Photon Compatibility Score:")

        # Step 5: Display the final results, now including the names.
        display(photon_analysis_df)

    else:
        logger.warning("⚠️ 'sql_df' or 'applications_df' is not available or is empty. Skipping Photon analysis.")
        print("⚠️ 'sql_df' or 'applications_df' is not available or is empty. Skipping Photon analysis.")

except NameError as ne:
    logger.error("❌ A required DataFrame ('sql_df' or 'applications_df') was not found. Please ensure the main analysis has been run successfully.", exc_info=True)
    print("❌ A required DataFrame ('sql_df' or 'applications_df') was not found. Please ensure the main analysis has been run successfully.")
except Exception as e:
    logger.error("❌ An unexpected error occurred during Photon analysis: %s", str(e), exc_info=True)
    print(f"❌ An unexpected error occurred during Photon analysis: {str(e)}")
