In [0]:
from pyspark.sql import SparkSession, Row
from pyspark.sql.functions import col, when, avg, count, sum, lit, row_number
from pyspark.sql.window import Window
from datetime import datetime
import uuid
import traceback


# ------------------------------
# Initialize Spark
# ------------------------------
spark = SparkSession.builder.appName("HealthcareGoldLayerExtended").getOrCreate()

# ------------------------------
# Job ID for this run
# ------------------------------
job_id = str(uuid.uuid4())

# ------------------------------
# Logging function
# ------------------------------
def log_step(step_name, status="INFO", message=""):
    log_df = spark.createDataFrame([
        Row(
            job_id=job_id,
            timestamp=datetime.utcnow().isoformat(),
            step=step_name,
            status=status,
            message=message
        )
    ])
    (
        log_df.write
        .format("delta")
        .mode("append")
        .saveAsTable("logs.curated_logs.logs")  # Gold log path
    )

try:
    log_step("START", "INFO", "Gold Layer extended aggregations started")

    # ------------------------------
    # Read Silver Layer data
    # ------------------------------
    silver_df = spark.table("processed_outputs.silver.processed")
    log_step("LOAD_SILVER", "INFO", f"Loaded {silver_df.count()} rows from Silver Layer")

    # ------------------------------
    # Compute Patient Risk Score
    # ------------------------------
    gold_df = silver_df.withColumn(
        "RiskScore",
        col("HadHeartAttack")*3 +
        col("HadStroke")*3 +
        col("HadDiabetes")*2 +
        col("HadCOPD")*2 +
        col("HadAsthma")*1 +
        when(col("SmokerStatus")=="CURRENT", 2).otherwise(0) +
        when(col("BMI")>30, 2).otherwise(0)
    )
    log_step("COMPUTE_RISKSCORE", "INFO", "RiskScore column computed")

    # ------------------------------
    # Aggregation 1: By State & AgeCategory
    # ------------------------------
    aggregated_df = gold_df.groupBy("State", "AgeCategory").agg(
        avg("RiskScore").alias("AvgRiskScore"),
        count("*").alias("PatientCount"),
        sum("HadDiabetes").alias("DiabetesCases"),
        sum("HadHeartAttack").alias("HeartAttackCases")
    ).withColumn("DataSource", lit("Kaggle Heart Disease Dataset"))

    aggregated_df.write.format("delta") \
        .mode("overwrite") \
        .option("overwriteSchema", "true") \
        .saveAsTable("processed_outputs.curated.health_metrics")
    log_step("SAVE_HEALTH_METRICS", "INFO", f"Saved {aggregated_df.count()} rows to health_metrics")
    display(aggregated_df)

    # ------------------------------
    # Aggregation 2: Hospital Ranking by Patient Count
    # ------------------------------
    hospital_patient_count = gold_df.groupBy("Hospital_Name").agg(
        count("*").alias("PatientCount")
    )
    window_spec = Window.orderBy(col("PatientCount").desc())
    ranked_hospitals = hospital_patient_count.withColumn(
        "Rank", row_number().over(window_spec)
    )

    ranked_hospitals.write.format("delta") \
        .mode("overwrite") \
        .option("overwriteSchema", "true") \
        .saveAsTable("processed_outputs.curated.hospital_rankings")
    log_step("SAVE_HOSPITAL_RANKINGS", "INFO", f"Saved {ranked_hospitals.count()} rows to hospital_rankings")
    display(ranked_hospitals)

    # ------------------------------
    # Aggregation 3: Average RiskScore by Hospital
    # ------------------------------
    risk_by_hospital = gold_df.groupBy("Hospital_Name").agg(
        avg("RiskScore").alias("AvgRiskScore"),
        count("*").alias("PatientCount")
    )
    risk_by_hospital.write.format("delta") \
        .mode("overwrite") \
        .option("overwriteSchema", "true") \
        .saveAsTable("processed_outputs.curated.risk_by_hospital")
    log_step("SAVE_RISK_BY_HOSPITAL", "INFO", f"Saved {risk_by_hospital.count()} rows to risk_by_hospital")
    display(risk_by_hospital)

    # ------------------------------
    # Aggregation 4: Average RiskScore by SmokerStatus
    # ------------------------------
    risk_by_smoker = gold_df.groupBy("SmokerStatus").agg(
        avg("RiskScore").alias("AvgRiskScore"),
        count("*").alias("PatientCount")
    )
    risk_by_smoker.write.format("delta") \
        .mode("overwrite") \
        .option("overwriteSchema", "true") \
        .saveAsTable("processed_outputs.curated.risk_by_smoker")
    log_step("SAVE_RISK_BY_SMOKER", "INFO", f"Saved {risk_by_smoker.count()} rows to risk_by_smoker")
    display(risk_by_smoker)

    # ------------------------------
    # Aggregation 5: Average RiskScore by RaceEthnicityCategory
    # ------------------------------
    risk_by_race = gold_df.groupBy("RaceEthnicityCategory").agg(
        avg("RiskScore").alias("AvgRiskScore"),
        count("*").alias("PatientCount")
    )
    risk_by_race.write.format("delta") \
        .mode("overwrite") \
        .option("overwriteSchema", "true") \
        .saveAsTable("processed_outputs.curated.risk_by_race")
    log_step("SAVE_RISK_BY_RACE", "INFO", f"Saved {risk_by_race.count()} rows to risk_by_race")
    display(risk_by_race)

    # ------------------------------
    # End of Job
    # ------------------------------
    log_step("END", "SUCCESS", "Gold Layer extended aggregations completed successfully ✅")
    print("✅ Gold Layer extended aggregations generated successfully!")

except Exception as e:
    log_step("ERROR", "FAIL", traceback.format_exc())
    raise


Databricks visualization. Run in Databricks to view.

Databricks visualization. Run in Databricks to view.

Databricks visualization. Run in Databricks to view.

Databricks visualization. Run in Databricks to view.

Databricks visualization. Run in Databricks to view.

Databricks visualization. Run in Databricks to view.