In [0]:
# COMMAND ---------- (cell 1) Widgets & helpers
dbutils.widgets.text("start_date", "")
dbutils.widgets.text("end_date",   "")
start = dbutils.widgets.get("start_date") or None
end   = dbutils.widgets.get("end_date")   or None

import pyspark.sql.functions as F
from datetime import date
from delta.tables import DeltaTable
import requests

# COMMAND ---------- (cell 2) Read Bronze, de‑dup, optional date filter
bronze_raw = spark.read.table("weather_bronze.hourly")

bronze = bronze_raw.dropDuplicates(
    ["timestamp_utc", "location_lat", "location_lon"]
)

if start or end:
    bronze = bronze.filter(
        (F.col("timestamp_utc") >= F.lit(start) if start else F.lit("1900-01-01")) &
        (F.col("timestamp_utc") <  F.lit(end)   if end   else F.lit("2999-12-31"))
    )

df_daily = (
    bronze
    .withColumn("date", F.to_date("timestamp_utc"))
    .groupBy("date", "location_lat", "location_lon")
    .agg(
        F.count("*").alias("row_count"),
        F.avg("temp_c").alias("avg_temp_c"),
        F.max("wind_speed_kmh").alias("max_wind_kmh"),
        F.min("humidity_pct").alias("min_humidity_pct"),
    )
    .withColumn("process_ts", F.current_timestamp())
)

# COMMAND ---------- (cell 3) Data‑quality expectations
df_daily = df_daily.withColumn(
    "expect_row_count_ok",
    F.when(
        F.col("date") < F.current_date(),
        F.col("row_count") == 24
    ).otherwise(F.col("row_count") <= 24)
)

df_daily = df_daily.withColumn(
    "expect_temp_ok", F.col("avg_temp_c").between(-60, 60)
)

df_daily = df_daily.withColumn(
    "expect_humidity_ok", F.col("min_humidity_pct").between(0, 100)
)

dq_cols = [c for c in df_daily.columns if c.startswith("expect_")]
df_daily = df_daily.withColumn(
    "dq_passed",
    F.expr(" AND ".join(c for c in dq_cols))
)

# COMMAND ---------- (cell 4) Upsert into Delta Silver
spark.sql("CREATE DATABASE IF NOT EXISTS weather_silver")
target = "weather_silver.daily"

if not spark.catalog.tableExists(target):
    (df_daily.write
        .format("delta")
        .partitionBy("date")
        .saveAsTable(target))
else:
    tgt = DeltaTable.forName(spark, target)
    (tgt.alias("t")
        .merge(
            source=df_daily.alias("s"),
            condition="""
                t.date          = s.date AND
                t.location_lat  = s.location_lat AND
                t.location_lon  = s.location_lon
            """
        )
        .whenMatchedUpdateAll()
        .whenNotMatchedInsertAll()
        .execute()
    )

# COMMAND ---------- (cell 5) Log all DQ issues & fail if needed

# --- Identify failing rows ---
bad = df_daily.filter(~F.col("dq_passed"))

if bad.count() > 0:
    print(f"⚠️ Logging {bad.count()} DQ issue(s) into alerts table...")

    # Ensure alerts table exists
    spark.sql("CREATE DATABASE IF NOT EXISTS weather_silver")

    # Prepare alert rows (one per failing day)
    alerts_to_log = (
        bad.withColumn("alert_date", F.current_date())
           .withColumn("dq_completeness_pct", F.lit(None))  # placeholder for row-level issues
           .withColumn("reason", F.concat_ws(
               "; ",
               F.when(~F.col("expect_row_count_ok"), F.lit("Row Count Mismatch")),
               F.when(~F.col("expect_temp_ok"), F.lit("Temp Out of Range")),
               F.when(~F.col("expect_humidity_ok"), F.lit("Humidity Out of Range"))
           ))
           .select("alert_date", "dq_completeness_pct", "reason", "date", "row_count")
    )

    (alerts_to_log.write
        .format("delta")
        .mode("append")
        .saveAsTable("weather_silver.alerts"))

    # Still fail the job (so email alerts trigger)
    raise ValueError(f"Data-quality failed for {bad.count()} day(s) in this batch")

else:
    print("✅ No DQ issues in current batch.")

# COMMAND ---------- (Final Completeness DQ Alert Block – 7-day %)
dq_check = spark.sql(
    """SELECT 
  ROUND((SUM(CASE WHEN expect_row_count_ok = TRUE THEN 1 ELSE 0 END) * 100.0) / COUNT(*), 1)
  AS dq_completeness_pct
FROM weather_silver.daily
WHERE date >= date_add(current_date(), -7)"""
).collect()[0]["dq_completeness_pct"]

if dq_check < 90:
    alert_msg = f"⚠️ Data Quality Alert: Daily Completeness is {dq_check}% (<90%)"

    # Log this as a separate alert row
    spark.createDataFrame(
        [(str(date.today()), dq_check, "Completeness <90%", None, None)],
        ["alert_date", "dq_completeness_pct", "reason", "date", "row_count"]
    ).write.format("delta").mode("append").saveAsTable("weather_silver.alerts")

    # Fail the job
    raise ValueError(alert_msg)
else:
    print(f"✅ DQ completeness OK: {dq_check}%")
