In [0]:
import os
os.environ["SPARK_VERSION"] = "3.3"

In [0]:
# =============================================================================
# TRAVEL BOOKING SCD2 MERGE PROJECT - DATA QUALITY: BOOKING DATA VALIDATION
# =============================================================================
# Purpose: Native PySpark DQ validation for Booking data (Serverless Compatible)

from pyspark.sql import functions as F
import datetime as _dt

# =============================================================================
# PARAMETER EXTRACTION
# =============================================================================
try:
    arrival_date = dbutils.widgets.get("arrival_date")
except Exception:
    arrival_date = _dt.date.today().strftime("%Y-%m-%d")

catalog = "travel_bookings"
dataset_name = "booking_inc"

# =============================================================================
# DQ RESULTS STORAGE SETUP
# =============================================================================
spark.sql(f"CREATE SCHEMA IF NOT EXISTS {catalog}.ops")
spark.sql(f"""
CREATE TABLE IF NOT EXISTS {catalog}.ops.dq_results (
  business_date DATE,
  dataset STRING,
  check_name STRING,
  status STRING,
  constraint STRING,
  message STRING,
  recorded_at TIMESTAMP
) USING DELTA
""")

# =============================================================================
# SOURCE DATA PREPARATION
# =============================================================================
src = spark.table(f"{catalog}.bronze.{dataset_name}").where(F.col("business_date") == F.to_date(F.lit(arrival_date)))

# =============================================================================
# DATA QUALITY CHECKS EXECUTION (NATIVE SQL)
# =============================================================================
# Perform single-pass aggregation for all metrics
metrics = src.select(
    F.count("*").alias("row_count"),
    F.sum(F.when(F.col("customer_id").isNull(), 1).otherwise(0)).alias("null_customer_id"),
    F.sum(F.when(F.col("amount").isNull(), 1).otherwise(0)).alias("null_amount"),
    F.sum(F.when(F.col("amount") < 0, 1).otherwise(0)).alias("neg_amount"),
    F.sum(F.when(F.col("quantity") < 0, 1).otherwise(0)).alias("neg_quantity"),
    F.sum(F.when(F.col("discount") < 0, 1).otherwise(0)).alias("neg_discount")
).collect()[0]

# =============================================================================
# DQ RESULTS TRANSFORMATION
# =============================================================================
# Map the collected metrics into the audit table schema
results_data = [
    (arrival_date, dataset_name, "Size Check", "Success" if metrics["row_count"] > 0 else "Error", 
     "hasSize > 0", f"Count: {metrics['row_count']}"),
    
    (arrival_date, dataset_name, "Completeness: customer_id", "Success" if metrics["null_customer_id"] == 0 else "Error", 
     "isComplete", f"Nulls: {metrics['null_customer_id']}"),
    
    (arrival_date, dataset_name, "Completeness: amount", "Success" if metrics["null_amount"] == 0 else "Error", 
     "isComplete", f"Nulls: {metrics['null_amount']}"),
    
    (arrival_date, dataset_name, "Integrity: amount", "Success" if metrics["neg_amount"] == 0 else "Error", 
     "isNonNegative", f"Negatives: {metrics['neg_amount']}"),
    
    (arrival_date, dataset_name, "Integrity: quantity", "Success" if metrics["neg_quantity"] == 0 else "Error", 
     "isNonNegative", f"Negatives: {metrics['neg_quantity']}"),
    
    (arrival_date, dataset_name, "Integrity: discount", "Success" if metrics["neg_discount"] == 0 else "Error", 
     "isNonNegative", f"Negatives: {metrics['neg_discount']}")
]

dq_results_df = (
    spark.createDataFrame(
        results_data,
        ["business_date", "dataset", "check_name", "status", "constraint", "message"]
    )
    .withColumn("business_date", F.to_date(F.col("business_date")))
    .withColumn("recorded_at", F.current_timestamp())
)

# =============================================================================
# DQ RESULTS LOGGING & VALIDATION
# =============================================================================
display(dq_results_df)

# Write to audit table
dq_results_df.write.mode("append").option("mergeSchema", "true").saveAsTable(f"{catalog}.ops.dq_results")

# Fail if any check returned an 'Error' status
if dq_results_df.filter(F.col("status") == "Error").count() > 0:
    raise ValueError(f"DQ failed for {dataset_name}. Check {catalog}.ops.dq_results for details.")

print(f"Booking DQ passed for {arrival_date}")
