In [0]:
import os
os.environ["SPARK_VERSION"] = "3.3"

In [0]:
from pyspark.sql import functions as F
import datetime as _dt

# 1. PARAMETERS (Consistent with your existing SCD2 project)
try:
    arrival_date = dbutils.widgets.get("arrival_date")
except Exception:
    arrival_date = _dt.date.today().strftime("%Y-%m-%d")

catalog = "travel_bookings"
schema = "default"
dataset_name = "customer_inc"

# 2. SOURCE DATA
src = spark.table(f"{catalog}.bronze.{dataset_name}").where(F.col("business_date") == F.to_date(F.lit(arrival_date)))

# 3. DEFINE QUALITY CHECKS (Native Spark SQL)
# Row Count Check
row_count = src.count()
has_size_passed = row_count > 0

# Completeness Checks (Null counts for specific columns)
# We aggregate the null counts in a single pass for efficiency
null_counts = src.select(
    F.sum(F.when(F.col("customer_name").isNull(), 1).otherwise(0)).alias("null_name"),
    F.sum(F.when(F.col("customer_address").isNull(), 1).otherwise(0)).alias("null_address"),
    F.sum(F.when(F.col("email").isNull(), 1).otherwise(0)).alias("null_email")
).collect()[0]

# 4. PREPARE DQ RESULTS FOR LOGGING
# Define conditions for a "Success" status
results_list = [
    (arrival_date, dataset_name, "Size Check", "Success" if has_size_passed else "Error", 
     "Size > 0", f"Found {row_count} rows"),
    
    (arrival_date, dataset_name, "Completeness: customer_name", "Success" if null_counts["null_name"] == 0 else "Error", 
     "customer_name is not null", f"Found {null_counts['null_name']} null values"),
    
    (arrival_date, dataset_name, "Completeness: customer_address", "Success" if null_counts["null_address"] == 0 else "Error", 
     "customer_address is not null", f"Found {null_counts['null_address']} null values"),
    
    (arrival_date, dataset_name, "Completeness: email", "Success" if null_counts["null_email"] == 0 else "Error", 
     "email is not null", f"Found {null_counts['null_email']} null values")
]

# Create DataFrame for logging
dq_log_df = (
    spark.createDataFrame(
        results_list,
        ["business_date", "dataset", "check_name", "status", "constraint", "message"]
    )
    .withColumn("business_date", F.to_date(F.col("business_date")))
    .withColumn("recorded_at", F.current_timestamp())
)

dq_log_df.write.mode("append").option("mergeSchema", "true").saveAsTable(f"{catalog}.ops.dq_results")

# 5. STORAGE & ERROR HANDLING
# Append to the ops table created in your previous step
dq_log_df.write.mode("append").option("mergeSchema", "true").saveAsTable(f"{catalog}.ops.dq_results")

# Fail the pipeline if any check is 'Error'
failed_checks = dq_log_df.filter(F.col("status") == "Error")
if failed_checks.count() > 0:
    display(failed_checks)
    raise ValueError(f"Data Quality Failed for {dataset_name} on {arrival_date}. See {catalog}.ops.dq_results for details.")

print(f"Customer DQ passed for {arrival_date}")
