In [0]:
from pyspark.sql.functions import col, count, expr

# Set context
spark.sql("USE CATALOG main")
spark.sql("USE SCHEMA ecommerce")


### Task 6 - Data Quality & Validation

Objective:  Ensure correctness and reliability.

What You Need to Do 
- Perform row count checks
- Validate nulls in key columns
- Perform revenue sanity checks
- Fail the pipeline if validations fail


In [0]:
# ---------------------------------------------------------
# CHECK 1: ROW COUNT CHECK (Are the tables empty?)
# ---------------------------------------------------------
tables_to_check = ["customer", "fact_sales", "order_items", "orders", "products","order_payments"]

for table in tables_to_check:
    row_count = spark.read.table(table).count()
    print(f"Checking {table}: {row_count} rows found.")
    
    if row_count == 0:
        raise Exception(f"CRITICAL ERROR: Table {table} is empty! Pipeline stopped.")

print("--> Row Count Checks Passed.")


In [0]:
# ---------------------------------------------------------
# CHECK 2: NULL CHECK (Do we have Null Primary Keys?)
# ---------------------------------------------------------
# We check fact_sales for null order_ids
null_count = spark.read.table("fact_sales") \
    .filter( (col("order_id").isNull()) & (col("order_item_id").isNull()) ) \
    .count()

if null_count > 0:
    raise Exception(f"DATA INTEGRITY ERROR: Found {null_count} null Order IDs in Fact Table!")

print("--> Null ID Checks Passed.")



In [0]:
# ---------------------------------------------------------
# CHECK 3: BUSINESS LOGIC CHECK (Negative Revenue?)
# ---------------------------------------------------------
# Revenue cannot be negative. If it is, something is wrong.
negative_revenue_count = spark.read.table("fact_sales") \
    .filter(col("revenue") < 0) \
    .count()

if negative_revenue_count > 0:
    print(f"WARNING: Found {negative_revenue_count} records with negative revenue.")
    # In some companies, this is a warning. In this assignment, let's be strict:
    raise Exception("BUSINESS LOGIC ERROR: Negative Revenue found! Pipeline stopped.")

print("--> Revenue Sanity Checks Passed.")

print("------------------------------------------------")
print("SUCCESS: All Data Quality Checks Passed!")