In [0]:
from pyspark.sql import functions as F

# ============================================================
# 1. LOAD SILVER
# ============================================================
df = spark.read.format("delta").load(
    "<input_path>/orders"
)

# ============================================================
# 2. BUSINESS RULES
# ============================================================

# Rule: Drop invalid customer_id
df = df.filter(F.col("customer_id").isNotNull())

# Rule: Safe numeric conversions using expr(try_cast)
df = df.withColumn(
    "quantity",
    F.abs(F.expr("try_cast(quantity AS double)"))
)

df = df.withColumn(
    "total_amount",
    F.expr("try_cast(total_amount AS double)")
)

# Rule: Drop invalid numeric rows
df = df.filter(F.col("quantity").isNotNull())
df = df.filter(F.col("total_amount").isNotNull())

# Rule: Status cleanup
df = df.withColumn(
    "status",
    F.when(F.trim("status") == "", "unknown")
     .otherwise(F.col("status"))
)

# Rule: Drop invalid dates
df = df.filter(F.col("order_date").isNotNull())

# Rule: Round amount
df = df.withColumn("total_amount", F.round(F.col("total_amount"), 2))

# Rule: Add ingestion_date
df = df.withColumn("ingestion_date", F.current_timestamp())

# ============================================================
# 3. WRITE GOLD
# ============================================================
df.write.format("delta").mode("overwrite").save(
    "<output_path>/gold/orders"
)


In [0]:
%sql
SELECT *
FROM delta.`<output_path>/gold/orders`;


order_id,customer_id,product_code,quantity,total_amount,status,order_date,ingestion_date
O004,C096,CODE,1.0,652.85,,2026-01-21,2026-01-30T19:18:14.245Z
O010,C073,BAD,1.0,569.36,pending,2026-01-15,2026-01-30T19:18:14.245Z
O012,C124,CODE,1.0,951.04,cancelled,2026-01-01,2026-01-30T19:18:14.245Z
O014,C054,QMFG,5.0,666.8,completed,2026-01-24,2026-01-30T19:18:14.245Z
O015,C038,QFWI,1.0,908.51,pending,2026-01-14,2026-01-30T19:18:14.245Z
O018,C065,CODE,5.0,739.81,pending,2026-01-05,2026-01-30T19:18:14.245Z
O021,C150,MRPN,1.0,58.09,unknown,2026-01-12,2026-01-30T19:18:14.245Z
O037,C012,FNXY,1.0,406.99,cancelled,2026-01-25,2026-01-30T19:18:14.245Z
O041,C004,BAD,1.0,73.43,unknown,2026-01-04,2026-01-30T19:18:14.245Z
O052,C101,PBSY,1.0,647.82,completed,2026-01-03,2026-01-30T19:18:14.245Z
