In [0]:
from pyspark.sql import functions as F

# ============================================================
# 1. LOAD SILVER
# ============================================================
df = spark.read.format("delta").load(
    "<input_location>/customers"
)

# ============================================================
# 2. BUSINESS RULES
# ============================================================

# Rule: Drop invalid customer_id
df = df.filter(F.col("customer_id").isNotNull())

# Rule: Clean empty strings â†’ NULL
for c in df.columns:
    df = df.withColumn(c, F.when(F.trim(F.col(c)) == "", None).otherwise(F.col(c)))

# Rule: Drop rows missing essential identity fields
df = df.filter(F.col("customer_id").isNotNull())

# Rule: Add ingestion_date
df = df.withColumn("ingestion_date", F.current_timestamp())

# ============================================================
# 3. WRITE GOLD
# ============================================================
df.write.format("delta").mode("overwrite").save(
    "<output_location>gold/customers"
)




In [0]:
%sql
SELECT *
FROM delta.`<output_location>;


customer_id,first_name,last_name,email,phone,country,created_date,status,email_clean,phone_clean,country_clean,status_clean,date_clean,score_email,score_phone,score_date,score_status,survivor_score,ingestion_date
C001,PIWGPD,KyWmWZ,piwgpd.kywmwz@example.com,123 456 7890,UK,2026-01-02,ACTIVE,piwgpd.kywmwz@example.com,1234567890.0,UK,active,,1,1,1,1,4,2026-01-30T19:24:06.251Z
C002,xiXMfv,dJDwCL,1234567890,Canada,2026/01/26,,,1234567890,,2026/01/26,,,0,0,0,0,0,2026-01-30T19:24:06.251Z
C003,XVywTd,uahNkT,xvywtd.uahnkt@example.com,India,2026-01-07,,,xvywtd.uahnkt@example.com,,2026-01-07,,,1,0,0,0,1,2026-01-30T19:24:06.251Z
C004,LxhcHu,ntYxWc,invalid_email,123 456 7890,Canada,,unknown,invalid_email,1234567890.0,CANADA,unknown,,0,1,0,0,1,2026-01-30T19:24:06.251Z
C005,ZIuxJZ,QUyhSv,invalid_email,1234567890,Canada,,unknown,invalid_email,1234567890.0,CANADA,unknown,,0,1,0,0,1,2026-01-30T19:24:06.251Z
C006,uCoXDT,ZrLBCJ,test@example.com,+1 123 456 7890,USA,,Active,test@example.com,11234567890.0,USA,active,,1,1,0,1,3,2026-01-30T19:24:06.251Z
C007,YcmxLW,KhLCFq,invalid_email,UK,01-17-26,,,invalid_email,,01-17-26,,,0,0,0,0,0,2026-01-30T19:24:06.251Z
C008,mDFmjr,eufzck,mdfmjr.eufzck@example.com,UK,2026/01/03,,,mdfmjr.eufzck@example.com,,2026/01/03,,,1,0,0,0,1,2026-01-30T19:24:06.251Z
C009,rVJnsl,nFqBLk,123 456 7890,Canada,"1, 2",,,1234567890,,"1, 2",,,0,0,0,0,0,2026-01-30T19:24:06.251Z
C010,uuxjzW,hGNVuK,uuxjzw.hgnvuk@example.com,123 456 7890,UK,,active,uuxjzw.hgnvuk@example.com,1234567890.0,UK,active,,1,1,0,1,3,2026-01-30T19:24:06.251Z
