In [0]:
from pyspark.sql import functions as F


#Regex to validate email and phone
email_regex = r"^[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,}$"
phone_regex = r"^\d{10}$"  



silver_df = spark.table("silver.guests")  


#Add Validation columns for email and phone
silver_df = silver_df \
  .withColumn("email_valid", F.col("email").rlike(email_regex)) \
  .withColumn("phone_valid",  F.regexp_replace(F.col("phone_number"), "[^0-9]", "").rlike(phone_regex))



silver_df = silver_df.withColumn(
  "data_valid",
  F.col("email_valid") & F.col("phone_valid")
)

#Check which validation failed
silver_df = silver_df.withColumn(
  "comment",
  F.concat_ws(" | ",
    F.when(F.col("email_valid"),  F.lit("email OK"))
     .otherwise(F.lit("email INVALID")),
    F.when(F.col("phone_valid"),  F.lit("phone OK"))
     .otherwise(F.lit("phone INVALID"))
  )
)




In [0]:


# Save the dataframe as a Delta table 
tablename = "guests_validated"
spark.sql(f"DROP TABLE IF EXISTS silver.{tablename}")

silver_df.write \
  .format("delta") \
  .mode("overwrite") \
  .option("overwriteSchema", "true") \
  .option("path", f"/mnt/silver/{tablename}/") \
  .saveAsTable(f"silver.{tablename}")


In [0]:


invalid_df = silver_df.filter(F.col("data_valid") == False)


invalid_df = silver_df.where("data_valid = false")

# Inspect
invalid_df.show(10)



In [0]:
tablename = "guests_validated"
partition_column = "state"
z_order_column = "guest_number"

#Drop table if it exists
spark.sql(f"DROP TABLE IF EXISTS silver.{tablename}")

#Write partitioned Delta table
silver_df.write \
  .format("delta") \
  .mode("overwrite") \
  .option("overwriteSchema", "true") \
  .partitionBy(partition_column) \
  .option("path", f"/mnt/silver/{tablename}/") \
  .saveAsTable(f"silver.{tablename}")


spark.sql(f"OPTIMIZE silver.{tablename} ZORDER BY ({z_order_column})")
