In [0]:


# Silver layer transformation 
# dedup sample 
from pyspark.sql import SparkSession
from pyspark.sql.functions import lit

spark = SparkSession.builder.getOrCreate()

# Initial batch for July 31
df1 = spark.createDataFrame(
  [(“2025-07-31”,”CA”,100)],
  ["event_date","region","cases"]
)

# Late arrival batch
df2 = spark.createDataFrame(
  [(“2025-07-31”,”CA”,100)],
  ["event_date","region","cases"]
)

# Naïve append (duplicates!)
df_naive = df1.union(df2)
print("Naïve count:", df_naive.count())   # 2

# With dedup on natural key
df_clean = df_naive.dropDuplicates(["event_date","region"])
print("Deduped count:", df_clean.count()) # 1

In [0]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.types import *

# 0) Build a toy “raw” DataFrame
spark = SparkSession.builder.getOrCreate()
raw = spark.createDataFrame([
    (1, " Alice  ", "2025-08-01", {"os":"iOS","region":" CA "}, "view_home",  "100",   "0", "secret@example.com" ),
    (2, " Bob",      None,      {"os":"Android","region":"NY"},  "addToCart","200",  None,  "bob@ex.com"        ),
    (2, "Bob ",      "2025-08-01",{"os":"Android","region":"NY"},"addToCart","200",  "5",   "bob@ex.com"        )
], schema=StructType([
    StructField("user_id", IntegerType()),
    StructField("user_name", StringType()),
    StructField("date", StringType()),
    StructField("device", StructType([
        StructField("os", StringType()), StructField("region", StringType())
    ])),
    StructField("event_type", StringType()),
    StructField("cases_str", StringType()),
    StructField("deaths_str", StringType()),
    StructField("email", StringType())
]))

print("=== raw ===")
display(raw)


# 1) Filter out bad rows (drop rows missing date)
step1 = raw.filter(col("date").isNotNull())
print("=== step1: filter date IS NOT NULL ===")
display(step1)


# 2) Type-cast & rename
step2 = step1.select(
    to_date("date", "yyyy-MM-dd").alias("event_date"),
    col("user_id"),
    col("event_type"),
    col("cases_str").cast("int").alias("cases"),
    col("deaths_str").cast("int").alias("deaths"),
    col("device"),
    col("email")
)
print("=== step2: cast & rename ===")
display(step2)


# 3) Drop unused columns (we drop email)
step3 = step2.drop("email")
print("=== step3: drop unused (email) ===")
display(step3)


# 4) Flatten nested structs (device → os, region)
step4 = step3.select(
    "event_date","user_id","event_type","cases","deaths",
    col("device.os").alias("os"),
    col("device.region").alias("region")
)
print("=== step4: flatten device struct ===")
display(step4)


# 5) Normalize column names (skip since already snake_case)


# 6) Trim & clean strings (region)
step6 = step4.withColumn("region", trim(lower(col("region"))))
print("=== step6: trim & lower region ===")
display(step6)


# 7) Drop duplicates on natural key (event_date, user_id, event_type)
step7 = step6.dropDuplicates(["event_date","user_id","event_type"])
print("=== step7: dropDuplicates on (event_date,user_id,event_type) ===")
display(step7)


# 8) Surrogate key generation
step8 = step7.withColumn("event_id",
    sha2(concat_ws("|","event_date","user_id","event_type"),256)
)
print("=== step8: add event_id surrogate key ===")
display(step8)


# 9) Derive new columns (event_hour)
step9 = step8.withColumn("event_hour", hour(col("event_date")))
print("=== step9: add event_hour ===")
display(step9)


# 10) Conformance join
lookup = spark.createDataFrame(
    [("view_home","Browsing"),("addtocart","ShoppingCart")],
    ["event_type","feature_category"]
)
step10 = step9.join(lookup, on="event_type", how="left")
print("=== step10: conformance join to lookup ===")
display(step10)


# 11) Mask or obfuscate PII (email hash)
# Re-attach email column for masking
step10_with_email = step10.withColumn("email",
    when(col("user_id")==1, lit("secret@example.com"))
    .otherwise(lit("bob@ex.com"))
)
step11 = step10_with_email.withColumn("email_hashed", sha2(col("email"),256)).drop("email")
print("=== step11: mask PII (email_hashed) ===")
display(step11)


# 12) Fill defaults / NULL handling
step12 = step11.na.fill({"deaths":0})
print("=== step12: fill null deaths → 0 ===")
display(step12)


# 13) Partition / repartition by event_date
step13 = step12.repartition(col("event_date"))
print("=== step13: repartition by event_date (no visible diff) ===")
display(step13)


# 14) File-size tuning (coalesce) — no visible change in output
step14 = step13.coalesce(1)
print("=== step14: coalesce to 1 partition (no visible diff) ===")
display(step14)


# 15) Quality assertions / expectations
bad = step14.filter(col("cases") < 0)
print("=== step15: bad rows count (cases < 0) ===", bad.count())