In [0]:
# Silver Layer Ingestion for telemetry1 events
# Spark 4.0.0, serverless cluster compatible

from pyspark.sql.functions import (
    current_timestamp, lit, col, date_format,
    lower, trim, monotonically_increasing_id,
    when, substring, concat, date_sub, current_date
)
from pyspark.sql.types import TimestampType
from delta.tables import DeltaTable

# 0) Initialize an empty user_profile Delta table so the subsequent read won’t fail
from pyspark.sql.types import StructType, StructField, StringType
# Paths for Bronze, Silver, Quarantine, and User Profile dimensions
bronze_path       = "dbfs:/tmp/bronze/telemetry1/"
silver_path       = "dbfs:/tmp/silver/telemetry1/"
quarantine_path   = "dbfs:/tmp/quarantine/telemetry1/"
user_profile_path = "dbfs:/tmp/dim/user_profile/"


user_profile_schema = StructType([
    StructField("user_id", StringType(), True)
])

# Write an empty DataFrame with that schema as a Delta table
spark.createDataFrame([], user_profile_schema) \
    .write \
    .format("delta") \
    .mode("overwrite") \
    .save(user_profile_path)


# 0) Create the Silver folder (and any others you need)
dbutils.fs.mkdirs(silver_path)
dbutils.fs.mkdirs(quarantine_path)
dbutils.fs.mkdirs(user_profile_path)

# 0) Read Bronze Delta with schema merge to handle schema drift
bronze_df = (
    spark.read
         .format("delta")
         .option("mergeSchema", "true")
         .load(bronze_path)
)

# 1) Null / Garbage-Value Removal
clean_df = bronze_df.filter(
    col("user_id").isNotNull() &
    col("app_name").isNotNull() &
    col("event_type").isNotNull() &
    (col("event_type") != "BAD_EVENT") &
    col("event_timestamp").isNotNull()
)

# 2) Data-Quality Checks & Quarantines (event_timestamp not in future)
now_ts = current_timestamp()
good_df = clean_df.filter(col("event_timestamp") <= now_ts)
bad_df  = clean_df.filter(col("event_timestamp") > now_ts)
bad_df.write.mode("append").json(quarantine_path)

# Flatten nested device struct for dedupe and joins
flat_df = good_df.withColumn("os", col("device.os")).withColumn("region", col("device.region"))

# 3) Deduplication
# Drop duplicates on user/app/event/time/os/region
dedup_df = flat_df.dropDuplicates([
    "user_id", "app_name", "event_type",
    "event_timestamp", "os", "region"
])

# 4) Conformance Join to Dimensions
def make_dim(data, cols): return spark.createDataFrame(data, cols)
app_dim    = make_dim([("Photos",1),("Mail",2),("Chat",3),("Store",4),("Search",5)], ["app_name","app_id"])
event_dim  = make_dim([("click",1),("view",2),("purchase",3),("login",4),("logout",5),("scroll",6)], ["event_type","event_type_id"])
os_dim      = make_dim([("iOS",1),("Android",2),("Windows",3),("Linux",4)], ["os","os_id"])
region_dim = make_dim([("IN",1),("US",2),("EU",3),("APAC",4),("LATAM",5)], ["region","region_id"])

conf_df = (
    dedup_df
      .join(app_dim,    "app_name",  "left")
      .join(event_dim,  "event_type","left")
      .join(os_dim,     "os",        "left")
      .join(region_dim, "region",    "left")
)

# 5) Type Enforcement & Casting
typed_df = (
    conf_df
      .withColumn("app_id",         col("app_id").cast("int"))
      .withColumn("event_type_id",  col("event_type_id").cast("int"))
      .withColumn("os_id",          col("os_id").cast("int"))
      .withColumn("region_id",      col("region_id").cast("int"))
      .withColumn("event_timestamp",col("event_timestamp").cast(TimestampType()))
)

# 6) Standardization & Normalization
normalized_df = (
    typed_df
      .withColumn("app_name_norm",   lower(trim(col("app_name"))))
      .withColumn("event_type_norm", lower(trim(col("event_type"))))
      .withColumn("user_id_norm",    trim(col("user_id")))
)

# 7) Surrogate-Key Generation
sk_df = normalized_df.withColumn("telemetry_sk", monotonically_increasing_id())

# 8) Metadata Enrichment (Silver timestamp)
enriched_df = sk_df.withColumn("silver_ingest_ts", current_timestamp())

# 9) Business-Rule Derivations
br_df = enriched_df.withColumn(
    "is_purchase",
    when(col("event_type_norm") == "purchase", lit(1)).otherwise(lit(0))
)

# 10) Late-Arriving Data Management / Watermarking (keep last 7 days)
recent_df = br_df.filter(
    col("event_timestamp") >= date_sub(current_date(), 7)
)

# 11) Row-Level Security / Masking of PII
masked_df = recent_df.withColumn(
    "user_id_masked",
    concat(substring(col("user_id"), 1, 3), lit("***"))
)

# 12) Cross-Source Consolidation (e.g., user profile enrichment)
user_profile_df  = spark.read.format("delta").load(user_profile_path)
consolidated_df  = masked_df.join(user_profile_df, "user_id", "left")

# 13) SCD Prep (Effective dates & versioning)
scd_df = (
    consolidated_df
      .withColumn("effective_from", col("event_timestamp"))
      .withColumn("effective_to",   lit(None).cast(TimestampType()))
      .withColumn("version",        lit(1))
)

# 14) Partitioning & File-Layout Optimization
scd_df.write \
    .format("delta") \
    .mode("overwrite") \
    .partitionBy("app_id") \
    .save(silver_path)

# Optimize Silver layout
silver_table = DeltaTable.forPath(spark, silver_path)
silver_table.optimize().where("app_id IS NOT NULL")
