In [0]:
# 02_Silver.py
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, to_timestamp, to_date
from pyspark.sql.types import StructType, StructField, StringType
from delta.tables import DeltaTable

spark = SparkSession.builder.getOrCreate()

# ─── 1) Paths ─────────────────────────────────────────────────────────────────
bronze_path = "/tmp/bronze/cc_events"
silver_path = "/tmp/silver/cc_events_enterprise"
ref_dir     = "/tmp/reference"
ref_file    = f"{ref_dir}/cc_features.csv"

# ─── 2) Bootstrap reference CSV if needed ─────────────────────────────────────
try:
    dbutils.fs.ls(f"dbfs:{ref_dir}")
except Exception:
    dbutils.fs.mkdirs(f"dbfs:{ref_dir}")
    csv_content = """app_name,event_type,feature_category
Photoshop,launch,application
Photoshop,feature_used,core_feature
Illustrator,launch,application
Illustrator,export,export_feature
"""
    dbutils.fs.put(f"dbfs:{ref_file}", csv_content, overwrite=True)

# ─── 3) Read Bronze Delta ────────────────────────────────────────────────────
bronze_df = spark.read.format("delta").load(bronze_path)

# ─── 4) Cleanse, flatten, cast, dedupe ───────────────────────────────────────
silver_ready = (
    bronze_df
      .filter(col("user_id").isNotNull() & col("event_type").isNotNull())
      .withColumn("event_ts",  to_timestamp("event_timestamp","yyyy-MM-dd'T'HH:mm:ss'Z'"))
      .withColumn("event_date", to_date("event_ts"))
      .withColumn("os",     col("device.os"))
      .withColumn("region", col("device.region"))
      .select("user_id","app_name","event_type",
              "event_ts","event_date","os","region",
              "ingest_ts","process_id")
      .dropDuplicates(["user_id","app_name","event_type","event_ts"])
)

# ─── 5) Read the reference CSV ────────────────────────────────────────────────
feature_schema = StructType([
    StructField("app_name",        StringType(), True),
    StructField("event_type",      StringType(), True),
    StructField("feature_category",StringType(), True)
])
feature_ref = (spark.read
    .option("header", True)
    .schema(feature_schema)
    .csv(ref_file)
)

# ─── 6) Join to enrich / conform ─────────────────────────────────────────────
silver_enriched = silver_ready.join(
    feature_ref,
    on=["app_name","event_type"],
    how="left"
)

# ─── 7) Idempotent upsert into Silver Delta ─────────────────────────────────
if DeltaTable.isDeltaTable(spark, silver_path):
    DeltaTable.forPath(spark, silver_path) \
      .alias("s") \
      .merge(
        silver_enriched.alias("b"),
        "s.user_id = b.user_id AND s.app_name = b.app_name AND s.event_ts = b.event_ts"
      ) \
      .whenMatchedUpdateAll() \
      .whenNotMatchedInsertAll() \
      .execute()
else:
    silver_enriched.write.format("delta") \
        .mode("overwrite") \
        .save(silver_path)
    spark.sql(f"""
      CREATE TABLE IF NOT EXISTS silver_cc_events_enterprise
      USING DELTA LOCATION '{silver_path}'
    """)

# ─── 8) Verify Silver output ────────────────────────────────────────────────
display(spark.read.format("delta").load(silver_path))