In [0]:
# 1) BRONZE – land “as-is” JSON into a Delta table, append-only

from pyspark.sql.types import *
from pyspark.sql.functions import current_timestamp, lit
import os

# -- 1a) prepare paths
raw_path    = "/tmp/raw/doc_events/"
bronze_path = "/tmp/bronze/doc_events/"

all_conf   = spark.conf.getAll   # note the ()
job_id     = all_conf.get("spark.databricks.job.id",   "interactive")
run_id     = all_conf.get("spark.databricks.job.runId", "interactive")
process_id = f"daily_ingest|{job_id}_{run_id}"

# -- 1b) ensure the raw JSON exists (for demo we'll create a tiny sample if missing)
if not os.path.exists("/dbfs" + raw_path):
    dbutils.fs.mkdirs(raw_path)
    sample = """
    {"event_id":"e1","user_id":"u1","doc_id":"d1",
     "action":"view","event_time":"2025-08-01T12:00:00Z",
     "device":{"os":"iOS","region":"CA"}}
    {"event_id":"e2","user_id":"u2","doc_id":"d2",
     "action":"edit","event_time":"2025-08-01T12:05:00Z",
     "device":{"os":"Android","region":"NY"}}
    """
    dbutils.fs.put(raw_path + "doc_events.json", sample.strip(), overwrite=True)

# -- 1c) define the “as-is” schema
bronze_schema = StructType([
    StructField("event_id",   StringType(), True),
    StructField("user_id",    StringType(), True),
    StructField("doc_id",     StringType(), True),
    StructField("action",     StringType(), True),
    StructField("event_time", StringType(), True),
    StructField("device", StructType([
        StructField("os",     StringType(), True),
        StructField("region", StringType(), True)
    ]), True)
])

# -- 1d) read raw JSON
df_raw = spark.read \
    .schema(bronze_schema) \
    .json(raw_path + "doc_events.json")

# -- 1e) add audit columns
df_bronze = df_raw \
    .withColumn("ingest_ts", current_timestamp()) \
    .withColumn("batch_id",   lit(process_id))



# -- 1f) write to Bronze Delta (append-only)
dbutils.fs.mkdirs(bronze_path)
df_bronze.write \
    .format("delta") \
    .mode("append") \
    .save(bronze_path)

# -- verify
print("🔍 Bronze rows:")
display(spark.read.format("delta").load(bronze_path))




In [0]:
# 2) SILVER – clean, cast, dedupe, flatten & conform

from pyspark.sql.functions import to_timestamp, col
from delta.tables           import DeltaTable
import os

# -- 2a) paths
bronze_path = "/tmp/bronze/doc_events/"
silver_path = "/tmp/silver/doc_events/"

# -- 2b) read Bronze
df = spark.read.format("delta").load(bronze_path)

# -- 2c) basic cleaning & flatten
df2 = (
    df.filter("event_time IS NOT NULL")
      .withColumn("event_ts",  to_timestamp("event_time", "yyyy-MM-dd'T'HH:mm:ss'Z'"))
      .withColumn("os",       col("device.os"))
      .withColumn("region",   col("device.region"))
      .select(
        "event_id","user_id","doc_id","action",
        "event_ts","os","region","ingest_ts","batch_id"
      )
)

# -- 2d) de-duplicate on natural key
df2 = df2.dropDuplicates(["event_id"])

# -- 2e) write/upsert to Silver Delta
if DeltaTable.isDeltaTable(spark, silver_path):
    DeltaTable.forPath(spark, silver_path) \
      .merge(
        df2.alias("new"),
        "new.event_id = event_id"
      ) \
      .whenMatchedUpdateAll() \
      .whenNotMatchedInsertAll() \
      .execute()
else:
    os.makedirs(silver_path, exist_ok=True)
    df2.write.format("delta") \
       .mode("overwrite") \
       .save(silver_path)

# -- verify
print("🔍 Silver rows:")
display(spark.read.format("delta").load(silver_path))

In [0]:
# 3) GOLD – aggregate into a consumption-ready fact table

from pyspark.sql.functions import date_trunc, count

# -- 3a) paths
silver_path = "/tmp/silver/doc_events/"
gold_path   = "/tmp/gold/doc_events_usage/"

# -- 3b) read Silver
df_silver = spark.read.format("delta").load(silver_path)

# -- 3c) compute daily usage by doc
gold = (
    df_silver
      .withColumn("day", date_trunc("DAY","event_ts"))
      .groupBy("day","doc_id")
      .agg(
         count("*"   ).alias("n_events"),
         count("user_id").alias("n_users")
      )
)

# -- 3d) write to Gold Delta, partitioned by date
dbutils.fs.mkdirs(gold_path)
gold.write.format("delta") \
    .mode("overwrite") \
    .partitionBy("day") \
    .save(gold_path)

# -- verify
print("🔍 Gold rows:")
display(spark.read.format("delta").load(gold_path))