In [0]:
import dlt
from pyspark.sql.functions import *
from pyspark.sql.types import *

# -----------------------------------------------------------------------------
# 1) Bronze: just land raw data into Delta
# -----------------------------------------------------------------------------
@dlt.table(
    name="bronze_events",
    comment="Raw ingested data; exactly the files you pointed at in 01_Bronze.py"
)
def bronze_events():
    # Copy your spark.read() + write code from 01_Bronze.py here…
    df = (
        spark.read
             .format("json")           # or parquet — whatever your sample does
             .load("/mnt/raw/source_path")
    )
    return df


# -----------------------------------------------------------------------------
# 2) Silver: flatten, dedupe, conform, etc.
# -----------------------------------------------------------------------------
@dlt.table(
    name="silver_events",
    comment="Flattened structs, deduplicated, with feature_category joined on"
)
@dlt.expect_or_drop("valid_event_ts", "event_ts IS NOT NULL")   # example expectation
def silver_events():
    bronze = dlt.read("bronze_events")

    # 2.a) flatten nested structs (if any)
    # Copy your flatten logic: e.g.
    flat = (
        bronze
        .select(
            col("user_id"),
            col("event.app_name").alias("app_name"),
            col("event.event_type").alias("event_type"),
            col("event.event_ts").alias("event_ts"),
            *[c for c in bronze.columns if c not in ("event",)]
        )
    )

    # 2.b) deduplicate on natural key
    deduped = flat.dropDuplicates(
        ["user_id","app_name","event_type","event_ts"]
    )

    # 2.c) conformance join against small lookup table
    feature_cat = (
        spark.table("feature_catalog")  # or read the small lookup from Bronze
    )
    silver = (
        deduped
        .join(
            feature_cat,
            on=["app_name","event_type"],
            how="left"
        )
    )

    return silver


# -----------------------------------------------------------------------------
# 3) Gold: aggregates, roll-ups, business-level tables
# -----------------------------------------------------------------------------
@dlt.table(
    name="gold_event_summary",
    comment="Business-level summary of events per user/app/…"
)
def gold_event_summary():
    silver = dlt.read("silver_events")

    gold = (
        silver
        .groupBy("user_id", "feature_category")
        .agg(
            count("*").alias("event_count"),
            min("event_ts").alias("first_seen"),
            max("event_ts").alias("last_seen")
        )
    )
    return gold



In [0]:
# medallion_dlt.py

import dlt
from pyspark.sql.functions import (
    current_timestamp, lit,
    to_timestamp, to_date,
    col, count, countDistinct
)
from pyspark.sql.types import StructType, StructField, StringType

# ──────────────────────────────────────────────────────────────────────────────
# 0) Optional: define your Bronze schema so you never get raw strings
# ──────────────────────────────────────────────────────────────────────────────
bronze_schema = StructType([
    StructField("user_id",         StringType(), True),
    StructField("app_name",        StringType(), True),
    StructField("event_type",      StringType(), True),
    StructField("event_timestamp", StringType(), True),
    StructField("device", StructType([
        StructField("os",     StringType(), True),
        StructField("region", StringType(), True)
    ]), True)
])

# ──────────────────────────────────────────────────────────────────────────────
# 1) Bronze: land raw JSON + audit columns
# ──────────────────────────────────────────────────────────────────────────────
@dlt.table(
    name="bronze_cc_events",
    comment="Raw CreativeCloud telemetry with ingest_ts & process_id",
    table_properties={
        "pipelines.trigger": "continuous"
    }
)
def bronze_cc_events():
    df = (
        spark.read
             .schema(bronze_schema)
             .option("multiline", True)
             .json("/tmp/raw/telemetry")
    )
    return (
        df
          .withColumn("ingest_ts",  current_timestamp())
          .withColumn("process_id", lit(dlt.current_timestamp().cast("string")))
    )

# ──────────────────────────────────────────────────────────────────────────────
# 2) Silver lookup: small reference table for conformance
# ──────────────────────────────────────────────────────────────────────────────
@dlt.table(
    name="cc_feature_ref",
    comment="app+event_type → feature_category lookup",
    table_properties={"pipelines.trigger": "once"}
)
def cc_feature_ref():
    return (
        spark.read
             .option("header", True)
             .csv("/tmp/reference/cc_features.csv")
    )

# ──────────────────────────────────────────────────────────────────────────────
# 3) Silver: clean, flatten, dedupe & conformance join
# ──────────────────────────────────────────────────────────────────────────────
@dlt.table(
    name="silver_cc_events",
    comment="Flattened, cleaned & conformed events",
    table_properties={"pipelines.trigger": "continuous"}
)
@dlt.expect("valid_event_ts", "event_ts IS NOT NULL")
def silver_cc_events():
    bronze = dlt.read("bronze_cc_events")

    # flatten + cast + filter + dedupe
    silver = (
        bronze
          .filter(col("user_id").isNotNull() & col("event_type").isNotNull())
          .withColumn("event_ts",   to_timestamp("event_timestamp","yyyy-MM-dd'T'HH:mm:ss'Z'"))
          .withColumn("event_date", to_date("event_ts"))
          .withColumn("os",         col("device.os"))
          .withColumn("region",     col("device.region"))
          .select(
             "user_id","app_name","event_type",
             "event_ts","event_date","os","region",
             "ingest_ts","process_id"
          )
          .dropDuplicates(["user_id","app_name","event_type","event_ts"])
    )

    # conformance join
    return silver.join(
        dlt.read("cc_feature_ref"),
        on=["app_name","event_type"],
        how="left"
    )

# ──────────────────────────────────────────────────────────────────────────────
# 4) Gold fact: daily feature usage
# ──────────────────────────────────────────────────────────────────────────────
@dlt.table(
    name="feature_usage_fact",
    comment="Daily counts of feature usage",
    table_properties={"pipelines.trigger": "continuous"}
)
def feature_usage_fact():
    silver = dlt.read("silver_cc_events")
    return (
        silver
          .filter(col("feature_category").isNotNull())
          .groupBy("event_date","app_name","feature_category")
          .agg(count("*").alias("usage_count"))
    )

# ──────────────────────────────────────────────────────────────────────────────
# 5) Gold fact: daily active users
# ──────────────────────────────────────────────────────────────────────────────
@dlt.table(
    name="user_activity_fact",
    comment="Daily unique active users by app/region",
    table_properties={"pipelines.trigger": "continuous"}
)
def user_activity_fact():
    silver = dlt.read("silver_cc_events")
    return (
        silver
          .groupBy("event_date","app_name","region")
          .agg(countDistinct("user_id").alias("active_users"))
    )