In [0]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, current_timestamp
from delta.tables import DeltaTable

spark = (
    SparkSession.builder
      .appName("BronzeSilverGold_Imperative_InMemory")
      .enableHiveSupport()
      .getOrCreate()
)

# 1. In-memory raw data → Bronze
raw = [
    (1, "click",    "2025-08-01T10:00:00"),
    (2, "view",     "2025-08-01T10:01:00"),
    (1, "purchase", "bad_ts"),            # bad timestamp row
    (3, "click",    "2025-08-01T10:02:00")
]
cols = ["user_id","event_type","ts"]
df_raw = spark.createDataFrame(raw, cols)

# Write and register Bronze
bronze_path = "/tmp/demo/bronze"
df_raw.write.format("delta") \
    .mode("overwrite") \
    .save(bronze_path)
spark.sql("CREATE DATABASE IF NOT EXISTS demo")
spark.sql("DROP TABLE IF EXISTS demo.bronze_events")
spark.sql(f"""
  CREATE TABLE demo.bronze_events
  USING DELTA
  LOCATION '{bronze_path}'
""")

# 2. Read Bronze → clean → Silver
bronze_df = spark.read.format("delta").table("demo.bronze_events")
silver_df = (
    bronze_df
      .filter(col("ts").rlike(r"^\d{{4}}-\d{{2}}-\d{{2}}T"))
      .withColumn("ts", col("ts").cast("timestamp"))
)

# Write and register Silver
silver_path = "/tmp/demo/silver"
silver_df.write.format("delta") \
    .mode("overwrite") \
    .save(silver_path)
spark.sql("DROP TABLE IF EXISTS demo.silver_events")


In [0]:
import dlt
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, current_timestamp

# Start SparkSession with Hive support
spark = (
    SparkSession.builder
      .appName("DLT_InMemory_Demo")
      .enableHiveSupport()
      .getOrCreate()
)

# Reuse same in-memory Bronze table (registered above) as streaming source:
@dlt.table(
    comment="Bronze: raw events",
    table_properties={"delta.enableChangeDataFeed":"true"}
)
def bronze_events():
    return (
      spark.readStream
           .table("demo.bronze_events")
    )

# Silver: clean & enforce quality
@dlt.table(
    comment="Silver: cleaned events",
    partition_cols=["event_date"]
)
@dlt.expect_or_drop("valid_ts", "ts IS NOT NULL")
def silver_events():
    return (
      dlt.read_stream("bronze_events")
         .filter("event_type IS NOT NULL")
         .withColumn("ts", col("ts").cast("timestamp"))
         .withColumn("event_date", col("ts").cast("date"))
    )

# Gold: aggregate counts
@dlt.table(comment="Gold: per-user counts")
def gold_event_counts():
    return (
      dlt.read("silver_events")
         .groupBy("user_id")
         .count()
         .withColumnRenamed("count","event_count")
         .withColumn("as_of", current_timestamp())
    )
