In [0]:
#09 CDF example 
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, current_timestamp
from delta.tables import DeltaTable

spark = (SparkSession.builder
         .appName("Silver_Incr_DIY")
         .enableHiveSupport()
         .getOrCreate())

BRONZE_TABLE = "demo.bronze_events"
SILVER_TABLE = "demo.silver_events"
CHECKPOINT_TABLE = "demo.silver_checkpoint"

def process_silver_diy():
    # 1) Read last watermark
    ckpt = spark.table(CHECKPOINT_TABLE).select("last_ts").collect()
    last_ts = ckpt[0]["last_ts"] if ckpt else "1970-01-01"

    # 2) Pull only new rows from Bronze by ingest_ts
    bronze_df = spark.table(BRONZE_TABLE)
    new_df = bronze_df.filter(col("ingest_ts") > last_ts)

    # 3) Clean / transform
    clean_df = (new_df
        .filter(col("ts").isNotNull())
        .withColumn("ts", col("ts").cast("timestamp")))

    # 4) Upsert into Silver via MERGE
    silver = DeltaTable.forName(spark, SILVER_TABLE)
    (silver.alias("t")
          .merge(clean_df.alias("s"), "t.event_id = s.event_id")
          .whenMatchedUpdateAll()
          .whenNotMatchedInsertAll()
          .execute())

    # 5) Update watermark to max ingest_ts processed
    max_ts = clean_df.agg({"ingest_ts": "max"}).collect()[0][0]
    spark.createDataFrame([(max_ts,)], ["last_ts"]) \
         .write.format("delta") \
         .mode("overwrite") \
         .saveAsTable(CHECKPOINT_TABLE)


In [0]:
from pyspark.sql import SparkSession

# ——————————————————————————————————————————————
# 1) Init Spark with Hive support
# ——————————————————————————————————————————————
spark = (
    SparkSession.builder
      .appName("Enable_CDF_Bronze_HMS")
      .enableHiveSupport()            # so spark_catalog points at your HMS
      .getOrCreate()
)

# ——————————————————————————————————————————————
# 2) Point at the Hive Metastore (not UC)
# ——————————————————————————————————————————————
spark.sql("USE CATALOG spark_catalog")

# (Optional) switch to your database
spark.sql("CREATE DATABASE IF NOT EXISTS demo")
spark.sql("USE demo")

# ——————————————————————————————————————————————
# 3) (Re)create Bronze table in HMS if not already registered
# ——————————————————————————————————————————————
bronze_path = "dbfs:/mnt/data/bronze_events"  # or s3://… / abfss://… etc.

spark.sql(f"""
  CREATE TABLE IF NOT EXISTS bronze_events (
    event_id     STRING,
    user_id      STRING,
    event_type   STRING,
    ts           STRING,
    ingest_ts    TIMESTAMP
  )
  USING DELTA
  LOCATION '{bronze_path}'
""")

# ——————————————————————————————————————————————
# 4) Enable Change Data Feed on Bronze
# ——————————————————————————————————————————————
spark.sql("""
  ALTER TABLE bronze_events
  SET TBLPROPERTIES (
    delta.enableChangeDataFeed = true
  )
""")

print("✅ Change Data Feed has been ENABLED on demo.bronze_events in the Hive Metastore.")

from pyspark.sql import SparkSession
from pyspark.sql.functions import col, current_timestamp
from delta.tables import DeltaTable

spark = (SparkSession.builder
         .appName("Silver_Incr_CDF")
         .enableHiveSupport()
         .getOrCreate())

BRONZE_TABLE = "demo.bronze_events"
SILVER_TABLE = "demo.silver_events"
CDF_CHECKPOINT = "demo.silver_cdf_checkpoint"

def process_silver_cdf():
    # 1) Read last processed version
    ckpt = spark.table(CDF_CHECKPOINT).select("version").collect()
    last_version = ckpt[0]["version"] if ckpt else 0

    # 2) Get current Bronze version
    current_version = spark.sql(f"DESCRIBE HISTORY {BRONZE_TABLE}") \
                           .selectExpr("max(version) as v").collect()[0]["v"]

    # 3) Read *all* inserts/updates/deletes via CDF
    cdf_df = (spark.read.format("delta")
                   .option("readChangeData", "true")
                   .option("startingVersion", last_version)
                   .option("endingVersion", current_version)
                   .table(BRONZE_TABLE))

    # 4) Clean / filter only actual rows (isChange == 'update_postimage' or 'insert')
    upserts = (cdf_df
        .filter(col("_change_type") != "delete")
        .withColumn("ts", col("ts").cast("timestamp")))

    # 5) Upsert into Silver
    silver = DeltaTable.forName(spark, SILVER_TABLE)
    (silver.alias("t")
           .merge(upserts.alias("s"), "t.event_id = s.event_id")
           .whenMatchedUpdateAll()
           .whenNotMatchedInsertAll()
           .execute())

    # 6) Update CDF checkpoint to current_version
    spark.createDataFrame([(current_version,)], ["version"]) \
         .write.format("delta") \
         .mode("overwrite") \
         .saveAsTable(CDF_CHECKPOINT)



In [0]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, current_timestamp
from delta.tables import DeltaTable

# --------------------------------------------------------------------
# 0) Bootstrap Spark with Hive support
# --------------------------------------------------------------------
spark = (
    SparkSession.builder
      .appName("Silver_Incr_Driver")
      .enableHiveSupport()
      .getOrCreate()
)

# --------------------------------------------------------------------
# 1) Point at Hive Metastore & demo database
# --------------------------------------------------------------------
spark.sql("USE CATALOG spark_catalog")
spark.sql("CREATE DATABASE IF NOT EXISTS demo")
spark.sql("USE demo")

# --------------------------------------------------------------------
# 2) (Re)create Bronze table & enable CDF
# --------------------------------------------------------------------
bronze_path = "dbfs:/mnt/data/bronze_events"

spark.sql(f"""
  CREATE TABLE IF NOT EXISTS bronze_events (
    event_id   STRING,
    user_id    STRING,
    event_type STRING,
    ts         STRING,
    ingest_ts  TIMESTAMP
  )
  USING DELTA
  LOCATION '{bronze_path}'
""")

spark.sql("""
  ALTER TABLE bronze_events
  SET TBLPROPERTIES (
    delta.enableChangeDataFeed = true
  )
""")

# --------------------------------------------------------------------
# 3) DIY Silver processor
# --------------------------------------------------------------------
CHECKPOINT_TABLE = "demo.silver_checkpoint"
SILVER_TABLE    = "demo.silver_events"
BRONZE_TABLE    = "demo.bronze_events"

def process_silver_diy():
    # 3.1 Read last watermark
    ckpt = spark.table(CHECKPOINT_TABLE).select("last_ts").collect()
    last_ts = ckpt[0]["last_ts"] if ckpt else "1970-01-01"

    # 3.2 Pull only new rows via ingest_ts
    new_df = (spark.table(BRONZE_TABLE)
                  .filter(col("ingest_ts") > last_ts))

    # 3.3 Clean / transform
    clean_df = (new_df
        .filter(col("ts").isNotNull())
        .withColumn("ts", col("ts").cast("timestamp")))

    # 3.4 Upsert into Silver
    silver = DeltaTable.forName(spark, SILVER_TABLE)
    (silver.alias("t")
          .merge(clean_df.alias("s"), "t.event_id = s.event_id")
          .whenMatchedUpdateAll()
          .whenNotMatchedInsertAll()
          .execute())

    # 3.5 Update watermark
    max_ts = clean_df.agg({"ingest_ts": "max"}).collect()[0][0]
    spark.createDataFrame([(max_ts,)], ["last_ts"]) \
         .write.format("delta") \
         .mode("overwrite") \
         .saveAsTable(CHECKPOINT_TABLE)

    print("✅ DIY Silver update complete. Watermark =", max_ts)


# --------------------------------------------------------------------
# 4) CDF-based Silver processor
# --------------------------------------------------------------------
CDF_CHECKPOINT = "demo.silver_cdf_checkpoint"

def process_silver_cdf():
    # 4.1 Read last processed version
    ckpt = spark.table(CDF_CHECKPOINT).select("version").collect()
    last_version = ckpt[0]["version"] if ckpt else 0

    # 4.2 Discover current Bronze version
    current_version = (spark
        .sql(f"DESCRIBE HISTORY {BRONZE_TABLE}")
        .selectExpr("max(version) as v")
        .collect()[0]["v"])

    # 4.3 Read CDF
    cdf_df = (spark.read.format("delta")
                   .option("readChangeData", "true")
                   .option("startingVersion", last_version)
                   .option("endingVersion",   current_version)
                   .table(BRONZE_TABLE))

    # 4.4 Filter for inserts & update_postimage
    upserts = (cdf_df
        .filter(col("_change_type") != "delete")
        .withColum("ts", col("ts").cast("timestamp")))

    # 4.5 Merge into Silver
    silver = DeltaTable.forName(spark, SILVER_TABLE)
    (silver.alias("t")
           .merge(upserts.alias("s"), "t.event_id = s.event_id")
           .whenMatchedUpdateAll()
           .whenNotMatchedInsertAll()
           .execute())

    # 4.6 Advance version checkpoint
    spark.createDataFrame([(current_version,)], ["version"]) \
         .write.format("delta") \
         .mode("overwrite") \
         .saveAsTable(CDF_CHECKPOINT)

    print(f"✅ CDF Silver update complete. Processed versions {last_version+1}→{current_version}")


# --------------------------------------------------------------------
# 5) Driver logic: choose which processor to run
# --------------------------------------------------------------------
if __name__ == "__main__":
    mode = spark.conf.get("spark.silver.mode", "DIY")  # or pass via --conf silver.mode=CDF

    if mode.upper() == "DIY":
        process_silver_diy()
    elif mode.upper() == "CDF":
        process_silver_cdf()
    else:
        raise ValueError(f"Unknown silver.mode: {mode}")
