In [0]:
# 03 - Unified batch and Stream 
from pyspark.sql import SparkSession
from pyspark.sql.functions import col
import time

spark = SparkSession.builder.getOrCreate()

# CLEAN UP
dbutils.fs.rm("dbfs:/FileStore/delta_demo_batch", recurse=True)
dbutils.fs.rm("dbfs:/FileStore/delta_demo_stream", recurse=True)
dbutils.fs.rm("dbfs:/FileStore/delta_demo_stream_ckpt", recurse=True)

# 1) BATCH WRITE
# ────────────────────────────────────────────────────────────────────────
# write IDs 0–9 in a single batch, *partitioned* on mod2 so you see two folders
batch_path = "dbfs:/FileStore/delta_demo_batch"

spark.range(0, 10) \
     .withColumn("mod2", col("id") % 2) \
     .write \
     .format("delta") \
     .mode("overwrite") \
     .partitionBy("mod2") \
     .save(batch_path)

print("📦 Batch write — directory listing:")
display(dbutils.fs.ls(batch_path))           

print("🔎 Inside mod2=0 folder:")
display(dbutils.fs.ls(f"{batch_path}/mod2=0"))  
print("🔎 Inside mod2=1 folder:")
display(dbutils.fs.ls(f"{batch_path}/mod2=1")) 

print("\n➡️ Batch snapshot:")
display(spark.read.format("delta").load(batch_path).orderBy("id"))

print("🔶 Batch DESCRIBE HISTORY:")
spark.sql(f"DESCRIBE HISTORY delta.`{batch_path}`").show(truncate=False)

# Count how many log files we have
logs = dbutils.fs.ls(f"{batch_path}/_delta_log")
print(f"Batch commits (_delta_log count): {len([f for f in logs if f.name.endswith('.json')])}")



# 2) STREAMING WRITE
# ────────────────────────────────────────────────────────────────────────
stream_path     = "dbfs:/FileStore/delta_demo_stream"
checkpoint_path = "dbfs:/FileStore/delta_demo_stream_ckpt"

# dummy 5 rows/sec stream of increasing IDs
stream_df = (spark.readStream
                .format("rate")
                .option("rowsPerSecond", 5)
                .load()
                .selectExpr("value AS id"))

query = (stream_df.writeStream
                .format("delta")
                .option("checkpointLocation", checkpoint_path)
                .outputMode("append")
                # run once over all available data, then stop
                .trigger(availableNow=True)
                .start(stream_path))

# let it fire off a couple of micro-batches
query.awaitTermination(20000)  
query.stop()
print("🔷 Stream DESCRIBE HISTORY:")
spark.sql(f"DESCRIBE HISTORY delta.`{stream_path}`").show(truncate=False)

logs = dbutils.fs.ls(f"{stream_path}/_delta_log")
print(f"Stream commits (_delta_log count): {len([f for f in logs if f.name.endswith('.json')])}")

print("\n📦 Streaming write — directory listing:")
display(dbutils.fs.ls(stream_path))

print("🔎 _delta_log entries:")
display(dbutils.fs.ls(stream_path + "/_delta_log"))

print("\n➡️ Stream snapshot:")
display(spark.read.format("delta").load(stream_path).orderBy("id"))