In [0]:
# ACID Properties
from delta.tables import DeltaTable
from pyspark.sql import SparkSession
import os

def list_parquet_files(directory):
    print(f"\n=== Parquet files in: {directory} ===")
    for root, dirs, files in os.walk(directory):
        for file in sorted(files):
            if file.endswith(".parquet"):
                print(os.path.join(root, file))

spark = SparkSession.builder.getOrCreate()
path  = "/local_disk0/tmp/delta_acid_demo"

# ── Step A: VERSION 0 ──
# Create a brand-new Delta table with 5 rows
spark.range(0, 5) \
     .write \
     .format("delta") \
     .mode("overwrite") \
     .save(path)

list_parquet_files(path)

# ── Step B: VERSION 1 ──
# Append 5 more rows
spark.range(5, 10) \
     .write \
     .format("delta") \
     .mode("append") \
     .save(path)

list_parquet_files(path)

# Step C: VERSION 2 ──
deltaTable = DeltaTable.forPath(spark, path)
deltaTable.update(
     condition = "id == 2",
     set = {"id": "200"}
)
list_parquet_files(path)

print ("History")
spark.sql(f"DESCRIBE HISTORY delta.`{path}`").show(truncate=False)

print("-version 0")
spark.read.format("delta").option("versionAsOf",0).load(path).show()

print("-version 1")
spark.read.format("delta").option("versionAsOf",1).load(path).show()  

print(" Latest version")
spark.read.format("delta").load(path).show()  









In [0]:
# CELL 1: SETUP
# Paths for the Delta table and the streaming checkpoint
delta_path = "dbfs:/FileStore/delta_unified_demo"
checkpoint = "dbfs:/FileStore/delta_unified_demo_checkpoint"

# Clean up old data if present
dbutils.fs.rm(delta_path, recurse=True)
dbutils.fs.rm(checkpoint, recurse=True)

In [0]:
# aspect 3 - Unified for demo unified batch and Stream 
from pyspark.sql import SparkSession
from pyspark.sql.functions import col
import time

spark = SparkSession.builder.getOrCreate()

# CLEAN UP
dbutils.fs.rm("dbfs:/FileStore/delta_demo_batch", recurse=True)
dbutils.fs.rm("dbfs:/FileStore/delta_demo_stream", recurse=True)
dbutils.fs.rm("dbfs:/FileStore/delta_demo_stream_ckpt", recurse=True)

# 1) BATCH WRITE
# ────────────────────────────────────────────────────────────────────────
# write IDs 0–9 in a single batch, *partitioned* on mod2 so you see two folders
batch_path = "dbfs:/FileStore/delta_demo_batch"

spark.range(0, 10) \
     .withColumn("mod2", col("id") % 2) \
     .write \
     .format("delta") \
     .mode("overwrite") \
     .partitionBy("mod2") \
     .save(batch_path)

print("📦 Batch write — directory listing:")
display(dbutils.fs.ls(batch_path))           

print("🔎 Inside mod2=0 folder:")
display(dbutils.fs.ls(f"{batch_path}/mod2=0"))  
print("🔎 Inside mod2=1 folder:")
display(dbutils.fs.ls(f"{batch_path}/mod2=1")) 

print("\n➡️ Batch snapshot:")
display(spark.read.format("delta").load(batch_path).orderBy("id"))

print("🔶 Batch DESCRIBE HISTORY:")
spark.sql(f"DESCRIBE HISTORY delta.`{batch_path}`").show(truncate=False)

# Count how many log files we have
logs = dbutils.fs.ls(f"{batch_path}/_delta_log")
print(f"Batch commits (_delta_log count): {len([f for f in logs if f.name.endswith('.json')])}")



# 2) STREAMING WRITE
# ────────────────────────────────────────────────────────────────────────
stream_path     = "dbfs:/FileStore/delta_demo_stream"
checkpoint_path = "dbfs:/FileStore/delta_demo_stream_ckpt"

# dummy 5 rows/sec stream of increasing IDs
stream_df = (spark.readStream
                .format("rate")
                .option("rowsPerSecond", 5)
                .load()
                .selectExpr("value AS id"))

query = (stream_df.writeStream
                .format("delta")
                .option("checkpointLocation", checkpoint_path)
                .outputMode("append")
                # run once over all available data, then stop
                .trigger(availableNow=True)
                .start(stream_path))

# let it fire off a couple of micro-batches
query.awaitTermination(20000)  
query.stop()
print("🔷 Stream DESCRIBE HISTORY:")
spark.sql(f"DESCRIBE HISTORY delta.`{stream_path}`").show(truncate=False)

logs = dbutils.fs.ls(f"{stream_path}/_delta_log")
print(f"Stream commits (_delta_log count): {len([f for f in logs if f.name.endswith('.json')])}")

print("\n📦 Streaming write — directory listing:")
display(dbutils.fs.ls(stream_path))

print("🔎 _delta_log entries:")
display(dbutils.fs.ls(stream_path + "/_delta_log"))

print("\n➡️ Stream snapshot:")
display(spark.read.format("delta").load(stream_path).orderBy("id"))

In [0]:
# Aspect 4 - Schema Evolution 
from pyspark.sql import SparkSession
from pyspark.sql.functions import lit, col

spark = SparkSession.builder.getOrCreate()
path = "dbfs:/FileStore/delta_schema_demo"

# CLEAN UP
dbutils.fs.rm(path, recurse=True)

# ── Step A: INITIAL WRITE ──
# Create a table with schema (id: LONG, val: STRING)
spark.range(0, 3) \
     .withColumn("val", lit("alpha")) \
     .write \
     .format("delta") \
     .mode("overwrite") \
     .save(path)

print("Version 0 snapshot:")
display(spark.read.format("delta").load(path))

# ── Step B: TRY A MISMATCHED WRITE ──
# This DataFrame has an extra column 'new_col'
bad_df = spark.range(3, 6) \
              .withColumn("val", lit("beta")) \
              .withColumn("new_col", lit(99.9))

try:
    bad_df.write.format("delta").mode("append").save(path)
except Exception as e:
    print("🛑 Write rejected due to schema mismatch:\n", e)

# ── Step C: ALLOW EVOLUTION & APPEND ──
# Now enable schema merging to accept the new column
bad_df.write \
     .format("delta") \
     .mode("append") \
     .option("mergeSchema", "true") \
     .save(path)

print("Version 2 snapshot (merged schema):")
spark.read \
     .format("delta") \
     .option("mergeSchema", "true") \
     .load(path) \
     .show()

# ── Inspect history & schema ──
print("DESCRIBE HISTORY:")
spark.sql(f"DESCRIBE HISTORY delta.`{path}`").show(truncate=False)

print("Current schema:")
spark.read.format("delta").load(path).printSchema()

In [0]:
#ASpect 5 - Time travel and versioning
from pyspark.sql import SparkSession
from delta.tables import DeltaTable
from pyspark.sql.functions import lit

spark = SparkSession.builder.getOrCreate()
path = "/local_disk0/tmp/delta_time_travel"

# CLEAN UP
dbutils.fs.rm(path, recurse=True)

# ── Version 0: Initial write ──
spark.range(0, 3) \
     .withColumn("val", lit("alpha")) \
     .write \
     .format("delta") \
     .mode("overwrite") \
     .save(path)

# ── Version 1: Append new rows ──
spark.range(3, 5) \
     .withColumn("val", lit("beta")) \
     .write \
     .format("delta") \
     .mode("append") \
     .save(path)

# ── Version 2: Update in place ──
DeltaTable.forPath(spark, path) \
  .update(
    condition = "id == 1",
    set       = {"id": "100", "val": "'gamma'"}
  )

# 1) Show commit history
print("=== DESCRIBE HISTORY ===")
spark.sql(f"DESCRIBE HISTORY delta.`{path}`").show(truncate=False)

# 2) Read as of version 0
print("→ VERSION 0 SNAPSHOT:")
spark.read.format("delta") \
     .option("versionAsOf", 0) \
     .load(path) \
     .orderBy("id") \
     .show()

# 3) Read as of version 1
print("→ VERSION 1 SNAPSHOT:")
spark.read.format("delta") \
     .option("versionAsOf", 1) \
     .load(path) \
     .orderBy("id") \
     .show()

# 4) Read latest (version 2)
print("→ VERSION 2 (LATEST):")
spark.read.format("delta") \
     .load(path) \
     .orderBy("id") \
     .show()

# 5) Read by timestamp (pick a timestamp between v1 & v2)
import datetime
ts = spark.sql(f"SELECT timestamp FROM (DESCRIBE HISTORY delta.`{path}`) WHERE version = 1").first()[0]
print(f"→ SNAPSHOT AS OF {ts}:")
spark.read.format("delta") \
     .option("timestampAsOf", ts) \
     .load(path) \
     .orderBy("id") \
     .show()

In [0]:
#ASpect 5 - Merge feature
from pyspark.sql import SparkSession
from delta.tables import DeltaTable
from pyspark.sql.functions import lit

spark = SparkSession.builder.getOrCreate()
path = "dbfs:/FileStore/delta_merge_demo"

# CLEAN UP
dbutils.fs.rm(path, recurse=True)

# ── Step A: Create base table ──
# IDs 1–4 with vals A–D
spark.createDataFrame(
    [(1, "A"), (2, "B"), (3, "C"), (4, "D")],
    ["id", "val"]
).write.format("delta").mode("overwrite").save(path)

print("Initial (version 0):")
display(spark.read.format("delta").load(path).orderBy("id"))

# ── Step B: Prepare updates ──
# id=1 updated, id=2 marked for delete, id=5 new
updates = spark.createDataFrame(
    [(1, "A'"), (2, None), (5, "E")],
    ["id", "val"]
)

# ── Step C: MERGE
delta_tbl = DeltaTable.forPath(spark, path)
(delta_tbl.alias("t")
  .merge(
     source = updates.alias("s"),
     condition = "t.id = s.id"
  )
  .whenMatchedUpdate(condition="s.val IS NOT NULL", set={"val": "s.val"})
  .whenMatchedDelete(condition="s.val IS NULL")
  .whenNotMatchedInsert(values={"id": "s.id", "val": "s.val"})
  .execute()
)

print("After MERGE (version 1):")
display(spark.read.format("delta").load(path).orderBy("id"))

# ── Inspect history ──
print("DESCRIBE HISTORY:")
spark.sql(f"DESCRIBE HISTORY delta.`{path}`").show(truncate=False)

In [0]:
#ASpect 6 - Zorder 

from pyspark.sql import SparkSession
from pyspark.sql.functions import col, expr
spark = SparkSession.builder.getOrCreate()

# 1) Build a “wide” table: 1M rows, random country & age
import random
data = [(i,
         random.choice(["US","CA","MX","IN","DE"]),
         random.randint(1,100))
        for i in range(1_000_000)]
df = spark.createDataFrame(data, schema=["id","country","age"])

# 2) Write it as Delta (no partitioning)
path = "/tmp/delta_skip_zorder"
df.write.format("delta").mode("overwrite").save(path)

# Read before Z-order
# simple filter on age
filtered = spark.read.format("delta") \
    .load(path) \
    .filter("age BETWEEN 30 AND 40")

# show that many files are scanned
print("Files scanned before ZORDER:")
filtered.explain(True)

# Now cluster
spark.sql(f"OPTIMIZE delta.`{path}` ZORDER BY age")

# Read after Z-order
filtered2 = spark.read.format("delta") \
    .load(path) \
    .filter("age BETWEEN 30 AND 40")
print("Files scanned after ZORDER:")
filtered2.explain(True)

import time
t0 = time.time()
filtered.count()
print("Before:", time.time() - t0, "s")

t1 = time.time()
filtered2.count()
print("After ZORDER:", time.time() - t1, "s")

In [0]:
#ASpect 8 Audit and Lineage 
from pyspark.sql import SparkSession
from pyspark.sql.functions import lit, col

spark = SparkSession.builder.getOrCreate()

# Paths for our demo tables
src = "/tmp/delta_audit_src"
dst = "/tmp/delta_audit_child"

# 1) Clean up any previous runs
dbutils.fs.rm(src, recurse=True)
dbutils.fs.rm(dst, recurse=True)

# 2) Create source table (version 0)
spark.range(0, 3) \
     .withColumn("val", lit("alpha")) \
     .write.format("delta") \
     .mode("overwrite") \
     .save(src)

# 3) Append more data (version 1)
spark.range(3, 5) \
     .withColumn("val", lit("beta")) \
     .write.format("delta") \
     .mode("append") \
     .save(src)

# 4) Create a child table via CTAS (child version 0)
spark.sql(f"CREATE TABLE delta.`{dst}` AS SELECT * FROM delta.`{src}`")

# 5) Inspect Audit History on source_tbl
print("=== SOURCE TABLE HISTORY ===")
spark.sql(f"DESCRIBE HISTORY delta.`{src}`").show(truncate=False)

# 6) Inspect Audit History (and lineage) on child_tbl
print("=== CHILD TABLE HISTORY ===")
spark.sql(f"DESCRIBE HISTORY delta.`{dst}`").show(truncate=False)

In [0]:
# CELL 2: BATCH WRITE
# Write IDs 0–4 in one batch
spark.range(0, 5) \
     .write \
     .format("delta") \
     .mode("overwrite") \
     .save(delta_path)

print("Batch data:")
display(spark.read.format("delta").load(delta_path))

In [0]:
#Can skip - since there is one more code sample one below 
from pyspark.sql import SparkSession

spark = SparkSession.builder.getOrCreate()
delta_path = "dbfs:/FileStore/delta_stream_demo"
checkpoint =  "dbfs:/FileStore/delta_stream_demo_ckpt"

# 1) start a streaming write: 
stream_df = (
    spark.readStream.format("rate")
         .option("rowsPerSecond", 10)   # 10 events/sec
         .load()
         .selectExpr("value AS event_id")
)

stream_query = (
    stream_df
      .writeStream
      .format("delta")
      .option("checkpointLocation", checkpoint)
      .outputMode("append")
      .trigger(availableNow=True)
      .start(delta_path)
)

# Let it run for a few seconds, then stop
import time; time.sleep(5)
stream_query.stop()

# 2) Inspect the files & log
print("Files on disk:")
display(dbutils.fs.ls(delta_path))
print("\nDelta log entries:")
display(dbutils.fs.ls(delta_path + "/_delta_log"))
print("\nLatest snapshot:")
display(spark.read.format("delta").load(delta_path).orderBy("event_id"))