In [0]:
#06 Zorder 

from pyspark.sql import SparkSession
from pyspark.sql.functions import col, expr
spark = SparkSession.builder.getOrCreate()

# 1) Build a “wide” table: 1M rows, random country & age
import random
data = [(i,
         random.choice(["US","CA","MX","IN","DE"]),
         random.randint(1,100))
        for i in range(1_000_000)]
df = spark.createDataFrame(data, schema=["id","country","age"])

# 2) Write it as Delta (no partitioning)
path = "/tmp/delta_skip_zorder"
df.write.format("delta").mode("overwrite").save(path)

# Read before Z-order
# simple filter on age
filtered = spark.read.format("delta") \
    .load(path) \
    .filter("age BETWEEN 30 AND 40")

# show that many files are scanned
print("Files scanned before ZORDER:")
filtered.explain(True)

# Now cluster
spark.sql(f"OPTIMIZE delta.`{path}` ZORDER BY age")

# Read after Z-order
filtered2 = spark.read.format("delta") \
    .load(path) \
    .filter("age BETWEEN 30 AND 40")
print("Files scanned after ZORDER:")
filtered2.explain(True)

import time
t0 = time.time()
filtered.count()
print("Before:", time.time() - t0, "s")

t1 = time.time()
filtered2.count()
print("After ZORDER:", time.time() - t1, "s")