In [0]:
#06 Zorder 

from pyspark.sql import SparkSession
from pyspark.sql.functions import col, expr
spark = SparkSession.builder.getOrCreate()

# 1) Build a “wide” table: 1M rows, random country & age
import random
data = [(i,
         random.choice(["US","CA","MX","IN","DE"]),
         random.randint(1,100))
        for i in range(1_000_000)]
df = spark.createDataFrame(data, schema=["id","country","age"])

# 2) Write it as Delta (no partitioning)
path = "/tmp/delta_skip_zorder"
df.write.format("delta").mode("overwrite").save(path)

# Read before Z-order
# simple filter on age
filtered = spark.read.format("delta") \
    .load(path) \
    .filter("age BETWEEN 30 AND 40")

# show that many files are scanned
print("Files scanned before ZORDER:")
filtered.explain(True)

# Now cluster
spark.sql(f"OPTIMIZE delta.`{path}` ZORDER BY age")

# Read after Z-order
filtered2 = spark.read.format("delta") \
    .load(path) \
    .filter("age BETWEEN 30 AND 40")
print("Files scanned after ZORDER:")
filtered2.explain(True)

import time
t0 = time.time()
filtered.count()
print("Before:", time.time() - t0, "s")

t1 = time.time()
filtered2.count()
print("After ZORDER:", time.time() - t1, "s")

In [0]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, expr
import time
import random                         # ← add this
from datetime import date, timedelta


# ——————————————————————————————————————————————————————————
# 1) Initialize Spark
# ——————————————————————————————————————————————————————————
spark = (
    SparkSession.builder
      .appName("ZOrder_BeforeAfter_Demo")
      .enableHiveSupport()
      .getOrCreate()
)

# Base paths & table names
BASE     = "/tmp/zorder_demo"
TABLE_A  = "demo.scenario_a"    # low-cardinality test
TABLE_B  = "demo.scenario_b"    # high-cardinality test

# Shared start date for data generation
start = date(2025, 1, 1)

# Utility to print EXPLAIN ANALYZE
def show_plan(sql):
    print(f"\n=== PLAN: {sql.strip()} ===")
    print(spark.sql(f"EXPLAIN ANALYZE {sql}").collect()[0][0])


In [0]:
#Low‐Cardinality Region
# Simulate ~1M rows with only 5 distinct regions over 30 dates
import random
from datetime import date, timedelta

regions = ["North","South","East","West","Central"]
start = date(2025,1,1)
data = []
for i in range(1_000_000):
    d = start + timedelta(days=random.randint(0,29))
    data.append((random.choice(regions), d.isoformat(), random.random()))

dfA = spark.createDataFrame(data, schema=["region","event_date","value"])

# Write as Delta partitioned by event_date
dfA.write.format("delta") \
   .mode("overwrite") \
   .partitionBy("event_date") \
   .save(f"{BASE}/scenario_a")

# Register in HMS
# 1) Tell Spark/SQL to use the Hive Metastore catalog
spark.sql("USE CATALOG spark_catalog")

spark.sql("CREATE DATABASE IF NOT EXISTS demo")
spark.sql(f"""
  DROP TABLE IF EXISTS {TABLE_A};
  CREATE TABLE {TABLE_A}
  USING DELTA
  LOCATION '{BASE}/scenario_a'
""")


In [0]:
# Define a representative filter: one region over a week
queryA = """
SELECT sum(value) 
  FROM demo.scenario_a
 WHERE region = 'North'
   AND event_date BETWEEN '2025-01-10' AND '2025-01-16'
"""

# Before Z-order
show_plan(queryA)

# Optimize & Z-Order
spark.sql(f"OPTIMIZE {TABLE_A} ZORDER BY (region, event_date)")

# After Z-order
show_plan(queryA)


In [0]:
 #High‐Cardinality User ID
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, expr
import time

# ——————————————————————————————————————————————————————————
# 1) Initialize Spark
# ——————————————————————————————————————————————————————————
spark = (
    SparkSession.builder
      .appName("ZOrder_BeforeAfter_Demo")
      .enableHiveSupport()
      .getOrCreate()
)

# Base paths
BASE = "dbfs:/tmp/zorder_demo"
TABLE_B = "demo.scenario_b"

# Make sure `start` is defined for both scenarios
from datetime import date, timedelta
start = date(2025, 1, 1)

# Utility to print EXPLAIN ANALYZE
def show_plan(sql):
    print(f"\n=== PLAN: {sql.strip()} ===")
    print(spark.sql(f"EXPLAIN ANALYZE {sql}").collect()[0][0])


# ——————————————————————————————————————————————————————————
# 2B) Build & Register Scenario B (High-Cardinality)
# ——————————————————————————————————————————————————————————
# Simulate ~1M rows with 50,000 distinct user_ids over 30 dates
users = [f"user_{i}" for i in range(50_000)]
data = []
for i in range(1_000_000):
    d = start + timedelta(days=random.randint(0,29))
    data.append((random.choice(users), d.isoformat(), random.random()))

dfB = spark.createDataFrame(data, schema=["user_id","event_date","value"])

# Write as Delta partitioned by event_date
dfB.write.format("delta") \
   .mode("overwrite") \
   .partitionBy("event_date") \
   .save(f"{BASE}/scenario_b")

# Register in HMS
# 1) Tell Spark/SQL to use the Hive Metastore catalog
spark.sql("USE CATALOG spark_catalog")
# Use a fully qualified DBFS path
BASE = "dbfs:/tmp/zorder_demo"
# Make sure the database exists
spark.sql("CREATE DATABASE IF NOT EXISTS demo")

# Drop any old table
spark.sql("DROP TABLE IF EXISTS demo.scenario_b")

# Create the new Delta table
spark.sql(f"""
  CREATE TABLE demo.scenario_b (
    user_id STRING,
    event_date DATE,
    value DOUBLE
  )
  USING DELTA
  LOCATION '{BASE}/scenario_b'
""")
