In [0]:
# OPTIMIZE merges many small files into fewer large files
# This makes queries significantly faster

print("=" * 55)
print("OPTIMIZING DELTA TABLES")
print("=" * 55)

tables_to_optimize = [
    # Bronze tables
    "ecommerce.bronze.raw_orders",
    "ecommerce.bronze.raw_order_items",
    "ecommerce.bronze.raw_customers",
    "ecommerce.bronze.raw_products",
    "ecommerce.bronze.raw_payments",
    "ecommerce.bronze.raw_reviews",
    "ecommerce.bronze.raw_sellers",
    # Silver tables
    "ecommerce.silver.orders",
    "ecommerce.silver.order_items",
    "ecommerce.silver.customers",
    "ecommerce.silver.products",
    "ecommerce.silver.payments",
    "ecommerce.silver.reviews",
    "ecommerce.silver.sellers",
    # Gold tables
    "ecommerce.gold.daily_revenue",
    "ecommerce.gold.customer_ltv",
    "ecommerce.gold.product_performance",
    "ecommerce.gold.category_performance",
    "ecommerce.gold.seller_performance",
    "ecommerce.gold.payment_analysis",
]

for table in tables_to_optimize:
    print(f"Optimizing {table}...")
    spark.sql(f"OPTIMIZE {table}")
    print(f"Done: {table}")

print("\nAll tables optimized!")

OPTIMIZING DELTA TABLES
Optimizing ecommerce.bronze.raw_orders...
Done: ecommerce.bronze.raw_orders
Optimizing ecommerce.bronze.raw_order_items...
Done: ecommerce.bronze.raw_order_items
Optimizing ecommerce.bronze.raw_customers...
Done: ecommerce.bronze.raw_customers
Optimizing ecommerce.bronze.raw_products...
Done: ecommerce.bronze.raw_products
Optimizing ecommerce.bronze.raw_payments...
Done: ecommerce.bronze.raw_payments
Optimizing ecommerce.bronze.raw_reviews...
Done: ecommerce.bronze.raw_reviews
Optimizing ecommerce.bronze.raw_sellers...
Done: ecommerce.bronze.raw_sellers
Optimizing ecommerce.silver.orders...
Done: ecommerce.silver.orders
Optimizing ecommerce.silver.order_items...
Done: ecommerce.silver.order_items
Optimizing ecommerce.silver.customers...
Done: ecommerce.silver.customers
Optimizing ecommerce.silver.products...
Done: ecommerce.silver.products
Optimizing ecommerce.silver.payments...
Done: ecommerce.silver.payments
Optimizing ecommerce.silver.reviews...
Done: ecommer

Z-ORDER (Smart Indexing)

In [0]:
# Z-ORDER sorts data by the columns you filter on most
# Makes queries that filter by these columns 10-100x faster

print("=" * 55)
print("APPLYING Z-ORDER INDEXING")
print("=" * 55)

# Orders → most often filtered by customer_id and date
spark.sql("""
    OPTIMIZE ecommerce.silver.orders
    ZORDER BY (customer_id, order_purchase_timestamp)
""")
print("silver.orders Z-ordered by customer_id, order_purchase_timestamp")

# Order items → most often filtered by order_id and product_id
spark.sql("""
    OPTIMIZE ecommerce.silver.order_items
    ZORDER BY (order_id, product_id)
""")
print("silver.order_items Z-ordered by order_id, product_id")

# Payments → most often filtered by order_id
spark.sql("""
    OPTIMIZE ecommerce.silver.payments
    ZORDER BY (order_id, payment_type)
""")
print("silver.payments Z-ordered by order_id, payment_type")

# Reviews → most often filtered by order_id and score
spark.sql("""
    OPTIMIZE ecommerce.silver.reviews
    ZORDER BY (order_id, review_score)
""")
print("silver.reviews Z-ordered by order_id, review_score")

# Gold daily revenue → most often filtered by date
spark.sql("""
    OPTIMIZE ecommerce.gold.daily_revenue
    ZORDER BY (order_day)
""")
print("gold.daily_revenue Z-ordered by order_day")

# Gold customer ltv → most often filtered by lifetime value
spark.sql("""
    OPTIMIZE ecommerce.gold.customer_ltv
    ZORDER BY (lifetime_value, customer_state)
""")
print("gold.customer_ltv Z-ordered by lifetime_value")

print("\nAll Z-ORDER indexing complete!")

APPLYING Z-ORDER INDEXING
silver.orders Z-ordered by customer_id, order_purchase_timestamp
silver.order_items Z-ordered by order_id, product_id
silver.payments Z-ordered by order_id, payment_type
silver.reviews Z-ordered by order_id, review_score
gold.daily_revenue Z-ordered by order_day
gold.customer_ltv Z-ordered by lifetime_value

All Z-ORDER indexing complete!


VACUUM (Clean Up Old Files)

In [0]:
# VACUUM deletes old data files that are no longer needed
# Delta keeps old versions for "time travel" by default
# We keep 7 days (168 hours) of history

print("=" * 55)
print("VACUUMING DELTA TABLES")
print("=" * 55)

# Must set this to allow vacuum under 7 days
spark.conf.set("spark.databricks.delta.retentionDurationCheck.enabled", "false")

vacuum_tables = [
    "ecommerce.bronze.raw_orders",
    "ecommerce.bronze.raw_order_items",
    "ecommerce.bronze.raw_customers",
    "ecommerce.bronze.raw_products",
    "ecommerce.bronze.raw_payments",
    "ecommerce.bronze.raw_reviews",
    "ecommerce.bronze.raw_sellers",
    "ecommerce.silver.orders",
    "ecommerce.silver.order_items",
    "ecommerce.silver.customers",
    "ecommerce.silver.products",
    "ecommerce.silver.payments",
    "ecommerce.silver.reviews",
    "ecommerce.silver.sellers",
    "ecommerce.gold.daily_revenue",
    "ecommerce.gold.customer_ltv",
    "ecommerce.gold.product_performance",
    "ecommerce.gold.category_performance",
    "ecommerce.gold.seller_performance",
    "ecommerce.gold.payment_analysis",
]

for table in vacuum_tables:
    print(f"Vacuuming {table}...")
    spark.sql(f"VACUUM {table} RETAIN 168 HOURS")  # keep 7 days
    print(f"Done: {table}")

print("\nAll tables vacuumed!")

VACUUMING DELTA TABLES
Vacuuming ecommerce.bronze.raw_orders...
Done: ecommerce.bronze.raw_orders
Vacuuming ecommerce.bronze.raw_order_items...
Done: ecommerce.bronze.raw_order_items
Vacuuming ecommerce.bronze.raw_customers...
Done: ecommerce.bronze.raw_customers
Vacuuming ecommerce.bronze.raw_products...
Done: ecommerce.bronze.raw_products
Vacuuming ecommerce.bronze.raw_payments...
Done: ecommerce.bronze.raw_payments
Vacuuming ecommerce.bronze.raw_reviews...
Done: ecommerce.bronze.raw_reviews
Vacuuming ecommerce.bronze.raw_sellers...
Done: ecommerce.bronze.raw_sellers
Vacuuming ecommerce.silver.orders...
Done: ecommerce.silver.orders
Vacuuming ecommerce.silver.order_items...
Done: ecommerce.silver.order_items
Vacuuming ecommerce.silver.customers...
Done: ecommerce.silver.customers
Vacuuming ecommerce.silver.products...
Done: ecommerce.silver.products
Vacuuming ecommerce.silver.payments...
Done: ecommerce.silver.payments
Vacuuming ecommerce.silver.reviews...
Done: ecommerce.silver.revi

Auto Optimize (Set & Forget)

In [0]:
# Enable Auto Optimize on Silver and Gold tables
# This automatically optimizes as new data comes in
# So you don't have to run OPTIMIZE manually every time

print("=" * 55)
print("ENABLING AUTO OPTIMIZE")
print("=" * 55)

auto_optimize_tables = [
    "ecommerce.silver.orders",
    "ecommerce.silver.order_items",
    "ecommerce.silver.customers",
    "ecommerce.silver.products",
    "ecommerce.silver.payments",
    "ecommerce.silver.reviews",
    "ecommerce.silver.sellers",
    "ecommerce.gold.daily_revenue",
    "ecommerce.gold.customer_ltv",
    "ecommerce.gold.product_performance",
    "ecommerce.gold.category_performance",
    "ecommerce.gold.seller_performance",
    "ecommerce.gold.payment_analysis",
]

for table in auto_optimize_tables:
    spark.sql(f"""
        ALTER TABLE {table}
        SET TBLPROPERTIES (
            'delta.autoOptimize.optimizeWrite' = 'true',
            'delta.autoOptimize.autoCompact'   = 'true'
        )
    """)
    print(f"Auto optimize enabled: {table}")

print("\nAuto optimize enabled on all Silver & Gold tables!")

ENABLING AUTO OPTIMIZE
Auto optimize enabled: ecommerce.silver.orders
Auto optimize enabled: ecommerce.silver.order_items
Auto optimize enabled: ecommerce.silver.customers
Auto optimize enabled: ecommerce.silver.products
Auto optimize enabled: ecommerce.silver.payments
Auto optimize enabled: ecommerce.silver.reviews
Auto optimize enabled: ecommerce.silver.sellers
Auto optimize enabled: ecommerce.gold.daily_revenue
Auto optimize enabled: ecommerce.gold.customer_ltv
Auto optimize enabled: ecommerce.gold.product_performance
Auto optimize enabled: ecommerce.gold.category_performance
Auto optimize enabled: ecommerce.gold.seller_performance
Auto optimize enabled: ecommerce.gold.payment_analysis

Auto optimize enabled on all Silver & Gold tables!


Check Table History (Time Travel)

In [0]:
# Delta keeps full history of all changes
# This is called "Time Travel"
# You can query data AS IT WAS at any point in the past

print("=" * 55)
print("TABLE HISTORY — TIME TRAVEL")
print("=" * 55)

# See full history of silver.orders
print("History of silver.orders:")
spark.sql("""
    DESCRIBE HISTORY ecommerce.silver.orders
""").select(
    "version",
    "timestamp",
    "operation",
    "operationMetrics"
).show(5, truncate=False)

# Query data as it was at version 0 (first load)
print("silver.orders at Version 0:")
spark.sql("""
    SELECT COUNT(*) as record_count
    FROM ecommerce.silver.orders VERSION AS OF 0
""").show()

# Query data as it was yesterday
print("silver.orders as of 1 hour ago:")
spark.sql("""
    SELECT COUNT(*) as record_count
    FROM ecommerce.silver.orders
    TIMESTAMP AS OF (current_timestamp() - INTERVAL 1 HOUR)
""").show()

TABLE HISTORY — TIME TRAVEL
History of silver.orders:
+-------+-------------------+---------------------------------+--------------------------------------------------------------------------------------------------------------------+
|version|timestamp          |operation                        |operationMetrics                                                                                                    |
+-------+-------------------+---------------------------------+--------------------------------------------------------------------------------------------------------------------+
|6      |2026-02-21 01:45:45|SET TBLPROPERTIES                |{}                                                                                                                  |
|5      |2026-02-21 00:58:07|VACUUM END                       |{numDeletedFiles -> 0, numVacuumedDirectories -> 1}                                                                 |
|4      |2026-02-21 00:58:03|VACUUM START

Final Storage Stats

In [0]:
# Check how much storage each table uses

print("=" * 55)
print("STORAGE STATS AFTER OPTIMIZATION")
print("=" * 55)

all_tables = [
    ("Bronze", "ecommerce.bronze.raw_orders"),
    ("Bronze", "ecommerce.bronze.raw_order_items"),
    ("Bronze", "ecommerce.bronze.raw_customers"),
    ("Bronze", "ecommerce.bronze.raw_products"),
    ("Silver", "ecommerce.silver.orders"),
    ("Silver", "ecommerce.silver.order_items"),
    ("Silver", "ecommerce.silver.customers"),
    ("Gold",   "ecommerce.gold.daily_revenue"),
    ("Gold",   "ecommerce.gold.customer_ltv"),
    ("Gold",   "ecommerce.gold.product_performance"),
]

for layer, table in all_tables:
    count = spark.table(table).count()
    details = spark.sql(f"DESCRIBE DETAIL {table}") \
                   .select("sizeInBytes", "numFiles") \
                   .collect()[0]
    size_mb = round(details["sizeInBytes"] / 1024 / 1024, 2)
    files   = details["numFiles"]
    print(f"[{layer}] {table.split('.')[-1]:<25} "
          f"{count:>10,} rows | "
          f"{size_mb:>8.2f} MB | "
          f"{files:>3} files")

print("\nOptimization complete!")


STORAGE STATS AFTER OPTIMIZATION
[Bronze] raw_orders                    99,441 rows |     5.92 MB |   1 files
[Bronze] raw_order_items              112,650 rows |     3.86 MB |   1 files
[Bronze] raw_customers                 99,441 rows |     3.87 MB |   1 files
[Bronze] raw_products                  32,951 rows |     0.82 MB |   1 files
[Silver] orders                        99,441 rows |     6.17 MB |   1 files
[Silver] order_items                  112,650 rows |     4.47 MB |   1 files
[Silver] customers                     99,441 rows |     3.87 MB |   1 files
[Gold] daily_revenue                    612 rows |     0.02 MB |   1 files
[Gold] customer_ltv                  93,471 rows |     2.27 MB |   1 files
[Gold] product_performance           32,216 rows |     0.95 MB |   1 files

Optimization complete!
