### Import Libraries

In [2]:
import pyspark
from delta import *
from pyspark.sql.functions import initcap

### Create Spark Session with Delta

In [3]:
#  Create a spark session with Delta
builder = pyspark.sql.SparkSession.builder.appName("DeltaTutorial") \
    .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension") \
    .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog")

# Create spark context
spark = configure_spark_with_delta_pip(builder).getOrCreate()
spark.sparkContext.setLogLevel("ERROR")

:: loading settings :: url = jar:file:/Library/Python/3.9/site-packages/pyspark/jars/ivy-2.5.1.jar!/org/apache/ivy/core/settings/ivysettings.xml


Ivy Default Cache set to: /Users/sahilnagpal/.ivy2/cache
The jars for the packages stored in: /Users/sahilnagpal/.ivy2/jars
io.delta#delta-spark_2.12 added as a dependency
:: resolving dependencies :: org.apache.spark#spark-submit-parent-c224d752-fdc9-4ae8-bcc2-28c23f94591d;1.0
	confs: [default]
	found io.delta#delta-spark_2.12;3.2.0 in central
	found io.delta#delta-storage;3.2.0 in central
	found org.antlr#antlr4-runtime;4.9.3 in central
:: resolution report :: resolve 185ms :: artifacts dl 12ms
	:: modules in use:
	io.delta#delta-spark_2.12;3.2.0 from central in [default]
	io.delta#delta-storage;3.2.0 from central in [default]
	org.antlr#antlr4-runtime;4.9.3 from central in [default]
	---------------------------------------------------------------------
	|                  |            modules            ||   artifacts   |
	|       conf       | number| search|dwnlded|evicted|| number|dwnlded|
	---------------------------------------------------------------------
	|      default     |

In [4]:
spark.sql(f"CREATE DATABASE IF NOT EXISTS demo_db")

DataFrame[]

### Reading the data

In [8]:
df = spark.read.parquet("/Users/sahilnagpal/Desktop/wordsToSpeak/delta_lake/dataset/invoices_201_99457.parquet")
df.show(5,truncate=False)

                                                                                

+-----------+----------+------+---+--------+--------+------+--------------+------------+-----------------+-------------+
|customer_id|invoice_no|gender|age|category|quantity|price |payment_method|invoice_date|shopping_mall    |_rescued_data|
+-----------+----------+------+---+--------+--------+------+--------------+------------+-----------------+-------------+
|201        |I885979   |Female|26 |Clothing|3       |900.24|Debit Card    |2021-07-04  |Metrocity        |NULL         |
|202        |I810217   |Female|51 |Clothing|3       |900.24|Cash          |2022-01-14  |Metrocity        |NULL         |
|203        |I499170   |Female|38 |Toys    |1       |35.84 |Credit Card   |2022-02-20  |Kanyon           |NULL         |
|204        |I792963   |Female|59 |Clothing|5       |1500.4|Debit Card    |2022-06-18  |Emaar Square Mall|NULL         |
|205        |I311151   |Female|39 |Souvenir|3       |35.19 |Credit Card   |2022-04-27  |Mall of Istanbul |NULL         |
+-----------+----------+------+-

In [9]:
df_union = df
expected_rows = 20000000 # 20,000,000

while df_union.count() <= expected_rows:
    df_union = df_union.union(df_union)
    print(f"count: {df_union.count()}")

print(f"final count: {df_union.count()}")

count: 198514
count: 397028
count: 794056
count: 1588112
count: 3176224
count: 6352448
count: 12704896


                                                                                

count: 25409792




final count: 25409792


                                                                                

In [10]:
df_union\
    .write\
    .format("delta")\
    .mode("overwrite")\
    .saveAsTable("demo_db.zorder_ex1")

                                                                                

In [11]:
%%time
spark.sql(
    """
    SELECT category,
        SUM(price * quantity) as total_sales
    FROM demo_db.zorder_ex1
    WHERE customer_id = 201
    GROUP BY category
    """
)

CPU times: user 2.05 ms, sys: 2.88 ms, total: 4.92 ms
Wall time: 156 ms


DataFrame[category: string, total_sales: double]

In [12]:
from delta.tables import DeltaTable

table = DeltaTable.forName(spark, "demo_db.zorder_ex1")
table.optimize().executeZOrderBy("customer_id")

                                                                                

DataFrame[path: string, metrics: struct<numFilesAdded:bigint,numFilesRemoved:bigint,filesAdded:struct<min:bigint,max:bigint,avg:double,totalFiles:bigint,totalSize:bigint>,filesRemoved:struct<min:bigint,max:bigint,avg:double,totalFiles:bigint,totalSize:bigint>,partitionsOptimized:bigint,zOrderStats:struct<strategyName:string,inputCubeFiles:struct<num:bigint,size:bigint>,inputOtherFiles:struct<num:bigint,size:bigint>,inputNumCubes:bigint,mergedFiles:struct<num:bigint,size:bigint>,numOutputCubes:bigint,mergedNumCubes:bigint>,clusteringStats:struct<inputZCubeFiles:struct<numFiles:bigint,size:bigint>,inputOtherFiles:struct<numFiles:bigint,size:bigint>,inputNumZCubes:bigint,mergedFiles:struct<numFiles:bigint,size:bigint>,numOutputZCubes:bigint>,numBatches:bigint,totalConsideredFiles:bigint,totalFilesSkipped:bigint,preserveInsertionOrder:boolean,numFilesSkippedToReduceWriteAmplification:bigint,numBytesSkippedToReduceWriteAmplification:bigint,startTimeMs:bigint,endTimeMs:bigint,totalClusterPar

In [14]:
%%time
spark.sql(
    """
    SELECT category,
        SUM(price * quantity) as total_sales
    FROM demo_db.zorder_ex1
    WHERE customer_id = 201
    GROUP BY category
    """
)

CPU times: user 2.12 ms, sys: 2.14 ms, total: 4.26 ms
Wall time: 19.5 ms


DataFrame[category: string, total_sales: double]

In [None]:
# %sql
# OPTIMIZE deltacatalog.deltadb.zorder_ex2
# WHERE invoice_date = '{current_day - 1}'
# ZORDER BY customer_id;

In [15]:
spark.sql("drop table demo_db.zorder_ex1")

DataFrame[]

### Z-Ordering in Delta Lake (Apache Spark)

#### What is Z-Ordering?
- **Z-Ordering** is a technique in Delta Lake to co-locate related data in the same set of files.
- It works by **sorting data across multiple columns** and storing them in the same physical files.
- Helps with **data skipping** – Spark can read only relevant portions of data instead of scanning entire tables.

---

#### Why Use Z-Ordering?
- **Improved Query Performance**: Queries filtering on Z-Ordered columns read fewer files.
- **Efficient Data Skipping**: Delta Lake automatically skips irrelevant data blocks using column statistics.
- **Better Clustering for Analytical Queries**: Especially useful for large tables with frequent range or equality filters.

---

#### Syntax

```sql
OPTIMIZE <table_name>
ZORDER BY (<col1>, <col2>, ...);
```

-- Optimize and cluster table by customer_id
```sql
OPTIMIZE sales_table
ZORDER BY (customer_id);
```

-- Optimize and cluster data by multiple dimensions
```sql
OPTIMIZE sales_table
ZORDER BY (region, product_category);
```
---
### Best Practices
- Z-Order columns that appear frequently in `WHERE` clauses.
- Avoid Z-Ordering on high-cardinality columns unless queries filter by ranges.
- Combine with periodic `OPTIMIZE` to handle small files before Z-Ordering.
---
### Limitations
- Z-Ordering is a performance optimization, **not a guarantee**.
- Requires **Databricks Runtime** and **Delta Lake** (not supported in open-source Delta without Databricks).
- Requires additional **compute cost** during `OPTIMIZE`.
