# [databricks](https://community.cloud.databricks.com/)


In [None]:
products = [
          # (product_id, product_name, brand_id)  
         (1, 'iPhone', 100),
         (2, 'Galaxy', 200),
         (3, 'Redmi', 300), # orphan record, no matching brand
         (4, 'Pixel', 400),
]

brands = [
    #(brand_id, brand_name)
    (100, "Apple"),
    (200, "Samsung"),
    (400, "Google"),
    (500, "Sony"), # no matching products
]
 
productDf = spark.createDataFrame(data=products, schema=["product_id", "product_name", "brand_id"])
brandDf = spark.createDataFrame(data=brands, schema=["brand_id", "brand_name"])
productDf.show()
brandDf.show()

```spark
(6) Spark Jobs
productDf:pyspark.sql.dataframe.DataFrame = [product_id: long, product_name: string ... 1 more fields]
brandDf:pyspark.sql.dataframe.DataFrame = [brand_id: long, brand_name: string]
+----------+------------+--------+
|product_id|product_name|brand_id|
+----------+------------+--------+
|         1|      iPhone|     100|
|         2|      Galaxy|     200|
|         3|       Redmi|     300|
|         4|       Pixel|     400|
+----------+------------+--------+

+--------+----------+
|brand_id|brand_name|
+--------+----------+
|     100|     Apple|
|     200|   Samsung|
|     400|    Google|
|     500|      Sony|
+--------+----------+
```

In [None]:
"""
Write to S3 as a delta lake format to..

/mnt/bond-s3-forspark/delta/products
OR
dbfs:///mnt/bond-s3-forspark/delta/products
"""

productDf.write.format("delta").save("/mnt/bond-s3-forspark/delta/products")

In [None]:
%sql
SELECT * FROM delta.`/mnt/bond-s3-forspark/delta/products`

In [None]:
%sql
SELECT * FROM delta.products

In [None]:
"""
OPTIMIZE by merging multiple .parquet files into a single file.
"""

%sql
OPTIMIZE delta.`/mnt/bond-s3-forspark/delta/products`

In [None]:
%sql

-- Z-ORDER
-- Z-order means, group the data by ranking, or relevant data together as group or co-group
-- improve the performance of read query if the query contains zorder column

OPTIMIZE delta.`/mnt/bond-s3-forspark/delta/products`
  ZORDER BY (brand_id)

In [None]:
%sql

-- Every write to delta-lake creates a snapshot for parallel/immutable processing
-- Clean-up all the not-needed snapshots with VACUUM

VACUUM delta.`/mnt/bond-s3-forspark/delta/products`