### Load reqs and start

In [31]:
import sys, os
sys.path.append("../src/python") 

In [1]:
cd "/home/iceberg"


/home/iceberg


  self.shell.db['dhist'] = compress_dhist(dhist)[-100:]


In [3]:
pip install -e /home/iceberg

Obtaining file:///home/iceberg
  Installing build dependencies ... [?2done
[?25h  Checking if build backend supports build_editable ... [?25ldone
[?25h  Getting requirements to build editable ... [?25done
[?25h  Preparing editable metadata (pyproject.toml) ... [?25ldone
Building wheels for collected packages: customer_transaction_etl
  Building editable for customer_transaction_etl (pyproject.toml) ... [?25ldone
[?25h  Created wheel for customer_transaction_etl: filename=customer_transaction_etl-0.1.0-0.editable-py3-none-any.whl size=1441 sha256=65c27dcae07bd9abb8fe071d05821190427c6651a6227da5173a3f9b6d754929
  Stored in directory: /tmp/pip-ephem-wheel-cache-61bscaoi/wheels/47/e4/18/3729c1b70e0dfec9a6b23d35b4183f467ede935f2336901bfb
Successfully built customer_transaction_etl
Installing collected packages: customer_transaction_etl
  Attempting uninstall: customer_transaction_etl
    Found existing installation: customer_transaction_etl 0.1.0
    Uninstalling customer_transacti

## Starting the Data Exploration 

In [2]:
from customer_transaction_etl.session import build_spark
spark = build_spark("data-exploration")

25/09/28 18:15:15 WARN SparkSession: Using an existing Spark session; only runtime SQL configurations will take effect.


In [3]:
from datetime import datetime
print(datetime.today().strftime('%d-%m-%Y'))
today=datetime.today().strftime('%d-%m-%Y')

from customer_transaction_etl.config import Paths
ingest_date = "2025-09-25"  # or today 
source_root = "/home/iceberg/data/raw/customer_data_ingest"  # <- adjust if needed
raw_glob = f"{source_root}/ingest_date={ingest_date}/*.json"
#raw_glob = os.path.join(paths.raw, f"ingest_date={ingest_date}", "*.json")

28-09-2025


In [4]:
print("Reading:", raw_glob)
df_raw = spark.read.json(raw_glob, multiLine=True)

df_raw.printSchema()
df_raw.show(truncate=False)

Reading: /home/iceberg/data/raw/customer_data_ingest/ingest_date=2025-09-25/*.json
root
 |-- customer_id: string (nullable = true)
 |-- order_id: string (nullable = true)
 |-- products: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- name: string (nullable = true)
 |    |    |-- price: string (nullable = true)
 |    |    |-- product_id: string (nullable = true)
 |    |    |-- quantity: string (nullable = true)
 |-- purchase_date: string (nullable = true)

+-----------+--------+---------------------------------------------------------------+-------------+
|customer_id|order_id|products                                                       |purchase_date|
+-----------+--------+---------------------------------------------------------------+-------------+
|C001       |O001    |[{Product A, 10,99, P001, 2,00}, { Product B,  5.99 , P002, 1}]| 2023-01-01  |
|C002       |O002    |[{Product A, 10,99, P001, 3,00}, {Product C, 7.99, P003,  2}]  |2023-01-02 

## Data Transformation

### <span style="color:blue"> ExplodeToLines </span>

In [5]:
from customer_transaction_etl.steps import ExplodeToLines
from pyspark.sql import DataFrame, functions as F
from decimal import Decimal

explode = ExplodeToLines()

df_lines = explode.transform(df_raw)

df_lines.printSchema()
df_lines.show(truncate=False)


root
 |-- order_id: string (nullable = true)
 |-- customer_id: string (nullable = true)
 |-- purchase_date: string (nullable = true)
 |-- product_id: string (nullable = true)
 |-- product_name: string (nullable = true)
 |-- quantity: string (nullable = true)
 |-- price: string (nullable = true)

+--------+-----------+-------------+----------+------------+--------+------+
|order_id|customer_id|purchase_date|product_id|product_name|quantity|price |
+--------+-----------+-------------+----------+------------+--------+------+
|O001    |C001       | 2023-01-01  |P001      |Product A   |2,00    |10,99 |
|O001    |C001       | 2023-01-01  |P002      | Product B  |1       | 5.99 |
|O002    |C002       |2023-01-02   |P001      |Product A   |3,00    |10,99 |
|O002    |C002       |2023-01-02   |P003      |Product C   | 2      |7.99  |
|O003    |C003       |2023-01-03   |P002      |Product B   | 1      |5,99  |
|O003    |C003       |2023-01-03   |P003      |Product C   |4,00    | 7,99 |
+--------+

#### How it will look without calling the class

In [52]:

from pyspark.sql import DataFrame, functions as F
exploded_exmp = df_raw.withColumn("item", F.explode("products"))

exploded_exmp.printSchema()
exploded_exmp.show()

root
 |-- customer_id: string (nullable = true)
 |-- order_id: string (nullable = true)
 |-- products: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- name: string (nullable = true)
 |    |    |-- price: string (nullable = true)
 |    |    |-- product_id: string (nullable = true)
 |    |    |-- quantity: string (nullable = true)
 |-- purchase_date: string (nullable = true)
 |-- item: struct (nullable = true)
 |    |-- name: string (nullable = true)
 |    |-- price: string (nullable = true)
 |    |-- product_id: string (nullable = true)
 |    |-- quantity: string (nullable = true)

+-----------+--------+--------------------+-------------+--------------------+
|customer_id|order_id|            products|purchase_date|                item|
+-----------+--------+--------------------+-------------+--------------------+
|       C001|    O001|[{Product A, 10,9...|   2023-01-01|{Product A, 10,99...|
|       C001|    O001|[{Product A, 10,9...|   2023-01-01|

In [54]:
        lines_exmp = (
            exploded_exmp
            .select(
                F.col("order_id"),
                F.col("customer_id"),
                F.col("purchase_date"),
                F.col("item.product_id").alias("product_id"),
                F.col("item.name").alias("product_name"),
                F.col("item.quantity").alias("quantity"),
                F.col("item.price").alias("price"),
            )
        )   

lines_exmp.printSchema()
lines_exmp.show()

root
 |-- order_id: string (nullable = true)
 |-- customer_id: string (nullable = true)
 |-- purchase_date: string (nullable = true)
 |-- product_id: string (nullable = true)
 |-- product_name: string (nullable = true)
 |-- quantity: string (nullable = true)
 |-- price: string (nullable = true)

+--------+-----------+-------------+----------+------------+--------+------+
|order_id|customer_id|purchase_date|product_id|product_name|quantity| price|
+--------+-----------+-------------+----------+------------+--------+------+
|    O001|       C001|   2023-01-01|      P001|   Product A|    2,00| 10,99|
|    O001|       C001|   2023-01-01|      P002|   Product B|       1| 5.99 |
|    O002|       C002|  2023-01-02 |      P001|   Product A|    3,00| 10,99|
|    O002|       C002|  2023-01-02 |      P003|   Product C|       2|  7.99|
|    O003|       C003|   2023-01-03|      P002|   Product B|       1|  5,99|
|    O003|       C003|   2023-01-03|      P003|   Product C|    4,00| 7,99 |
+--------+

#### End Example

#### Sanity Checks

In [6]:

# 1) Number of line rows should equal sum of products array sizes per order
expected_count = (df_raw
    .select(F.size(F.col("products")).alias("n"))
    .agg(F.sum("n").alias("sum_n"))
    .collect()[0]["sum_n"])

actual_count = df_lines.count()

print("Expected line count:", expected_count, " | Actual:", actual_count)

Expected line count: 6  | Actual: 6


In [7]:
# 2) No columns lost accidentally
print("Columns:", df_lines.columns)

# 3) Quick peek grouped by order to ensure row multiplication looks right
df_lines.groupBy("order_id").agg(F.count("*").alias("line_count")).show()

Columns: ['order_id', 'customer_id', 'purchase_date', 'product_id', 'product_name', 'quantity', 'price']
+--------+----------+
|order_id|line_count|
+--------+----------+
|    O002|         2|
|    O001|         2|
|    O003|         2|
+--------+----------+



### <span style="color:blue"> CleanseAndCast</span>

In [8]:
from customer_transaction_etl.steps import CleanseAndCast

In [9]:
cast_step = CleanseAndCast()
df_cast = cast_step.transform(df_lines)  # df_lines is the output from ExplodeToLines

df_cast.printSchema()
df_cast.show(10, truncate=False)

root
 |-- order_id: string (nullable = true)
 |-- customer_id: string (nullable = true)
 |-- purchase_date: date (nullable = true)
 |-- product_id: string (nullable = true)
 |-- product_name: string (nullable = true)
 |-- quantity: integer (nullable = true)
 |-- price: decimal(10,2) (nullable = true)

+--------+-----------+-------------+----------+------------+--------+-----+
|order_id|customer_id|purchase_date|product_id|product_name|quantity|price|
+--------+-----------+-------------+----------+------------+--------+-----+
|O001    |C001       |2023-01-01   |P001      |Product A   |2       |10.99|
|O001    |C001       |2023-01-01   |P002      |Product B   |1       |5.99 |
|O002    |C002       |2023-01-02   |P001      |Product A   |3       |10.99|
|O002    |C002       |2023-01-02   |P003      |Product C   |2       |7.99 |
|O003    |C003       |2023-01-03   |P002      |Product B   |1       |5.99 |
|O003    |C003       |2023-01-03   |P003      |Product C   |4       |7.99 |
+--------+---

In [10]:
total_rows = df_cast.count()
print("Rows after cast:", total_rows)

# 1) How many rows failed to parse critical fields (now null)?
invalid = df_cast.where(
    F.col("order_id").isNull() |
    F.col("product_id").isNull() |
    F.col("purchase_date").isNull() |
    F.col("quantity").isNull() |
    F.col("price").isNull()
)
print("Invalid rows:", invalid.count())
invalid.show(truncate=False)

# 2) Are any quantities non-positive? (optional business rule)
neg_qty = df_cast.where(F.col("quantity") <= 0)
print("Non-positive quantities:", neg_qty.count())

# 3) Are any prices negative?
neg_price = df_cast.where(F.col("price") <= 0)
print("Non-positive prices:", neg_price.count())

Rows after cast: 6
Invalid rows: 0
+--------+-----------+-------------+----------+------------+--------+-----+
|order_id|customer_id|purchase_date|product_id|product_name|quantity|price|
+--------+-----------+-------------+----------+------------+--------+-----+
+--------+-----------+-------------+----------+------------+--------+-----+

Non-positive quantities: 0
Non-positive prices: 0


#### Check all transformations in a df

In [66]:
df_lines.select(
    "order_id","product_id",
    F.col("quantity").alias("q_raw"),
    F.col("price").alias("p_raw"),
    F.col("purchase_date").alias("d_raw")
).show(truncate=False)

+--------+----------+-----+------+-----------+
|order_id|product_id|q_raw|p_raw |d_raw      |
+--------+----------+-----+------+-----------+
|O001    |P001      |2,00 |10,99 | 2023-01-01|
|O001    |P002      |1    | 5.99 | 2023-01-01|
|O002    |P001      |3,00 |10,99 |2023-01-02 |
|O002    |P003      | 2   |7.99  |2023-01-02 |
|O003    |P002      | 1   |5,99  |2023-01-03 |
|O003    |P003      |4,00 | 7,99 |2023-01-03 |
+--------+----------+-----+------+-----------+



In [14]:
norm = (df_lines
    .withColumn("q_trim", F.trim("quantity"))
    .withColumn("q_norm", F.regexp_replace(F.regexp_replace(F.col("q_trim"), r"\s+", ""), ",", "."))
    .withColumn("q_dbl",  F.col("q_norm").cast("double"))
    .withColumn("p_trim", F.trim("price"))
    .withColumn("p_norm", F.regexp_replace(F.regexp_replace(F.col("p_trim"), r"\s+", ""), ",", "."))
    .withColumn("p_dec",  F.col("p_norm").cast("decimal(10,2)"))
    .withColumn("d_trim", F.trim("purchase_date"))
    .withColumn("d1",     F.to_date("d_trim", "yyyy-MM-dd"))
    .withColumn("d2",     F.to_date("d_trim", "dd/MM/yyyy"))
    .withColumn("d_final",F.coalesce("d1","d2"))
)
norm.select("*").show(truncate=False)



+--------+-----------+-------------+----------+------------+--------+------+------+------+-----+------+------+-----+----------+----------+----+----------+
|order_id|customer_id|purchase_date|product_id|product_name|quantity|price |q_trim|q_norm|q_dbl|p_trim|p_norm|p_dec|d_trim    |d1        |d2  |d_final   |
+--------+-----------+-------------+----------+------------+--------+------+------+------+-----+------+------+-----+----------+----------+----+----------+
|O001    |C001       | 2023-01-01  |P001      |Product A   |2,00    |10,99 |2,00  |2.00  |2.0  |10,99 |10.99 |10.99|2023-01-01|2023-01-01|NULL|2023-01-01|
|O001    |C001       | 2023-01-01  |P002      | Product B  |1       | 5.99 |1     |1     |1.0  |5.99  |5.99  |5.99 |2023-01-01|2023-01-01|NULL|2023-01-01|
|O002    |C002       |2023-01-02   |P001      |Product A   |3,00    |10,99 |3,00  |3.00  |3.0  |10,99 |10.99 |10.99|2023-01-02|2023-01-02|NULL|2023-01-02|
|O002    |C002       |2023-01-02   |P003      |Product C   | 2      |7

### <span style="color:blue"> DeDuplication step </span>

In [11]:
key_cols = ["order_id", "product_id"]

dups = (df_cast
        .groupBy(*key_cols)
        .agg(F.count("*").alias("row_count"))
        .where("row_count > 1")
        .orderBy(F.col("row_count").desc()))

print("Duplicate key groups:", dups.count())
dups.show(20, truncate=False)

Duplicate key groups: 0
+--------+----------+---------+
|order_id|product_id|row_count|
+--------+----------+---------+
+--------+----------+---------+



In [12]:
from customer_transaction_etl.steps import Deduplicate

In [13]:
before = df_cast.count()
print("Rows before:", before)

dedup = Deduplicate(key_cols=["order_id","product_id"], strategy="prefer_latest")
df_dedup = dedup.transform(df_cast)

after = df_dedup.count()
print("Rows after:", after, "| Removed:", before - after)

Rows before: 6
Rows after: 6 | Removed: 0


In [14]:
# Sanity: ensure no duplicate keys remain
check = (df_dedup.groupBy("order_id","product_id")
         .agg(F.count("*").alias("row_count"))
         .where("row_count > 1"))
print("Remaining duplicate key groups:", check.count())

Remaining duplicate key groups: 0


### <span style="color:blue"> Add Line Amount  </span>
### <span style="color:silver"> Silver level Analysis

In [15]:
from customer_transaction_etl.steps import WithLineAmount, WithCustomerTotalRevenue

In [16]:
df_with_amt = WithLineAmount("with_line_amount").transform(df_dedup)
df_with_amt.select("order_id","product_id","quantity","price","line_amount").show()
df_with_amt.printSchema()

+--------+----------+--------+-----+-----------+
|order_id|product_id|quantity|price|line_amount|
+--------+----------+--------+-----+-----------+
|    O001|      P001|       2|10.99|      21.98|
|    O001|      P002|       1| 5.99|       5.99|
|    O002|      P001|       3|10.99|      32.97|
|    O002|      P003|       2| 7.99|      15.98|
|    O003|      P002|       1| 5.99|       5.99|
|    O003|      P003|       4| 7.99|      31.96|
+--------+----------+--------+-----+-----------+

root
 |-- order_id: string (nullable = true)
 |-- customer_id: string (nullable = true)
 |-- purchase_date: date (nullable = true)
 |-- product_id: string (nullable = true)
 |-- product_name: string (nullable = true)
 |-- quantity: integer (nullable = true)
 |-- price: decimal(10,2) (nullable = true)
 |-- line_amount: decimal(18,2) (nullable = true)



##### Window Total Revenue per customer

In [17]:
df_with_cust_rev = WithCustomerTotalRevenue().transform(df_with_amt)
df_with_cust_rev.select("customer_id","line_amount","customer_total_revenue").show()


+-----------+-----------+----------------------+
|customer_id|line_amount|customer_total_revenue|
+-----------+-----------+----------------------+
|       C001|      21.98|                 27.97|
|       C001|       5.99|                 27.97|
|       C002|      32.97|                 48.95|
|       C002|      15.98|                 48.95|
|       C003|       5.99|                 37.95|
|       C003|      31.96|                 37.95|
+-----------+-----------+----------------------+



In [20]:
df_with_cust_rev.show()

+--------+-----------+-------------+----------+------------+--------+-----+-----------+----------------------+
|order_id|customer_id|purchase_date|product_id|product_name|quantity|price|line_amount|customer_total_revenue|
+--------+-----------+-------------+----------+------------+--------+-----+-----------+----------------------+
|    O001|       C001|   2023-01-01|      P001|   Product A|       2|10.99|      21.98|                 27.97|
|    O001|       C001|   2023-01-01|      P002|   Product B|       1| 5.99|       5.99|                 27.97|
|    O002|       C002|   2023-01-02|      P001|   Product A|       3|10.99|      32.97|                 48.95|
|    O002|       C002|   2023-01-02|      P003|   Product C|       2| 7.99|      15.98|                 48.95|
|    O003|       C003|   2023-01-03|      P002|   Product B|       1| 5.99|       5.99|                 37.95|
|    O003|       C003|   2023-01-03|      P003|   Product C|       4| 7.99|      31.96|                 37.95|
+

In [18]:
###Sanity

# A) Recompute customer totals via groupBy and compare to the window column
grp = (df_with_cust_rev
       .groupBy("customer_id")
       .agg(F.sum("line_amount").alias("grp_total")))

chk = (df_with_cust_rev
       .select("customer_id","customer_total_revenue")
       .dropDuplicates())
# Does group by match a distinct prune?
joined = chk.join(grp, "customer_id")
mismatches = joined.where(F.col("customer_total_revenue") != F.col("grp_total"))
print("Customer total mismatches:", mismatches.count())

# B) Spot-check: sum of line_amount overall should equal sum of the grouped totals
total_lines = df_with_cust_rev.agg(F.sum("line_amount").alias("s")).first()["s"]
total_grouped = grp.agg(F.sum("grp_total").alias("s")).first()["s"]
print("Sum(line_amount) overall:", total_lines, "| Sum of per-customer totals:", total_grouped)

# C) Sanity: no nulls in critical numeric fields before/after
print("Null line_amount rows:", df_with_cust_rev.where(F.col("line_amount").isNull()).count())
print("Null customer_total_revenue rows:", df_with_cust_rev.where(F.col("customer_total_revenue").isNull()).count())


Customer total mismatches: 0
Sum(line_amount) overall: 114.87 | Sum of per-customer totals: 114.87
Null line_amount rows: 0
Null customer_total_revenue rows: 0


### <span style="color:orange"> Gold <span style="color:blue"> level Analysis  
- Per-product revenue

- Top-selling products by revenue

- Average Order Value (AOV)
</span>

In [19]:
from customer_transaction_etl.aggregates import *

In [20]:
df_analysis= df_with_cust_rev

In [21]:
# 1) Per-product revenue (overall)
prod_rev = product_revenue(df_analysis, by="overall")
prod_rev.show(10, truncate=False)

+----------+------------+-------+--------------+-----------+
|product_id|product_name|revenue|total_quantity|order_count|
+----------+------------+-------+--------------+-----------+
|P001      |Product A   |54.95  |5             |2          |
|P003      |Product C   |47.94  |6             |2          |
|P002      |Product B   |11.98  |2             |2          |
+----------+------------+-------+--------------+-----------+



In [22]:
# 1.b: per-day product revenue
prod_rev_daily = product_revenue(df_analysis, by="daily")
prod_rev_daily.orderBy("purchase_date", F.desc("revenue")).show(10, truncate=False)

+-------------+----------+------------+-------+--------------+-----------+
|purchase_date|product_id|product_name|revenue|total_quantity|order_count|
+-------------+----------+------------+-------+--------------+-----------+
|2023-01-01   |P001      |Product A   |21.98  |2             |1          |
|2023-01-01   |P002      |Product B   |5.99   |1             |1          |
|2023-01-02   |P001      |Product A   |32.97  |3             |1          |
|2023-01-02   |P003      |Product C   |15.98  |2             |1          |
|2023-01-03   |P003      |Product C   |31.96  |4             |1          |
|2023-01-03   |P002      |Product B   |5.99   |1             |1          |
+-------------+----------+------------+-------+--------------+-----------+



In [23]:
# 2) Top-N products by revenue
topN = top_products_by_revenue(df_analysis, n=5, by="overall")
topN.show(truncate=False)

25/09/28 18:17:16 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/09/28 18:17:16 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/09/28 18:17:16 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/09/28 18:17:16 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/09/28 18:17:16 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/09/28 18:17:16 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/09/28 1

+----------+------------+-------+--------------+-----------+----+
|product_id|product_name|revenue|total_quantity|order_count|rank|
+----------+------------+-------+--------------+-----------+----+
|P001      |Product A   |54.95  |5             |2          |1   |
|P003      |Product C   |47.94  |6             |2          |2   |
|P002      |Product B   |11.98  |2             |2          |3   |
+----------+------------+-------+--------------+-----------+----+



25/09/28 18:17:16 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/09/28 18:17:16 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/09/28 18:17:16 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.


In [27]:
# 3) Average Order Value
aov = average_order_value(df_analysis)
aov.show(truncate=False)

+-------------+-----------+-------------------+
|total_revenue|order_count|average_order_value|
+-------------+-----------+-------------------+
|114.87       |3          |38.29              |
+-------------+-----------+-------------------+



## <span style="color:Green"> Write Outputs

In [39]:
from pathlib import Path,PurePath
PurePath("silver")

PurePosixPath('silver')

In [46]:
from customer_transaction_etl.config import Paths

In [47]:
print(Paths.gold)

./data/gold


### Write Silver level

In [42]:
(df_with_cust_rev
 .write
 .mode("overwrite") #overwrite-by-partition
 .partitionBy("purchase_date")
 .parquet("/home/iceberg/data/silver/customer_data/order_lines"))


### Write Gold level

In [43]:
from datetime import date
snapshot_date = date.today().isoformat()


In [45]:
# Daily Product Revenue
(prod_rev_daily
 .write
 .mode("overwrite")
 .partitionBy("purchase_date")
 .parquet("/home/iceberg/data/gold/customer_data/product_revenue"))


In [49]:
# Top N (overall) products by Revenue
(topN
 .withColumn("snapshot_date", F.lit(snapshot_date))
 .write
 .mode("overwrite")
 .partitionBy("snapshot_date")
 .parquet("/home/iceberg/data/gold/customer_data/top_products"))


25/09/28 19:04:10 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/09/28 19:04:10 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/09/28 19:04:10 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/09/28 19:04:10 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/09/28 19:04:10 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/09/28 19:04:11 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/09/28 1

In [50]:
# Average Order Value as a snapshot table
(aov
 .withColumn("snapshot_date", F.lit(snapshot_date))
 .write
 .mode("overwrite")
 .partitionBy("snapshot_date")
 .parquet("/home/iceberg/data/gold/customer_data/order_kpis"))