In [1]:
%run utils.ipynb

In [3]:
spark = get_spark(catalog="iceberg", storage="lakehouse.io")
spark

In [11]:
spark.catalog.tableExists("iceberg.ecommerce.orders")

True

In [12]:
spark.sql("drop table if exists iceberg.ecommerce.orders")

DataFrame[]

In [13]:
df.isEmpty()

False

In [14]:
from pyspark.sql.functions import year, month, dayofmonth

df = df.withColumn("year", year(df["order_purchase_timestamp"])) \
       .withColumn("month", month(df["order_purchase_timestamp"])) \
       .withColumn("day", dayofmonth(df["order_purchase_timestamp"])) \

df.writeTo("iceberg.ecommerce.orders") \
    .using("iceberg").partitionedBy("year", "month") \
    .option("write.parquet.compression-codec", "snappy") \
    .createOrReplace()

df.createOrReplaceTempView("orders")

spark.sql("""
    MERGE INTO iceberg.ecommerce.orders AS target
    USING orders AS source
    ON target.order_id = source.order_id
    WHEN MATCHED THEN UPDATE SET
        target.order_status = source.order_status,
        target.order_approved_at = source.order_approved_at,
        target.order_delivered_carrier_date = source.order_delivered_carrier_date,
        target.order_delivered_customer_date = source.order_delivered_customer_date
    WHEN NOT MATCHED THEN INSERT *
""")

DataFrame[]

In [15]:
spark.sql("select * from iceberg.ecommerce.orders").show(5)

+--------------------+--------------------+------------+------------------------+-------------------+----------------------------+-----------------------------+-----------------------------+----+-----+---+
|            order_id|         customer_id|order_status|order_purchase_timestamp|  order_approved_at|order_delivered_carrier_date|order_delivered_customer_date|order_estimated_delivery_date|year|month|day|
+--------------------+--------------------+------------+------------------------+-------------------+----------------------------+-----------------------------+-----------------------------+----+-----+---+
|09f58c00f941827ab...|7c94da97db6fe83e1...|   delivered|     2017-01-05 16:05:07|2017-01-07 03:35:34|         2017-01-11 15:47:40|          2017-01-16 15:43:31|          2017-02-13 00:00:00|2017|    1|  5|
|0bda8164c1a12b6a3...|3f402674c608ea670...|   delivered|     2017-01-05 13:36:07|2017-01-07 03:45:47|         2017-01-11 16:09:00|          2017-01-16 17:27:34|          2017-0

In [77]:
properties = {
    "user": "admin",
    "password": "password",
    "driver": "org.postgresql.Driver"
}

url = "jdbc:postgresql://warehouse.io:5432/ecommerce"

df.write.jdbc(url=url, table="orders", mode="append", properties=properties)

In [31]:
spark.sql("describe iceberg.ecommerce.orders").show()

+--------------------+-------------+-------+
|            col_name|    data_type|comment|
+--------------------+-------------+-------+
|            order_id|       string|   NULL|
|         customer_id|       string|   NULL|
|        order_status|       string|   NULL|
|order_purchase_ti...|timestamp_ntz|   NULL|
|   order_approved_at|timestamp_ntz|   NULL|
|order_delivered_c...|timestamp_ntz|   NULL|
|order_delivered_c...|timestamp_ntz|   NULL|
|order_estimated_d...|timestamp_ntz|   NULL|
|                year|          int|   NULL|
|               month|          int|   NULL|
|                 day|          int|   NULL|
|# Partition Infor...|             |       |
|          # col_name|    data_type|comment|
|                year|          int|   NULL|
|               month|          int|   NULL|
+--------------------+-------------+-------+



In [10]:
properties = {
    "uri": "http://iceberg.io",
    "s3.endpoint": "http://lakehouse.io",
    "s3.access-key-id": "admin",
    "s3.secret-access-key": "password",
    "py-io-impl": "pyiceberg.io.pyarrow.PyArrowFileIO"
}

catalog = load_catalog("iceberg", **properties)
table = catalog.load_table("ecommerce.orders")
pl.DataFrame(table.scan().to_pandas())

order_id,customer_id,order_status,order_purchase_timestamp,order_approved_at,order_delivered_carrier_date,order_delivered_customer_date,order_estimated_delivery_date
str,str,str,datetime[μs],datetime[μs],datetime[μs],datetime[μs],datetime[μs]
"""1b694ef5b28d3e949a4f0ffeb2c9fc…","""cba12036a88407a37770dc8ab725de…","""delivered""",2017-01-05 13:43:17,2017-01-07 03:35:38,2017-01-11 16:08:55,2017-01-16 16:54:04,2017-02-13 00:00:00
"""17fed53ba6dfef9b594ee2268642e2…","""c0352e94059e3e5a714c9ad0c8306a…","""delivered""",2017-01-05 14:50:54,2017-01-07 03:35:35,2017-01-11 15:59:08,2017-01-16 15:24:03,2017-02-13 00:00:00
"""ca5a215980675471f0cf8199c04190…","""588047d7101d88c333691e47659d70…","""delivered""",2017-01-05 14:23:54,2017-01-07 03:44:16,2017-01-11 15:37:55,2017-01-17 15:14:20,2017-02-01 00:00:00
"""ce86fa5a5108884726a2244bcae51a…","""8fa33a3159dfc303b8aeccf859b9be…","""delivered""",2017-01-05 13:29:03,2017-01-07 03:45:23,2017-01-11 15:35:54,2017-01-16 16:05:21,2017-02-01 00:00:00
"""40599d3d28b75746952ded75566637…","""efdf4a7c78d7c364046efb69035d1d…","""delivered""",2017-01-05 13:01:48,2017-01-07 03:45:49,2017-01-11 16:08:53,2017-01-16 15:43:21,2017-02-13 00:00:00
…,…,…,…,…,…,…,…
"""38bcb524e1c38c2c1b60600a80fc89…","""d2c63ad286e3ca9dd69218008d61ff…","""delivered""",2017-01-05 12:06:36,2017-01-07 03:45:22,2017-01-11 15:35:54,2017-01-16 16:05:22,2017-02-13 00:00:00
"""205d7052a6505124d200f6fea6b423…","""d020d4abe6475a8382f53d763fe24a…","""delivered""",2017-01-05 13:59:30,2017-01-07 03:44:23,2017-01-11 16:09:00,2017-01-17 17:52:31,2017-02-13 00:00:00
"""ec7a019261fce44180373d45b442d7…","""c24fc5f9a446b4d8262041b9c64de7…","""delivered""",2017-01-05 11:56:06,2017-01-05 12:10:17,2017-01-06 12:43:41,2017-01-11 13:14:05,2017-02-01 00:00:00
"""c96209cd1d43d071d3bdf48d299b7a…","""06cd112a475fd4d834eff329681470…","""delivered""",2017-01-05 15:37:58,2017-01-07 03:45:27,2017-01-11 15:47:40,2017-01-16 15:18:08,2017-02-13 00:00:00


In [120]:
!spark-submit \
    --master k8s://https://kubernetes.default.svc.cluster.local:443 \
    --deploy-mode cluster \
    --name ecommerce-orders \
    --conf spark.executor.instances=1 \
    --conf spark.kubernetes.namespace=spark \
    --conf spark.kubernetes.container.image.pullPolicy=Always \
    --conf spark.kubernetes.container.image=registry.io/spark \
    local:///opt/spark/work-dir/tasks/sync_lakehouse.py --table-name orders --date-cursor 2017-01-05

24/07/25 03:47:20 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
24/07/25 03:47:20 INFO SparkKubernetesClientFactory: Auto-configuring K8S client using current context from users K8S config file
24/07/25 03:47:20 INFO KerberosConfDriverFeatureStep: You have not specified a krb5.conf file locally or via a ConfigMap. Make sure that you have the krb5.conf locally on the driver image.
24/07/25 03:47:21 INFO LoggingPodStatusWatcherImpl: State changed, new state: 
	 pod name: ecommerce-orders-a182d490e8001ebe-driver
	 namespace: spark
	 labels: spark-app-name -> ecommerce-orders, spark-app-selector -> spark-2c21ba7ca31047be8ea17d6b35dc613b, spark-role -> driver, spark-version -> 3.5.1
	 pod uid: 88be27ad-73df-4ba5-be13-9d78177175a8
	 creation time: 2024-07-25T03:47:21Z
	 service account name: default
	 volumes: spark-local-dir-1, spark-conf-volume-driver, kube-api-access-fhwqm
	 node name: master
	 start time: 2024