In [3]:
# define Spark client

from pyspark.sql import SparkSession


spark = SparkSession.builder \
    .config("spark.sql.warehouse.dir", "/usr/local/hadoop/warehouse") \
    .config("spark.sql.catalog.iceberg", "org.apache.iceberg.spark.SparkCatalog") \
    .config("spark.sql.catalog.iceberg.type", "hive") \
    .config("spark.sql.catalog.iceberg.uri", "thrift://hivemetastore:9083") \
    .config("spark.sql.catalog.iceberg.cache-enabled", False) \
    .config("spark.sql.extensions", "org.apache.iceberg.spark.extensions.IcebergSparkSessionExtensions") \
    .getOrCreate()

spark.sparkContext.setLogLevel('ERROR')

In [4]:
# query existing snapshots

spark.sql("""
SELECT snapshot_id, committed_at FROM iceberg.default.bank_transfers.snapshots
ORDER BY committed_at           
""").show(truncate=False)

+-------------------+-----------------------+
|snapshot_id        |committed_at           |
+-------------------+-----------------------+
|171325975829668396 |2023-03-13 19:23:29.323|
|9048732889309365881|2023-03-13 19:23:31.272|
|2948771710917491439|2023-03-13 19:23:36.083|
+-------------------+-----------------------+



In [None]:
# query a snapshot using its id

df_1 = spark.sql("""
SELECT * FROM iceberg.default.bank_transfers VERSION AS OF 5932334375850750706
""")

df_1.show(truncate=False)

In [None]:
# query a snapshot using its creation date

df_2 = spark.sql("""
SELECT * FROM iceberg.default.bank_transfers TIMESTAMP AS OF '2023-03-12 22:02:34.091'
""")

df_2.show(truncate=False)

In [None]:
# show the difference between the two

df_2.subtract(df_1).show()

In [5]:
# expand partitioning scheme and insert more data, then list partitions

spark.sql("""
ALTER TABLE iceberg.default.bank_transfers ADD PARTITION FIELD hours(timestamp)
""")


spark.sql("""
INSERT INTO iceberg.default.bank_transfers VALUES
    (5, 2500, "Charles B.", "GlobCorp Inc.", TIMESTAMP"2022-12-03T11:00:02")
""")


spark.sql("""
SELECT * FROM iceberg.default.bank_transfers.partitions
""").show(truncate=False)

+--------------------+------------+----------+-------+
|partition           |record_count|file_count|spec_id|
+--------------------+------------+----------+-------+
|{2022-12-01, null}  |1           |1         |0      |
|{2022-11-15, null}  |1           |1         |0      |
|{2022-12-03, 463907}|1           |1         |1      |
|{2022-11-17, null}  |1           |1         |0      |
+--------------------+------------+----------+-------+



In [None]:
TODO:
    - expand partitioning scheme and insert more data, then list partitions
    - expand schema
    - delete column and recreate it to show it won't "bring back" old data