In [None]:
# define Spark client

from pyspark.sql import SparkSession


spark = SparkSession.builder \
    .config("spark.sql.warehouse.dir", "/usr/local/hadoop/warehouse") \
    .config("spark.sql.catalog.iceberg", "org.apache.iceberg.spark.SparkCatalog") \
    .config("spark.sql.catalog.iceberg.type", "hive") \
    .config("spark.sql.catalog.iceberg.uri", "thrift://hivemetastore:9083") \
    .config("spark.sql.catalog.iceberg.cache-enabled", False) \
    .config("spark.sql.extensions", "org.apache.iceberg.spark.extensions.IcebergSparkSessionExtensions") \
    .getOrCreate()

spark.sparkContext.setLogLevel('ERROR')

In [None]:
# query existing snapshots

spark.sql("""
SELECT snapshot_id, committed_at
FROM iceberg.default.bank_transfers.snapshots
ORDER BY committed_at           
""").show(truncate=False)

In [None]:
# query a snapshot using its id

df_1 = spark.sql("""
SELECT *
FROM iceberg.default.bank_transfers
VERSION AS OF 4618106655617205880 
""")

df_1.show(truncate=False)

In [None]:
# query a snapshot using its creation date

df_2 = spark.sql("""
SELECT *
FROM iceberg.default.bank_transfers
TIMESTAMP AS OF '2023-04-04 19:50:46.929'
""")

df_2.show(truncate=False)

In [None]:
# show the difference between the two

df_2.subtract(df_1).show()

In [None]:
# list partitions of the table

spark.sql("""
SELECT *
FROM iceberg.default.bank_transfers.partitions
""").show(truncate=False)

In [None]:
# expand partitioning scheme and insert more data, then list partitions

spark.sql("""
ALTER TABLE iceberg.default.bank_transfers
ADD PARTITION FIELD bucket(1000, amount)
""")


spark.sql("""
INSERT INTO iceberg.default.bank_transfers VALUES
    (5, 2500, "Charles B.", "GlobCorp Inc.", TIMESTAMP"2022-12-03T11:00:02")
""")

In [None]:
# list partitions of the table again

spark.sql("""
SELECT *
FROM iceberg.default.bank_transfers.partitions
""").show(truncate=False)

In [None]:
# add a column and insert more data

spark.sql("""
ALTER TABLE iceberg.default.bank_transfers
ADD COLUMNS (
    comment string comment 'additional information about a transaction'
)
""")

spark.sql("""
INSERT INTO iceberg.default.bank_transfers VALUES
    (6, 2200, "Ian S.", "GlobCorp Inc.", TIMESTAMP"2022-12-04T21:01:02", "flag this transaction please")
""")

In [None]:
# compare data with previous snapshot

spark.sql("""
SELECT *
FROM iceberg.default.bank_transfers
VERSION AS OF 4618106655617205880 
""").show(truncate=False)

spark.sql("""
SELECT *
FROM iceberg.default.bank_transfers
""").show(truncate=False)

In [None]:
# remove a column and add it again

spark.sql("""
ALTER TABLE iceberg.default.bank_transfers
DROP COLUMN comment
""")

spark.sql("""
ALTER TABLE iceberg.default.bank_transfers
ADD COLUMNS (
    comment string comment 'additional information about a transaction'
)
""")

In [None]:
# select the data again

spark.sql("""
SELECT *
FROM iceberg.default.bank_transfers
""").show(truncate=False)

In [None]:
# list snapshots again

spark.sql("""
SELECT snapshot_id, committed_at
FROM iceberg.default.bank_transfers.snapshots
ORDER BY committed_at           
""").show(truncate=False)

In [None]:
# is the data in the last snapshot when called explicitly?

df_1 = spark.sql("""
SELECT *
FROM iceberg.default.bank_transfers
VERSION AS OF 2493344975659103448  
""").show(truncate=False)

In [None]:
# what is going on with the metadata

df_1 = spark.sql("""
SELECT timestamp, latest_snapshot_id
FROM iceberg.default.bank_transfers.metadata_log_entries
ORDER BY timestamp
""").show(truncate=False)