In [4]:
# import polars

import polars as pl

In [19]:
# use pyiceberg to get the location of the table

from pyiceberg.catalog.rest import RestCatalog


catalog = RestCatalog(name="iceberg", uri="http://nessie:19120/iceberg/main/")
table = catalog.load_table("default.bank_transfers")
meta = table.metadata

In [20]:
# perform a scan of the table

table_path = table.location()
storage_options = {
    "s3.endpoint": "http://minio:9000",
    "s3.region": "eu-central-1",
    "s3.access-key-id": "minioadmin",
    "s3.secret-access-key": "minioadmin",
}

bank_transfers = pl.scan_iceberg(table.metadata_location, storage_options=storage_options)

In [21]:
# use SQL to query the table through the variable created before

pl.sql(
    """
    SELECT * FROM bank_transfers
    """
).collect()

id,amount,transferred_from,transferred_to,timestamp
i64,i32,str,str,"datetime[μs, UTC]"
1,12000,"""ACME INC""","""ASTROCORP""",2022-11-14 00:55:00 UTC
2,24000,"""John Doe""","""Jane Doe""",2022-11-15 02:11:00 UTC
3,500,"""Deborah S.""","""Michael C.""",2022-11-17 16:25:07 UTC


In [22]:
# use SQL for self-referential query

bank_transfers.sql(
    """
    SELECT * FROM self
    """
).collect()

id,amount,transferred_from,transferred_to,timestamp
i64,i32,str,str,"datetime[μs, UTC]"
1,12000,"""ACME INC""","""ASTROCORP""",2022-11-14 00:55:00 UTC
2,24000,"""John Doe""","""Jane Doe""",2022-11-15 02:11:00 UTC
3,500,"""Deborah S.""","""Michael C.""",2022-11-17 16:25:07 UTC


In [23]:
# A filtering query using the DataFrame API

bank_transfers.filter(
    pl.col('transferred_from') == "Charles B."
).collect()

id,amount,transferred_from,transferred_to,timestamp
i64,i32,str,str,"datetime[μs, UTC]"


In [24]:
# A basic analytical query

pl.sql(
    """
    SELECT
        transferred_to,
        COUNT(*) AS count,
        SUM(amount) AS total_transfers
    FROM bank_transfers
    GROUP BY transferred_to
    """
).collect()

transferred_to,count,total_transfers
str,u32,i32
"""ASTROCORP""",1,12000
"""Michael C.""",1,500
"""Jane Doe""",1,24000


In [25]:
# Is it really evaluated lazily?

df = pl.sql(
    """
    SELECT
        transferred_to,
        COUNT(*) AS count,
        SUM(amount) AS total_transfers
    FROM bank_transfers
    GROUP BY transferred_to
    """
)
type(df)

polars.lazyframe.frame.LazyFrame

In [None]:
# What does it contain, then?

df.explain()

In [None]:
# Let's see the results again!

df.collect()

In [None]:
# What about time travel?

table_log = table.metadata.metadata_log
table_log

In [None]:
pl.scan_iceberg(
    table_log[4].metadata_file
).collect()