In [1]:
pip install pyiceberg[sql-sqlite]

Collecting pyiceberg[sql-sqlite]
  Downloading pyiceberg-0.8.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (4.0 kB)
Collecting mmh3<6.0.0,>=4.0.0 (from pyiceberg[sql-sqlite])
  Downloading mmh3-5.1.0-cp311-cp311-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (16 kB)
Collecting sortedcontainers==2.4.0 (from pyiceberg[sql-sqlite])
  Downloading sortedcontainers-2.4.0-py2.py3-none-any.whl.metadata (10 kB)
Collecting strictyaml<2.0.0,>=1.7.0 (from pyiceberg[sql-sqlite])
  Downloading strictyaml-1.7.3-py3-none-any.whl.metadata (11 kB)
Downloading sortedcontainers-2.4.0-py2.py3-none-any.whl (29 kB)
Downloading mmh3-5.1.0-cp311-cp311-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl (101 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m101.6/101.6 kB[0m [31m6.8 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading strictyaml-1.7.3-py3-none-any.whl (123 kB)
[2K   [90m━━━━━

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
from pyiceberg.catalog.sql import SqlCatalog

warehouse_path = "/content/drive/MyDrive/Lecture2Iceberg/"
catalog = SqlCatalog(
    "default",
    **{
        "uri": f"sqlite:///{warehouse_path}/pyiceberg_catalog.db",
        "warehouse": f"file://{warehouse_path}",
    },
)

In [None]:
import pyarrow.parquet as pq

parquet_file = "/content/drive/MyDrive/Lecture2/financial.parquet"

df = pq.read_table(parquet_file)

df

pyarrow.Table
Transaction ID: int64
Account ID: string
Transaction Amount: double
Transaction Type: string
Merchant/Counterparty: string
Location: string
Date and Time: string
----
Transaction ID: [[1000000000,1000000001,1000000002,1000000003,1000000004,...,1000009995,1000009996,1000009997,1000009998,1000009999]]
Account ID: [["ACC456789","ACC567890","ACC345678","ACC567890","ACC567890",...,"ACC234567","ACC345678","ACC234567","ACC456789","ACC345678"]]
Transaction Amount: [[86.8,1758.97,636.72,266.67,942.24,...,1770.98,1512.92,653.26,1876.98,162.44]]
Transaction Type: [["Deposit","Withdrawal","Purchase","Deposit","Deposit",...,"Deposit","Purchase","Trade","Withdrawal","Deposit"]]
Merchant/Counterparty: [["Employer","ABC Store","XYZ Electronics","Employer","ATM",...,"ABC Store","Supermarket","Employer","Online Store","Supermarket"]]
Location: [["Los Angeles, CA","Chicago, IL","New York, NY","New York, NY","Chicago, IL",...,"Miami, FL","New York, NY","Los Angeles, CA","Online","Chicago, IL

In [None]:
catalog.create_namespace("default")

In [None]:
table = catalog.create_table(
    "default.financial",
    schema=df.schema,
)

In [None]:
table.append(df)

Now let's look at its schema evolution capability

In [None]:
import pyarrow.compute as pc

df = df.append_column("amount_in_cents", pc.multiply(df["Transaction Amount"], 100))

In [None]:
with table.update_schema() as update_schema:
    update_schema.union_by_name(df.schema)

In [None]:
table.overwrite(df)
print(table.scan().to_arrow())

pyarrow.Table
Transaction ID: int64
Account ID: large_string
Transaction Amount: double
Transaction Type: large_string
Merchant/Counterparty: large_string
Location: large_string
Date and Time: large_string
amount_in_cents: double
----
Transaction ID: [[1000000000,1000000001,1000000002,1000000003,1000000004,...,1000009995,1000009996,1000009997,1000009998,1000009999]]
Account ID: [["ACC456789","ACC567890","ACC345678","ACC567890","ACC567890",...,"ACC234567","ACC345678","ACC234567","ACC456789","ACC345678"]]
Transaction Amount: [[86.8,1758.97,636.72,266.67,942.24,...,1770.98,1512.92,653.26,1876.98,162.44]]
Transaction Type: [["Deposit","Withdrawal","Purchase","Deposit","Deposit",...,"Deposit","Purchase","Trade","Withdrawal","Deposit"]]
Merchant/Counterparty: [["Employer","ABC Store","XYZ Electronics","Employer","ATM",...,"ABC Store","Supermarket","Employer","Online Store","Supermarket"]]
Location: [["Los Angeles, CA","Chicago, IL","New York, NY","New York, NY","Chicago, IL",...,"Miami, FL",

Now let's time travel!

In [None]:
snapshots = table.snapshots()
print("Snapshots:")
for snapshot in snapshots:
    print(snapshot)
    print(f"Snapshot ID: {snapshot.snapshot_id}")


Snapshots:
Operation.APPEND: id=4274749927063933794, schema_id=0
Snapshot ID: 4274749927063933794
Operation.DELETE: id=7501133203571401911, parent_id=4274749927063933794, schema_id=1
Snapshot ID: 7501133203571401911
Operation.APPEND: id=3877388678237865632, parent_id=7501133203571401911, schema_id=1
Snapshot ID: 3877388678237865632


In [None]:
snapshot_id = snapshots[0].snapshot_id  # Use the first snapshot
table_at_snapshot = table.scan(snapshot_id=snapshot_id).to_arrow()
print("Data at Snapshot ID:", snapshot_id)
print(table_at_snapshot)

Data at Snapshot ID: 4274749927063933794
pyarrow.Table
Transaction ID: int64
Account ID: large_string
Transaction Amount: double
Transaction Type: large_string
Merchant/Counterparty: large_string
Location: large_string
Date and Time: large_string
----
Transaction ID: [[1000000000,1000000001,1000000002,1000000003,1000000004,...,1000009995,1000009996,1000009997,1000009998,1000009999]]
Account ID: [["ACC456789","ACC567890","ACC345678","ACC567890","ACC567890",...,"ACC234567","ACC345678","ACC234567","ACC456789","ACC345678"]]
Transaction Amount: [[86.8,1758.97,636.72,266.67,942.24,...,1770.98,1512.92,653.26,1876.98,162.44]]
Transaction Type: [["Deposit","Withdrawal","Purchase","Deposit","Deposit",...,"Deposit","Purchase","Trade","Withdrawal","Deposit"]]
Merchant/Counterparty: [["Employer","ABC Store","XYZ Electronics","Employer","ATM",...,"ABC Store","Supermarket","Employer","Online Store","Supermarket"]]
Location: [["Los Angeles, CA","Chicago, IL","New York, NY","New York, NY","Chicago, IL"