This Notebook will write an Iceberg table to your Google Drive. In order for it to do so, you will have to give it access. When prompted, select all permissions.

In [1]:
pip install pyiceberg[sql-sqlite]

Collecting pyiceberg[sql-sqlite]
  Downloading pyiceberg-0.9.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (4.5 kB)
Collecting mmh3<6.0.0,>=4.0.0 (from pyiceberg[sql-sqlite])
  Downloading mmh3-5.1.0-cp311-cp311-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (16 kB)
Collecting strictyaml<2.0.0,>=1.7.0 (from pyiceberg[sql-sqlite])
  Downloading strictyaml-1.7.3-py3-none-any.whl.metadata (11 kB)
Downloading mmh3-5.1.0-cp311-cp311-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl (101 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m101.6/101.6 kB[0m [31m4.3 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading strictyaml-1.7.3-py3-none-any.whl (123 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m123.9/123.9 kB[0m [31m9.1 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pyiceberg-0.9.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.1 MB

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [5]:
from pyiceberg.catalog.sql import SqlCatalog
import os

warehouse_path = "/content/drive/MyDrive/Lecture2Iceberg/"

if not os.path.exists(warehouse_path):
    os.makedirs(warehouse_path)

catalog = SqlCatalog(
    "default",
    **{
        "uri": f"sqlite:///{warehouse_path}/pyiceberg_catalog.db",
        "warehouse": f"file://{warehouse_path}",
    },
)

In [6]:
import pyarrow.parquet as pq

parquet_file = "/content/drive/MyDrive/Lecture2/financial.parquet"

df = pq.read_table(parquet_file)

df

pyarrow.Table
Transaction ID: int64
Account ID: string
Transaction Amount: double
Transaction Type: string
Merchant/Counterparty: string
Location: string
Date and Time: string
----
Transaction ID: [[1000000000,1000000001,1000000002,1000000003,1000000004,...,1000009995,1000009996,1000009997,1000009998,1000009999]]
Account ID: [["ACC456789","ACC567890","ACC345678","ACC567890","ACC567890",...,"ACC234567","ACC345678","ACC234567","ACC456789","ACC345678"]]
Transaction Amount: [[86.8,1758.97,636.72,266.67,942.24,...,1770.98,1512.92,653.26,1876.98,162.44]]
Transaction Type: [["Deposit","Withdrawal","Purchase","Deposit","Deposit",...,"Deposit","Purchase","Trade","Withdrawal","Deposit"]]
Merchant/Counterparty: [["Employer","ABC Store","XYZ Electronics","Employer","ATM",...,"ABC Store","Supermarket","Employer","Online Store","Supermarket"]]
Location: [["Los Angeles, CA","Chicago, IL","New York, NY","New York, NY","Chicago, IL",...,"Miami, FL","New York, NY","Los Angeles, CA","Online","Chicago, IL

In [7]:
catalog.create_namespace("default")

In [8]:
table = catalog.create_table(
    "default.financial",
    schema=df.schema,
)

In [9]:
table.append(df)

Now let's look at its schema evolution capability

In [10]:
import pyarrow.compute as pc

df = df.append_column("amount_in_cents", pc.multiply(df["Transaction Amount"], 100))

In [11]:
with table.update_schema() as update_schema:
    update_schema.union_by_name(df.schema)

In [12]:
table.overwrite(df)
print(table.scan().to_arrow())

pyarrow.Table
Transaction ID: int64
Account ID: large_string
Transaction Amount: double
Transaction Type: large_string
Merchant/Counterparty: large_string
Location: large_string
Date and Time: large_string
amount_in_cents: double
----
Transaction ID: [[1000000000,1000000001,1000000002,1000000003,1000000004,...,1000009995,1000009996,1000009997,1000009998,1000009999]]
Account ID: [["ACC456789","ACC567890","ACC345678","ACC567890","ACC567890",...,"ACC234567","ACC345678","ACC234567","ACC456789","ACC345678"]]
Transaction Amount: [[86.8,1758.97,636.72,266.67,942.24,...,1770.98,1512.92,653.26,1876.98,162.44]]
Transaction Type: [["Deposit","Withdrawal","Purchase","Deposit","Deposit",...,"Deposit","Purchase","Trade","Withdrawal","Deposit"]]
Merchant/Counterparty: [["Employer","ABC Store","XYZ Electronics","Employer","ATM",...,"ABC Store","Supermarket","Employer","Online Store","Supermarket"]]
Location: [["Los Angeles, CA","Chicago, IL","New York, NY","New York, NY","Chicago, IL",...,"Miami, FL",

Now let's time travel!

In [13]:
snapshots = table.snapshots()
print("Snapshots:")
for snapshot in snapshots:
    print(snapshot)
    print(f"Snapshot ID: {snapshot.snapshot_id}")


Snapshots:
Operation.APPEND: id=7195230645844944471, schema_id=0
Snapshot ID: 7195230645844944471
Operation.DELETE: id=6516950793692017886, parent_id=7195230645844944471, schema_id=1
Snapshot ID: 6516950793692017886
Operation.APPEND: id=239807549678947714, parent_id=6516950793692017886, schema_id=1
Snapshot ID: 239807549678947714


In [14]:
snapshot_id = snapshots[0].snapshot_id  # Use the first snapshot
table_at_snapshot = table.scan(snapshot_id=snapshot_id).to_arrow()
print("Data at Snapshot ID:", snapshot_id)
print(table_at_snapshot)

Data at Snapshot ID: 7195230645844944471
pyarrow.Table
Transaction ID: int64
Account ID: large_string
Transaction Amount: double
Transaction Type: large_string
Merchant/Counterparty: large_string
Location: large_string
Date and Time: large_string
----
Transaction ID: [[1000000000,1000000001,1000000002,1000000003,1000000004,...,1000009995,1000009996,1000009997,1000009998,1000009999]]
Account ID: [["ACC456789","ACC567890","ACC345678","ACC567890","ACC567890",...,"ACC234567","ACC345678","ACC234567","ACC456789","ACC345678"]]
Transaction Amount: [[86.8,1758.97,636.72,266.67,942.24,...,1770.98,1512.92,653.26,1876.98,162.44]]
Transaction Type: [["Deposit","Withdrawal","Purchase","Deposit","Deposit",...,"Deposit","Purchase","Trade","Withdrawal","Deposit"]]
Merchant/Counterparty: [["Employer","ABC Store","XYZ Electronics","Employer","ATM",...,"ABC Store","Supermarket","Employer","Online Store","Supermarket"]]
Location: [["Los Angeles, CA","Chicago, IL","New York, NY","New York, NY","Chicago, IL"