In [None]:
#
# PREPARATIONS
#

In [None]:
# create the 'warehouse' S3 bucket

import boto3


s3_resource = boto3.resource('s3', 
    endpoint_url='http://minio:9000',
    aws_access_key_id='minioadmin',
    aws_secret_access_key='minioadmin',
    aws_session_token=None,
    config=boto3.session.Config(signature_version='s3v4'),
    verify=False,
)

# if it fails with BucketAlreadyOwnedByYou, it means the bucket is already there
try:
    s3_resource.Bucket("warehouse").create()
except Exception as e:
    print(e)

In [None]:
# bootstrap the catalog

import requests


r = requests.post("http://lakekeeper:8181/management/v1/bootstrap", json={"accept-terms-of-use": True})
r.json()

In [None]:
# initialise the 'iceberg' warehouse in Lakekeeper

import requests


payload = {
  "warehouse-name": "iceberg",
  "project-id": "00000000-0000-0000-0000-000000000000",
  "storage-profile": {
    "type": "s3",
    "bucket": "warehouse",
    "key-prefix": "iceberg",
    "assume-role-arn": None,
    "endpoint": "http://minio:9000",
    "region": "eu-central-1",
    "path-style-access": True,
    "flavor": "minio",
    "sts-enabled": True,
  },
  "storage-credential": {
    "type": "s3",
    "credential-type": "access-key",
    "aws-access-key-id": "minioadmin",
    "aws-secret-access-key": "minioadmin"
  }
}

r = requests.post("http://lakekeeper:8181/management/v1/warehouse", json=payload)
r.json()

In [None]:
# check the config of the created warehouse

import requests


r = requests.get("http://lakekeeper:8181/catalog/v1/config?warehouse=iceberg")
r.json()

In [5]:
# check that Spark client works

from pyspark.sql import SparkSession


spark = (
    SparkSession.builder
        .config(
            "spark.sql.extensions",
            "org.projectnessie.spark.extensions.NessieSparkSessionExtensions, org.apache.iceberg.spark.extensions.IcebergSparkSessionExtensions"
        )
        .config("spark.sql.catalog.iceberg", "org.apache.iceberg.spark.SparkCatalog")
        .config("spark.sql.catalog.iceberg.io-impl", "org.apache.iceberg.aws.s3.S3FileIO")
        .config("spark.sql.catalog.iceberg.type", "rest")
        .config("spark.sql.catalog.iceberg.uri", "http://lakekeeper:8181/catalog/")
        .config("spark.sql.catalog.iceberg.warehouse", "iceberg")
        .config("spark.sql.catalog.iceberg.ref", "main")
        .config("spark.sql.catalog.iceberg.cache-enabled", False)
        .getOrCreate()
)
spark.sparkContext.setLogLevel('ERROR')

spark.sql("""
    SHOW NAMESPACES FROM iceberg
""").toPandas()

Unnamed: 0,namespace
0,workshop


In [None]:
# check existence of pyiceberg config

!cat .pyiceberg.yaml

In [4]:
# check if pyiceberg works

from pyiceberg.catalog import load_catalog


catalog = load_catalog("rest")

# recreate
catalog.create_namespace("workshop")
catalog.list_namespaces()

NamespaceAlreadyExistsError: NamespaceAlreadyExists: Namespace already exists

In [None]:
#
# TABLE DEFINITION
#

In [None]:
# cleanup if required

spark.sql("""
DROP TABLE IF EXISTS iceberg.workshop.atable
""")

In [None]:
spark.sql("""
CREATE TABLE IF NOT EXISTS iceberg.workshop.atable (
    id string COMMENT 'random id',
    name string COMMENT 'a name',
    value int COMMENT 'integer with some numerical value',
    timestamp timestamp COMMENT 'a timestamp column'
)
PARTITIONED BY (years(timestamp))
TBLPROPERTIES (
    'format-version'='2'
)
""")

In [None]:
spark.sql("""
CREATE TABLE IF NOT EXISTS iceberg.workshop.atable (
    id string COMMENT 'random id',
    name string COMMENT 'a name',
    value int COMMENT 'integer with some numerical value',
    timestamp timestamp COMMENT 'a timestamp column'
)
PARTITIONED BY (years(timestamp))
TBLPROPERTIES (
    'format-version'='2'
)
""")

In [6]:
# describe the created table

spark.sql("""
DESCRIBE EXTENDED iceberg.workshop.atable
""").toPandas()

Unnamed: 0,col_name,data_type,comment
0,id,string,random id
1,name,string,a name
2,value,int,integer with some numerical value
3,timestamp,timestamp,a timestamp column
4,,,
5,# Partitioning,,
6,Part 0,years(timestamp),
7,,,
8,# Metadata Columns,,
9,_spec_id,int,


In [8]:
# insert some data

import random
import uuid

from datetime import datetime
from pyspark.sql import Row
from pyspark.sql.functions import to_timestamp


df = spark.createDataFrame([
    Row(
        id=str(uuid.uuid4()),
        name=random.choice(["Ariel", "Georgina", "Tom", "Ulma"]),
        value=random.randint(100, 999),
        timestamp=datetime.fromisoformat("2024-12-01T07:32:18")
    ),
    Row(
        id=str(uuid.uuid4()),
        name=random.choice(["Ariel", "Georgina", "Tom", "Ulma"]),
        value=random.randint(100, 999),
        timestamp=datetime.fromisoformat("2025-01-02T09:11:23")
    ),
])

df.writeTo("iceberg.workshop.atable").append()

In [13]:
spark.sql("""
SELECT *
FROM iceberg.workshop.atable
ORDER BY name
""").toPandas()

Unnamed: 0,id,name,value,timestamp
0,f032952a-a596-446e-9b03-9ad6384692ca,Ariel,831,2024-12-01 06:32:18
1,a7a074c7-5a57-4636-8c1f-085575de1151,Ariel,438,2024-12-01 06:32:18
2,119b87de-0819-44fd-b96a-9af0a123fd42,Georgina,705,2025-01-02 08:11:23
3,d9b26e31-ca6b-4374-a49b-f2aabd272ab6,Georgina,505,2024-12-01 06:32:18
4,6f8a90e9-30ee-4f80-baaa-62927761816e,Georgina,804,2025-01-02 08:11:23
5,272fc3c2-01c1-4a33-a8a1-6c470c02ceb7,Georgina,812,2025-01-02 08:11:23
6,8331eed8-b0c9-46b1-bda4-409f80abf30f,Tom,922,2024-12-01 06:32:18
7,d8f2f224-df67-4662-9549-48cae9d62b83,Ulma,818,2025-01-02 08:11:23
8,8346efe4-8b1d-4985-b6ec-8c7599f16c27,Ulma,823,2024-12-01 06:32:18
9,63302b61-fc98-418c-abe3-73b072a56697,Ulma,131,2025-01-02 08:11:23


In [17]:
spark.sql("""
SELECT *
FROM iceberg.workshop.atable.history
""").show(1000, False)

+-----------------------+-------------------+-------------------+-------------------+
|made_current_at        |snapshot_id        |parent_id          |is_current_ancestor|
+-----------------------+-------------------+-------------------+-------------------+
|2025-06-17 03:51:04.407|3343137357946927937|NULL               |true               |
|2025-06-17 03:51:08.115|3030987006270639493|3343137357946927937|true               |
|2025-06-17 03:51:12.698|7130290621727084349|3030987006270639493|true               |
|2025-06-17 04:00:27.258|4931056771337407708|7130290621727084349|true               |
|2025-06-17 04:00:28.662|7052107035016213932|4931056771337407708|true               |
+-----------------------+-------------------+-------------------+-------------------+



In [25]:
spark.sql("""
--SELECT *
SELECT content, file_path, record_count, partition
FROM iceberg.workshop.atable.data_files
""").toPandas()

Unnamed: 0,content,file_path,record_count,partition
0,0,s3://warehouse/iceberg/01977bf1-69fb-7bd2-8c0c...,1,"(54,)"
1,0,s3://warehouse/iceberg/01977bf1-69fb-7bd2-8c0c...,1,"(55,)"
2,0,s3://warehouse/iceberg/01977bf1-69fb-7bd2-8c0c...,1,"(54,)"
3,0,s3://warehouse/iceberg/01977bf1-69fb-7bd2-8c0c...,1,"(55,)"
4,0,s3://warehouse/iceberg/01977bf1-69fb-7bd2-8c0c...,1,"(54,)"
5,0,s3://warehouse/iceberg/01977bf1-69fb-7bd2-8c0c...,1,"(55,)"
6,0,s3://warehouse/iceberg/01977bf1-69fb-7bd2-8c0c...,1,"(54,)"
7,0,s3://warehouse/iceberg/01977bf1-69fb-7bd2-8c0c...,1,"(55,)"
8,0,s3://warehouse/iceberg/01977bf1-69fb-7bd2-8c0c...,1,"(54,)"
9,0,s3://warehouse/iceberg/01977bf1-69fb-7bd2-8c0c...,1,"(55,)"


In [None]:
#
# PARTITIONING
#

In [37]:
# change partition layout

spark.sql("""
ALTER TABLE iceberg.workshop.atable
DROP PARTITION FIELD years(timestamp)
""")

IllegalArgumentException: Cannot find partition field to remove: year(ref(name="timestamp"))

In [None]:
spark.sql("""
ALTER TABLE iceberg.workshop.atable
ADD PARTITION FIELD bucket(2, id)
""")

In [35]:
spark.sql("""
ALTER TABLE iceberg.workshop.atable
ADD PARTITION FIELD truncate(5, name)
""")

DataFrame[]

In [38]:
spark.sql("""
DESCRIBE EXTENDED iceberg.workshop.atable
""").toPandas()

Unnamed: 0,col_name,data_type,comment
0,id,string,random id
1,name,string,a name
2,value,int,integer with some numerical value
3,timestamp,timestamp,a timestamp column
4,,,
5,# Partitioning,,
6,Part 0,"bucket(2, id)",
7,Part 1,"truncate(5, name)",
8,,,
9,# Metadata Columns,,


In [39]:
# anything changed?

spark.sql("""
--SELECT *
SELECT content, file_path, record_count, partition
FROM iceberg.workshop.atable.data_files
""").toPandas()

Unnamed: 0,content,file_path,record_count,partition
0,0,s3://warehouse/iceberg/01977bf1-69fb-7bd2-8c0c...,1,"(54, None, None)"
1,0,s3://warehouse/iceberg/01977bf1-69fb-7bd2-8c0c...,1,"(55, None, None)"
2,0,s3://warehouse/iceberg/01977bf1-69fb-7bd2-8c0c...,1,"(54, None, None)"
3,0,s3://warehouse/iceberg/01977bf1-69fb-7bd2-8c0c...,1,"(55, None, None)"
4,0,s3://warehouse/iceberg/01977bf1-69fb-7bd2-8c0c...,1,"(54, None, None)"
5,0,s3://warehouse/iceberg/01977bf1-69fb-7bd2-8c0c...,1,"(55, None, None)"
6,0,s3://warehouse/iceberg/01977bf1-69fb-7bd2-8c0c...,1,"(54, None, None)"
7,0,s3://warehouse/iceberg/01977bf1-69fb-7bd2-8c0c...,1,"(55, None, None)"
8,0,s3://warehouse/iceberg/01977bf1-69fb-7bd2-8c0c...,1,"(54, None, None)"
9,0,s3://warehouse/iceberg/01977bf1-69fb-7bd2-8c0c...,1,"(55, None, None)"


In [40]:
# write more data

df = spark.createDataFrame([
    Row(
        id=str(uuid.uuid4()),
        name=random.choice(["Ariel", "Georgina", "Tom", "Ulma"]),
        value=random.randint(100, 999),
        timestamp=datetime.fromisoformat("2024-12-01T07:32:18")
    ),
    Row(
        id=str(uuid.uuid4()),
        name=random.choice(["Ariel", "Georgina", "Tom", "Ulma"]),
        value=random.randint(100, 999),
        timestamp=datetime.fromisoformat("2025-01-02T09:11:23")
    ),
])

df.writeTo("iceberg.workshop.atable").append()

In [41]:
# what about now?

spark.sql("""
--SELECT *
SELECT content, file_path, record_count, partition
FROM iceberg.workshop.atable.data_files
""").toPandas()

Unnamed: 0,content,file_path,record_count,partition
0,0,s3://warehouse/iceberg/01977bf1-69fb-7bd2-8c0c...,1,"(None, 0, Ariel)"
1,0,s3://warehouse/iceberg/01977bf1-69fb-7bd2-8c0c...,1,"(None, 1, Ulma)"
2,0,s3://warehouse/iceberg/01977bf1-69fb-7bd2-8c0c...,1,"(54, None, None)"
3,0,s3://warehouse/iceberg/01977bf1-69fb-7bd2-8c0c...,1,"(55, None, None)"
4,0,s3://warehouse/iceberg/01977bf1-69fb-7bd2-8c0c...,1,"(54, None, None)"
5,0,s3://warehouse/iceberg/01977bf1-69fb-7bd2-8c0c...,1,"(55, None, None)"
6,0,s3://warehouse/iceberg/01977bf1-69fb-7bd2-8c0c...,1,"(54, None, None)"
7,0,s3://warehouse/iceberg/01977bf1-69fb-7bd2-8c0c...,1,"(55, None, None)"
8,0,s3://warehouse/iceberg/01977bf1-69fb-7bd2-8c0c...,1,"(54, None, None)"
9,0,s3://warehouse/iceberg/01977bf1-69fb-7bd2-8c0c...,1,"(55, None, None)"
