In [None]:
#
# PREPARATIONS
#

In [1]:
# create the 'warehouse' S3 bucket

import boto3


s3_resource = boto3.resource('s3', 
    endpoint_url='http://minio:9000',
    aws_access_key_id='minioadmin',
    aws_secret_access_key='minioadmin',
    aws_session_token=None,
    config=boto3.session.Config(signature_version='s3v4'),
    verify=False,
)

# if it fails with BucketAlreadyOwnedByYou, it means the bucket is already there
try:
    s3_resource.Bucket("warehouse").create()
except Exception as e:
    print(e)

An error occurred (BucketAlreadyOwnedByYou) when calling the CreateBucket operation: Your previous request to create the named bucket succeeded and you already own it.


In [2]:
# bootstrap the catalog

import requests


r = requests.post("http://lakekeeper:8181/management/v1/bootstrap", json={"accept-terms-of-use": True})
r.json()

{'error': {'message': 'Catalog already bootstrapped',
  'type': 'CatalogAlreadyBootstrapped',
  'code': 400,
  'stack': ['01977be4-ffdc-74f1-a2b3-0cce24c6f998']}}

In [3]:
# initialise the 'iceberg' warehouse in Lakekeeper

import requests


payload = {
  "warehouse-name": "iceberg",
  "project-id": "00000000-0000-0000-0000-000000000000",
  "storage-profile": {
    "type": "s3",
    "bucket": "warehouse",
    "key-prefix": "iceberg",
    "assume-role-arn": None,
    "endpoint": "http://minio:9000",
    "region": "eu-central-1",
    "path-style-access": True,
    "flavor": "minio",
    "sts-enabled": True,
  },
  "storage-credential": {
    "type": "s3",
    "credential-type": "access-key",
    "aws-access-key-id": "minioadmin",
    "aws-secret-access-key": "minioadmin"
  }
}

r = requests.post("http://lakekeeper:8181/management/v1/warehouse", json=payload)
r.json()

{'error': {'message': 'Storage profile overlaps with existing warehouse iceberg',
  'type': 'CreateWarehouseStorageProfileOverlap',
  'code': 400,
  'stack': ['01977be5-a93d-7ad1-b001-4b6fd84e9b0d']}}

In [4]:
# check the config of the created warehouse

import requests


r = requests.get("http://lakekeeper:8181/catalog/v1/config?warehouse=iceberg")
r.json()

{'overrides': {'uri': 'http://lakekeeper:8181/catalog'},
 'defaults': {'rest-page-size': '100',
  'prefix': 'edb09648-4a32-11f0-a0ca-d33695f1b551'},
 'endpoints': ['GET /v1/config',
  'GET /v1/{prefix}/namespaces',
  'HEAD /v1/{prefix}/namespaces/{namespace}',
  'POST /v1/{prefix}/namespaces',
  'GET /v1/{prefix}/namespaces/{namespace}',
  'DELETE /v1/{prefix}/namespaces/{namespace}',
  'POST /v1/{prefix}/namespaces/{namespace}/properties',
  'GET /v1/{prefix}/namespaces/{namespace}/tables',
  'POST /v1/{prefix}/namespaces/{namespace}/tables',
  'GET /v1/{prefix}/namespaces/{namespace}/tables/{table}',
  'POST /v1/{prefix}/namespaces/{namespace}/tables/{table}',
  'DELETE /v1/{prefix}/namespaces/{namespace}/tables/{table}',
  'HEAD /v1/{prefix}/namespaces/{namespace}/tables/{table}',
  'GET /v1/{prefix}/namespaces/{namespace}/tables/{table}/credentials',
  'POST /v1/{prefix}/tables/rename',
  'POST /v1/{prefix}/namespaces/{namespace}/register',
  'POST /v1/{prefix}/namespaces/{namespac

In [5]:
# check that Spark client works

from pyspark.sql import SparkSession


spark = (
    SparkSession.builder
        .config(
            "spark.sql.extensions",
            "org.projectnessie.spark.extensions.NessieSparkSessionExtensions, org.apache.iceberg.spark.extensions.IcebergSparkSessionExtensions"
        )
        .config("spark.sql.catalog.iceberg", "org.apache.iceberg.spark.SparkCatalog")
        .config("spark.sql.catalog.iceberg.io-impl", "org.apache.iceberg.aws.s3.S3FileIO")
        .config("spark.sql.catalog.iceberg.type", "rest")
        .config("spark.sql.catalog.iceberg.uri", "http://lakekeeper:8181/catalog/")
        .config("spark.sql.catalog.iceberg.warehouse", "iceberg")
        .config("spark.sql.catalog.iceberg.ref", "main")
        .config("spark.sql.catalog.iceberg.cache-enabled", False)
        .getOrCreate()
)
spark.sparkContext.setLogLevel('ERROR')

spark.sql("""
    SHOW SCHEMAS
""").show()

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


+---------+
|namespace|
+---------+
|  default|
+---------+



In [7]:
# check existence of pyiceberg config

!cat .pyiceberg.yaml

catalog:
  rest:
    uri: http://lakekeeper:8181/catalog
    warehouse: iceberg
    s3.endpoint: http://minio:9000
    s3.region: eu-central-1
    s3.access-key-id: minioadmin
    s3.secret-access-key: minioadmin


In [13]:
# check if pyiceberg works

from pyiceberg.catalog import load_catalog


catalog = load_catalog("rest")
catalog.create_namespace("workshop")
catalog.list_namespaces()

In [None]:
#
# TABLE DEFINITION
#