### Load env variables

In this example we're loading the environment variables with all the secrets from a file in Localfile. The .evn file includes the following variables:

cz_username: Username for connecting to the Lakehouse service 

cz_password: Password for connecting to the Lakehouse service

cz_service: Name of the Lakehouse service to connect to

cz_instance: Instance name of the Lakehouse service to connect to

cz_workspace: Workspace name of the Lakehouse service to connect to

cz_schema: Schema name of the Lakehouse service to connect to

cz_vcluster: Virtual cluster name of the Lakehouse service to connect to

AWS_KEY: Key for connecting to AWS services

AWS_SECRET: Secret key for connecting to AWS services

AWS_S3_NAME: Bucket name for connecting to AWS S3 service

UNSTRUCTURED_API_KEY: API key for connecting to the UNSTRUCTURED API

UNSTRUCTURED_URL: URL for connecting to the UNSTRUCTURED API


In [12]:
import os
import dotenv

dotenv.load_dotenv('./.env') # replace with the path to your .env file

True

In [13]:
!pip install pyiceberg boto3 pandas

Looking in indexes: https://pypi.org/simple/


In [15]:
import boto3
from pyiceberg.catalog import load_catalog
from pyiceberg.schema import Schema
from pyiceberg.types import NestedField, StringType, IntegerType
import pandas as pd

In [16]:
# 1. 创建S3存储桶并配置Iceberg Catalog
BUCKET_NAME = os.getenv("AWS_S3_NAME")+"_IcebergTable"
key=os.getenv("AWS_KEY")
secret=os.getenv("AWS_SECRET")
TABLE_NAME = "demo_iceberg_table"
REGION = "us-east-1"



In [17]:
key

''

In [None]:
# 使用显式密钥
s3 = boto3.client(
    "s3",
    aws_access_key_id=key,
    aws_secret_access_key=secret
)

s3.create_bucket(Bucket=BUCKET_NAME, CreateBucketConfiguration={"LocationConstraint": REGION})

catalog = load_catalog("s3", {"s3.endpoint": f"s3://{BUCKET_NAME}"})

In [None]:
# 2. 定义Iceberg表Schema
schema = Schema(
    NestedField.required(1, "id", IntegerType()),
    NestedField.optional(2, "name", StringType()),
)

# 3. 创建Iceberg表
try:
    catalog.create_table(
        identifier=TABLE_NAME,
        schema=schema,
        partition_spec=None,
        properties={"format-version": "2"}
    )
    print(f"表 {TABLE_NAME} 创建成功！")
except Exception as e:
    print(f"表创建失败：{e}")

# 4. 插入数据到Iceberg表
data = pd.DataFrame({"id": [1, 2, 3], "name": ["Alice", "Bob", "Charlie"]})
data_file = f"s3://{BUCKET_NAME}/data.parquet"
data.to_parquet(data_file)

print("数据插入成功！")

# 5. 查询数据
try:
    table = catalog.load_table(TABLE_NAME)
    snapshots = table.snapshots()
    print("当前数据快照：")
    for snapshot in snapshots:
        print(snapshot)
except Exception as e:
    print(f"数据查询失败：{e}")

# 6. 演示Iceberg特性（如快照、事务处理）
try:
    new_data = pd.DataFrame({"id": [4], "name": ["Diana"]})
    new_data_file = f"s3://{BUCKET_NAME}/new_data.parquet"
    new_data.to_parquet(new_data_file)
    print("新增数据成功，支持事务操作！")
except Exception as e:
    print(f"操作失败：{e}")
