In [144]:
import findspark
findspark.init()

from pyspark.sql import SparkSession

spark = (
    SparkSession.builder.master("local")
    .appName("IcebergPySpark")
    .config("spark.jars", "/home/asus/Downloads/iceberg-spark-runtime-3.2_2.12-1.2.0.jar")
    .config("spark.sql.extensions", "org.apache.iceberg.spark.extensions.IcebergSparkSessionExtensions")
    .config("spark.sql.catalog.demo", "org.apache.iceberg.spark.SparkCatalog")
    .config("spark.sql.catalog.demo.catalog-impl", "org.apache.iceberg.rest.RESTCatalog")
    .config("spark.sql.catalog.demo.uri", "http://rest:8181")
    .config("spark.sql.catalog.demo.warehouse", "s3a://warehouse/wh/")
    .config("spark.sql.catalog.demo.s3.endpoint", "http://minio:9000")
    .config("spark.sql.defaultCatalog", "demo")
    .config("spark.eventLog.enabled", "true")
    .config("spark.eventLog.dir", "/home/iceberg/spark-events")
    .config("spark.history.fs.logDirectory", "/home/iceberg/spark-events")
    .config("spark.sql.catalogImplementation", "/home/iceberg/spark-events")
    .getOrCreate()
)

23/04/23 05:10:34 WARN SparkSession: Using an existing Spark session; only runtime SQL configurations will take effect.


In [145]:
spark.catalog.currentDatabase()

'default'

In [146]:
spark.catalog.listDatabases()

[Database(name='default', description='default database', locationUri='file:/home/iceberg/notebooks/spark-warehouse')]

In [147]:
spark.catalog.listTables()

[Table(name='iceberg_pyspark_example', database='default', description=None, tableType='MANAGED', isTemporary=False)]

In [105]:
from pyiceberg.catalog import load_catalog
from pyiceberg.expressions import GreaterThanOrEqual

iceberg_catalog = load_catalog('default')
iceberg_catalog

<pyiceberg.catalog.rest.RestCatalog at 0x7fe08e2b3c10>

In [148]:
iceberg_catalog.list_namespaces()

[('default',), ('nyc',), ('sample',)]

In [149]:
iceberg_catalog.create_namespace("substrait")

In [150]:
iceberg_catalog.list_namespaces()

[('default',), ('nyc',), ('sample',), ('substrait',)]

In [151]:
nyc_taxi_table = catalog.load_table(("nyc", "taxis"))

In [152]:
nyc_taxi_table.location()

's3://warehouse/nyc/taxis'

In [153]:
from pyiceberg.catalog import load_catalog
from pyiceberg.schema import Schema
from pyiceberg.types import TimestampType, DoubleType, StringType, NestedField

schema = Schema(
    NestedField(
        field_id=1, name="id", field_type=TimestampType(), required=False
    ),
    NestedField(field_id=2, name="bid", field_type=DoubleType(), required=False),
    NestedField(field_id=3, name="ask", field_type=DoubleType(), required=False),
    NestedField(field_id=4, name="symbol", field_type=StringType(), required=False),
)

from pyiceberg.partitioning import PartitionSpec, PartitionField
from pyiceberg.transforms import DayTransform

partition_spec = PartitionSpec(
    PartitionField(
        source_id=1, field_id=1000, transform=DayTransform(), name="datetime_day"
    )
)

from pyiceberg.table.sorting import SortOrder, SortField
from pyiceberg.transforms import IdentityTransform

sort_order = SortOrder(SortField(source_id=4, transform=IdentityTransform()))

iceberg_catalog = load_catalog("default")

iceberg_catalog.create_table(
    identifier="default.substrait.bids",
    location="s3://warehouse/sample/mytable",
    schema=schema,
    partition_spec=partition_spec,
    sort_order=sort_order,
)

<pyiceberg.table.Table at 0x7fe089771b50>

In [154]:
iceberg_catalog.list_namespaces()

[('default',), ('nyc',), ('sample',), ('substrait',)]

In [155]:
import datetime

In [156]:
from pyiceberg.schema import Schema
from pyiceberg.types import TimestampType, DoubleType, StringType, NestedField

iceberg_table_name = "default.substrait.bids"
#spark.catalog.createTable(iceberg_table_name, schema=schema)

write_schema = ["id", "bid", "ask", "symbol"]


df = spark.createDataFrame([(datetime.datetime(2020, 5, 17), 10.2, 10.3, "gold")], write_schema)
df.write.option("format", "iceberg").mode("overwrite").saveAsTable(iceberg_table_name)

In [157]:
tbl = iceberg_catalog.load_table('default.substrait.bids')
sc = tbl.scan()
df = sc.to_arrow().to_pandas()
df

Unnamed: 0,id,bid,ask,symbol
0,2020-05-17 00:00:00+00:00,10.2,10.3,gold


In [158]:
iceberg_catalog.list_namespaces()

[('default',), ('nyc',), ('sample',), ('substrait',)]

In [159]:
iceberg_catalog.drop_namespace('substrait')

In [160]:
iceberg_catalog.list_namespaces()

[('default',), ('nyc',), ('sample',)]

In [161]:
iceberg_catalog.drop_table("default.substrait.bids")