In [1]:
import findspark
findspark.init()

from pyspark.sql import SparkSession

spark = (
    SparkSession.builder.master("local")
    .appName("IcebergPySpark")
    .config("spark.jars", "/home/asus/Downloads/iceberg-spark-runtime-3.2_2.12-1.2.0.jar")
    .config("spark.sql.extensions", "org.apache.iceberg.spark.extensions.IcebergSparkSessionExtensions")
    .config("spark.sql.catalog.demo", "org.apache.iceberg.spark.SparkCatalog")
    .config("spark.sql.catalog.demo.catalog-impl", "org.apache.iceberg.rest.RESTCatalog")
    .config("spark.sql.catalog.demo.uri", "http://rest:8181")
    .config("spark.sql.catalog.demo.warehouse", "s3a://warehouse/wh/")
    .config("spark.sql.catalog.demo.s3.endpoint", "http://minio:9000")
    .config("spark.sql.defaultCatalog", "demo")
    .config("spark.eventLog.enabled", "true")
    .config("spark.eventLog.dir", "/home/iceberg/spark-events")
    .config("spark.history.fs.logDirectory", "/home/iceberg/spark-events")
    .config("spark.sql.catalogImplementation", "/home/iceberg/spark-events")
    .getOrCreate()
)

23/04/23 04:05:52 WARN SparkSession: Using an existing Spark session; only runtime SQL configurations will take effect.


In [28]:
spark.catalog.currentDatabase()

'default'

In [32]:
spark.catalog.listDatabases()

[Database(name='default', description='default database', locationUri='file:/home/iceberg/notebooks/spark-warehouse')]

In [27]:
spark.catalog.listTables()

[Table(name='iceberg_pyspark_example', database='default', description=None, tableType='MANAGED', isTemporary=False)]

In [33]:
from pyiceberg.catalog import load_catalog
from pyiceberg.expressions import GreaterThanOrEqual

iceberg_catalog = load_catalog('default')
iceberg_catalog

<pyiceberg.catalog.rest.RestCatalog at 0x7fe08c1733d0>

In [34]:
iceberg_catalog.list_namespaces()

[('nyc',)]

In [40]:
iceberg_catalog.create_namespace("sample")

In [41]:
iceberg_catalog.list_namespaces()

[('nyc',), ('sample',)]

In [43]:
nyc_taxi_table = catalog.load_table(("nyc", "taxis"))

In [46]:
nyc_taxi_table.location()

's3://warehouse/nyc/taxis'

In [82]:
from pyiceberg.catalog import load_catalog
from pyiceberg.schema import Schema
from pyiceberg.types import TimestampType, DoubleType, StringType, NestedField

schema = Schema(
    NestedField(
        field_id=1, name="id", field_type=TimestampType(), required=False
    ),
    NestedField(field_id=2, name="bid", field_type=DoubleType(), required=False),
    NestedField(field_id=3, name="ask", field_type=DoubleType(), required=False),
    NestedField(field_id=4, name="symbol", field_type=StringType(), required=False),
)

from pyiceberg.partitioning import PartitionSpec, PartitionField
from pyiceberg.transforms import DayTransform

partition_spec = PartitionSpec(
    PartitionField(
        source_id=1, field_id=1000, transform=DayTransform(), name="datetime_day"
    )
)

from pyiceberg.table.sorting import SortOrder, SortField
from pyiceberg.transforms import IdentityTransform

sort_order = SortOrder(SortField(source_id=4, transform=IdentityTransform()))

iceberg_catalog = load_catalog("default")

iceberg_catalog.create_table(
    identifier="sample.bids",
    location="s3://warehouse/sample/mytable",
    schema=schema,
    partition_spec=partition_spec,
    sort_order=sort_order,
)

TableAlreadyExistsError: AlreadyExistsException: Table already exists: sample.bids

In [54]:
iceberg_catalog.list_namespaces()

[('default',), ('nyc',), ('sample',)]

In [55]:
iceberg_catalog.list_tables('sample')

[('sample', 'bids')]

In [70]:
tbl = iceberg_catalog.load_table('sample.bids')
tbl.name()

('default', 'sample', 'bids')

In [76]:
import datetime

datetime.datetime(2020, 5, 17, 0, 0)

In [86]:
from pyiceberg.schema import Schema
from pyiceberg.types import TimestampType, DoubleType, StringType, NestedField

iceberg_table_name = "default.sample.bids"
#spark.catalog.createTable(iceberg_table_name, schema=schema)

write_schema = ["id", "bid", "ask", "symbol"]


df = spark.createDataFrame([(datetime.datetime(2020, 5, 17), 10.2, 10.3, "gold")], write_schema)
df.write.option("format", "iceberg").mode("overwrite").saveAsTable(iceberg_table_name)

In [87]:
tbl = iceberg_catalog.load_table('default.sample.bids')
sc = tbl.scan()
df = sc.to_arrow().to_pandas()
df

Unnamed: 0,id,bid,ask,symbol
0,2020-05-17 00:00:00+00:00,10.2,10.3,gold


In [None]:
tbl = iceberg_catalog.load_table('nyc.taxis')
sc = tbl.scan()
df = sc.to_arrow().to_pandas()
df

In [30]:
tbl = iceberg_catalog.load_table('nyc.taxis')
sc = tbl.scan()
df = sc.to_arrow().to_pandas()
df

Unnamed: 0,VendorID,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,trip_distance,RatecodeID,store_and_fwd_flag,PULocationID,DOLocationID,payment_type,fare_amount,extra,mta_tax,tip_amount,tolls_amount,improvement_surcharge,total_amount,congestion_surcharge,airport_fee
0,1,2021-04-01 00:00:18+00:00,2021-04-01 00:21:54+00:00,1.0,8.40,1.0,N,79,116,1,25.50,3.0,0.5,5.85,0.0,0.3,35.15,2.5,0.0
1,1,2021-04-01 00:42:37+00:00,2021-04-01 00:46:23+00:00,1.0,0.90,1.0,N,75,236,2,5.00,3.0,0.5,0.00,0.0,0.3,8.80,2.5,0.0
2,1,2021-04-01 00:57:56+00:00,2021-04-01 01:08:22+00:00,1.0,3.40,1.0,N,236,168,2,11.50,3.0,0.5,0.00,0.0,0.3,15.30,2.5,0.0
3,1,2021-04-01 00:01:58+00:00,2021-04-01 00:54:27+00:00,1.0,0.00,1.0,N,47,61,1,44.20,0.0,0.5,0.00,0.0,0.3,45.00,0.0,0.0
4,2,2021-04-01 00:24:55+00:00,2021-04-01 00:34:33+00:00,1.0,1.96,1.0,N,238,152,1,9.00,0.5,0.5,3.09,0.0,0.3,13.39,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2171182,2,2021-04-30 23:39:00+00:00,2021-04-30 23:56:00+00:00,,4.17,,,158,142,0,16.91,0.0,0.5,4.83,0.0,0.3,25.04,,
2171183,1,2021-04-30 23:20:32+00:00,2021-04-30 23:23:05+00:00,,0.90,,,141,229,0,4.50,0.0,0.5,1.56,0.0,0.3,9.36,,
2171184,2,2021-04-30 23:33:00+00:00,2021-04-30 23:55:00+00:00,,6.20,,,90,75,0,21.86,0.0,0.5,6.08,0.0,0.3,31.24,,
2171185,2,2021-04-30 23:31:38+00:00,2021-04-30 23:45:18+00:00,,3.71,,,75,116,0,16.63,0.0,0.5,3.20,0.0,0.3,20.63,,


Py4JJavaError: An error occurred while calling o139.saveAsTable.
: org.apache.iceberg.exceptions.NoSuchTableException: Invalid table identifier: iceberg_pyspark_example
	at org.apache.iceberg.rest.RESTSessionCatalog.checkIdentifierIsValid(RESTSessionCatalog.java:866)
	at org.apache.iceberg.rest.RESTSessionCatalog.access$100(RESTSessionCatalog.java:88)
	at org.apache.iceberg.rest.RESTSessionCatalog$Builder.<init>(RESTSessionCatalog.java:544)
	at org.apache.iceberg.rest.RESTSessionCatalog$Builder.<init>(RESTSessionCatalog.java:534)
	at org.apache.iceberg.rest.RESTSessionCatalog.buildTable(RESTSessionCatalog.java:392)
	at org.apache.iceberg.catalog.BaseSessionCatalog$AsCatalog.buildTable(BaseSessionCatalog.java:84)
	at org.apache.iceberg.rest.RESTCatalog.buildTable(RESTCatalog.java:103)
	at org.apache.iceberg.CachingCatalog$CachingTableBuilder.<init>(CachingCatalog.java:219)
	at org.apache.iceberg.CachingCatalog$CachingTableBuilder.<init>(CachingCatalog.java:214)
	at org.apache.iceberg.CachingCatalog.buildTable(CachingCatalog.java:211)
	at org.apache.iceberg.spark.SparkCatalog.newBuilder(SparkCatalog.java:788)
	at org.apache.iceberg.spark.SparkCatalog.stageCreate(SparkCatalog.java:241)
	at org.apache.spark.sql.execution.datasources.v2.AtomicCreateTableAsSelectExec.run(WriteToDataSourceV2Exec.scala:130)
	at org.apache.spark.sql.execution.datasources.v2.V2CommandExec.result$lzycompute(V2CommandExec.scala:43)
	at org.apache.spark.sql.execution.datasources.v2.V2CommandExec.result(V2CommandExec.scala:43)
	at org.apache.spark.sql.execution.datasources.v2.V2CommandExec.executeCollect(V2CommandExec.scala:49)
	at org.apache.spark.sql.execution.QueryExecution$$anonfun$eagerlyExecuteCommands$1.$anonfun$applyOrElse$1(QueryExecution.scala:98)
	at org.apache.spark.sql.execution.SQLExecution$.$anonfun$withNewExecutionId$6(SQLExecution.scala:109)
	at org.apache.spark.sql.execution.SQLExecution$.withSQLConfPropagated(SQLExecution.scala:169)
	at org.apache.spark.sql.execution.SQLExecution$.$anonfun$withNewExecutionId$1(SQLExecution.scala:95)
	at org.apache.spark.sql.SparkSession.withActive(SparkSession.scala:779)
	at org.apache.spark.sql.execution.SQLExecution$.withNewExecutionId(SQLExecution.scala:64)
	at org.apache.spark.sql.execution.QueryExecution$$anonfun$eagerlyExecuteCommands$1.applyOrElse(QueryExecution.scala:98)
	at org.apache.spark.sql.execution.QueryExecution$$anonfun$eagerlyExecuteCommands$1.applyOrElse(QueryExecution.scala:94)
	at org.apache.spark.sql.catalyst.trees.TreeNode.$anonfun$transformDownWithPruning$1(TreeNode.scala:584)
	at org.apache.spark.sql.catalyst.trees.CurrentOrigin$.withOrigin(TreeNode.scala:176)
	at org.apache.spark.sql.catalyst.trees.TreeNode.transformDownWithPruning(TreeNode.scala:584)
	at org.apache.spark.sql.catalyst.plans.logical.LogicalPlan.org$apache$spark$sql$catalyst$plans$logical$AnalysisHelper$$super$transformDownWithPruning(LogicalPlan.scala:30)
	at org.apache.spark.sql.catalyst.plans.logical.AnalysisHelper.transformDownWithPruning(AnalysisHelper.scala:267)
	at org.apache.spark.sql.catalyst.plans.logical.AnalysisHelper.transformDownWithPruning$(AnalysisHelper.scala:263)
	at org.apache.spark.sql.catalyst.plans.logical.LogicalPlan.transformDownWithPruning(LogicalPlan.scala:30)
	at org.apache.spark.sql.catalyst.plans.logical.LogicalPlan.transformDownWithPruning(LogicalPlan.scala:30)
	at org.apache.spark.sql.catalyst.trees.TreeNode.transformDown(TreeNode.scala:560)
	at org.apache.spark.sql.execution.QueryExecution.eagerlyExecuteCommands(QueryExecution.scala:94)
	at org.apache.spark.sql.execution.QueryExecution.commandExecuted$lzycompute(QueryExecution.scala:81)
	at org.apache.spark.sql.execution.QueryExecution.commandExecuted(QueryExecution.scala:79)
	at org.apache.spark.sql.execution.QueryExecution.assertCommandExecuted(QueryExecution.scala:116)
	at org.apache.spark.sql.DataFrameWriter.runCommand(DataFrameWriter.scala:860)
	at org.apache.spark.sql.DataFrameWriter.saveAsTable(DataFrameWriter.scala:636)
	at org.apache.spark.sql.DataFrameWriter.saveAsTable(DataFrameWriter.scala:566)
	at java.base/jdk.internal.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
	at java.base/jdk.internal.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
	at java.base/jdk.internal.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
	at java.base/java.lang.reflect.Method.invoke(Method.java:566)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
	at py4j.Gateway.invoke(Gateway.java:282)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.ClientServerConnection.waitForCommands(ClientServerConnection.java:182)
	at py4j.ClientServerConnection.run(ClientServerConnection.java:106)
	at java.base/java.lang.Thread.run(Thread.java:829)
