In [1]:
import findspark
findspark.init()

from pyspark.sql import SparkSession

spark = (
    SparkSession.builder.master("local")
    .appName("IcebergPySpark")
    .config("spark.jars", "/home/asus/Downloads/iceberg-spark-runtime-3.2_2.12-1.2.0.jar")
    .config("spark.sql.extensions", "org.apache.iceberg.spark.extensions.IcebergSparkSessionExtensions")
    .config("spark.sql.catalog.demo", "org.apache.iceberg.spark.SparkCatalog")
    .config("spark.sql.catalog.demo.catalog-impl", "org.apache.iceberg.rest.RESTCatalog")
    .config("spark.sql.catalog.demo.uri", "http://rest:8181")
    .config("spark.sql.catalog.demo.warehouse", "s3a://warehouse/wh/")
    .config("spark.sql.catalog.demo.s3.endpoint", "http://minio:9000")
    .config("spark.sql.defaultCatalog", "demo")
    .config("spark.eventLog.enabled", "true")
    .config("spark.eventLog.dir", "/home/iceberg/spark-events")
    .config("spark.history.fs.logDirectory", "/home/iceberg/spark-events")
    .config("spark.sql.catalogImplementation", "/home/iceberg/spark-events")
    .getOrCreate()
)

23/04/26 06:31:05 WARN SparkSession: Using an existing Spark session; only runtime SQL configurations will take effect.


In [2]:
def cleanup(namespace="substrait", table_id="default.substrait.bids"):
    iceberg_catalog.drop_namespace(namespace)
    iceberg_catalog.list_namespaces()
    iceberg_catalog.drop_table(table_id)

In [3]:
spark.catalog.currentDatabase()

'default'

In [4]:
spark.catalog.listDatabases()

[Database(name='default', description='default database', locationUri='file:/home/iceberg/notebooks/spark-warehouse')]

In [5]:
spark.catalog.listTables()

[]

In [6]:
from pyiceberg.catalog import load_catalog
from pyiceberg.expressions import GreaterThanOrEqual

iceberg_catalog = load_catalog('default')
iceberg_catalog

<pyiceberg.catalog.rest.RestCatalog at 0x7f822a05f670>

In [7]:
iceberg_catalog.list_namespaces()

[('default',), ('substrait',)]

In [8]:
#iceberg_catalog.create_namespace("substrait")

In [9]:
iceberg_catalog.list_namespaces()

[('default',), ('substrait',)]

In [10]:
#nyc_taxi_table = iceberg_catalog.load_table(("nyc", "taxis"))

In [11]:
#nyc_taxi_table.location()

In [12]:
from pyiceberg.catalog import load_catalog
from pyiceberg.schema import Schema
from pyiceberg.types import TimestampType, DoubleType, StringType, NestedField

schema = Schema(
    NestedField(
        field_id=1, name="id", field_type=TimestampType(), required=False
    ),
    NestedField(field_id=2, name="bid", field_type=DoubleType(), required=False),
    NestedField(field_id=3, name="ask", field_type=DoubleType(), required=False),
    NestedField(field_id=4, name="symbol", field_type=StringType(), required=False),
)

from pyiceberg.partitioning import PartitionSpec, PartitionField
from pyiceberg.transforms import DayTransform

partition_spec = PartitionSpec(
    PartitionField(
        source_id=1, field_id=1000, transform=DayTransform(), name="datetime_day"
    )
)

from pyiceberg.table.sorting import SortOrder, SortField
from pyiceberg.transforms import IdentityTransform

sort_order = SortOrder(SortField(source_id=4, transform=IdentityTransform()))

iceberg_catalog = load_catalog("default")

iceberg_catalog.create_table(
    identifier="default.substrait.bids",
    location="s3://warehouse/sample/mytable",
    schema=schema,
    partition_spec=partition_spec,
    sort_order=sort_order,
)

TableAlreadyExistsError: AlreadyExistsException: Table already exists: default.substrait.bids

In [13]:
iceberg_catalog.list_namespaces()

[('default',), ('substrait',)]

In [14]:
import datetime

In [15]:
from pyiceberg.schema import Schema
from pyiceberg.types import TimestampType, DoubleType, StringType, NestedField

iceberg_table_name = "default.substrait.bids"
#spark.catalog.createTable(iceberg_table_name, schema=schema)

write_schema = ["id", "bid", "ask", "symbol"]


df = spark.createDataFrame([(datetime.datetime(2020, 5, 17), 10.2, 10.3, "gold")], write_schema)
df.write.option("format", "iceberg").mode("overwrite").saveAsTable(iceberg_table_name)

SLF4J: Failed to load class "org.slf4j.impl.StaticLoggerBinder".
SLF4J: Defaulting to no-operation (NOP) logger implementation
SLF4J: See http://www.slf4j.org/codes.html#StaticLoggerBinder for further details.
                                                                                

In [17]:
tbl = iceberg_catalog.load_table('default.substrait.bids')
sc = tbl.scan()
df = sc.to_arrow().to_pandas()
df

Unnamed: 0,id,bid,ask,symbol
0,2020-05-17 00:00:00+00:00,10.2,10.3,gold


In [18]:
iceberg_catalog.list_tables("default.substrait")

[('default', 'substrait', 'bids')]

In [19]:
tbl = iceberg_catalog.load_table("default.substrait.bids")
tbl.location()

's3://warehouse/sample/mytable'

In [20]:
type(iceberg_catalog)

pyiceberg.catalog.rest.RestCatalog

In [21]:
iceberg_catalog.list_namespaces()

[('default',), ('substrait',)]

In [22]:
iceberg_catalog.list_tables('default')

[]

In [23]:
file_paths = []
for file in sc.plan_files():
    print(file.file.file_path)
    file_paths.append(file.file.file_path)

s3://warehouse/sample/mytable/data/00023-24-191d1ccf-68b0-47fd-9d2b-a75cbfb32728-00001.parquet


In [24]:
tbl.schema()

Schema(NestedField(field_id=1, name='id', field_type=TimestamptzType(), required=False), NestedField(field_id=2, name='bid', field_type=DoubleType(), required=False), NestedField(field_id=3, name='ask', field_type=DoubleType(), required=False), NestedField(field_id=4, name='symbol', field_type=StringType(), required=False), schema_id=1, identifier_field_ids=[])

In [25]:
from pyiceberg.io.pyarrow import PyArrowFileIO

In [26]:
scheme, path = PyArrowFileIO.parse_location(tbl.location())
schema, path

(Schema(NestedField(field_id=1, name='id', field_type=TimestampType(), required=False), NestedField(field_id=2, name='bid', field_type=DoubleType(), required=False), NestedField(field_id=3, name='ask', field_type=DoubleType(), required=False), NestedField(field_id=4, name='symbol', field_type=StringType(), required=False), schema_id=0, identifier_field_ids=[]),
 'warehouse/sample/mytable')

In [27]:
fs = tbl.io.get_fs(scheme)

In [28]:
iceberg_catalog.properties

{'uri': 'http://rest:8181',
 's3.endpoint': 'http://minio:9000',
 's3.access-key-id': 'admin',
 's3.secret-access-key': 'password'}

In [29]:
import ibis
from ibis_substrait.compiler.core import SubstraitCompiler
from icetrait.iceberg.process import ProcessSubstrait

In [30]:
ibis_tb = ibis.table([("id", "date"), ("bid", "float"), ("ask", "int32"), ("symbol", "string")], "t",)
query = ibis_tb.select(["id", "bid", "ask", "symbol"])
compiler = SubstraitCompiler()
protobuf_msg = compiler.compile(query).SerializeToString()

psb = ProcessSubstrait(protobuf_msg)

In [31]:
file_formats = ["parquet"]
file_formats, file_paths

(['parquet'],
 ['s3://warehouse/sample/mytable/data/00023-24-191d1ccf-68b0-47fd-9d2b-a75cbfb32728-00001.parquet'])

In [32]:
psb.update_local_files(file_paths, file_formats)

In [33]:
psb.plan

relations {
  root {
    input {
      project {
        common {
          emit {
            output_mapping: 4
            output_mapping: 5
            output_mapping: 6
            output_mapping: 7
          }
        }
        input {
          read {
            common {
              direct {
              }
            }
            base_schema {
              names: "id"
              names: "bid"
              names: "ask"
              names: "symbol"
              struct {
                types {
                  date {
                    nullability: NULLABILITY_NULLABLE
                  }
                }
                types {
                  fp64 {
                    nullability: NULLABILITY_NULLABLE
                  }
                }
                types {
                  i32 {
                    nullability: NULLABILITY_NULLABLE
                  }
                }
                types {
                  string {
                    nullability: NUL

In [34]:
psb.plan

relations {
  root {
    input {
      project {
        common {
          emit {
            output_mapping: 4
            output_mapping: 5
            output_mapping: 6
            output_mapping: 7
          }
        }
        input {
          read {
            common {
              direct {
              }
            }
            base_schema {
              names: "id"
              names: "bid"
              names: "ask"
              names: "symbol"
              struct {
                types {
                  date {
                    nullability: NULLABILITY_NULLABLE
                  }
                }
                types {
                  fp64 {
                    nullability: NULLABILITY_NULLABLE
                  }
                }
                types {
                  i32 {
                    nullability: NULLABILITY_NULLABLE
                  }
                }
                types {
                  string {
                    nullability: NUL

In [None]:
!pip install duckdb==0.7.1

In [35]:
import duckdb

In [36]:
duckdb.__version__

'0.7.1'

In [37]:
con = duckdb.connect()
con.install_extension("substrait")
con.load_extension("substrait")

In [None]:
!pip install httpfs

In [38]:
con.install_extension("httpfs")

In [39]:
con.load_extension("httpfs")

In [40]:
iceberg_catalog.properties

{'uri': 'http://rest:8181',
 's3.endpoint': 'http://minio:9000',
 's3.access-key-id': 'admin',
 's3.secret-access-key': 'password'}

In [41]:
con.execute(query="SET s3_endpoint='minio:9000';")

<duckdb.DuckDBPyConnection at 0x7f8212f90b70>

In [42]:
con.execute(query="SET s3_region='us-east-1';")

<duckdb.DuckDBPyConnection at 0x7f8212f90b70>

In [43]:
con.execute(query="SET s3_access_key_id='admin';")

<duckdb.DuckDBPyConnection at 0x7f8212f90b70>

In [44]:
con.execute(query="SET s3_secret_access_key='password';")

<duckdb.DuckDBPyConnection at 0x7f8212f90b70>

In [45]:
con.execute(query="SET s3_use_ssl=false;")

<duckdb.DuckDBPyConnection at 0x7f8212f90b70>

In [58]:
con.execute(query="SET s3_url_style='path';")

<duckdb.DuckDBPyConnection at 0x7f8212f90b70>

In [59]:
query_result = con.from_substrait(proto=psb.plan.SerializeToString())

IOException: IO Error: Unable to connect to URL "https://warehouse.s3.amazonaws.com/sample/mytable/data/00023-24-191d1ccf-68b0-47fd-9d2b-a75cbfb32728-00001.parquet": 400 (Bad Request)

In [60]:
from google.protobuf.json_format import MessageToJson

json_obj = MessageToJson(psb.plan)
print(json_obj)

{
  "relations": [
    {
      "root": {
        "input": {
          "project": {
            "common": {
              "emit": {
                "outputMapping": [
                  4,
                  5,
                  6,
                  7
                ]
              }
            },
            "input": {
              "read": {
                "common": {
                  "direct": {}
                },
                "baseSchema": {
                  "names": [
                    "id",
                    "bid",
                    "ask",
                    "symbol"
                  ],
                  "struct": {
                    "types": [
                      {
                        "date": {
                          "nullability": "NULLABILITY_NULLABLE"
                        }
                      },
                      {
                        "fp64": {
                          "nullability": "NULLABILITY_NULLABLE"
                        }
    

In [61]:
duckdb.__version__

'0.7.1'

In [62]:
sql_query=f"SELECT * FROM '{file_paths[0]}';"
sql_query

"SELECT * FROM 's3://warehouse/sample/mytable/data/00023-24-191d1ccf-68b0-47fd-9d2b-a75cbfb32728-00001.parquet';"

In [63]:
sql_query_2 = "SELECT * FROM "

In [64]:
val = con.execute(query=sql_query)

In [65]:
val.df()

Unnamed: 0,id,bid,ask,symbol
0,2020-05-17 00:00:00+00:00,10.2,10.3,gold
