In [1]:
from icetrait.substrait.visitor import SubstraitPlanEditor, visit_and_update, RelVisitor, RelUpdateVisitor
from icetrait.duckdb.wrapper import DuckdbSubstrait
import duckdb

## Initialize Spark Environment

In [2]:
import findspark
findspark.init()

from pyspark.sql import SparkSession

spark = (
    SparkSession.builder.master("local")
    .appName("IcebergPySpark")
    .config("spark.sql.extensions", "org.apache.iceberg.spark.extensions.IcebergSparkSessionExtensions")
    .config("spark.sql.catalog.demo", "org.apache.iceberg.spark.SparkCatalog")
    .config("spark.sql.catalog.demo.catalog-impl", "org.apache.iceberg.rest.RESTCatalog")
    .config("spark.sql.catalog.demo.uri", "http://rest:8181")
    .config("spark.sql.catalog.demo.s3.endpoint", "http://minio:9000")
    .config("spark.sql.defaultCatalog", "demo")
    .config("spark.eventLog.enabled", "true")
    .config("spark.eventLog.dir", "/home/iceberg/spark-events")
    .config("spark.history.fs.logDirectory", "/home/iceberg/spark-events")
    .config("spark.sql.catalogImplementation", "/home/iceberg/spark-events")
    .getOrCreate()
)

23/05/11 13:35:09 WARN SparkSession: Using an existing Spark session; only runtime SQL configurations will take effect.


In [3]:
%%sql

CREATE DATABASE IF NOT EXISTS nyc_demo;

23/05/11 13:35:09 WARN SparkSession: Using an existing Spark session; only runtime SQL configurations will take effect.


In [5]:
%%sql

show databases;

23/05/11 13:35:13 WARN SparkSession: Using an existing Spark session; only runtime SQL configurations will take effect.


namespace
nyc_demo


In [6]:
from pyiceberg.catalog import load_catalog
from pyiceberg.expressions import GreaterThanOrEqual

iceberg_catalog = load_catalog('default')
iceberg_catalog

<pyiceberg.catalog.rest.RestCatalog at 0x7f3caa4c1280>

In [7]:
iceberg_catalog.list_namespaces()

[('nyc_demo',)]

In [8]:
df = spark.read.parquet("/home/iceberg/data/yellow_tripdata_2021-04.parquet")
df.write.saveAsTable("nyc_demo.taxis_sample")

SLF4J: Failed to load class "org.slf4j.impl.StaticLoggerBinder".
SLF4J: Defaulting to no-operation (NOP) logger implementation
SLF4J: See http://www.slf4j.org/codes.html#StaticLoggerBinder for further details.
                                                                                

## Simulating Iceberg and Icetrait on Simple Query

In [9]:
iceberg_table = iceberg_catalog.load_table("nyc_demo.taxis_sample")

In [21]:
iceberg_df = iceberg_table.scan().to_pandas()
iceberg_df.head()

Unnamed: 0,VendorID,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,trip_distance,RatecodeID,store_and_fwd_flag,PULocationID,DOLocationID,payment_type,fare_amount,extra,mta_tax,tip_amount,tolls_amount,improvement_surcharge,total_amount,congestion_surcharge,airport_fee
0,1,2021-04-01 00:00:18+00:00,2021-04-01 00:21:54+00:00,1.0,8.4,1.0,N,79,116,1,25.5,3.0,0.5,5.85,0.0,0.3,35.15,2.5,0.0
1,1,2021-04-01 00:42:37+00:00,2021-04-01 00:46:23+00:00,1.0,0.9,1.0,N,75,236,2,5.0,3.0,0.5,0.0,0.0,0.3,8.8,2.5,0.0
2,1,2021-04-01 00:57:56+00:00,2021-04-01 01:08:22+00:00,1.0,3.4,1.0,N,236,168,2,11.5,3.0,0.5,0.0,0.0,0.3,15.3,2.5,0.0
3,1,2021-04-01 00:01:58+00:00,2021-04-01 00:54:27+00:00,1.0,0.0,1.0,N,47,61,1,44.2,0.0,0.5,0.0,0.0,0.3,45.0,0.0,0.0
4,2,2021-04-01 00:24:55+00:00,2021-04-01 00:34:33+00:00,1.0,1.96,1.0,N,238,152,1,9.0,0.5,0.5,3.09,0.0,0.3,13.39,0.0,0.0


In [11]:
sql_query = "SELECT * FROM nyc_demo.taxis_sample;"

In [12]:
def setup_duckdb():
    con = duckdb.connect()
    con.install_extension("substrait")
    con.load_extension("substrait")
    return con

In [13]:
con = setup_duckdb()
create_schema = "CREATE SCHEMA nyc_demo;"
creation_query = """
CREATE TABLE nyc_demo.taxis_sample (
    VendorID              bigint,
    tpep_pickup_datetime  timestamp,
    tpep_dropoff_datetime timestamp,
    passenger_count       double,
    trip_distance         double,
    RatecodeID            double,
    store_and_fwd_flag    string,
    PULocationID          bigint,
    DOLocationID          bigint,
    payment_type          bigint,
    fare_amount           double,
    extra                 double,
    mta_tax               double,
    tip_amount            double,
    tolls_amount          double,
    improvement_surcharge double,
    total_amount          double,
    congestion_surcharge  double,
    airport_fee           double
);
"""
con.execute(create_schema)
con.execute(creation_query)

<duckdb.DuckDBPyConnection at 0x7f3ca96c8df0>

In [14]:
proto_bytes = con.get_substrait("SELECT * FROM nyc_demo.taxis_sample;").fetchone()[0]

In [15]:
!mkdir /home/iceberg/notebooks/s3

In [16]:
class NamedTableUpdateVisitor(RelVisitor):
        
        def __init__(self, table_name):
            self._table_name = table_name
        
        def visit_aggregate(self, rel):
            pass
        
        def visit_cross(self, rel):
            pass
        
        def visit_fetch(self, rel):
            pass
        
        def visit_filter(self, rel):
            pass
        
        def visit_join(self, rel):
            pass
        
        def visit_hashjoin(self, rel):
            pass
        
        def visit_merge(self, rel):
            pass
        
        def visit_project(self, rel):
            pass
        
        def visit_read(self, read_rel):
            named_table = read_rel.NamedTable()
            named_table.names.append(self._table_name)
            read_rel.named_table.CopyFrom(named_table)
        
        def visit_set(self, rel):
            pass
        
        def visit_sort(self, rel):
            pass
        
editor = SubstraitPlanEditor(proto_bytes)
update_visitor = NamedTableUpdateVisitor("nyc_demo.taxis_sample")
visit_and_update(editor.rel, update_visitor)
proto_bytes = editor.plan.SerializeToString()

In [17]:
duckdb_substrait = DuckdbSubstrait(proto_bytes, "default", "/home/iceberg/notebooks/s3")
results = duckdb_substrait.execute()

In [22]:
df_duckdb_substrait = results.to_arrow_table().to_pandas()
df_duckdb_substrait.head()

Unnamed: 0,VendorID,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,trip_distance,RatecodeID,store_and_fwd_flag,PULocationID,DOLocationID,payment_type,fare_amount,extra,mta_tax,tip_amount,tolls_amount,improvement_surcharge,total_amount,congestion_surcharge,airport_fee
0,1,2021-04-01 00:00:18+00:00,2021-04-01 00:21:54+00:00,1.0,8.4,1.0,N,79,116,1,25.5,3.0,0.5,5.85,0.0,0.3,35.15,2.5,0.0
1,1,2021-04-01 00:42:37+00:00,2021-04-01 00:46:23+00:00,1.0,0.9,1.0,N,75,236,2,5.0,3.0,0.5,0.0,0.0,0.3,8.8,2.5,0.0
2,1,2021-04-01 00:57:56+00:00,2021-04-01 01:08:22+00:00,1.0,3.4,1.0,N,236,168,2,11.5,3.0,0.5,0.0,0.0,0.3,15.3,2.5,0.0
3,1,2021-04-01 00:01:58+00:00,2021-04-01 00:54:27+00:00,1.0,0.0,1.0,N,47,61,1,44.2,0.0,0.5,0.0,0.0,0.3,45.0,0.0,0.0
4,2,2021-04-01 00:24:55+00:00,2021-04-01 00:34:33+00:00,1.0,1.96,1.0,N,238,152,1,9.0,0.5,0.5,3.09,0.0,0.3,13.39,0.0,0.0


In [24]:
assert iceberg_df.equals(df_duckdb_substrait)

## Simulating Iceberg and Icetrait on Rename

## Simulating Iceberg and Icetrait on Add

## Simulating Iceberg and Icetrait on Drop

## Simulating Iceberg and Icetrait on Update