## Installing Required Icetrait

In [None]:
!pip install git+https://github.com/vibhatha/pyiceberg_substrait@feat-schema-evolution#egg=icetrait

**Warning**

Make sure to restart the kernel after installation

In [12]:
from icetrait.substrait.visitor import SubstraitPlanEditor, visit_and_update, RelVisitor, RelUpdateVisitor
from icetrait.duckdb.wrapper import DuckdbSubstrait
import duckdb

## Initialize Spark Environment

In [None]:
import findspark
findspark.init()

from pyspark.sql import SparkSession

spark = (
    SparkSession.builder.master("local")
    .appName("IcebergPySpark")
    .config("spark.sql.extensions", "org.apache.iceberg.spark.extensions.IcebergSparkSessionExtensions")
    .config("spark.sql.catalog.demo", "org.apache.iceberg.spark.SparkCatalog")
    .config("spark.sql.catalog.demo.catalog-impl", "org.apache.iceberg.rest.RESTCatalog")
    .config("spark.sql.catalog.demo.uri", "http://rest:8181")
    .config("spark.sql.catalog.demo.s3.endpoint", "http://minio:9000")
    .config("spark.sql.defaultCatalog", "demo")
    .config("spark.eventLog.enabled", "true")
    .config("spark.eventLog.dir", "/home/iceberg/spark-events")
    .config("spark.history.fs.logDirectory", "/home/iceberg/spark-events")
    .config("spark.sql.catalogImplementation", "/home/iceberg/spark-events")
    .getOrCreate()
)
spark

In [1]:
%%sql

CREATE DATABASE IF NOT EXISTS nyc_demo;

23/05/16 06:16:02 WARN SparkSession: Using an existing Spark session; only runtime SQL configurations will take effect.


In [2]:
%%sql

show databases;

namespace
nyc_demo


In [3]:
from pyiceberg.catalog import load_catalog
from pyiceberg.expressions import GreaterThanOrEqual

iceberg_catalog = load_catalog('default')
iceberg_catalog

<pyiceberg.catalog.rest.RestCatalog at 0x7f9dc7e33400>

In [4]:
iceberg_catalog.list_namespaces()

[('nyc_demo',)]

In [5]:
df = spark.read.parquet("/home/iceberg/data/yellow_tripdata_2021-04.parquet")
df.write.saveAsTable("nyc_demo.taxis_sample")

AnalysisException: Table nyc_demo.taxis_sample already exists

## Simulating Iceberg and Icetrait on Simple Query

In [6]:
iceberg_table = iceberg_catalog.load_table("nyc_demo.taxis_sample")

In [7]:
iceberg_df = iceberg_table.scan().to_pandas()
head = iceberg_df.head()
head.dtypes

VendorID                                int64
tpep_pickup_datetime      datetime64[ns, UTC]
tpep_dropoff_datetime     datetime64[ns, UTC]
passenger_count                       float64
trip_distance                         float64
fare_per_distance_unit                float32
RatecodeID                            float64
store_and_fwd_flag                     object
PULocationID                            int64
DOLocationID                            int64
payment_type                            int64
fare                                  float64
extra                                 float64
mta_tax                               float64
tip_amount                            float64
tolls_amount                          float64
improvement_surcharge                 float64
total_amount                          float64
congestion_surcharge                  float64
airport_fee                           float64
dtype: object

In [13]:
sql_query = "SELECT * FROM nyc_demo.taxis_sample;"

In [57]:
## TODO: Think about the following

"""
How should we get the query from the user? 
Should we assume that query would have the column names in the format of evolved schema?
Or should we expect it to have column names as in the original schema?
"""

def setup_duckdb():
    con = duckdb.connect()
    create_schema = "CREATE SCHEMA nyc_demo;"
    creation_query = """
    CREATE TABLE nyc_demo.taxis_sample (
        VendorID              bigint,
        tpep_pickup_datetime  timestamp,
        tpep_dropoff_datetime timestamp,
        passenger_count       double,
        trip_distance         double,
        fare_per_distance_unit float,
        RatecodeID            double,
        store_and_fwd_flag    string,
        PULocationID          bigint,
        DOLocationID          bigint,
        payment_type          bigint,
        fare_amount           double,
        extra                 double,
        mta_tax               double,
        tip_amount            double,
        tolls_amount          double,
        improvement_surcharge double,
        total_amount          double,
        congestion_surcharge  double,
        airport_fee           double
    );
    """
    con.execute(create_schema)
    con.execute(creation_query)
    return con

In [41]:
con = setup_duckdb()
type(con)

duckdb.DuckDBPyConnection

In [30]:
q1 = "SELECT * FROM nyc_demo.taxis_sample;"

In [31]:
!mkdir /home/iceberg/notebooks/s3

mkdir: cannot create directory ‘/home/iceberg/notebooks/s3’: File exists


In [32]:
wrapper = DuckdbSubstrait("default", "/home/iceberg/notebooks/s3", "nyc_demo", q1, setup_duckdb)
wrapper.update_named_table_with_schema()
wrapper.update_with_local_file_paths()

Table before update
pyarrow.Table
VendorID: int64
tpep_pickup_datetime: timestamp[us, tz=UTC]
tpep_dropoff_datetime: timestamp[us, tz=UTC]
passenger_count: double
trip_distance: double
fare_per_distance_unit: float
RatecodeID: double
store_and_fwd_flag: string
PULocationID: int64
DOLocationID: int64
payment_type: int64
fare: double
extra: double
mta_tax: double
tip_amount: double
tolls_amount: double
improvement_surcharge: double
total_amount: double
congestion_surcharge: double
airport_fee: double
----
VendorID: [[]]
tpep_pickup_datetime: [[]]
tpep_dropoff_datetime: [[]]
passenger_count: [[]]
trip_distance: [[]]
fare_per_distance_unit: [[]]
RatecodeID: [[]]
store_and_fwd_flag: [[]]
PULocationID: [[]]
DOLocationID: [[]]
...
Name is not none:  VendorID
Name is not none:  tpep_pickup_datetime
Name is not none:  tpep_dropoff_datetime
Name is not none:  passenger_count
Name is not none:  trip_distance
Name is not none:  fare_per_distance_unit
Name is not none:  RatecodeID
Name is not none:

In [33]:
wrapper.plan

relations {
  root {
    input {
      project {
        input {
          read {
            base_schema {
              names: "VendorID"
              names: "tpep_pickup_datetime"
              names: "tpep_dropoff_datetime"
              names: "passenger_count"
              names: "trip_distance"
              names: "fare_per_distance_unit"
              names: "RatecodeID"
              names: "store_and_fwd_flag"
              names: "PULocationID"
              names: "DOLocationID"
              names: "payment_type"
              names: "fare"
              names: "extra"
              names: "mta_tax"
              names: "tip_amount"
              names: "tolls_amount"
              names: "improvement_surcharge"
              names: "total_amount"
              names: "congestion_surcharge"
              names: "airport_fee"
              struct {
                types {
                  i64 {
                    nullability: NULLABILITY_NULLABLE
                  }
  

In [34]:
wrapper.updated_plan

relations {
  root {
    input {
      project {
        input {
          read {
            base_schema {
              names: "VendorID"
              names: "tpep_pickup_datetime"
              names: "tpep_dropoff_datetime"
              names: "passenger_count"
              names: "trip_distance"
              names: "fare_per_distance_unit"
              names: "RatecodeID"
              names: "store_and_fwd_flag"
              names: "PULocationID"
              names: "DOLocationID"
              names: "payment_type"
              names: "fare"
              names: "extra"
              names: "mta_tax"
              names: "tip_amount"
              names: "tolls_amount"
              names: "improvement_surcharge"
              names: "total_amount"
              names: "congestion_surcharge"
              names: "airport_fee"
              struct {
                types {
                  i64 {
                    nullability: NULLABILITY_NULLABLE
                  }
  

In [35]:
duckdb_res = wrapper.execute()

In [36]:
df_duckdb_substrait = duckdb_res.to_df()
df_duckdb_substrait.head()

Unnamed: 0,VendorID,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,trip_distance,fare_per_distance_unit,RatecodeID,store_and_fwd_flag,PULocationID,DOLocationID,payment_type,fare,extra,mta_tax,tip_amount,tolls_amount,improvement_surcharge,total_amount,congestion_surcharge,airport_fee
0,1,2021-04-01 00:00:18+00:00,2021-04-01 00:21:54+00:00,1.0,8.4,3.035714,1.0,N,79,116,1,25.5,3.0,0.5,5.85,0.0,0.3,35.15,2.5,0.0
1,1,2021-04-01 00:42:37+00:00,2021-04-01 00:46:23+00:00,1.0,0.9,5.555555,1.0,N,75,236,2,5.0,3.0,0.5,0.0,0.0,0.3,8.8,2.5,0.0
2,1,2021-04-01 00:57:56+00:00,2021-04-01 01:08:22+00:00,1.0,3.4,3.382353,1.0,N,236,168,2,11.5,3.0,0.5,0.0,0.0,0.3,15.3,2.5,0.0
3,1,2021-04-01 00:01:58+00:00,2021-04-01 00:54:27+00:00,1.0,0.0,,1.0,N,47,61,1,44.2,0.0,0.5,0.0,0.0,0.3,45.0,0.0,0.0
4,2,2021-04-01 00:24:55+00:00,2021-04-01 00:34:33+00:00,1.0,1.96,4.591837,1.0,N,238,152,1,9.0,0.5,0.5,3.09,0.0,0.3,13.39,0.0,0.0


In [37]:
# TODO: create separate table per evaluating action so that we can do the validations properly
assert iceberg_df.equals(df_duckdb_substrait)

## Simulating Iceberg and Icetrait on Rename

In [None]:
%%sql

ALTER TABLE nyc_demo.taxis_sample RENAME COLUMN fare_amount TO fare

## Simulating Iceberg and Icetrait on Add

In [None]:
%%sql

ALTER TABLE nyc_demo.taxis_sample
ADD COLUMN fare_per_distance_unit float AFTER trip_distance

In [None]:
%%sql

UPDATE nyc_demo.taxis_sample
SET fare_per_distance_unit = fare/trip_distance

In [None]:
iceberg_table = iceberg_catalog.load_table("nyc_demo.taxis_sample")
add_pending_table = iceberg_table.scan().to_pandas()
add_pending_table.dtypes

## Simulating Iceberg and Icetrait on Drop

## Simulating Iceberg and Icetrait on Update

## Simulating Iceberg and Icetrait on Select Query

In [49]:
%%sql
SELECT fare FROM nyc_demo.taxis_sample LIMIT 5

fare
25.5
5.0
11.5
44.2
9.0


In [42]:
con.install_extension("substrait")
con.load_extension("substrait")

select_query = "SELECT fare_amount as fare FROM nyc_demo.taxis_sample;"
proto_bytes = con.get_substrait(select_query).fetchone()[0]
editor = SubstraitPlanEditor(proto_bytes)
editor.plan

relations {
  root {
    input {
      project {
        input {
          read {
            base_schema {
              names: "VendorID"
              names: "tpep_pickup_datetime"
              names: "tpep_dropoff_datetime"
              names: "passenger_count"
              names: "trip_distance"
              names: "fare_per_distance_unit"
              names: "RatecodeID"
              names: "store_and_fwd_flag"
              names: "PULocationID"
              names: "DOLocationID"
              names: "payment_type"
              names: "fare_amount"
              names: "extra"
              names: "mta_tax"
              names: "tip_amount"
              names: "tolls_amount"
              names: "improvement_surcharge"
              names: "total_amount"
              names: "congestion_surcharge"
              names: "airport_fee"
              struct {
                types {
                  i64 {
                    nullability: NULLABILITY_NULLABLE
               

In [43]:
wrapper = DuckdbSubstrait("default", "/home/iceberg/notebooks/s3", "nyc_demo", select_query, setup_duckdb)
wrapper.update_named_table_with_schema()
wrapper.update_with_local_file_paths()

Table before update
pyarrow.Table
VendorID: int64
tpep_pickup_datetime: timestamp[us, tz=UTC]
tpep_dropoff_datetime: timestamp[us, tz=UTC]
passenger_count: double
trip_distance: double
fare_per_distance_unit: float
RatecodeID: double
store_and_fwd_flag: string
PULocationID: int64
DOLocationID: int64
payment_type: int64
fare: double
extra: double
mta_tax: double
tip_amount: double
tolls_amount: double
improvement_surcharge: double
total_amount: double
congestion_surcharge: double
airport_fee: double
----
VendorID: [[]]
tpep_pickup_datetime: [[]]
tpep_dropoff_datetime: [[]]
passenger_count: [[]]
trip_distance: [[]]
fare_per_distance_unit: [[]]
RatecodeID: [[]]
store_and_fwd_flag: [[]]
PULocationID: [[]]
DOLocationID: [[]]
...
Name is not none:  VendorID
Name is not none:  tpep_pickup_datetime
Name is not none:  tpep_dropoff_datetime
Name is not none:  passenger_count
Name is not none:  trip_distance
Name is not none:  fare_per_distance_unit
Name is not none:  RatecodeID
Name is not none:

In [44]:
wrapper.plan

relations {
  root {
    input {
      project {
        input {
          read {
            base_schema {
              names: "VendorID"
              names: "tpep_pickup_datetime"
              names: "tpep_dropoff_datetime"
              names: "passenger_count"
              names: "trip_distance"
              names: "fare_per_distance_unit"
              names: "RatecodeID"
              names: "store_and_fwd_flag"
              names: "PULocationID"
              names: "DOLocationID"
              names: "payment_type"
              names: "fare_amount"
              names: "extra"
              names: "mta_tax"
              names: "tip_amount"
              names: "tolls_amount"
              names: "improvement_surcharge"
              names: "total_amount"
              names: "congestion_surcharge"
              names: "airport_fee"
              struct {
                types {
                  i64 {
                    nullability: NULLABILITY_NULLABLE
               

In [45]:
wrapper.updated_plan

relations {
  root {
    input {
      project {
        input {
          read {
            base_schema {
              names: "VendorID"
              names: "tpep_pickup_datetime"
              names: "tpep_dropoff_datetime"
              names: "passenger_count"
              names: "trip_distance"
              names: "fare_per_distance_unit"
              names: "RatecodeID"
              names: "store_and_fwd_flag"
              names: "PULocationID"
              names: "DOLocationID"
              names: "payment_type"
              names: "fare"
              names: "extra"
              names: "mta_tax"
              names: "tip_amount"
              names: "tolls_amount"
              names: "improvement_surcharge"
              names: "total_amount"
              names: "congestion_surcharge"
              names: "airport_fee"
              struct {
                types {
                  i64 {
                    nullability: NULLABILITY_NULLABLE
                  }
  

In [46]:
res = wrapper.execute()

In [52]:
duckdb_substrait_df = res.to_df().head()
duckdb_substrait_df

Unnamed: 0,fare
0,25.5
1,5.0
2,11.5
3,44.2
4,9.0


In [48]:
wrapper.updated_plan

relations {
  root {
    input {
      project {
        input {
          read {
            base_schema {
              names: "VendorID"
              names: "tpep_pickup_datetime"
              names: "tpep_dropoff_datetime"
              names: "passenger_count"
              names: "trip_distance"
              names: "fare_per_distance_unit"
              names: "RatecodeID"
              names: "store_and_fwd_flag"
              names: "PULocationID"
              names: "DOLocationID"
              names: "payment_type"
              names: "fare"
              names: "extra"
              names: "mta_tax"
              names: "tip_amount"
              names: "tolls_amount"
              names: "improvement_surcharge"
              names: "total_amount"
              names: "congestion_surcharge"
              names: "airport_fee"
              struct {
                types {
                  i64 {
                    nullability: NULLABILITY_NULLABLE
                  }
  

In [54]:
iceberg_df = iceberg_table.scan(selected_fields=["fare"]).to_pandas().head()
iceberg_df

Unnamed: 0,fare
0,25.5
1,5.0
2,11.5
3,44.2
4,9.0


In [55]:
duckdb_substrait_df

Unnamed: 0,fare
0,25.5
1,5.0
2,11.5
3,44.2
4,9.0


In [56]:
assert iceberg_df.equals(duckdb_substrait_df)