## Installing Required Icetrait

In [52]:
!pip install git+https://github.com/vibhatha/pyiceberg_substrait@feat-schema-evolution-s1#egg=icetrait

Collecting icetrait
  Cloning https://github.com/vibhatha/pyiceberg_substrait (to revision feat-schema-evolution-s1) to /tmp/pip-install-n362u8hi/icetrait_f6596f467ce34e0da96f7b5041863816
  Running command git clone --filter=blob:none --quiet https://github.com/vibhatha/pyiceberg_substrait /tmp/pip-install-n362u8hi/icetrait_f6596f467ce34e0da96f7b5041863816
  Running command git checkout -b feat-schema-evolution-s1 --track origin/feat-schema-evolution-s1
  Switched to a new branch 'feat-schema-evolution-s1'
  Branch 'feat-schema-evolution-s1' set up to track remote branch 'feat-schema-evolution-s1' from 'origin'.
  Resolved https://github.com/vibhatha/pyiceberg_substrait to commit 8a86fbd1af85905af818566e5963954c818b44c4
  Running command git submodule update --init --recursive -q
  Installing build dependencies ... [?25ldone
[?25h  Getting requirements to build wheel ... [?25ldone
[?25h  Installing backend dependencies ... [?25ldone
[?25h  Preparing metadata (pyproject.toml) ... 

**Warning**

Make sure to restart the kernel after installation

In [1]:
from icetrait.substrait.visitor import SubstraitPlanEditor, visit_and_update, RelVisitor, RelUpdateVisitor
from icetrait.duckdb.wrapper import DuckdbSubstrait
import duckdb

## Initialize Spark Environment

In [2]:
import findspark
findspark.init()

from pyspark.sql import SparkSession

spark = (
    SparkSession.builder.master("local")
    .appName("IcebergPySpark")
    .config("spark.sql.extensions", "org.apache.iceberg.spark.extensions.IcebergSparkSessionExtensions")
    .config("spark.sql.catalog.demo", "org.apache.iceberg.spark.SparkCatalog")
    .config("spark.sql.catalog.demo.catalog-impl", "org.apache.iceberg.rest.RESTCatalog")
    .config("spark.sql.catalog.demo.uri", "http://rest:8181")
    .config("spark.sql.catalog.demo.s3.endpoint", "http://minio:9000")
    .config("spark.sql.defaultCatalog", "demo")
    .config("spark.eventLog.enabled", "true")
    .config("spark.eventLog.dir", "/home/iceberg/spark-events")
    .config("spark.history.fs.logDirectory", "/home/iceberg/spark-events")
    .config("spark.sql.catalogImplementation", "/home/iceberg/spark-events")
    .getOrCreate()
)
spark

23/05/24 19:22:41 WARN SparkSession: Using an existing Spark session; only runtime SQL configurations will take effect.


In [3]:
%%sql

DROP TABLE IF EXISTS nyc_demo.taxis_sample;

23/05/24 19:22:41 WARN SparkSession: Using an existing Spark session; only runtime SQL configurations will take effect.


In [4]:
%%sql

CREATE DATABASE IF NOT EXISTS nyc_demo;

23/05/24 19:22:42 WARN SparkSession: Using an existing Spark session; only runtime SQL configurations will take effect.


In [5]:
%%sql

show databases;

23/05/24 19:22:43 WARN SparkSession: Using an existing Spark session; only runtime SQL configurations will take effect.


namespace
nyc_demo


In [6]:
from pyiceberg.catalog import load_catalog

iceberg_catalog = load_catalog('default')
iceberg_catalog

<pyiceberg.catalog.rest.RestCatalog at 0x7fb78d091fd0>

In [7]:
iceberg_catalog.list_namespaces()

[('nyc_demo',)]

In [8]:
df = spark.read.parquet("/home/iceberg/data/yellow_tripdata_2021-04.parquet")
df.write.saveAsTable("nyc_demo.taxis_sample")

SLF4J: Failed to load class "org.slf4j.impl.StaticLoggerBinder".
SLF4J: Defaulting to no-operation (NOP) logger implementation
SLF4J: See http://www.slf4j.org/codes.html#StaticLoggerBinder for further details.
                                                                                

## Simulating Iceberg and Icetrait on Simple Query

In [9]:
iceberg_table = iceberg_catalog.load_table("nyc_demo.taxis_sample")

In [10]:
iceberg_df = iceberg_table.scan().to_pandas()
head = iceberg_df.head()
head.dtypes

VendorID                               int64
tpep_pickup_datetime     datetime64[ns, UTC]
tpep_dropoff_datetime    datetime64[ns, UTC]
passenger_count                      float64
trip_distance                        float64
RatecodeID                           float64
store_and_fwd_flag                    object
PULocationID                           int64
DOLocationID                           int64
payment_type                           int64
fare_amount                          float64
extra                                float64
mta_tax                              float64
tip_amount                           float64
tolls_amount                         float64
improvement_surcharge                float64
total_amount                         float64
congestion_surcharge                 float64
airport_fee                          float64
dtype: object

In [11]:
"""
We expect the user to pass the evolved schema.
"""
## TODO: I think we can probably keep a single connection and use the update query to Spark applied on
## the duckdb connector and update the table.
def setup_duckdb():
    con = duckdb.connect()
    create_schema = "CREATE SCHEMA nyc_demo;"
    creation_query = """
    CREATE TABLE nyc_demo.taxis_sample (
        VendorID              bigint,
        tpep_pickup_datetime  timestamp,
        tpep_dropoff_datetime timestamp,
        passenger_count       double,
        trip_distance         double,
        RatecodeID            double,
        store_and_fwd_flag    string,
        PULocationID          bigint,
        DOLocationID          bigint,
        payment_type          bigint,
        fare_amount           double,
        extra                 double,
        mta_tax               double,
        tip_amount            double,
        tolls_amount          double,
        improvement_surcharge double,
        total_amount          double,
        congestion_surcharge  double,
        airport_fee           double
    );
    """
    con.execute(create_schema)
    con.execute(creation_query)
    return con

In [12]:
q1 = "SELECT * FROM nyc_demo.taxis_sample;"

In [13]:
!mkdir /home/iceberg/notebooks/s3

mkdir: cannot create directory ‘/home/iceberg/notebooks/s3’: File exists


In [14]:
wrapper = DuckdbSubstrait("default", "/home/iceberg/notebooks/s3", "nyc_demo", q1, setup_duckdb)
wrapper.update_named_table_with_schema()
wrapper.update_with_local_file_paths()

Table before update
pyarrow.Table
VendorID: int64
tpep_pickup_datetime: timestamp[us, tz=UTC]
tpep_dropoff_datetime: timestamp[us, tz=UTC]
passenger_count: double
trip_distance: double
RatecodeID: double
store_and_fwd_flag: string
PULocationID: int64
DOLocationID: int64
payment_type: int64
fare_amount: double
extra: double
mta_tax: double
tip_amount: double
tolls_amount: double
improvement_surcharge: double
total_amount: double
congestion_surcharge: double
airport_fee: double
----
VendorID: [[]]
tpep_pickup_datetime: [[]]
tpep_dropoff_datetime: [[]]
passenger_count: [[]]
trip_distance: [[]]
RatecodeID: [[]]
store_and_fwd_flag: [[]]
PULocationID: [[]]
DOLocationID: [[]]
payment_type: [[]]
...
********************************************************************************
Projected Schema
table {
  1: VendorID: optional long
  2: tpep_pickup_datetime: optional timestamptz
  3: tpep_dropoff_datetime: optional timestamptz
  4: passenger_count: optional double
  5: trip_distance: optional 

In [15]:
duckdb_res = wrapper.execute()

In [16]:
df_duckdb_substrait = duckdb_res.to_df()
df_duckdb_substrait.head()

Unnamed: 0,VendorID,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,trip_distance,RatecodeID,store_and_fwd_flag,PULocationID,DOLocationID,payment_type,fare_amount,extra,mta_tax,tip_amount,tolls_amount,improvement_surcharge,total_amount,congestion_surcharge,airport_fee
0,1,2021-04-01 00:00:18+00:00,2021-04-01 00:21:54+00:00,1.0,8.4,1.0,N,79,116,1,25.5,3.0,0.5,5.85,0.0,0.3,35.15,2.5,0.0
1,1,2021-04-01 00:42:37+00:00,2021-04-01 00:46:23+00:00,1.0,0.9,1.0,N,75,236,2,5.0,3.0,0.5,0.0,0.0,0.3,8.8,2.5,0.0
2,1,2021-04-01 00:57:56+00:00,2021-04-01 01:08:22+00:00,1.0,3.4,1.0,N,236,168,2,11.5,3.0,0.5,0.0,0.0,0.3,15.3,2.5,0.0
3,1,2021-04-01 00:01:58+00:00,2021-04-01 00:54:27+00:00,1.0,0.0,1.0,N,47,61,1,44.2,0.0,0.5,0.0,0.0,0.3,45.0,0.0,0.0
4,2,2021-04-01 00:24:55+00:00,2021-04-01 00:34:33+00:00,1.0,1.96,1.0,N,238,152,1,9.0,0.5,0.5,3.09,0.0,0.3,13.39,0.0,0.0


In [17]:
# TODO: create separate table per evaluating action so that we can do the validations properly
assert iceberg_df.equals(df_duckdb_substrait)

## Simulating Iceberg and Icetrait on Rename

In [18]:
%%sql

ALTER TABLE nyc_demo.taxis_sample RENAME COLUMN fare_amount TO fare

23/05/24 19:23:01 WARN SparkSession: Using an existing Spark session; only runtime SQL configurations will take effect.
23/05/24 19:23:01 WARN BaseTransaction: Failed to load metadata for a committed snapshot, skipping clean-up


### Since the schema is updated, we need to update the Duckdb Substrait interface to reflect that change

In [19]:
# instead of `fare_amount` we use `fare` since it is the renamed field. 
def setup_duckdb_for_rename():
    con = duckdb.connect()
    create_schema = "CREATE SCHEMA nyc_demo;"
    creation_query = """
    CREATE TABLE nyc_demo.taxis_sample (
        VendorID              bigint,
        tpep_pickup_datetime  timestamp,
        tpep_dropoff_datetime timestamp,
        passenger_count       double,
        trip_distance         double,
        RatecodeID            double,
        store_and_fwd_flag    string,
        PULocationID          bigint,
        DOLocationID          bigint,
        payment_type          bigint,
        fare                  double,
        extra                 double,
        mta_tax               double,
        tip_amount            double,
        tolls_amount          double,
        improvement_surcharge double,
        total_amount          double,
        congestion_surcharge  double,
        airport_fee           double
    );
    """
    con.execute(create_schema)
    con.execute(creation_query)
    return con

In [20]:
query_rename = "SELECT fare FROM nyc_demo.taxis_sample;"

In [21]:
wrapper = DuckdbSubstrait("default", "/home/iceberg/notebooks/s3", "nyc_demo", query_rename, setup_duckdb_for_rename)
wrapper.update_named_table_with_schema()
wrapper.update_with_local_file_paths()

Table before update
pyarrow.Table
VendorID: int64
tpep_pickup_datetime: timestamp[us, tz=UTC]
tpep_dropoff_datetime: timestamp[us, tz=UTC]
passenger_count: double
trip_distance: double
RatecodeID: double
store_and_fwd_flag: string
PULocationID: int64
DOLocationID: int64
payment_type: int64
fare_amount: double
extra: double
mta_tax: double
tip_amount: double
tolls_amount: double
improvement_surcharge: double
total_amount: double
congestion_surcharge: double
airport_fee: double
----
VendorID: [[]]
tpep_pickup_datetime: [[]]
tpep_dropoff_datetime: [[]]
passenger_count: [[]]
trip_distance: [[]]
RatecodeID: [[]]
store_and_fwd_flag: [[]]
PULocationID: [[]]
DOLocationID: [[]]
payment_type: [[]]
...
fare not in ['VendorID', 'tpep_pickup_datetime', 'tpep_dropoff_datetime', 'passenger_count', 'trip_distance', 'RatecodeID', 'store_and_fwd_flag', 'PULocationID', 'DOLocationID', 'payment_type', 'fare_amount', 'extra', 'mta_tax', 'tip_amount', 'tolls_amount', 'improvement_surcharge', 'total_amount',

In [22]:
wrapper.plan

relations {
  root {
    input {
      project {
        input {
          read {
            base_schema {
              names: "VendorID"
              names: "tpep_pickup_datetime"
              names: "tpep_dropoff_datetime"
              names: "passenger_count"
              names: "trip_distance"
              names: "RatecodeID"
              names: "store_and_fwd_flag"
              names: "PULocationID"
              names: "DOLocationID"
              names: "payment_type"
              names: "fare"
              names: "extra"
              names: "mta_tax"
              names: "tip_amount"
              names: "tolls_amount"
              names: "improvement_surcharge"
              names: "total_amount"
              names: "congestion_surcharge"
              names: "airport_fee"
              struct {
                types {
                  i64 {
                    nullability: NULLABILITY_NULLABLE
                  }
                }
                types {
      

In [23]:
wrapper.updated_plan

relations {
  root {
    input {
      project {
        input {
          read {
            base_schema {
              names: "VendorID"
              names: "tpep_pickup_datetime"
              names: "tpep_dropoff_datetime"
              names: "passenger_count"
              names: "trip_distance"
              names: "RatecodeID"
              names: "store_and_fwd_flag"
              names: "PULocationID"
              names: "DOLocationID"
              names: "payment_type"
              names: "fare_amount"
              names: "extra"
              names: "mta_tax"
              names: "tip_amount"
              names: "tolls_amount"
              names: "improvement_surcharge"
              names: "total_amount"
              names: "congestion_surcharge"
              names: "airport_fee"
              struct {
                types {
                  i64 {
                    nullability: NULLABILITY_NULLABLE
                  }
                }
                types {

In [24]:
duckdb_rename_res = wrapper.execute()

In [25]:
df_duckdb_substrait_rename = duckdb_rename_res.to_df()
df_duckdb_substrait_rename.head()

Unnamed: 0,fare
0,25.5
1,5.0
2,11.5
3,44.2
4,9.0


In [28]:
iceberg_catalog = load_catalog('default')
iceberg_table = iceberg_catalog.load_table("nyc_demo.taxis_sample")
iceberg_df_rename = iceberg_table.scan(selected_fields=["fare"]).to_pandas()
head_rename = iceberg_df_rename.head()
head_rename

Unnamed: 0,fare
0,25.5
1,5.0
2,11.5
3,44.2
4,9.0


In [29]:
assert iceberg_df_rename.equals(df_duckdb_substrait_rename)

## Simulating Iceberg and Icetrait on Add

In [30]:
%%sql

ALTER TABLE nyc_demo.taxis_sample
ADD COLUMN fare_per_distance_unit float AFTER trip_distance

23/05/24 19:23:11 WARN SparkSession: Using an existing Spark session; only runtime SQL configurations will take effect.
23/05/24 19:23:12 WARN BaseTransaction: Failed to load metadata for a committed snapshot, skipping clean-up


In [31]:
%%sql

UPDATE nyc_demo.taxis_sample
SET fare_per_distance_unit = fare/trip_distance

23/05/24 19:23:15 WARN SparkSession: Using an existing Spark session; only runtime SQL configurations will take effect.


                                                                                

### Since the schema is updated, we need to update the Duckdb Substrait interface to reflect that change

In [32]:
# adding field `fare_per_distance_unit`
def setup_duckdb_for_update():
    con = duckdb.connect()
    create_schema = "CREATE SCHEMA nyc_demo;"
    creation_query = """
    CREATE TABLE nyc_demo.taxis_sample (
        VendorID               bigint,
        tpep_pickup_datetime   timestamp,
        tpep_dropoff_datetime  timestamp,
        passenger_count        double,
        trip_distance          double,
        fare_per_distance_unit float, 
        RatecodeID             double,
        store_and_fwd_flag     string,
        PULocationID           bigint,
        DOLocationID           bigint,
        payment_type           bigint,
        fare                   double,
        extra                  double,
        mta_tax                double,
        tip_amount             double,
        tolls_amount           double,
        improvement_surcharge  double,
        total_amount           double,
        congestion_surcharge   double,
        airport_fee            double
    );
    """
    con.execute(create_schema)
    con.execute(creation_query)
    return con

In [33]:
query_add = "SELECT fare_per_distance_unit, fare FROM nyc_demo.taxis_sample;"

In [34]:
wrapper = DuckdbSubstrait("default", "/home/iceberg/notebooks/s3", "nyc_demo", query_add, setup_duckdb_for_update)
wrapper.update_named_table_with_schema()
wrapper.update_with_local_file_paths()

Table before update
pyarrow.Table
VendorID: int64
tpep_pickup_datetime: timestamp[us, tz=UTC]
tpep_dropoff_datetime: timestamp[us, tz=UTC]
passenger_count: double
trip_distance: double
fare_per_distance_unit: float
RatecodeID: double
store_and_fwd_flag: string
PULocationID: int64
DOLocationID: int64
payment_type: int64
fare: double
extra: double
mta_tax: double
tip_amount: double
tolls_amount: double
improvement_surcharge: double
total_amount: double
congestion_surcharge: double
airport_fee: double
----
VendorID: [[]]
tpep_pickup_datetime: [[]]
tpep_dropoff_datetime: [[]]
passenger_count: [[]]
trip_distance: [[]]
fare_per_distance_unit: [[]]
RatecodeID: [[]]
store_and_fwd_flag: [[]]
PULocationID: [[]]
DOLocationID: [[]]
...
********************************************************************************
Projected Schema
table {
  20: fare_per_distance_unit: optional float
  11: fare: optional double
}
********************************************************************************
Proj

In [35]:
wrapper.plan

relations {
  root {
    input {
      project {
        input {
          read {
            base_schema {
              names: "VendorID"
              names: "tpep_pickup_datetime"
              names: "tpep_dropoff_datetime"
              names: "passenger_count"
              names: "trip_distance"
              names: "fare_per_distance_unit"
              names: "RatecodeID"
              names: "store_and_fwd_flag"
              names: "PULocationID"
              names: "DOLocationID"
              names: "payment_type"
              names: "fare"
              names: "extra"
              names: "mta_tax"
              names: "tip_amount"
              names: "tolls_amount"
              names: "improvement_surcharge"
              names: "total_amount"
              names: "congestion_surcharge"
              names: "airport_fee"
              struct {
                types {
                  i64 {
                    nullability: NULLABILITY_NULLABLE
                  }
  

In [36]:
wrapper.updated_plan

relations {
  root {
    input {
      project {
        input {
          read {
            base_schema {
              names: "VendorID"
              names: "tpep_pickup_datetime"
              names: "tpep_dropoff_datetime"
              names: "passenger_count"
              names: "trip_distance"
              names: "fare_per_distance_unit"
              names: "RatecodeID"
              names: "store_and_fwd_flag"
              names: "PULocationID"
              names: "DOLocationID"
              names: "payment_type"
              names: "fare"
              names: "extra"
              names: "mta_tax"
              names: "tip_amount"
              names: "tolls_amount"
              names: "improvement_surcharge"
              names: "total_amount"
              names: "congestion_surcharge"
              names: "airport_fee"
              struct {
                types {
                  i64 {
                    nullability: NULLABILITY_NULLABLE
                  }
  

In [37]:
duckdb_add_res = wrapper.execute()

In [38]:
df_duckdb_substrait_add = duckdb_add_res.to_df()
df_duckdb_substrait_add.head()

Unnamed: 0,fare_per_distance_unit,fare
0,3.035714,25.5
1,5.555555,5.0
2,3.382353,11.5
3,,44.2
4,4.591837,9.0


In [39]:
iceberg_catalog = load_catalog('default')
iceberg_table = iceberg_catalog.load_table("nyc_demo.taxis_sample")
iceberg_df_add = iceberg_table.scan(selected_fields=["fare_per_distance_unit", "fare"]).to_pandas()
head_add = iceberg_df_add.head()
head_add

Unnamed: 0,fare_per_distance_unit,fare
0,3.035714,25.5
1,5.555555,5.0
2,3.382353,11.5
3,,44.2
4,4.591837,9.0


In [41]:
assert iceberg_df_add.equals(df_duckdb_substrait_add)

## Simulating Iceberg and Icetrait on Drop

In [42]:
%%sql
ALTER TABLE nyc_demo.taxis_sample
DROP COLUMN improvement_surcharge;

23/05/24 19:23:40 WARN SparkSession: Using an existing Spark session; only runtime SQL configurations will take effect.
23/05/24 19:23:40 WARN BaseTransaction: Failed to load metadata for a committed snapshot, skipping clean-up


### Since the schema is updated, we need to update the Duckdb Substrait interface to reflect that change

In [50]:
# field `improvement_surcharge` is included duckdb assumes the fullschema to be available and
# it handles selection via projection
def setup_duckdb_for_drop():
    con = duckdb.connect()
    create_schema = "CREATE SCHEMA nyc_demo;"
    creation_query = """
    CREATE TABLE nyc_demo.taxis_sample (
        VendorID               bigint,
        tpep_pickup_datetime   timestamp,
        tpep_dropoff_datetime  timestamp,
        passenger_count        double,
        trip_distance          double,
        fare_per_distance_unit float, 
        RatecodeID             double,
        store_and_fwd_flag     string,
        PULocationID           bigint,
        DOLocationID           bigint,
        payment_type           bigint,
        fare                   double,
        extra                  double,
        mta_tax                double,
        tip_amount             double,
        tolls_amount           double,
        total_amount           double,
        congestion_surcharge   double,
        airport_fee            double
    );
    """
    con.execute(create_schema)
    con.execute(creation_query)
    return con

In [51]:
query_drop = "SELECT * FROM nyc_demo.taxis_sample;"

In [52]:
wrapper = DuckdbSubstrait("default", "/home/iceberg/notebooks/s3", "nyc_demo", query_drop, setup_duckdb_for_drop)
wrapper.update_named_table_with_schema()
wrapper.update_with_local_file_paths()

Table before update
pyarrow.Table
VendorID: int64
tpep_pickup_datetime: timestamp[us, tz=UTC]
tpep_dropoff_datetime: timestamp[us, tz=UTC]
passenger_count: double
trip_distance: double
fare_per_distance_unit: float
RatecodeID: double
store_and_fwd_flag: string
PULocationID: int64
DOLocationID: int64
payment_type: int64
fare: double
extra: double
mta_tax: double
tip_amount: double
tolls_amount: double
improvement_surcharge: double
total_amount: double
congestion_surcharge: double
airport_fee: double
----
VendorID: [[]]
tpep_pickup_datetime: [[]]
tpep_dropoff_datetime: [[]]
passenger_count: [[]]
trip_distance: [[]]
fare_per_distance_unit: [[]]
RatecodeID: [[]]
store_and_fwd_flag: [[]]
PULocationID: [[]]
DOLocationID: [[]]
...
********************************************************************************
Projected Schema
table {
  1: VendorID: optional long
  2: tpep_pickup_datetime: optional timestamptz
  3: tpep_dropoff_datetime: optional timestamptz
  4: passenger_count: optional dou

In [53]:
wrapper.plan

relations {
  root {
    input {
      project {
        input {
          read {
            base_schema {
              names: "VendorID"
              names: "tpep_pickup_datetime"
              names: "tpep_dropoff_datetime"
              names: "passenger_count"
              names: "trip_distance"
              names: "fare_per_distance_unit"
              names: "RatecodeID"
              names: "store_and_fwd_flag"
              names: "PULocationID"
              names: "DOLocationID"
              names: "payment_type"
              names: "fare"
              names: "extra"
              names: "mta_tax"
              names: "tip_amount"
              names: "tolls_amount"
              names: "total_amount"
              names: "congestion_surcharge"
              names: "airport_fee"
              struct {
                types {
                  i64 {
                    nullability: NULLABILITY_NULLABLE
                  }
                }
                types {
     

In [54]:
wrapper.updated_plan

relations {
  root {
    input {
      project {
        input {
          read {
            base_schema {
              names: "VendorID"
              names: "tpep_pickup_datetime"
              names: "tpep_dropoff_datetime"
              names: "passenger_count"
              names: "trip_distance"
              names: "fare_per_distance_unit"
              names: "RatecodeID"
              names: "store_and_fwd_flag"
              names: "PULocationID"
              names: "DOLocationID"
              names: "payment_type"
              names: "fare"
              names: "extra"
              names: "mta_tax"
              names: "tip_amount"
              names: "tolls_amount"
              names: "improvement_surcharge"
              names: "total_amount"
              names: "congestion_surcharge"
              names: "airport_fee"
              struct {
                types {
                  i64 {
                    nullability: NULLABILITY_NULLABLE
                  }
  

In [57]:
duckdb_drop_res = wrapper.execute()

In [58]:
df_duckdb_substrait_drop = duckdb_drop_res.to_df()
df_duckdb_substrait_drop.head()

Unnamed: 0,VendorID,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,trip_distance,fare_per_distance_unit,RatecodeID,store_and_fwd_flag,PULocationID,DOLocationID,payment_type,fare,extra,mta_tax,tip_amount,tolls_amount,total_amount,congestion_surcharge,airport_fee
0,1,2021-04-01 00:00:18+00:00,2021-04-01 00:21:54+00:00,1.0,8.4,3.035714,1.0,N,79,116,1,25.5,3.0,0.5,5.85,0.0,0.3,35.15,2.5
1,1,2021-04-01 00:42:37+00:00,2021-04-01 00:46:23+00:00,1.0,0.9,5.555555,1.0,N,75,236,2,5.0,3.0,0.5,0.0,0.0,0.3,8.8,2.5
2,1,2021-04-01 00:57:56+00:00,2021-04-01 01:08:22+00:00,1.0,3.4,3.382353,1.0,N,236,168,2,11.5,3.0,0.5,0.0,0.0,0.3,15.3,2.5
3,1,2021-04-01 00:01:58+00:00,2021-04-01 00:54:27+00:00,1.0,0.0,,1.0,N,47,61,1,44.2,0.0,0.5,0.0,0.0,0.3,45.0,0.0
4,2,2021-04-01 00:24:55+00:00,2021-04-01 00:34:33+00:00,1.0,1.96,4.591837,1.0,N,238,152,1,9.0,0.5,0.5,3.09,0.0,0.3,13.39,0.0


In [56]:
iceberg_catalog = load_catalog('default')
iceberg_table = iceberg_catalog.load_table("nyc_demo.taxis_sample")

In [57]:
iceberg_table.scan().to_pandas().head()

Unnamed: 0,VendorID,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,trip_distance,fare_per_distance_unit,RatecodeID,store_and_fwd_flag,PULocationID,DOLocationID,payment_type,fare,extra,mta_tax,tip_amount,tolls_amount,improvement_surcharge,total_amount,congestion_surcharge,airport_fee
0,1,2021-04-01 00:00:18+00:00,2021-04-01 00:21:54+00:00,1.0,8.4,3.035714,1.0,N,79,116,1,25.5,3.0,0.5,5.85,0.0,0.3,35.15,2.5,0.0
1,1,2021-04-01 00:42:37+00:00,2021-04-01 00:46:23+00:00,1.0,0.9,5.555555,1.0,N,75,236,2,5.0,3.0,0.5,0.0,0.0,0.3,8.8,2.5,0.0
2,1,2021-04-01 00:57:56+00:00,2021-04-01 01:08:22+00:00,1.0,3.4,3.382353,1.0,N,236,168,2,11.5,3.0,0.5,0.0,0.0,0.3,15.3,2.5,0.0
3,1,2021-04-01 00:01:58+00:00,2021-04-01 00:54:27+00:00,1.0,0.0,,1.0,N,47,61,1,44.2,0.0,0.5,0.0,0.0,0.3,45.0,0.0,0.0
4,2,2021-04-01 00:24:55+00:00,2021-04-01 00:34:33+00:00,1.0,1.96,4.591837,1.0,N,238,152,1,9.0,0.5,0.5,3.09,0.0,0.3,13.39,0.0,0.0


In [59]:
print(iceberg_table.schema())

table {
  1: VendorID: optional long
  2: tpep_pickup_datetime: optional timestamptz
  3: tpep_dropoff_datetime: optional timestamptz
  4: passenger_count: optional double
  5: trip_distance: optional double
  20: fare_per_distance_unit: optional float
  6: RatecodeID: optional double
  7: store_and_fwd_flag: optional string
  8: PULocationID: optional long
  9: DOLocationID: optional long
  10: payment_type: optional long
  11: fare: optional double
  12: extra: optional double
  13: mta_tax: optional double
  14: tip_amount: optional double
  15: tolls_amount: optional double
  17: total_amount: optional double
  18: congestion_surcharge: optional double
  19: airport_fee: optional double
}


In [61]:
print(iceberg_table.scan().projection())

table {
  1: VendorID: optional long
  2: tpep_pickup_datetime: optional timestamptz
  3: tpep_dropoff_datetime: optional timestamptz
  4: passenger_count: optional double
  5: trip_distance: optional double
  20: fare_per_distance_unit: optional float
  6: RatecodeID: optional double
  7: store_and_fwd_flag: optional string
  8: PULocationID: optional long
  9: DOLocationID: optional long
  10: payment_type: optional long
  11: fare: optional double
  12: extra: optional double
  13: mta_tax: optional double
  14: tip_amount: optional double
  15: tolls_amount: optional double
  16: improvement_surcharge: optional double
  17: total_amount: optional double
  18: congestion_surcharge: optional double
  19: airport_fee: optional double
}


In [None]:
assert iceberg_df_drop.equals(df_duckdb_substrait_drop)

In [None]:
for a, b in zip(iceberg_df_drop.dtypes, df_duckdb_substrait_drop.dtypes):
    print(a, b)

In [None]:
from google.protobuf.json_format import MessageToJson

json_obj = MessageToJson(wrapper.updated_plan)

In [None]:
con = duckdb.connect()
con.install_extension("substrait")
con.load_extension("substrait")
res = con.get_substrait_json(substrait_plan)
res

In [None]:
current_schema = iceberg_table.schema()
print(current_schema)

In [None]:
selected_cols = ["fare", "fare_per_distance_unit"]
selected_cols = ['VendorID', 'tpep_pickup_datetime', 'tpep_dropoff_datetime', 'passenger_count', 'trip_distance', 'fare_per_distance_unit', 'RatecodeID', 'store_and_fwd_flag', 'PULocationID', 'DOLocationID', 'payment_type', 'fare', 'extra', 'mta_tax', 'tip_amount', 'tolls_amount', 'improvement_surcharge', 'total_amount', 'congestion_surcharge', 'airport_fee']

In [None]:
for col in selected_cols:
    res = current_schema.find_field(col)
    print(res.field_id, res.name, col)

In [None]:
current_schema.find_field?

In [None]:
for field in current_schema.fields:
    print(field.field_id, field.name)

In [51]:
import pyarrow.parquet as pq
file_path = "/home/iceberg/notebooks/s3/00009-11-e4aef848-a02b-4358-b01d-8340f42dbedb-00001.parquet"
table = pq.read_table(file_path)
table.to_pandas().head()

Unnamed: 0,VendorID,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,trip_distance,fare_per_distance_unit,RatecodeID,store_and_fwd_flag,PULocationID,DOLocationID,payment_type,fare,extra,mta_tax,tip_amount,tolls_amount,improvement_surcharge,total_amount,congestion_surcharge,airport_fee
0,1,2021-04-01 00:00:18+00:00,2021-04-01 00:21:54+00:00,1.0,8.4,3.035714,1.0,N,79,116,1,25.5,3.0,0.5,5.85,0.0,0.3,35.15,2.5,0.0
1,1,2021-04-01 00:42:37+00:00,2021-04-01 00:46:23+00:00,1.0,0.9,5.555555,1.0,N,75,236,2,5.0,3.0,0.5,0.0,0.0,0.3,8.8,2.5,0.0
2,1,2021-04-01 00:57:56+00:00,2021-04-01 01:08:22+00:00,1.0,3.4,3.382353,1.0,N,236,168,2,11.5,3.0,0.5,0.0,0.0,0.3,15.3,2.5,0.0
3,1,2021-04-01 00:01:58+00:00,2021-04-01 00:54:27+00:00,1.0,0.0,,1.0,N,47,61,1,44.2,0.0,0.5,0.0,0.0,0.3,45.0,0.0,0.0
4,2,2021-04-01 00:24:55+00:00,2021-04-01 00:34:33+00:00,1.0,1.96,4.591837,1.0,N,238,152,1,9.0,0.5,0.5,3.09,0.0,0.3,13.39,0.0,0.0


abc.ABCMeta