## Installing Required Icetrait

In [35]:
!pip install git+https://github.com/vibhatha/pyiceberg_substrait@feat-schema-evolution#egg=icetrait

Collecting icetrait
  Cloning https://github.com/vibhatha/pyiceberg_substrait (to revision feat-schema-evolution) to /tmp/pip-install-wrid3_i1/icetrait_87e321b5352546e9b0b528c010db918b
  Running command git clone --filter=blob:none --quiet https://github.com/vibhatha/pyiceberg_substrait /tmp/pip-install-wrid3_i1/icetrait_87e321b5352546e9b0b528c010db918b
  Running command git checkout -b feat-schema-evolution --track origin/feat-schema-evolution
  Switched to a new branch 'feat-schema-evolution'
  Branch 'feat-schema-evolution' set up to track remote branch 'feat-schema-evolution' from 'origin'.
  Resolved https://github.com/vibhatha/pyiceberg_substrait to commit 256765ae95aeb31b721a4ef96a56d800458e2eb3
  Running command git submodule update --init --recursive -q
  Installing build dependencies ... [?25ldone
[?25h  Getting requirements to build wheel ... [?25ldone
[?25h  Installing backend dependencies ... [?25ldone
[?25h  Preparing metadata (pyproject.toml) ... [?25ldone
You sho

**Warning**

Make sure to restart the kernel after installation

In [1]:
from icetrait.substrait.visitor import SubstraitPlanEditor, visit_and_update, RelVisitor, RelUpdateVisitor
from icetrait.duckdb.wrapper import DuckdbSubstrait
import duckdb

## Initialize Spark Environment

In [2]:
import findspark
findspark.init()

from pyspark.sql import SparkSession

spark = (
    SparkSession.builder.master("local")
    .appName("IcebergPySpark")
    .config("spark.sql.extensions", "org.apache.iceberg.spark.extensions.IcebergSparkSessionExtensions")
    .config("spark.sql.catalog.demo", "org.apache.iceberg.spark.SparkCatalog")
    .config("spark.sql.catalog.demo.catalog-impl", "org.apache.iceberg.rest.RESTCatalog")
    .config("spark.sql.catalog.demo.uri", "http://rest:8181")
    .config("spark.sql.catalog.demo.s3.endpoint", "http://minio:9000")
    .config("spark.sql.defaultCatalog", "demo")
    .config("spark.eventLog.enabled", "true")
    .config("spark.eventLog.dir", "/home/iceberg/spark-events")
    .config("spark.history.fs.logDirectory", "/home/iceberg/spark-events")
    .config("spark.sql.catalogImplementation", "/home/iceberg/spark-events")
    .getOrCreate()
)
spark

23/05/15 19:05:31 WARN SparkSession: Using an existing Spark session; only runtime SQL configurations will take effect.


In [3]:
%%sql

CREATE DATABASE IF NOT EXISTS nyc_demo;

23/05/15 19:05:34 WARN SparkSession: Using an existing Spark session; only runtime SQL configurations will take effect.


In [4]:
%%sql

show databases;

23/05/15 19:05:35 WARN SparkSession: Using an existing Spark session; only runtime SQL configurations will take effect.


namespace
nyc_demo


In [5]:
from pyiceberg.catalog import load_catalog
from pyiceberg.expressions import GreaterThanOrEqual

iceberg_catalog = load_catalog('default')
iceberg_catalog

<pyiceberg.catalog.rest.RestCatalog at 0x7fc7d3a83e80>

In [6]:
iceberg_catalog.list_namespaces()

[('nyc_demo',)]

In [7]:
df = spark.read.parquet("/home/iceberg/data/yellow_tripdata_2021-04.parquet")
df.write.saveAsTable("nyc_demo.taxis_sample")

AnalysisException: Table nyc_demo.taxis_sample already exists

## Simulating Iceberg and Icetrait on Simple Query

In [8]:
iceberg_table = iceberg_catalog.load_table("nyc_demo.taxis_sample")

In [9]:
iceberg_df = iceberg_table.scan().to_pandas()
iceberg_df.head()

Unnamed: 0,VendorID,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,trip_distance,fare_per_distance_unit,RatecodeID,store_and_fwd_flag,PULocationID,DOLocationID,payment_type,fare,extra,mta_tax,tip_amount,tolls_amount,improvement_surcharge,total_amount,congestion_surcharge,airport_fee
0,1,2021-04-01 00:00:18+00:00,2021-04-01 00:21:54+00:00,1.0,8.4,3.035714,1.0,N,79,116,1,25.5,3.0,0.5,5.85,0.0,0.3,35.15,2.5,0.0
1,1,2021-04-01 00:42:37+00:00,2021-04-01 00:46:23+00:00,1.0,0.9,5.555555,1.0,N,75,236,2,5.0,3.0,0.5,0.0,0.0,0.3,8.8,2.5,0.0
2,1,2021-04-01 00:57:56+00:00,2021-04-01 01:08:22+00:00,1.0,3.4,3.382353,1.0,N,236,168,2,11.5,3.0,0.5,0.0,0.0,0.3,15.3,2.5,0.0
3,1,2021-04-01 00:01:58+00:00,2021-04-01 00:54:27+00:00,1.0,0.0,,1.0,N,47,61,1,44.2,0.0,0.5,0.0,0.0,0.3,45.0,0.0,0.0
4,2,2021-04-01 00:24:55+00:00,2021-04-01 00:34:33+00:00,1.0,1.96,4.591837,1.0,N,238,152,1,9.0,0.5,0.5,3.09,0.0,0.3,13.39,0.0,0.0


In [10]:
sql_query = "SELECT * FROM nyc_demo.taxis_sample;"

In [11]:
def setup_duckdb():
    con = duckdb.connect()
    con.install_extension("substrait")
    con.load_extension("substrait")
    return con

In [12]:
con = setup_duckdb()
create_schema = "CREATE SCHEMA nyc_demo;"
creation_query = """
CREATE TABLE nyc_demo.taxis_sample (
    VendorID              bigint,
    tpep_pickup_datetime  timestamp,
    tpep_dropoff_datetime timestamp,
    passenger_count       double,
    trip_distance         double,
    RatecodeID            double,
    store_and_fwd_flag    string,
    PULocationID          bigint,
    DOLocationID          bigint,
    payment_type          bigint,
    fare_amount           double,
    extra                 double,
    mta_tax               double,
    tip_amount            double,
    tolls_amount          double,
    improvement_surcharge double,
    total_amount          double,
    congestion_surcharge  double,
    airport_fee           double
);
"""
con.execute(create_schema)
con.execute(creation_query)

<duckdb.DuckDBPyConnection at 0x7fc7807933f0>

In [13]:
proto_bytes = con.get_substrait("SELECT * FROM nyc_demo.taxis_sample;").fetchone()[0]

In [14]:
!mkdir /home/iceberg/notebooks/s3

mkdir: cannot create directory ‘/home/iceberg/notebooks/s3’: File exists


In [16]:
wrapper = DuckdbSubstrait(proto_bytes, "default", "/home/iceberg/notebooks/s3", "nyc_demo")
wrapper.update_named_table_with_schema()
wrapper.update_with_local_file_paths()

Table before update
pyarrow.Table
VendorID: int64
tpep_pickup_datetime: timestamp[us, tz=UTC]
tpep_dropoff_datetime: timestamp[us, tz=UTC]
passenger_count: double
trip_distance: double
fare_per_distance_unit: float
RatecodeID: double
store_and_fwd_flag: string
PULocationID: int64
DOLocationID: int64
payment_type: int64
fare: double
extra: double
mta_tax: double
tip_amount: double
tolls_amount: double
improvement_surcharge: double
total_amount: double
congestion_surcharge: double
airport_fee: double
----
VendorID: [[]]
tpep_pickup_datetime: [[]]
tpep_dropoff_datetime: [[]]
passenger_count: [[]]
trip_distance: [[]]
fare_per_distance_unit: [[]]
RatecodeID: [[]]
store_and_fwd_flag: [[]]
PULocationID: [[]]
DOLocationID: [[]]
...
Name is not none:  VendorID
Name is not none:  tpep_pickup_datetime
Name is not none:  tpep_dropoff_datetime
Name is not none:  passenger_count
Name is not none:  trip_distance
Name is not none:  fare_per_distance_unit
Name is not none:  RatecodeID
Name is not none:

In [16]:
duckdb_res = wrapper.execute()

In [17]:
df_duckdb_substrait = duckdb_res.to_df()
df_duckdb_substrait.head()

Unnamed: 0,VendorID,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,trip_distance,fare_per_distance_unit,RatecodeID,store_and_fwd_flag,PULocationID,DOLocationID,payment_type,fare,extra,mta_tax,tip_amount,tolls_amount,improvement_surcharge,total_amount,congestion_surcharge,airport_fee
0,1,2021-04-01 00:00:18+00:00,2021-04-01 00:21:54+00:00,1.0,8.4,3.035714,1.0,N,79,116,1,25.5,3.0,0.5,5.85,0.0,0.3,35.15,2.5,0.0
1,1,2021-04-01 00:42:37+00:00,2021-04-01 00:46:23+00:00,1.0,0.9,5.555555,1.0,N,75,236,2,5.0,3.0,0.5,0.0,0.0,0.3,8.8,2.5,0.0
2,1,2021-04-01 00:57:56+00:00,2021-04-01 01:08:22+00:00,1.0,3.4,3.382353,1.0,N,236,168,2,11.5,3.0,0.5,0.0,0.0,0.3,15.3,2.5,0.0
3,1,2021-04-01 00:01:58+00:00,2021-04-01 00:54:27+00:00,1.0,0.0,,1.0,N,47,61,1,44.2,0.0,0.5,0.0,0.0,0.3,45.0,0.0,0.0
4,2,2021-04-01 00:24:55+00:00,2021-04-01 00:34:33+00:00,1.0,1.96,4.591837,1.0,N,238,152,1,9.0,0.5,0.5,3.09,0.0,0.3,13.39,0.0,0.0


In [18]:
# TODO: create separate table per evaluating action so that we can do the validations properly
assert iceberg_df.equals(df_duckdb_substrait)

## Simulating Iceberg and Icetrait on Rename

In [24]:
%%sql

ALTER TABLE nyc_demo.taxis_sample RENAME COLUMN fare_amount TO fare

23/05/15 05:10:31 WARN SparkSession: Using an existing Spark session; only runtime SQL configurations will take effect.
23/05/15 05:10:31 WARN BaseTransaction: Failed to load metadata for a committed snapshot, skipping clean-up


In [15]:
import pyarrow.dataset as ds
from pyiceberg.io.pyarrow import PyArrowFileIO
from pyiceberg.schema import Schema, prune_columns
from pyiceberg.types import MapType, ListType



ONE_MEGABYTE = 1024 * 1024
ICEBERG_SCHEMA = b"iceberg.schema"

class SchemaEvolutionUtil:
    
    def __init__(self, catalog_name, table_name, database_schema) -> None:
        self._catalog_name = catalog_name
        self._table_name = table_name
        self._database_schema = database_schema
        self._iceberg_catalog = None
        self._iceberg_table = None
        self._output_names = None
        self._physical_schema = None
        self._file_project_col_names = None
        self._file_project_schema = None
        self._projected_schema = None
        
    def load_catalog(self):
        self._iceberg_catalog = load_catalog(self._catalog_name)
        return self._iceberg_catalog
        
    def load_table(self):
        full_table_name = None
        if self._database_schema:
            full_table_name = f"{self._database_schema}.{self._table_name}"
        else:
            full_table_name = self._table_name
        self._iceberg_table = self._iceberg_catalog.load_table(full_table_name)
        return self._iceberg_table
        
    def update_plan(self):
        if self._iceberg_catalog is None:
            self.load_catalog()
        if self._iceberg_table is None:
            self.load_table()
            
        sc = self._iceberg_table.scan()
        table = sc.table
        tasks = sc.plan_files()
        scheme, _ = PyArrowFileIO.parse_location(table.location())
        
        projected_schema = sc.projection()
        
        if isinstance(table.io, PyArrowFileIO):
            fs = table.io.get_fs(scheme)
        
        if fs is None:
            raise ValueError(f"Couldn't load file system for the provide catalog {self._catalog_name}, table {self._table_name}")
        
        
        file_project_col_names = []
        root_rel_names = []
        file_schema = None
        physical_schema = None
        
        for task in tasks:    
            _, parquet_file_path = PyArrowFileIO.parse_location(task.file.file_path)
            arrow_format = ds.ParquetFileFormat(pre_buffer=True, buffer_size=(ONE_MEGABYTE * 8))
            with fs.open_input_file(parquet_file_path) as fin:
                fragment = arrow_format.make_fragment(fin)
                physical_schema = fragment.physical_schema
                schema_raw = None
                if metadata := physical_schema.metadata:
                    schema_raw = metadata.get(ICEBERG_SCHEMA)
                if schema_raw is None:
                    raise ValueError(
                        "Iceberg schema is not embedded into the Parquet file, see https://github.com/apache/iceberg/issues/6505"
                    )
                file_schema = Schema.parse_raw(schema_raw)
                # note that the find_type(id) would retrieve the field based on the unique id
                # schema evolution is guaranteed by the unique id definition for each column 
                # irrespective of the RUD operation (READ, UPDATE, DELETE)
                projected_field_ids = {id for id in projected_schema.field_ids \
                                       if not isinstance(projected_schema.find_type(id), (MapType, ListType))}
                file_project_schema = prune_columns(file_schema, projected_field_ids, select_full_types=False)
                # print(type(projected_schema), type(file_schema), type(physical_schema))
                # we use physical_schema as the executable Substrait plan's base_schema
                # we use columns=[col.name for col in file_project_schema.columns] as file column names of the
                # loading data stage
                self._file_project_schema = file_project_schema
                # for each file the names should be the same so just extract values for the first file
                if len(file_project_col_names) == 0:
                    file_project_col_names = [col.name for col in file_project_schema.columns]
                
                if len(root_rel_names) == 0:
                    for field in projected_schema.fields:
                        root_rel_names.append(field.name)
                print(type(metadata))
        self._physical_schema = physical_schema
        self._output_names = root_rel_names
        self._file_project_col_names = file_project_col_names
        self._projected_schema = projected_schema
         

In [16]:
se_util = SchemaEvolutionUtil(catalog_name='default', database_schema='nyc_demo', table_name='taxis_sample')
se_util.update_plan()

<class 'dict'>


In [17]:
type(se_util._file_project_schema)

pyiceberg.schema.Schema

In [18]:
from pyiceberg.schema import visit_with_partner
from pyiceberg.io.pyarrow import ArrowProjectionVisitor, ArrowAccessor
empty_table = pa.Table.from_pylist([], se_util._physical_schema)
struct_array = visit_with_partner(se_util._projected_schema, \
                                  empty_table, \
                                  ArrowProjectionVisitor(se_util._file_project_schema), \
                                  ArrowAccessor(se_util._file_project_schema))



NameError: name 'pa' is not defined

In [20]:
print(se_util._file_project_schema.field_ids)

{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19}


In [26]:
projected_schema = se_util._projected_schema
struct = projected_schema.as_struct()
for field in struct.fields:
    field_id = field.field_id
    print(field_id, field.name, field.field_type)
    name = se_util._file_project_schema.find_field(field_id).name
    
    
    # from empty_table extract the column



1 VendorID long
2 tpep_pickup_datetime timestamptz
3 tpep_dropoff_datetime timestamptz
4 passenger_count double
5 trip_distance double
20 fare_per_distance_unit float


ValueError: Could not find field with id: 20

## Simulating Iceberg and Icetrait on Add

In [25]:
%%sql

ALTER TABLE nyc_demo.taxis_sample
ADD COLUMN fare_per_distance_unit float AFTER trip_distance

23/05/15 10:58:16 WARN SparkSession: Using an existing Spark session; only runtime SQL configurations will take effect.
23/05/15 10:58:16 WARN BaseTransaction: Failed to load metadata for a committed snapshot, skipping clean-up


In [23]:
%%sql

UPDATE nyc_demo.taxis_sample
SET fare_per_distance_unit = fare/trip_distance

23/05/15 17:54:03 WARN SparkSession: Using an existing Spark session; only runtime SQL configurations will take effect.


SLF4J: Failed to load class "org.slf4j.impl.StaticLoggerBinder".
SLF4J: Defaulting to no-operation (NOP) logger implementation
SLF4J: See http://www.slf4j.org/codes.html#StaticLoggerBinder for further details.
                                                                                

In [30]:
iceberg_table = iceberg_catalog.load_table("nyc_demo.taxis_sample")
add_pending_table = iceberg_table.scan().to_pandas()
add_pending_table.dtypes

VendorID                                int64
tpep_pickup_datetime      datetime64[ns, UTC]
tpep_dropoff_datetime     datetime64[ns, UTC]
passenger_count                       float64
trip_distance                         float64
fare_per_distance_unit                float32
RatecodeID                            float64
store_and_fwd_flag                     object
PULocationID                            int64
DOLocationID                            int64
payment_type                            int64
fare                                  float64
extra                                 float64
mta_tax                               float64
tip_amount                            float64
tolls_amount                          float64
improvement_surcharge                 float64
total_amount                          float64
congestion_surcharge                  float64
airport_fee                           float64
dtype: object

In [27]:
wrapper = DuckdbSubstrait(proto_bytes, "default", "/home/iceberg/notebooks/s3", "nyc_demo")
wrapper.update_named_table_with_schema()
wrapper.update_with_local_file_paths()

ValueError: Could not find field with id: 20

## Simulating Iceberg and Icetrait on Drop

## Simulating Iceberg and Icetrait on Update

## Simulating Iceberg and Icetrait on Select Query

In [34]:
%%sql
SELECT
VendorID
,tpep_pickup_datetime
,tpep_dropoff_datetime
,fare
,trip_distance
,fare_per_distance_unit
FROM nyc_demo.taxis_sample

23/05/15 18:12:13 WARN SparkSession: Using an existing Spark session; only runtime SQL configurations will take effect.


VendorID,tpep_pickup_datetime,tpep_dropoff_datetime,fare,trip_distance,fare_per_distance_unit
1,2021-04-01 00:00:18,2021-04-01 00:21:54,25.5,8.4,3.0357143878936768
1,2021-04-01 00:42:37,2021-04-01 00:46:23,5.0,0.9,5.55555534362793
1,2021-04-01 00:57:56,2021-04-01 01:08:22,11.5,3.4,3.382352828979492
1,2021-04-01 00:01:58,2021-04-01 00:54:27,44.2,0.0,
2,2021-04-01 00:24:55,2021-04-01 00:34:33,9.0,1.96,4.591836929321289
2,2021-04-01 00:19:16,2021-04-01 00:21:46,4.5,0.77,5.844155788421631
2,2021-04-01 00:25:11,2021-04-01 00:31:53,11.5,3.65,3.1506848335266118
1,2021-04-01 00:27:53,2021-04-01 00:47:03,26.5,8.9,2.9775280952453613
2,2021-04-01 00:24:24,2021-04-01 00:37:50,12.0,2.98,4.026845455169678
1,2021-04-01 00:19:18,2021-04-01 00:41:25,28.0,8.9,3.146067380905152


In [28]:
select_query = "SELECT fare_amount as fare FROM nyc_demo.taxis_sample;"
proto_bytes = con.get_substrait(select_query).fetchone()[0]
editor = SubstraitPlanEditor(proto_bytes)
editor.plan

relations {
  root {
    input {
      project {
        input {
          read {
            base_schema {
              names: "VendorID"
              names: "tpep_pickup_datetime"
              names: "tpep_dropoff_datetime"
              names: "passenger_count"
              names: "trip_distance"
              names: "RatecodeID"
              names: "store_and_fwd_flag"
              names: "PULocationID"
              names: "DOLocationID"
              names: "payment_type"
              names: "fare_amount"
              names: "extra"
              names: "mta_tax"
              names: "tip_amount"
              names: "tolls_amount"
              names: "improvement_surcharge"
              names: "total_amount"
              names: "congestion_surcharge"
              names: "airport_fee"
              struct {
                types {
                  i64 {
                    nullability: NULLABILITY_NULLABLE
                  }
                }
                types {

In [29]:
wrapper = DuckdbSubstrait(proto_bytes, "default", "/home/iceberg/notebooks/s3", "nyc_demo")
wrapper.update_named_table_with_schema()
wrapper.update_with_local_file_paths()

Table before update
pyarrow.Table
VendorID: int64
tpep_pickup_datetime: timestamp[us, tz=UTC]
tpep_dropoff_datetime: timestamp[us, tz=UTC]
passenger_count: double
trip_distance: double
fare_per_distance_unit: float
RatecodeID: double
store_and_fwd_flag: string
PULocationID: int64
DOLocationID: int64
payment_type: int64
fare: double
extra: double
mta_tax: double
tip_amount: double
tolls_amount: double
improvement_surcharge: double
total_amount: double
congestion_surcharge: double
airport_fee: double
----
VendorID: [[]]
tpep_pickup_datetime: [[]]
tpep_dropoff_datetime: [[]]
passenger_count: [[]]
trip_distance: [[]]
fare_per_distance_unit: [[]]
RatecodeID: [[]]
store_and_fwd_flag: [[]]
PULocationID: [[]]
DOLocationID: [[]]
...
Name is not none:  VendorID
Name is not none:  tpep_pickup_datetime
Name is not none:  tpep_dropoff_datetime
Name is not none:  passenger_count
Name is not none:  trip_distance
Name is not none:  fare_per_distance_unit
Name is not none:  RatecodeID
Name is not none:

In [30]:
res = wrapper.execute()

In [31]:
res.to_df().head()

Unnamed: 0,VendorID,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,trip_distance,fare_per_distance_unit,RatecodeID,store_and_fwd_flag,PULocationID,DOLocationID,payment_type,fare,extra,mta_tax,tip_amount,tolls_amount,improvement_surcharge,total_amount,congestion_surcharge,airport_fee
0,1,2021-04-01 00:00:18+00:00,2021-04-01 00:00:18+00:00,2021-04-01 00:00:18+00:00,2021-04-01 00:00:18+00:00,2021-04-01 00:00:18+00:00,2021-04-01 00:00:18+00:00,2021-04-01 00:00:18+00:00,2021-04-01 00:00:18+00:00,2021-04-01 00:00:18+00:00,2021-04-01 00:00:18+00:00,2021-04-01 00:00:18+00:00,2021-04-01 00:00:18+00:00,2021-04-01 00:00:18+00:00,2021-04-01 00:00:18+00:00,2021-04-01 00:00:18+00:00,2021-04-01 00:00:18+00:00,2021-04-01 00:00:18+00:00,2021-04-01 00:00:18+00:00,2021-04-01 00:00:18+00:00
1,2,2021-04-01 00:42:37+00:00,2021-04-01 00:42:37+00:00,2021-04-01 00:42:37+00:00,2021-04-01 00:42:37+00:00,2021-04-01 00:42:37+00:00,2021-04-01 00:42:37+00:00,2021-04-01 00:42:37+00:00,2021-04-01 00:42:37+00:00,2021-04-01 00:42:37+00:00,2021-04-01 00:42:37+00:00,2021-04-01 00:42:37+00:00,2021-04-01 00:42:37+00:00,2021-04-01 00:42:37+00:00,2021-04-01 00:42:37+00:00,2021-04-01 00:42:37+00:00,2021-04-01 00:42:37+00:00,2021-04-01 00:42:37+00:00,2021-04-01 00:42:37+00:00,2021-04-01 00:42:37+00:00
2,2,2021-04-01 00:57:56+00:00,2021-04-01 00:57:56+00:00,2021-04-01 00:57:56+00:00,2021-04-01 00:57:56+00:00,2021-04-01 00:57:56+00:00,2021-04-01 00:57:56+00:00,2021-04-01 00:57:56+00:00,2021-04-01 00:57:56+00:00,2021-04-01 00:57:56+00:00,2021-04-01 00:57:56+00:00,2021-04-01 00:57:56+00:00,2021-04-01 00:57:56+00:00,2021-04-01 00:57:56+00:00,2021-04-01 00:57:56+00:00,2021-04-01 00:57:56+00:00,2021-04-01 00:57:56+00:00,2021-04-01 00:57:56+00:00,2021-04-01 00:57:56+00:00,2021-04-01 00:57:56+00:00
3,1,2021-04-01 00:01:58+00:00,2021-04-01 00:01:58+00:00,2021-04-01 00:01:58+00:00,2021-04-01 00:01:58+00:00,2021-04-01 00:01:58+00:00,2021-04-01 00:01:58+00:00,2021-04-01 00:01:58+00:00,2021-04-01 00:01:58+00:00,2021-04-01 00:01:58+00:00,2021-04-01 00:01:58+00:00,2021-04-01 00:01:58+00:00,2021-04-01 00:01:58+00:00,2021-04-01 00:01:58+00:00,2021-04-01 00:01:58+00:00,2021-04-01 00:01:58+00:00,2021-04-01 00:01:58+00:00,2021-04-01 00:01:58+00:00,2021-04-01 00:01:58+00:00,2021-04-01 00:01:58+00:00
4,1,2021-04-01 00:24:55+00:00,2021-04-01 00:24:55+00:00,2021-04-01 00:24:55+00:00,2021-04-01 00:24:55+00:00,2021-04-01 00:24:55+00:00,2021-04-01 00:24:55+00:00,2021-04-01 00:24:55+00:00,2021-04-01 00:24:55+00:00,2021-04-01 00:24:55+00:00,2021-04-01 00:24:55+00:00,2021-04-01 00:24:55+00:00,2021-04-01 00:24:55+00:00,2021-04-01 00:24:55+00:00,2021-04-01 00:24:55+00:00,2021-04-01 00:24:55+00:00,2021-04-01 00:24:55+00:00,2021-04-01 00:24:55+00:00,2021-04-01 00:24:55+00:00,2021-04-01 00:24:55+00:00


In [21]:
wrapper.updated_plan

relations {
  root {
    input {
      project {
        input {
          read {
            base_schema {
              names: "VendorID"
              names: "tpep_pickup_datetime"
              names: "tpep_dropoff_datetime"
              names: "passenger_count"
              names: "trip_distance"
              names: "fare_per_distance_unit"
              names: "RatecodeID"
              names: "store_and_fwd_flag"
              names: "PULocationID"
              names: "DOLocationID"
              names: "payment_type"
              names: "fare"
              names: "extra"
              names: "mta_tax"
              names: "tip_amount"
              names: "tolls_amount"
              names: "improvement_surcharge"
              names: "total_amount"
              names: "congestion_surcharge"
              names: "airport_fee"
              struct {
                types {
                  i64 {
                    nullability: NULLABILITY_NULLABLE
                  }
  

In [24]:
iceberg_table.scan(selected_fields=["fare"]).to_pandas()

Unnamed: 0,fare
0,25.50
1,5.00
2,11.50
3,44.20
4,9.00
...,...
2171182,16.91
2171183,4.50
2171184,21.86
2171185,16.63


In [29]:
import sqlparse

sql_statement = "SELECT A, B, C FROM TABLE;"
parsed = sqlparse.parse(sql_statement)

# Assuming the first statement is the one we want
statement = parsed[0]

# Extract column names
col_names = None

for token in statement.tokens:
    if isinstance(token, sqlparse.sql.IdentifierList):
        str_token = str(token)
        col_names = str_token.split(",")
        
trimmed_cols = []
for col_name in col_names:
    trimmed_cols.append(col_name.strip())

In [30]:
trimmed_cols

['A', 'B', 'C']