# `virtual`: Taxi Demo

In [1]:
%load_ext autoreload
%autoreload 2

import virtual

parquet_filepath = 'https://blobs.duckdb.org/data/taxi_2019_04.parquet'

### One-shot table virtualization

In [2]:
# Save in Parquet format.
# Note: In this case, we're also using k-regression (see our TRL'24 paper for more details).
virtual.to_format(parquet_filepath, 'taxi_virtual.parquet', model_types=['sparse-lr', 'k-regression', 'custom'], prefix='demo-debug/')

Running schema inference..
Drilling functions..
We found 10 function candidate(s) in your table.
Let's see how many benefit virtualization..
It seems that 2 function(s) can indeed be used for virtualization.
Creating the virtual file..
Done.


### Compare to vanilla `Parquet`

In [6]:
import os

table = os.path.splitext(os.path.basename(parquet_filepath))[0]
print(f'[{table}] Parquet: {os.path.getsize('taxi.parquet') / 1_000_000} MB')
print(f'[{table}] Virtual: {os.path.getsize('taxi_virtual.parquet') / 1_000_000} MB')

[taxi_2019_04] Parquet: 137.726973 MB
[taxi_2019_04] Virtual: 109.026684 MB


### Query the vanilla Parquet file

#### `TIMESTAMP`-valued Column

In [12]:
import duckdb

duckdb.sql(f"select min(dropoff_at) from read_parquet('{parquet_filepath}')").df()

Unnamed: 0,min(dropoff_at)
0,2008-08-08 20:25:45


#### Numeric column

In [7]:
import duckdb

duckdb.sql(f"select avg(total_amount) from read_parquet('{parquet_filepath}')").df()

Unnamed: 0,avg(total_amount)
0,19.229813


### Query the virtualized Parquet file

#### `TIMESTAMP`-valued column

In [14]:
virtual.query(
  'select min(dropoff_at) from read_parquet("taxi_virtual.parquet")',
  engine = 'duckdb'
)

Unnamed: 0,min((pickup_at + (CAST('1 second' AS INTERVAL) * dropoff_at_offset)))
0,2008-08-08 20:25:45


#### Numeric Column

In [15]:
virtual.query(
  'select avg(total_amount) from read_parquet("taxi_virtual.parquet")',
  engine = 'duckdb'
)

Unnamed: 0,"avg((CASE WHEN ((total_amount_switch = 0)) THEN (round((((1.0353 * fare_amount) + (1.2448 * tip_amount)) + 2.6879), 2)) WHEN ((total_amount_switch = 1)) THEN (round(((((1.0025 * fare_amount) + (0.9973 * tip_amount)) + (0.9109 * tolls_amount)) + 3.7439), 2)) WHEN ((total_amount_switch = 2)) THEN (round((((1.2921 * fare_amount) + (1.3004 * tolls_amount)) + -0.6258), 2)) WHEN ((total_amount_switch = 3)) THEN (round(((((1 * fare_amount) + (1 * tip_amount)) + (1 * tolls_amount)) + 3.8), 2)) WHEN ((total_amount_switch = 4)) THEN (round(((1.4433 * fare_amount) + 7.2944), 2)) WHEN ((total_amount_switch = 5)) THEN (round(((((1.0819 * fare_amount) + (0.9819 * tip_amount)) + (1.0188 * tolls_amount)) + 3.5008), 2)) WHEN ((total_amount_switch = 6)) THEN (round(((((1 * fare_amount) + (0.9855 * tip_amount)) + (1.097 * tolls_amount)) + 3.4855), 2)) WHEN ((total_amount_switch = 7)) THEN (24.36) WHEN ((total_amount_switch = 8)) THEN (round((((((1.0026 * fare_amount) + (0.9972 * tip_amount)) + (0.9608 * tolls_amount)) + (1.3321 * congestion_surcharge)) + 0.9671), 2)) WHEN ((total_amount_switch = 9)) THEN (round(((((0.9389 * fare_amount) + (1.2687 * tip_amount)) + (1.6065 * tolls_amount)) + 4.0021), 2)) WHEN ((total_amount_switch = 10)) THEN (round(((0.9998 * fare_amount) + 0.7622), 2)) WHEN ((total_amount_switch = 11)) THEN (round(((((1 * fare_amount) + (1 * tip_amount)) + (1 * tolls_amount)) + 3.3), 2)) WHEN ((total_amount_switch = 12)) THEN (round((((1.0049 * fare_amount) + (1.0574 * tip_amount)) + 3.6742), 2)) WHEN ((total_amount_switch = 13)) THEN (round(((((0.9648 * fare_amount) + (1.0903 * tip_amount)) + (0.9751 * tolls_amount)) + 3.8456), 2)) ELSE round(((((0.9982 * fare_amount) + (0.9933 * tip_amount)) + (0.9943 * tolls_amount)) + 1.8484), 2) END + total_amount_offset))"
0,19.229813
