# `virtual`: `lineitem` Demo

In [1]:
%load_ext autoreload
%autoreload 2

import virtual

# Note: You need to create your own TPC-H lineitem Parquet file.
parquet_filepath = '../../demo/data/lineitem.parquet'

### One-shot table virtualization

In [2]:
# Save in Parquet format.
virtual.to_format(parquet_filepath, 'lineitem_virtual.parquet', prefix='demo-debug/')

Running schema inference..
Drilling functions..
We found 9 function candidate(s) in your table.
Let's see how many benefit virtualization..
It seems that 2 function(s) can indeed be used for virtualization.
Creating the virtual file..
Done.


### Compare to vanilla `Parquet`

In [3]:
import os

table = os.path.splitext(os.path.basename(parquet_filepath))[0]
print(f'[{table}] Parquet: {os.path.getsize(parquet_filepath) / 1_000_000} MB')
print(f'[{table}] Virtual: {os.path.getsize('lineitem_virtual.parquet') / 1_000_000} MB')

[lineitem] Parquet: 255.970446 MB
[lineitem] Virtual: 201.748749 MB


### Query the vanilla Parquet file

In [4]:
import duckdb

duckdb.sql(f"select max(l_shipdate), max(l_commitdate), max(l_receiptdate) from read_parquet('{parquet_filepath}')").df()

Unnamed: 0,max(l_shipdate),max(l_commitdate),max(l_receiptdate)
0,1998-12-01,1998-10-31,1998-12-31


### Query the virtualized Parquet file

In [5]:
virtual.query(
  'select max(l_shipdate), max(l_commitdate), max(l_receiptdate) from read_parquet("lineitem_virtual.parquet")',
  engine = 'duckdb'
)

Unnamed: 0,max((l_commitdate + l_shipdate_offset)),max(l_commitdate),max(((l_commitdate + l_shipdate_offset) + l_receiptdate_offset))
0,1998-12-01,1998-10-31,1998-12-31
