# `virtual`: `lineitem` Demo

In [1]:
%load_ext autoreload
%autoreload 2

import virtual

# Note: You need to create your own TPC-H lineitem Parquet file.
parquet_filepath = '../../demo/data/lineitem.parquet'

### One-shot table virtualization

In [2]:
# Save in Parquet format.
virtual.to_format(parquet_filepath, 'lineitem_virtual.parquet', prefix='demo-debug/')

Running schema inference..
Drilling functions..
We found 9 function candidate(s) in your table.
Let's see how many benefit virtualization..

 ---- @@@ after create basetable @@@ ---- 

| l_comment   | l_commitdate   | l_discount   | l_discount_offset   | l_extendedprice   | l_linenumber   | l_linenumber_offset   | l_linestatus   | l_orderkey   | l_partkey   | l_quantity   | l_receiptdate   | l_returnflag   | l_shipdate   | l_shipinstruct   | l_shipmode   | l_suppkey   | l_tax   | l_tax_offset   |
|-------------|----------------|--------------|---------------------|-------------------|----------------|-----------------------|----------------|--------------|-------------|--------------|-----------------|----------------|--------------|------------------|--------------|-------------|---------|----------------|
['"l_linenumber_offset" = coalesce(round("l_linenumber" - (round(-1.2044 * "l_tax" + 3.0494, 0)::BIGINT), 0), 0)']

 ---- @@@ l_linenumber @after updates @@@ ---- 

|    | l_comment

### Compare to vanilla `Parquet`

In [6]:
import os

table = os.path.splitext(os.path.basename(parquet_filepath))[0]
print(f'[{table}] Parquet: {os.path.getsize(parquet_filepath) / 1_000_000} MB')
print(f'[{table}] Virtual: {os.path.getsize('lineitem_virtual.parquet') / 1_000_000} MB')

[lineitem] Parquet: 255.970446 MB
[lineitem] Virtual: 201.748746 MB


### Query the vanilla Parquet file

In [7]:
import duckdb

duckdb.sql(f"select max(l_shipdate), max(l_commitdate), max(l_receiptdate) from read_parquet('{parquet_filepath}')").df()

Unnamed: 0,max(l_shipdate),max(l_commitdate),max(l_receiptdate)
0,1998-12-01,1998-10-31,1998-12-31


### Query the virtualized Parquet file

In [8]:
virtual.query(
  'select max(l_shipdate), max(l_commitdate), max(l_receiptdate) from read_parquet("lineitem_virtual.parquet")',
  engine = 'duckdb'
)

what??
{'header': ['l_discount', 'l_shipdate_offset', 'l_comment', 'l_suppkey', 'l_orderkey', 'l_extendedprice', 'l_commitdate', 'l_shipinstruct', 'l_receiptdate_offset', 'l_returnflag', 'l_shipmode', 'l_linestatus', 'l_linenumber', 'l_partkey', 'l_quantity', 'l_tax'], 'schema': [{'name': 'l_orderkey', 'type': 'BIGINT', 'null': {'any': False, 'all': False}, 'scale': 0, 'precision': 0}, {'name': 'l_partkey', 'type': 'BIGINT', 'null': {'any': False, 'all': False}, 'scale': 0, 'precision': 0}, {'name': 'l_suppkey', 'type': 'BIGINT', 'null': {'any': False, 'all': False}, 'scale': 0, 'precision': 0}, {'name': 'l_linenumber', 'type': 'BIGINT', 'null': {'any': False, 'all': False}, 'scale': 0, 'precision': 0}, {'name': 'l_quantity', 'type': 'DECIMAL(15,2)', 'null': {'any': False, 'all': False}, 'scale': 2, 'precision': 4}, {'name': 'l_extendedprice', 'type': 'DECIMAL(15,2)', 'null': {'any': False, 'all': False}, 'scale': 2, 'precision': 8}, {'name': 'l_discount', 'type': 'DECIMAL(15,2)', 'nul

Unnamed: 0,max((l_commitdate + l_shipdate_offset)),max(l_commitdate),max(((l_commitdate + l_shipdate_offset) + l_receiptdate_offset))
0,1998-12-01,1998-10-31,1998-12-31
