# `virtual`: `lineitem` Demo

In [1]:
%load_ext autoreload
%autoreload 2

import virtual

parquet_filepath = '../../demo/data/lineitem.parquet'

### One-shot table compression

In [2]:
# Save in Parquet format.
virtual.to_format(parquet_filepath, 'lineitem_virtual.parquet', schema='./demo-debug/schema_lineitem.json', model_types=['sparse-lr', 'custom'], prefix='demo-debug/')

Drilling functions..
{'num': {'indices': [0, 1, 2, 3, 4, 5, 6, 7], 'names': ['l_orderkey', 'l_partkey', 'l_suppkey', 'l_linenumber', 'l_quantity', 'l_extendedprice', 'l_discount', 'l_tax']}, 'time': {'indices': [], 'names': []}, 'date': {'indices': [10, 11, 12], 'names': ['l_shipdate', 'l_commitdate', 'l_receiptdate']}, 'string': {'indices': [8, 9, 13, 14, 15], 'names': ['l_returnflag', 'l_linestatus', 'l_shipinstruct', 'l_shipmode', 'l_comment']}, 'boolean': {'indices': [], 'names': []}}
8
We found 9 function(s) in your table.
[{'target_index': 3, 'target_name': 'l_linenumber', 'target_stats': {'mean': np.float64(3.0007), 'max': 7.0, 'min': 1.0}, 'models': {'sparse-lr': {'mse': 1.735666816958929, 'intercept': np.float64(3.049352576319866), 'coeffs': [{'col-index': 7, 'col-name': 'l_tax', 'coeff': np.float64(-1.2043611238425027)}]}}}, {'target_index': 6, 'target_name': 'l_discount', 'target_stats': {'mean': np.float64(0.05014400000000001), 'max': 0.1, 'min': 0.0}, 'models': {'sparse-lr

### Compare to vanilla `Parquet`

In [3]:
import os

table = os.path.splitext(os.path.basename(parquet_filepath))[0]
print(f'[{table}] Parquet: {os.path.getsize(parquet_filepath) / 1_000_000} MB')
print(f'[{table}] Virtual: {os.path.getsize('lineitem_virtual.parquet') / 1_000_000} MB')

[lineitem] Parquet: 255.970446 MB
[lineitem] Virtual: 201.748569 MB


### Query the vanilla Parquet file

In [6]:
import duckdb

duckdb.sql(f"select max(l_shipdate), max(l_commitdate), max(l_receiptdate) from read_parquet('{parquet_filepath}') limit 5").df()

Unnamed: 0,max(l_shipdate),max(l_commitdate),max(l_receiptdate)
0,1998-12-01,1998-10-31,1998-12-31


### Query the virtualized Parquet file

In [7]:
virtual.query(
  'select max(l_shipdate), max(l_commitdate), max(l_receiptdate) from read_parquet("lineitem_virtual.parquet") limit 5',
  engine = 'duckdb'
)

Unnamed: 0,max((l_commitdate + l_shipdate_offset)),max(l_commitdate),max(((l_commitdate + l_shipdate_offset) + l_receiptdate_offset))
0,1998-12-01,1998-10-31,1998-12-31
