# `virtual`: `lineitem` Demo

In [6]:
%load_ext autoreload
%autoreload 2

import virtual

parquet_filepath = '../../demo/data/lineitem.parquet'

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


### One-shot table compression

In [7]:
# Save in Parquet format.
virtual.to_format(parquet_filepath, 'lineitem_virtual.parquet', model_types=['sparse-lr'], prefix='demo-debug/')

Running schema inference..
Drilling functions..
      l_orderkey  l_partkey  l_suppkey  l_linenumber  l_quantity  \
0        4784742      87435       2452             3        45.0   
1        4689443      17063       2066             5        38.0   
2          38757     177936       5488             3        22.0   
3        5163044     113853       1387             5        32.0   
4        5509472      61717       4224             1        49.0   
...          ...        ...        ...           ...         ...   
9995     5531970     154086       6602             1        23.0   
9996     2219719      42379       2380             1        44.0   
9997     1744868     155108       7624             3         2.0   
9998     1259810      66576       9083             5        39.0   
9999     1226337     160499       3016             3        18.0   

      l_extendedprice  l_discount  l_tax  
0            64009.35        0.00   0.00  
1            37242.28        0.02   0.07  
2     

### Compare to vanilla `Parquet`

In [8]:
import os

table = os.path.splitext(os.path.basename(parquet_filepath))[0]
print(f'[{table}] Parquet: {os.path.getsize(parquet_filepath) / 1_000_000} MB')
print(f'[{table}] Virtual: {os.path.getsize('lineitem_virtual.parquet') / 1_000_000} MB')

[lineitem] Parquet: 255.970446 MB
[lineitem] Virtual: 210.116663 MB


### Query the vanilla Parquet file

In [9]:
import duckdb

duckdb.sql(f"select * from read_parquet('{parquet_filepath}') limit 5").df()

Unnamed: 0,l_orderkey,l_partkey,l_suppkey,l_linenumber,l_quantity,l_extendedprice,l_discount,l_tax,l_returnflag,l_linestatus,l_shipdate,l_commitdate,l_receiptdate,l_shipinstruct,l_shipmode,l_comment
0,1,155190,7706,1,17.0,21168.23,0.04,0.02,N,O,1996-03-13,1996-02-12,1996-03-22,DELIVER IN PERSON,TRUCK,to beans x-ray carefull
1,1,67310,7311,2,36.0,45983.16,0.09,0.06,N,O,1996-04-12,1996-02-28,1996-04-20,TAKE BACK RETURN,MAIL,according to the final foxes. qui
2,1,63700,3701,3,8.0,13309.6,0.1,0.02,N,O,1996-01-29,1996-03-05,1996-01-31,TAKE BACK RETURN,REG AIR,ourts cajole above the furiou
3,1,2132,4633,4,28.0,28955.64,0.09,0.06,N,O,1996-04-21,1996-03-30,1996-05-16,NONE,AIR,s cajole busily above t
4,1,24027,1534,5,24.0,22824.48,0.1,0.04,N,O,1996-03-30,1996-03-14,1996-04-01,NONE,FOB,"the regular, regular pa"


### Query the virtualized Parquet file

In [10]:
virtual.query(
  'select * from read_parquet("lineitem_virtual.parquet") limit 5',
  engine = 'duckdb'
)

Unnamed: 0,l_linenumber,l_discount,l_comment,l_commitdate,l_extendedprice,l_quantity,l_receiptdate,l_shipdate,l_partkey,l_shipmode,l_linestatus,l_returnflag,l_shipinstruct,l_suppkey,l_orderkey,l_tax
0,1,0.04,to beans x-ray carefull,1996-02-12,21168.23,17.0,1996-03-22,1996-03-13,155190,TRUCK,O,N,DELIVER IN PERSON,7706,1,0.02
1,2,0.09,according to the final foxes. qui,1996-02-28,45983.16,36.0,1996-04-20,1996-04-12,67310,MAIL,O,N,TAKE BACK RETURN,7311,1,0.06
2,3,0.1,ourts cajole above the furiou,1996-03-05,13309.6,8.0,1996-01-31,1996-01-29,63700,REG AIR,O,N,TAKE BACK RETURN,3701,1,0.02
3,4,0.09,s cajole busily above t,1996-03-30,28955.64,28.0,1996-05-16,1996-04-21,2132,AIR,O,N,NONE,4633,1,0.06
4,5,0.1,"the regular, regular pa",1996-03-14,22824.48,24.0,1996-04-01,1996-03-30,24027,FOB,O,N,NONE,1534,1,0.04
