# `virtual`: Parquet Demo

In [1]:
%reload_ext autoreload
%autoreload 2

import virtual

parquet_filepath = 'example.parquet'

### One-shot table compression

In [2]:
# Save in Parquet format.
virtual.to_format(parquet_filepath, 'example_virtual.parquet', model_types=['sparse-lr'], prefix='demo-debug/')

Running schema inference..
Drilling functions..
     Earnings Year  Regular Earnings  Overtime Earnings  Total Earnings
0             2020            125.11               0.00          125.11
1             2020          38703.38           11743.82        50447.20
2             2020          10431.20               0.00        10431.20
3             2020          37306.10             492.61        37798.71
4             2020          86453.24               0.00        86453.24
..             ...               ...                ...             ...
994           2020          38740.58            5333.90        44074.48
995           2020          25286.04               0.00        25286.04
996           2010          45259.79               0.00        45259.79
997           2020          40130.06             159.78        40289.84
998           2020           5232.00               0.00         5232.00

[999 rows x 4 columns]

BBBBBBBBB >>>Earnings Year
[7, 8, 9]
['Earnings Year', 'Regular

### Compare to vanilla `Parquet`

In [3]:
import os

table = os.path.splitext(os.path.basename(parquet_filepath))[0]
print(f'[{table}] Parquet: {os.path.getsize('example.parquet')} bytes')
print(f'[{table}] Virtual: {os.path.getsize('example_virtual.parquet')} bytes')

Parquet: 44078 bytes
Virtual: 36806 bytes


### Exploring functions

In [4]:
# Drill the functions.
functions = virtual.train(parquet_filepath, model_types=['sparse-lr'], prefix='./demo-debug/')

functions

Drilling functions..
     Earnings Year  Regular Earnings  Overtime Earnings  Total Earnings
0             2020            125.11               0.00          125.11
1             2020          38703.38           11743.82        50447.20
2             2020          10431.20               0.00        10431.20
3             2020          37306.10             492.61        37798.71
4             2020          86453.24               0.00        86453.24
..             ...               ...                ...             ...
994           2020          38740.58            5333.90        44074.48
995           2020          25286.04               0.00        25286.04
996           2010          45259.79               0.00        45259.79
997           2020          40130.06             159.78        40289.84
998           2020           5232.00               0.00         5232.00

[999 rows x 4 columns]

BBBBBBBBB >>>Earnings Year
[7, 8, 9]
['Earnings Year', 'Regular Earnings', 'Overtime Earni

[{'target_index': 0,
  'target_name': 'Earnings Year',
  'target_stats': {'mean': np.float64(2019.6546546546547),
   'max': 2022.0,
   'min': 2010.0},
  'models': {'sparse-lr': {'mse': 1.8572998848441455,
    'intercept': np.float64(2019.553519802266),
    'coeffs': [{'col-index': 7,
      'col-name': 'Regular Earnings',
      'coeff': np.float64(3.2051956960837913e-06)}]}}},
 {'target_index': 9,
  'target_name': 'Total Earnings',
  'target_stats': {'mean': np.float64(32326.906086086085),
   'max': 202038.13,
   'min': 0.78},
  'models': {'sparse-lr': {'mse': 5.6050884714671134e-11,
    'intercept': np.float64(-6.184563972055912e-11),
    'coeffs': [{'col-index': 7,
      'col-name': 'Regular Earnings',
      'coeff': np.float64(1.000000000000002)},
     {'col-index': 8,
      'col-name': 'Overtime Earnings',
      'coeff': np.float64(1.0000000000000004)}]}}}]

### Compress with functions

In [5]:
# Use them for compression.
virtual.to_format(parquet_filepath, 'example_virtual.parquet', functions)

Running schema inference..
Earnings Year sparse-lr -25
Total Earnings sparse-lr 6916
Creating the virtual file..


### Read the file

In [6]:
df = virtual.from_format('example_virtual.parquet')

### Query the vanilla Parquet file

In [7]:
import duckdb

duckdb.sql('select avg("Total Earnings") from read_parquet("example.parquet")').df()

Unnamed: 0,"avg(""Total Earnings"")"
0,32326.906086


### Query the virtualized Parquet file

In [8]:
virtual.query(
  'select avg("Total Earnings") from read_parquet("example_virtual.parquet")',
  engine = 'duckdb'
)

Unnamed: 0,"avg(""Total Earnings"")"
0,32326.906086
