# `virtual`: Parquet Demo

In [1]:
%reload_ext autoreload
%autoreload 2

import virtual

parquet_filepath = 'example.parquet'

### One-shot table compression

In [2]:
# Save in Parquet format.
virtual.to_format(parquet_filepath, 'example_virtual.parquet', model_types=['sparse-lr'], prefix='demo-debug/')

Running schema inference..
Drilling functions..
We found 4 function(s) in your table.
Creating the virtual file..


### Compare to vanilla `Parquet`

In [3]:
import os

print(f'Parquet: {os.path.getsize('example.parquet')} bytes')
print(f'Virtual: {os.path.getsize('example_virtual.parquet')} bytes')

Parquet: 44078 bytes
Virtual: 36814 bytes


### Exploring functions

In [4]:
# Drill the functions.
functions = virtual.train(parquet_filepath, model_types=['sparse-lr'], prefix='./demo-debug/')

functions

Drilling functions..
We found 4 function(s) in your table.


[{'target_index': 0,
  'target_name': 'Earnings Year',
  'target_stats': {'mean': np.float64(2019.6546546546547),
   'max': 2022.0,
   'min': 2010.0},
  'models': {'sparse-lr': {'mse': 1.8555809279793827,
    'intercept': np.float64(2019.5465662140489),
    'coeffs': [{'col-index': 8,
      'col-name': 'Overtime Earnings',
      'coeff': np.float64(3.5580390904300555e-05)},
     {'col-index': 9,
      'col-name': 'Total Earnings',
      'coeff': np.float64(2.492262333726554e-06)}]}}},
 {'target_index': 7,
  'target_name': 'Regular Earnings',
  'target_stats': {'mean': np.float64(31553.40951951952),
   'max': 202038.13,
   'min': 0.78},
  'models': {'sparse-lr': {'mse': 2.6357838956405258e-11,
    'intercept': np.float64(-3.637978807091713e-12),
    'coeffs': [{'col-index': 8,
      'col-name': 'Overtime Earnings',
      'coeff': np.float64(-0.9999999999999871)},
     {'col-index': 9,
      'col-name': 'Total Earnings',
      'coeff': np.float64(0.9999999999999998)}]}}},
 {'target_index

### Compress with functions

In [5]:
# Use them for compression.
virtual.to_format(parquet_filepath, 'example_virtual.parquet', functions)

Running schema inference..
Creating the virtual file..


### Read the file

In [6]:
df = virtual.from_format('example_virtual.parquet')

### Query the vanilla Parquet file

In [7]:
import duckdb

duckdb.sql('select avg("Total Earnings") from read_parquet("example.parquet")').df()

Unnamed: 0,"avg(""Total Earnings"")"
0,32326.906086


### Query the virtualized Parquet file

In [8]:
virtual.query(
  'select avg("Total Earnings") from read_parquet("example_virtual.parquet")',
  engine = 'duckdb'
)

Unnamed: 0,"avg(""Total Earnings"")"
0,32326.906086
