# `virtual`: Parquet Demo

In [1]:
%reload_ext autoreload
%autoreload 2

import virtual

parquet_filepath = 'example.parquet'

### One-shot table virtualization

In [2]:
# Save in Parquet format.
virtual.to_format(parquet_filepath, 'example_virtual.parquet', model_types=['sparse-lr'], prefix='demo-debug/')

Running schema inference..
Drilling functions..
{'num': {'indices': [0, 7, 8, 9], 'names': ['Earnings Year', 'Regular Earnings', 'Overtime Earnings', 'Total Earnings']}, 'date': {'indices': [], 'names': []}, 'timestamp': {'indices': [], 'names': []}, 'time': {'indices': [], 'names': []}, 'other': {'indices': [], 'names': []}, 'string': {'indices': [1, 2, 3, 4, 5, 6], 'names': ['Department', 'Employee Name', 'Position Title', 'Union Name', 'Regular or Temporary', 'Full or Part Time']}, 'boolean': {'indices': [], 'names': []}}
4
We found 2 function(s) in your table.
[{'target-index': 0, 'target-name': 'Earnings Year', 'target-stats': {'mean': np.float64(2019.6546546546547), 'max': 2022.0, 'min': 2010.0}, 'models': {'sparse-lr': {'mse': 1.8572998848441455, 'intercept': np.float64(2019.553519802266), 'coeffs': [{'col-index': 7, 'col-name': 'Regular Earnings', 'coeff': np.float64(3.2051956960837913e-06)}]}}}, {'target-index': 9, 'target-name': 'Total Earnings', 'target-stats': {'mean': np.f

### Compare to vanilla `Parquet`

In [3]:
import os

table = os.path.splitext(os.path.basename(parquet_filepath))[0]
print(f'[{table}] Parquet: {os.path.getsize('example.parquet')} bytes')
print(f'[{table}] Virtual: {os.path.getsize('example_virtual.parquet')} bytes')

[example] Parquet: 44078 bytes
[example] Virtual: 36815 bytes


### Exploring functions

In [4]:
# Drill the functions.
functions = virtual.train(parquet_filepath, model_types=['sparse-lr'], prefix='./demo-debug/')

functions

Drilling functions..
{'num': {'indices': [0, 7, 8, 9], 'names': ['Earnings Year', 'Regular Earnings', 'Overtime Earnings', 'Total Earnings']}, 'date': {'indices': [], 'names': []}, 'timestamp': {'indices': [], 'names': []}, 'time': {'indices': [], 'names': []}, 'other': {'indices': [], 'names': []}, 'string': {'indices': [1, 2, 3, 4, 5, 6], 'names': ['Department', 'Employee Name', 'Position Title', 'Union Name', 'Regular or Temporary', 'Full or Part Time']}, 'boolean': {'indices': [], 'names': []}}
4
We found 2 function(s) in your table.


[{'target-index': 0,
  'target-name': 'Earnings Year',
  'target-stats': {'mean': np.float64(2019.6546546546547),
   'max': 2022.0,
   'min': 2010.0},
  'models': {'sparse-lr': {'mse': 1.8572998848441455,
    'intercept': np.float64(2019.553519802266),
    'coeffs': [{'col-index': 7,
      'col-name': 'Regular Earnings',
      'coeff': np.float64(3.2051956960837913e-06)}]}}},
 {'target-index': 9,
  'target-name': 'Total Earnings',
  'target-stats': {'mean': np.float64(32326.906086086085),
   'max': 202038.13,
   'min': 0.78},
  'models': {'sparse-lr': {'mse': 5.6050884714671134e-11,
    'intercept': np.float64(-6.184563972055912e-11),
    'coeffs': [{'col-index': 7,
      'col-name': 'Regular Earnings',
      'coeff': np.float64(1.000000000000002)},
     {'col-index': 8,
      'col-name': 'Overtime Earnings',
      'coeff': np.float64(1.0000000000000004)}]}}}]

### Compress with functions

In [5]:
# Use them for compression.
virtual.to_format(parquet_filepath, 'example_virtual.parquet', functions)

Running schema inference..
[{'target-index': 0, 'target-name': 'Earnings Year', 'target-stats': {'mean': np.float64(2019.6546546546547), 'max': 2022.0, 'min': 2010.0}, 'models': {'sparse-lr': {'mse': 1.8572998848441455, 'intercept': np.float64(2019.553519802266), 'coeffs': [{'col-index': 7, 'col-name': 'Regular Earnings', 'coeff': np.float64(3.2051956960837913e-06)}]}}}, {'target-index': 9, 'target-name': 'Total Earnings', 'target-stats': {'mean': np.float64(32326.906086086085), 'max': 202038.13, 'min': 0.78}, 'models': {'sparse-lr': {'mse': 5.6050884714671134e-11, 'intercept': np.float64(-6.184563972055912e-11), 'coeffs': [{'col-index': 7, 'col-name': 'Regular Earnings', 'coeff': np.float64(1.000000000000002)}, {'col-index': 8, 'col-name': 'Overtime Earnings', 'coeff': np.float64(1.0000000000000004)}]}}}]
{'target-index': 0, 'target-name': 'Earnings Year', 'target-stats': {'mean': np.float64(2019.6546546546547), 'max': 2022.0, 'min': 2010.0}, 'models': {'sparse-lr': {'mse': 1.85729988

### Read the file

In [6]:
df = virtual.from_format('example_virtual.parquet')

df.head(5)

{'header': ['Employee Name', 'Regular Earnings', 'Position Title', 'Overtime Earnings', 'Full or Part Time', 'Union Name', 'Earnings Year', 'Department', 'Regular or Temporary'], 'functions': {'greedy': {'obj-value': 6916, 'chosen': [{'target-index': 9, 'target-name': 'Total Earnings', 'model-type': 'sparse-lr', 'sparse-lr': {'mse': 5.6050884714671134e-11, 'intercept': -6.184563972055912e-11, 'coeffs': [{'col-index': 7, 'col-name': 'Regular Earnings', 'coeff': 1.000000000000002}, {'col-index': 8, 'col-name': 'Overtime Earnings', 'coeff': 1.0000000000000004}]}, 'left': [7, 8], 'right': 9, 'gain': 6916}], 'time': 9.059906005859375e-06}}}


KeyError: 'schema'

### Query the vanilla Parquet file

In [7]:
import duckdb

duckdb.sql('select avg("Total Earnings") from read_parquet("example.parquet")').df()

Unnamed: 0,"avg(""Total Earnings"")"
0,32326.906086


### Query the virtualized Parquet file

In [8]:
virtual.query(
  'select avg("Total Earnings") from read_parquet("example_virtual.parquet")',
  engine = 'duckdb'
)

{'header': ['Employee Name', 'Regular Earnings', 'Position Title', 'Overtime Earnings', 'Full or Part Time', 'Union Name', 'Earnings Year', 'Department', 'Regular or Temporary'], 'schema': [{'name': 'Earnings Year', 'type': 'BIGINT', 'null': {'any': False, 'all': False}, 'scale': 0, 'precision': 0}, {'name': 'Department', 'type': 'VARCHAR', 'null': {'any': False, 'all': False}, 'scale': 0, 'precision': 0}, {'name': 'Employee Name', 'type': 'VARCHAR', 'null': {'any': False, 'all': False}, 'scale': 0, 'precision': 0}, {'name': 'Position Title', 'type': 'VARCHAR', 'null': {'any': False, 'all': False}, 'scale': 0, 'precision': 0}, {'name': 'Union Name', 'type': 'VARCHAR', 'null': {'any': False, 'all': False}, 'scale': 0, 'precision': 0}, {'name': 'Regular or Temporary', 'type': 'VARCHAR', 'null': {'any': False, 'all': False}, 'scale': 0, 'precision': 0}, {'name': 'Full or Part Time', 'type': 'VARCHAR', 'null': {'any': False, 'all': False}, 'scale': 0, 'precision': 0}, {'name': 'Regular Ear

Unnamed: 0,"avg(""Total Earnings"")"
0,32326.906086
