# `virtual`: CSV Demo

In [1]:
%reload_ext autoreload
%autoreload 2

import virtual

csv_filepath = 'example.csv'

### One-shot table virtualization

In [2]:
# Save in Parquet format.
virtual.to_format(csv_filepath, 'example_virtual.csv', model_types=['sparse-lr'], prefix='demo-debug/')

Running schema inference..
Drilling functions..
{'num': {'indices': [0, 7, 8, 9], 'names': ['Earnings Year', 'Regular Earnings', 'Overtime Earnings', 'Total Earnings']}, 'date': {'indices': [], 'names': []}, 'timestamp': {'indices': [], 'names': []}, 'time': {'indices': [], 'names': []}, 'other': {'indices': [], 'names': []}, 'string': {'indices': [1, 2, 3, 4, 5, 6], 'names': ['Department', 'Employee Name', 'Position Title', 'Union Name', 'Regular or Temporary', 'Full or Part Time']}, 'boolean': {'indices': [], 'names': []}}
4
We found 2 function(s) in your table.
[{'target-index': 0, 'target-name': 'Earnings Year', 'target-stats': {'mean': np.float64(2019.6546546546547), 'max': 2022.0, 'min': 2010.0}, 'models': {'sparse-lr': {'mse': 1.8572998848441455, 'intercept': np.float64(2019.553519802266), 'coeffs': [{'col-index': 7, 'col-name': 'Regular Earnings', 'coeff': np.float64(3.205195696083787e-06)}]}}}, {'target-index': 9, 'target-name': 'Total Earnings', 'target-stats': {'mean': np.fl

### Compare to vanilla CSV

In [3]:
import os

table = os.path.splitext(os.path.basename(csv_filepath))[0]
print(f'[{table}] CSV: {os.path.getsize('example.csv') / 1_000_000} MB')
print(f'[{table}] Virtual: {os.path.getsize('example_virtual.csv') / 1_000_000} MB')

[example] CSV: 0.122375 MB
[example] Virtual: 0.112833 MB


### Exploring functions

In [4]:
# Drill the functions.
functions = virtual.train(csv_filepath, model_types=['sparse-lr'], prefix='./demo-debug/')

functions

Drilling functions..
{'num': {'indices': [0, 7, 8, 9], 'names': ['Earnings Year', 'Regular Earnings', 'Overtime Earnings', 'Total Earnings']}, 'date': {'indices': [], 'names': []}, 'timestamp': {'indices': [], 'names': []}, 'time': {'indices': [], 'names': []}, 'other': {'indices': [], 'names': []}, 'string': {'indices': [1, 2, 3, 4, 5, 6], 'names': ['Department', 'Employee Name', 'Position Title', 'Union Name', 'Regular or Temporary', 'Full or Part Time']}, 'boolean': {'indices': [], 'names': []}}
4
We found 2 function(s) in your table.


[{'target-index': 0,
  'target-name': 'Earnings Year',
  'target-stats': {'mean': np.float64(2019.6546546546547),
   'max': 2022.0,
   'min': 2010.0},
  'models': {'sparse-lr': {'mse': 1.8572998848441455,
    'intercept': np.float64(2019.553519802266),
    'coeffs': [{'col-index': 7,
      'col-name': 'Regular Earnings',
      'coeff': np.float64(3.205195696083787e-06)}]}}},
 {'target-index': 9,
  'target-name': 'Total Earnings',
  'target-stats': {'mean': np.float64(32326.906086086085),
   'max': 202038.13,
   'min': 0.78},
  'models': {'sparse-lr': {'mse': 1.277106715403009e-11,
    'intercept': np.float64(-1.4551915228366852e-11),
    'coeffs': [{'col-index': 7,
      'col-name': 'Regular Earnings',
      'coeff': np.float64(1.0000000000000004)},
     {'col-index': 8,
      'col-name': 'Overtime Earnings',
      'coeff': np.float64(1.0000000000000007)}]}}}]

### Compress with functions

In [5]:
# Use them for compression.
virtual.to_format(csv_filepath, 'example_virtual.csv', functions)

Running schema inference..
[{'target-index': 0, 'target-name': 'Earnings Year', 'target-stats': {'mean': np.float64(2019.6546546546547), 'max': 2022.0, 'min': 2010.0}, 'models': {'sparse-lr': {'mse': 1.8572998848441455, 'intercept': np.float64(2019.553519802266), 'coeffs': [{'col-index': 7, 'col-name': 'Regular Earnings', 'coeff': np.float64(3.205195696083787e-06)}]}}}, {'target-index': 9, 'target-name': 'Total Earnings', 'target-stats': {'mean': np.float64(32326.906086086085), 'max': 202038.13, 'min': 0.78}, 'models': {'sparse-lr': {'mse': 1.277106715403009e-11, 'intercept': np.float64(-1.4551915228366852e-11), 'coeffs': [{'col-index': 7, 'col-name': 'Regular Earnings', 'coeff': np.float64(1.0000000000000004)}, {'col-index': 8, 'col-name': 'Overtime Earnings', 'coeff': np.float64(1.0000000000000007)}]}}}]
{'target-index': 0, 'target-name': 'Earnings Year', 'target-stats': {'mean': np.float64(2019.6546546546547), 'max': 2022.0, 'min': 2010.0}, 'models': {'sparse-lr': {'mse': 1.85729988

### Query the vanilla CSV file

In [6]:
import duckdb

duckdb.sql('select avg("Total Earnings") from read_csv("example.csv")').df()

Unnamed: 0,"avg(""Total Earnings"")"
0,32326.906086


### Query the virtualized CSV file

In [7]:
virtual.query(
  'select avg("Total Earnings") from read_csv("example_virtual.csv")',
  engine = 'duckdb'
)

Unnamed: 0,"avg(""Total Earnings"")"
0,32326.906086
