# `virtual`: CSV Demo

In [1]:
%reload_ext autoreload
%autoreload 2

import virtual

csv_filepath = 'example.csv'

### One-shot table compression

In [2]:
# Save in Parquet format.
virtual.to_format(csv_filepath, 'example_virtual.csv', model_types=['sparse-lr'], prefix='demo-debug/')

Running schema inference..
Drilling functions..
     Earnings Year  Regular Earnings  Overtime Earnings  Total Earnings
722           2020          53353.99            2598.24        55952.23
77            2020           6403.80               0.00         6403.80
877           2020          15061.67               0.00        15061.67
613           2020           2583.70               0.00         2583.70
903           2020          39549.48               0.00        39549.48
..             ...               ...                ...             ...
835           2020          17824.52               0.00        17824.52
192           2020          76032.78             235.71        76268.49
629           2020          52957.48            8044.19        61001.67
559           2020          15074.50               0.00        15074.50
684           2020          42466.97            1971.51        44438.48

[999 rows x 4 columns]
BBBBBBBBB
[7, 8, 9]
['Earnings Year', 'Regular Earnings', 'Overt

### Compare to vanilla CSV

In [3]:
import os

print(f'CSV: {os.path.getsize('example.csv')} bytes')
print(f'Virtual: {os.path.getsize('example_virtual.csv')} bytes')

CSV: 122375 bytes
Virtual: 112833 bytes


### Exploring functions

In [4]:
# Drill the functions.
functions = virtual.train(csv_filepath, model_types=['sparse-lr'], prefix='./demo-debug/')

functions

Drilling functions..
     Earnings Year  Regular Earnings  Overtime Earnings  Total Earnings
722           2020          53353.99            2598.24        55952.23
77            2020           6403.80               0.00         6403.80
877           2020          15061.67               0.00        15061.67
613           2020           2583.70               0.00         2583.70
903           2020          39549.48               0.00        39549.48
..             ...               ...                ...             ...
835           2020          17824.52               0.00        17824.52
192           2020          76032.78             235.71        76268.49
629           2020          52957.48            8044.19        61001.67
559           2020          15074.50               0.00        15074.50
684           2020          42466.97            1971.51        44438.48

[999 rows x 4 columns]
BBBBBBBBB
[7, 8, 9]
['Earnings Year', 'Regular Earnings', 'Overtime Earnings', 'Total Earni

[{'target_index': 0,
  'target_name': 'Earnings Year',
  'target_stats': {'mean': np.float64(2019.6546546546547),
   'max': 2022.0,
   'min': 2010.0},
  'models': {'sparse-lr': {'mse': 1.8555809279793825,
    'intercept': np.float64(2019.5465662140489),
    'coeffs': [{'col-index': 7,
      'col-name': 'Regular Earnings',
      'coeff': np.float64(-1.10293761901915e-05)},
     {'col-index': 8,
      'col-name': 'Overtime Earnings',
      'coeff': np.float64(2.4551014714109545e-05)},
     {'col-index': 9,
      'col-name': 'Total Earnings',
      'coeff': np.float64(1.3521638523918047e-05)}]}}},
 {'target_index': 7,
  'target_name': 'Regular Earnings',
  'target_stats': {'mean': np.float64(31553.40951951952),
   'max': 202038.13,
   'min': 0.78},
  'models': {'sparse-lr': {'mse': 3.160278351151207e-11,
    'intercept': np.float64(1.0913936421275139e-11),
    'coeffs': [{'col-index': 8,
      'col-name': 'Overtime Earnings',
      'coeff': np.float64(-0.9999999999999851)},
     {'col-ind

### Compress with functions

In [5]:
# Use them for compression.
virtual.to_format(csv_filepath, 'example_virtual.csv', functions)

Running schema inference..
Creating the virtual file..


### Query the vanilla CSV file

In [6]:
import duckdb

duckdb.sql('select avg("Total Earnings") from read_csv("example.csv")').df()

Unnamed: 0,"avg(""Total Earnings"")"
0,32326.906086


### Query the virtualized CSV file

In [7]:
virtual.query(
  'select avg("Total Earnings") from read_csv("example_virtual.csv")',
  engine = 'duckdb'
)

Unnamed: 0,"avg(""Total Earnings"")"
0,32326.906086
