# `virtual`: CSV Demo

In [1]:
%reload_ext autoreload
%autoreload 2

import virtual

csv_filepath = 'example.csv'

### One-Shot Table Compression

In [2]:
# Save in Parquet format.
virtual.to_format(csv_filepath, 'example_virtual.csv', model_types=['sparse-lr'], prefix='demo-debug/')

Schema
[{'name': 'Earnings Year', 'type': 'BIGINT', 'null': {'any': False, 'all': False}, 'scale': 0, 'precision': 0}, {'name': 'Department', 'type': 'VARCHAR', 'null': {'any': False, 'all': False}, 'scale': 0, 'precision': 0}, {'name': 'Employee Name', 'type': 'VARCHAR', 'null': {'any': False, 'all': False}, 'scale': 0, 'precision': 0}, {'name': 'Position Title', 'type': 'VARCHAR', 'null': {'any': False, 'all': False}, 'scale': 0, 'precision': 0}, {'name': 'Union Name', 'type': 'VARCHAR', 'null': {'any': False, 'all': False}, 'scale': 0, 'precision': 0}, {'name': 'Regular or Temporary', 'type': 'VARCHAR', 'null': {'any': False, 'all': False}, 'scale': 0, 'precision': 0}, {'name': 'Full or Part Time', 'type': 'VARCHAR', 'null': {'any': False, 'all': False}, 'scale': 0, 'precision': 0}, {'name': 'Regular Earnings', 'type': 'DOUBLE', 'null': {'any': False, 'all': False}, 'scale': 2, 'precision': 8}, {'name': 'Overtime Earnings', 'type': 'DOUBLE', 'null': {'any': False, 'all': False}, 'sc

### Compare to vanilla CSV

In [3]:
import os

print(f'CSV: {os.path.getsize('example.csv')} bytes')
print(f'Virtual: {os.path.getsize('example_virtual.csv')} bytes')

CSV: 122375 bytes
Virtual: 112833 bytes


### Exploring functions

In [4]:
# Drill the functions.
functions = virtual.train(csv_filepath, model_types=['sparse-lr'], prefix='./demo-debug/')

functions

start virtualize
[{'name': 'Earnings Year', 'type': 'BIGINT'}, {'name': 'Department', 'type': 'VARCHAR'}, {'name': 'Employee Name', 'type': 'VARCHAR'}, {'name': 'Position Title', 'type': 'VARCHAR'}, {'name': 'Union Name', 'type': 'VARCHAR'}, {'name': 'Regular or Temporary', 'type': 'VARCHAR'}, {'name': 'Full or Part Time', 'type': 'VARCHAR'}, {'name': 'Regular Earnings', 'type': 'DOUBLE'}, {'name': 'Overtime Earnings', 'type': 'DOUBLE'}, {'name': 'Total Earnings', 'type': 'DOUBLE'}]
{'date': [], 'string': ['Department', 'Employee Name', 'Position Title', 'Union Name', 'Regular or Temporary', 'Full or Part Time'], 'boolean': []}
['Earnings Year', 'Department', 'Employee Name', 'Position Title', 'Union Name', 'Regular or Temporary', 'Full or Part Time', 'Regular Earnings', 'Overtime Earnings', 'Total Earnings']
[0, 7, 8, 9]
(999, 4)
We found 4 function(s) in your table.
[{'target_index': 0, 'target_name': 'Earnings Year', 'target_stats': {'mean': np.float64(2019.6546546546547), 'max': 20

[{'target_index': 0,
  'target_name': 'Earnings Year',
  'target_stats': {'mean': np.float64(2019.6546546546547),
   'max': 2022.0,
   'min': 2010.0},
  'models': {'sparse-lr': {'mse': 1.8555809279793825,
    'intercept': np.float64(2019.5465662140489),
    'coeffs': [{'col-index': 8,
      'col-name': 'Overtime Earnings',
      'coeff': np.float64(3.558039090430056e-05)},
     {'col-index': 9,
      'col-name': 'Total Earnings',
      'coeff': np.float64(2.4922623337265483e-06)}]}}},
 {'target_index': 7,
  'target_name': 'Regular Earnings',
  'target_stats': {'mean': np.float64(31553.40951951952),
   'max': 202038.13,
   'min': 0.78},
  'models': {'sparse-lr': {'mse': 3.160278351151207e-11,
    'intercept': np.float64(1.0913936421275139e-11),
    'coeffs': [{'col-index': 8,
      'col-name': 'Overtime Earnings',
      'coeff': np.float64(-0.9999999999999851)},
     {'col-index': 9,
      'col-name': 'Total Earnings',
      'coeff': np.float64(0.9999999999999993)}]}}},
 {'target_index'

### Compress with functions

In [5]:
# Use them for compression.
virtual.to_format(csv_filepath, 'example_virtual.csv', functions)

Schema
[{'name': 'Earnings Year', 'type': 'BIGINT', 'null': {'any': False, 'all': False}, 'scale': 0, 'precision': 0}, {'name': 'Department', 'type': 'VARCHAR', 'null': {'any': False, 'all': False}, 'scale': 0, 'precision': 0}, {'name': 'Employee Name', 'type': 'VARCHAR', 'null': {'any': False, 'all': False}, 'scale': 0, 'precision': 0}, {'name': 'Position Title', 'type': 'VARCHAR', 'null': {'any': False, 'all': False}, 'scale': 0, 'precision': 0}, {'name': 'Union Name', 'type': 'VARCHAR', 'null': {'any': False, 'all': False}, 'scale': 0, 'precision': 0}, {'name': 'Regular or Temporary', 'type': 'VARCHAR', 'null': {'any': False, 'all': False}, 'scale': 0, 'precision': 0}, {'name': 'Full or Part Time', 'type': 'VARCHAR', 'null': {'any': False, 'all': False}, 'scale': 0, 'precision': 0}, {'name': 'Regular Earnings', 'type': 'DOUBLE', 'null': {'any': False, 'all': False}, 'scale': 2, 'precision': 8}, {'name': 'Overtime Earnings', 'type': 'DOUBLE', 'null': {'any': False, 'all': False}, 'sc

### Query the vanilla CSV file

In [6]:
import duckdb

duckdb.sql('select avg("Total Earnings") from read_csv("example.csv")').df()

Unnamed: 0,"avg(""Total Earnings"")"
0,32326.906086


### Query the virtualized CSV file

In [7]:
virtual.query(
  'select avg("Total Earnings") from read_csv("example_virtual.csv")',
  engine = 'duckdb'
)

Unnamed: 0,"avg(""Total Earnings"")"
0,32326.906086
