# `virtual`: DataFrame Demo

In [1]:
%reload_ext autoreload
%autoreload 2

import pandas as pd
import virtual

df = pd.read_csv('example.csv')

### One-shot table virtualization

In [2]:
# Save in Parquet format.
virtual.to_format(df, 'example_virtual.csv', prefix='demo-debug/')

Running schema inference..
Drilling functions..
We found 2 function candidate(s) in your table.
Let's see how many benefit virtualization..
It seems that 1 function(s) can indeed be used for virtualization.
Creating the virtual file..
Done.


### Compare to vanilla CSV

In [3]:
import os

print(f'CSV: {os.path.getsize('example.csv')} bytes')
print(f'Virtual: {os.path.getsize('example_virtual.csv')} bytes')

CSV: 122375 bytes
Virtual: 112833 bytes


### Exploring functions

In [4]:
# Drill the functions.
functions = virtual.train(df, model_types=['sparse-lr'], prefix='./demo-debug/')

functions

Drilling functions..
We found 2 function candidate(s) in your table.


[{'category': 'num',
  'target-index': 0,
  'target-name': 'Earnings Year',
  'target-stats': {'mean': np.float64(2019.6546546546547),
   'max': 2022.0,
   'min': 2010.0},
  'models': {'sparse-lr': {'mse': 1.8572998848441455,
    'intercept': np.float64(2019.553519802266),
    'coeffs': [{'col-index': 7,
      'col-name': 'Regular Earnings',
      'coeff': np.float64(3.205195696083787e-06)}]}}},
 {'category': 'num',
  'target-index': 9,
  'target-name': 'Total Earnings',
  'target-stats': {'mean': np.float64(32326.906086086085),
   'max': 202038.13,
   'min': 0.78},
  'models': {'sparse-lr': {'mse': 1.277106715403009e-11,
    'intercept': np.float64(-1.4551915228366852e-11),
    'coeffs': [{'col-index': 7,
      'col-name': 'Regular Earnings',
      'coeff': np.float64(1.0000000000000004)},
     {'col-index': 8,
      'col-name': 'Overtime Earnings',
      'coeff': np.float64(1.0000000000000007)}]}}}]

### Compress with functions

In [5]:
# Use them for compression.
virtual.to_format(df, 'example_virtual.csv', functions)

Running schema inference..
Let's see how many benefit virtualization..
It seems that 1 function(s) can indeed be used for virtualization.
Creating the virtual file..
Done.


### Read the file

In [6]:
df = virtual.from_format('example_virtual.csv')

df.head(5)

Unnamed: 0,Earnings Year,Department,Employee Name,Position Title,Union Name,Regular or Temporary,Full or Part Time,Regular Earnings,Overtime Earnings,"""Total Earnings"""
0,2010,Law,"Allen,Marcy E",Secretary,Excluded Personnel (Admin I),R,F,38578.75,0.0,38578.75
1,2010,Health-Administration,"Peeling,Carol A",Spvg Public Health Nurse,Br Admin & Professional Assoc,R,P,45259.79,0.0,45259.79
2,2010,Health-Administration,"Picciano,Cindy L",Sr Accountant,Non Union Equivalent - 07,T,P,1058.94,0.0,1058.94
3,2010,Health-Administration,"Picciano,Cindy L",Sr Accountant,Br Admin & Professional Assoc,R,F,43137.16,7.67,43144.83
4,2010,Health-Administration,"Riese,Nadine K",Nutrition Assistant,"CSEA Local 6150, Full-time",R,F,30905.73,0.0,30905.73


### Query the vanilla CSV file

In [7]:
import duckdb

duckdb.sql('select avg("Total Earnings") from read_csv("example.csv")').df()

Unnamed: 0,"avg(""Total Earnings"")"
0,32326.906086


### Query the virtualized CSV file

In [8]:
virtual.query(
  'select avg("Total Earnings") from read_csv("example_virtual.csv")',
  engine = 'duckdb'
)

Unnamed: 0,"avg(""Total Earnings"")"
0,32326.906086
