# `virtual`: How to virtualize Redset

In [None]:
%load_ext autoreload
%autoreload 2

import virtual

# Virtualize Parquet file stored on S3.
parquet_filepath = 's3://redshift-downloads/redset/serverless/full.parquet'

### One-shot table virtualization

In [2]:
virtual.to_format(parquet_filepath, 'redset_virtual.parquet', prefix='demo-debug/')

Running schema inference..
Drilling functions..
We found 9 function candidate(s) in your table.
Let's see how many benefit virtualization..
It seems that 1 function(s) can indeed be used for virtualization.
Creating the virtual file..
Done.


### Compare to vanilla `Parquet`

In [3]:
import os
import demo_util
%pip install requests

table = os.path.splitext(os.path.basename(parquet_filepath))[0]
print(f'[{table}] Parquet size: {demo_util.get_file_size(parquet_filepath) / 1_000_000} MB')
print(f'[{table}] Virtual size: {os.path.getsize('redset_virtual.parquet') / 1_000_000} MB')

Note: you may need to restart the kernel to use updated packages.
[full] Parquet size: 322.619676 MB
[full] Virtual size: 314.556479 MB


### Query the vanilla Parquet file

In [4]:
import duckdb

duckdb.sql(f'select num_scans, count(*) from read_parquet("{parquet_filepath}") group by num_scans order by num_scans;').fetchdf()

Unnamed: 0,num_scans,count_star()
0,0,4992205
1,1,2191086
2,2,910544
3,3,37527
4,4,33871
...,...,...
123,226,63
124,246,4
125,261,3
126,262,22


### Query the virtualized Parquet file

In [5]:
virtual.query(
  'select num_scans, count(*) from read_parquet("redset_virtual.parquet") group by num_scans order by num_scans',
  engine = 'duckdb'
)

Unnamed: 0,"(CAST(round(((0.8423 * num_joins) + 0.285), 0) AS BIGINT) + num_scans_offset)",count_star()
0,0,4992205
1,1,2191086
2,2,910544
3,3,37527
4,4,33871
...,...,...
123,226,63
124,246,4
125,261,3
126,262,22
