# `virtual`: How to virtualize Redset

In [1]:
%reload_ext autoreload
%autoreload 2

import virtual

s3_parquet_filepath = 's3://redshift-downloads/redset/serverless/sample_0.01.parquet'
# s3_parquet_filepath = '../../demo/data/redset-serverless.parquet'

### One-shot table compression

In [2]:
# Virtualize Parquet file stored on S3.
virtual.to_format(s3_parquet_filepath,'redset_virtual.parquet', schema='./demo-debug/schema_redset-serverless.json', model_types=['sparse-lr'], prefix='demo-debug/')

Drilling functions..
[[104.   0.   0. ...   1.   2.   3.]
 [104.   0.   0. ...   1.   2.   3.]
 [104.   0.   0. ...   0.   0.   2.]
 ...
 [104.   0.   0. ...   0.   0.   2.]
 [ 19.   0.   0. ...   1.   2.   2.]
 [134.   0.   0. ...   0.   0.   2.]]
We found 9 function(s) in your table.
Creating the virtual file..


### Compare to vanilla `Parquet`

In [3]:
import os
import demo_util
%pip install requests

print(f'Redset parquet size: {demo_util.get_file_size(s3_parquet_filepath) / 1_000_000} MB')
print(f'Redset virtual size: {os.path.getsize('redset_virtual.parquet') / 1_000_000} MB')

Note: you may need to restart the kernel to use updated packages.
Redset parquet size: 4.90608 MB
Redset virtual size: 4.762818 MB


### Query the vanilla Parquet file

In [4]:
import duckdb

duckdb.sql(f'select num_scans, count(*) from read_parquet("{s3_parquet_filepath}") group by num_scans order by num_scans;').fetchdf()

Unnamed: 0,num_scans,count_star()
0,0,49983
1,1,21967
2,2,9025
3,3,387
4,4,323
5,5,74
6,6,39
7,7,80
8,8,33
9,9,22


### Query the virtualized Parquet file

In [5]:
virtual.query(
  'select num_scans, count(*) from read_parquet("redset_virtual.parquet") group by num_scans order by num_scans',
  engine = 'duckdb'
)

Unnamed: 0,num_scans,count_star()
0,0,49983
1,1,21967
2,2,9025
3,3,387
4,4,323
5,5,74
6,6,39
7,7,80
8,8,33
9,9,22
