# `virtual`: How to virtualize Redset

In [1]:
%load_ext autoreload
%autoreload 2

import virtual

# parquet_filepath = 's3://redshift-downloads/redset/serverless/sample_0.01.parquet'
parquet_filepath = '../../demo/data/redset-serverless.parquet'

### One-shot table virtualization

In [2]:
# Virtualize Parquet file stored on S3.
virtual.to_format(parquet_filepath, 'redset_virtual.parquet', model_types=['sparse-lr'], prefix='demo-debug/')

Running schema inference..
Drilling functions..
      instance_id  cluster_size  user_id  database_id  query_id  \
0              19             0        0            0    210701   
1              19             0        0            0    590465   
2             127             0        1            0     87545   
3               0             0        0            0     60070   
4              55             0        0            0     34184   
...           ...           ...      ...          ...       ...   
9995          153             0        0            0     43016   
9996          104             0        0            0    414893   
9997          104             0        0            0    865100   
9998            1             0        1            0    118892   
9999            1             0        3            0     89427   

      compile_duration_ms  queue_duration_ms  execution_duration_ms  \
0                    57.0                  0                     62   
1    

### Compare to vanilla `Parquet`

In [3]:
%load_ext autoreload
%autoreload 2

import os
import demo_util
%pip install requests

table = os.path.splitext(os.path.basename(parquet_filepath))[0]
print(f'[{table}] Parquet size: {demo_util.get_file_size(parquet_filepath) / 1_000_000} MB')
print(f'[{table}] Virtual size: {os.path.getsize('redset_virtual.parquet') / 1_000_000} MB')

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload
Note: you may need to restart the kernel to use updated packages.
[redset-serverless] Parquet size: 322.619676 MB
[redset-serverless] Virtual size: 314.556389 MB


### Query the vanilla Parquet file

In [4]:
import duckdb

duckdb.sql(f'select num_joins, count(*) from read_parquet("{parquet_filepath}") group by num_joins order by num_joins;').fetchdf()

Unnamed: 0,num_joins,count_star()
0,0,6118504
1,1,1433559
2,2,544264
3,3,41863
4,4,6899
...,...,...
128,199,14
129,206,1
130,207,223
131,210,3


### Query the virtualized Parquet file

In [5]:
virtual.query(
  'select num_joins, count(*) from read_parquet("redset_virtual.parquet") group by num_joins order by num_joins',
  engine = 'duckdb'
)

Unnamed: 0,num_joins,count_star()
0,0,6118504
1,1,1433559
2,2,544264
3,3,41863
4,4,6899
...,...,...
128,199,14
129,206,1
130,207,223
131,210,3
