In [48]:
import os
import time
import pandas as pd
from slippi_db import make_compression_datasets
from slippi_db import upload_lib

In [15]:
env = 'compression_test'

In [16]:
slp_db = upload_lib.get_db(env, upload_lib.SLP)
slp_infos = list(slp_db.find({}))
slp_compressed_size = sum(info['stored_size'] for info in slp_infos)
slp_size = sum(info['original_size'] for info in slp_infos)

In [17]:
def get_dataset_size(dataset: str):
    parsed_db = upload_lib.get_db(env, dataset)
    infos = parsed_db.find({})
    sizes = [info['size'] for info in infos if not info.get('failed', False)]
    return sum(sizes)

In [18]:
compression = slp_infos[0]['compression']
for info in slp_infos:
    assert info['compression'] == compression
print(compression)

zlib


In [19]:
sizes = {name: get_dataset_size(name) for name in make_compression_datasets.configurations}
sizes[f'slp_{compression}_compressed'] = slp_compressed_size
sizes['slp'] = slp_size

In [20]:
names, sizes_ = zip(*sizes.items())
df = pd.DataFrame({'name': names, 'size': sizes_})
df['relative_size'] = df['size'] / sizes['uncompressed']

In [21]:
df.sort_values(by='relative_size', inplace=True)
df[['name', 'relative_size']]

Unnamed: 0,name,relative_size
6,zlib,0.537411
3,brotli,0.700552
5,zstd,0.74474
1,gzip,0.749642
4,lz4,0.7506
2,snappy,0.825933
0,uncompressed,1.0
7,slp_zlib_compressed,3.017508
8,slp,13.652784


In [4]:
# Download test files
slp_dir = f'data/{env}'

In [None]:
os.makedirs(slp_dir, exist_ok=True)
for info in slp_infos:
    upload_lib.download_slp_locally(env, info['key'], slp_dir)

In [5]:
import pickle, zlib, os
from slippi_db import parse_libmelee, parse_peppi
from slippi_ai.types import InvalidGameError, array_to_nest

In [6]:
paths = [os.path.join(slp_dir, f) for f in os.listdir(slp_dir)]

In [10]:
%%time

pa_arrays = []
errors = []

for path in paths:
    print(path)
    try:
#         pa_arrays.append(parse_peppi.get_slp(path))
        pa_arrays.append(parse_libmelee.get_slp(path))
    except InvalidGameError as e:
        errors.append((path, e))
len(errors)

data/compression_test/Doc vs Samus [DL] Game_20190820T123018.slp
data/compression_test/Game_20210124T214030.slp
data/compression_test/2020-07_Game_20200720T165745.slp
data/compression_test/20200205 - HNC 16 - PM 1046 - HELP Peach (Blue) vs SUWA Fox (Default) - Fountain of Dreams.slp
data/compression_test/2021-07_Game_20210709T184131.slp
data/compression_test/15_48_46 Fox + Mario (YS).slp
data/compression_test/19_37_31 Fox + Falco (BF).slp
data/compression_test/17_42_36 [C2] Marth + [GRRL] Falco + [JAZZ] Fox + [POP] Fox (YS).slp
data/compression_test/18_40_08 Fox + Falco (FoD).slp
data/compression_test/Game_20190706T211019.slp
CPU times: user 10.6 s, sys: 99 ms, total: 10.7 s
Wall time: 10.7 s


1

In [44]:
def nested_size(array, level) -> dict:
    nest = array_to_nest(array)
    pickled = pickle.dumps(nest)
    compressed = zlib.compress(pickled, level=level)
    return len(compressed)

In [49]:
%%time

totals = {}
times = {}
for level in range(zlib.Z_BEST_COMPRESSION + 1):
    start = time.perf_counter()
    totals[level] = sum(nested_size(a, level) for a in pa_arrays)
    times[level] = time.perf_counter() - start

CPU times: user 990 ms, sys: 18.9 ms, total: 1.01 s
Wall time: 1.01 s


In [50]:
times

{0: 0.02037558400115813,
 1: 0.05806408399803331,
 2: 0.04424483300317661,
 3: 0.044978833000641316,
 4: 0.06588908299818286,
 5: 0.07144233400322264,
 6: 0.08604466700126068,
 7: 0.09655908399872715,
 8: 0.17153379199953633,
 9: 0.3478453329989861}

In [47]:
totals

{0: 8278850,
 1: 1085447,
 2: 1070725,
 3: 1048364,
 4: 1004004,
 5: 987325,
 6: 969468,
 7: 965385,
 8: 956067,
 9: 952457}

In [12]:
total_nested_size = sum(map(nested_size, pa_arrays))

In [39]:
sizes['pickle_zlib'] = total_nested_size

In [40]:
for name in sorted(sizes, key=lambda n: sizes[n]):
    print(name, '%.3f' % (sizes['slp'] / sizes[name]))

pickle_zlib 27.040
zlib 25.405
brotli 19.489
zstd 18.332
gzip 18.212
lz4 18.189
snappy 16.530
uncompressed 13.653
slp_zlib_compressed 4.525
slp 1.000
