# CPG Compression Benchmark

This notebook benchmarks compression ratio, compression speed, and decompression speed on the `.pyd` binaries in `phantom/common/distribute/cpg`.

It uses stdlib codecs by default (`zlib`, `gzip`, `bz2`, `lzma`) and will also benchmark optional codecs (`brotli`, `zstandard`, `lz4`) if installed.

In [1]:
from __future__ import annotations

import bz2
import gzip
import importlib
import lzma
import statistics
import time
import zlib
from pathlib import Path


def find_repo_root(start: Path | None = None) -> Path:
    """Walk upwards until a directory containing `phantom/` is found."""
    current = (start or Path.cwd()).resolve()
    for candidate in [current, *current.parents]:
        if (candidate / "phantom").exists():
            return candidate
    raise FileNotFoundError("Could not locate repository root containing `phantom/`.")


REPO_ROOT = find_repo_root()
CPG_DIR = REPO_ROOT / "phantom" / "common" / "distribute" / "cpg"
PYD_FILES = sorted(CPG_DIR.glob("*.pyd"))

if not PYD_FILES:
    raise FileNotFoundError(f"No .pyd files found in {CPG_DIR}")

print(f"Repo root: {REPO_ROOT}")
print(f"CPG dir:   {CPG_DIR}")
print("\nInputs:")
for p in PYD_FILES:
    print(f"- {p.name:35} {p.stat().st_size / (1024 * 1024):8.3f} MiB")


Repo root: C:\Users\volke\PycharmProjects\phantom-sc2
CPG dir:   C:\Users\volke\PycharmProjects\phantom-sc2\phantom\common\distribute\cpg

Inputs:
- harvest1.cp312-win_amd64.pyd           0.260 MiB
- harvest2.cp312-win_amd64.pyd           0.268 MiB
- harvest3.cp312-win_amd64.pyd           0.295 MiB
- harvest4.cp312-win_amd64.pyd           0.397 MiB
- harvest5.cp312-win_amd64.pyd           0.812 MiB
- harvest6.cp312-win_amd64.pyd           2.495 MiB
- harvest7.cp312-win_amd64.pyd           9.256 MiB


In [2]:
def _build_algorithms():
    algos = []

    algos.append({
        "name": "zlib-l1",
        "compress": lambda b: zlib.compress(b, level=1),
        "decompress": zlib.decompress,
    })
    algos.append({
        "name": "zlib-l6",
        "compress": lambda b: zlib.compress(b, level=6),
        "decompress": zlib.decompress,
    })
    algos.append({
        "name": "zlib-l9",
        "compress": lambda b: zlib.compress(b, level=9),
        "decompress": zlib.decompress,
    })

    algos.append({
        "name": "gzip-l6",
        "compress": lambda b: gzip.compress(b, compresslevel=6),
        "decompress": gzip.decompress,
    })

    algos.append({
        "name": "bz2-l9",
        "compress": lambda b: bz2.compress(b, compresslevel=9),
        "decompress": bz2.decompress,
    })

    algos.append({
        "name": "lzma-p6",
        "compress": lambda b: lzma.compress(b, preset=6),
        "decompress": lzma.decompress,
    })

    # Optional codecs
    brotli_spec = importlib.util.find_spec("brotli")
    if brotli_spec:
        import brotli

        algos.append({
            "name": "brotli-q5",
            "compress": lambda b: brotli.compress(b, quality=5),
            "decompress": brotli.decompress,
        })
        algos.append({
            "name": "brotli-q11",
            "compress": lambda b: brotli.compress(b, quality=11),
            "decompress": brotli.decompress,
        })

    zstd_spec = importlib.util.find_spec("zstandard")
    if zstd_spec:
        import zstandard as zstd

        def zstd_compress(level: int):
            c = zstd.ZstdCompressor(level=level)
            return lambda b: c.compress(b)

        def zstd_decompress(b):
            d = zstd.ZstdDecompressor()
            return d.decompress(b)

        algos.append({
            "name": "zstd-l3",
            "compress": zstd_compress(3),
            "decompress": zstd_decompress,
        })
        algos.append({
            "name": "zstd-l10",
            "compress": zstd_compress(10),
            "decompress": zstd_decompress,
        })

    lz4_spec = importlib.util.find_spec("lz4.frame")
    if lz4_spec:
        import lz4.frame

        algos.append({
            "name": "lz4-default",
            "compress": lz4.frame.compress,
            "decompress": lz4.frame.decompress,
        })

    return algos


ALGORITHMS = _build_algorithms()
print("Algorithms:")
for a in ALGORITHMS:
    print(f"- {a['name']}")


Algorithms:
- zlib-l1
- zlib-l6
- zlib-l9
- gzip-l6
- bz2-l9
- lzma-p6
- lz4-default


In [3]:
def benchmark_throughput(func, payload: bytes, min_seconds: float = 0.25, min_runs: int = 3):
    """Return throughput in MiB/s and median single-run latency in ms."""
    total_bytes = 0
    runs = []
    t_start = time.perf_counter()

    while len(runs) < min_runs or (time.perf_counter() - t_start) < min_seconds:
        t0 = time.perf_counter()
        func(payload)
        dt = time.perf_counter() - t0
        runs.append(dt)
        total_bytes += len(payload)

    elapsed = max(sum(runs), 1e-9)
    mib_s = (total_bytes / (1024 * 1024)) / elapsed
    median_ms = statistics.median(runs) * 1000
    return mib_s, median_ms


results = []

for pyd_path in PYD_FILES:
    raw = pyd_path.read_bytes()
    raw_size = len(raw)

    for algo in ALGORITHMS:
        compressed = algo["compress"](raw)
        round_trip = algo["decompress"](compressed)
        if round_trip != raw:
            raise ValueError(f"Round-trip mismatch for {algo['name']} on {pyd_path.name}")

        c_mib_s, c_ms = benchmark_throughput(algo["compress"], raw)

        # Decompression throughput is normalized by original (uncompressed) bytes.
        d_mib_s_by_compressed, d_ms = benchmark_throughput(algo["decompress"], compressed)
        ratio = len(compressed) / raw_size
        d_mib_s = d_mib_s_by_compressed / max(ratio, 1e-9)

        results.append({
            "file": pyd_path.name,
            "algo": algo["name"],
            "raw_bytes": raw_size,
            "compressed_bytes": len(compressed),
            "ratio": ratio,
            "compress_mib_s": c_mib_s,
            "decompress_mib_s": d_mib_s,
            "compress_median_ms": c_ms,
            "decompress_median_ms": d_ms,
        })

print(f"Collected {len(results)} rows ({len(PYD_FILES)} files x {len(ALGORITHMS)} algos).")


Collected 49 rows (7 files x 7 algos).


In [4]:
from collections import defaultdict


def fmt_size(n):
    return f"{n / (1024 * 1024):.3f}"


def print_table(rows, headers):
    widths = [len(h) for h in headers]
    for row in rows:
        for i, v in enumerate(row):
            widths[i] = max(widths[i], len(str(v)))

    line = " | ".join(h.ljust(widths[i]) for i, h in enumerate(headers))
    sep = "-+-".join("-" * widths[i] for i in range(len(headers)))
    print(line)
    print(sep)
    for row in rows:
        print(" | ".join(str(v).ljust(widths[i]) for i, v in enumerate(row)))


# Per-file best ratio
print("Per-file best compression ratio (lower is better):")
for p in PYD_FILES:
    file_rows = [r for r in results if r['file'] == p.name]
    best = min(file_rows, key=lambda r: r['ratio'])
    print(
        f"- {p.name:35} {best['algo']:12} ratio={best['ratio']:.4f} "
        f"size={fmt_size(best['compressed_bytes'])} MiB"
    )


# Aggregate weighted metrics by algorithm
agg = defaultdict(lambda: {
    'raw_total': 0,
    'compressed_total': 0,
    'compress_weighted': 0.0,
    'decompress_weighted': 0.0,
    'compress_ms': [],
    'decompress_ms': [],
})

for r in results:
    a = agg[r['algo']]
    a['raw_total'] += r['raw_bytes']
    a['compressed_total'] += r['compressed_bytes']
    a['compress_weighted'] += r['compress_mib_s'] * r['raw_bytes']
    a['decompress_weighted'] += r['decompress_mib_s'] * r['raw_bytes']
    a['compress_ms'].append(r['compress_median_ms'])
    a['decompress_ms'].append(r['decompress_median_ms'])

summary = []
for algo, a in agg.items():
    raw_total = a['raw_total']
    summary.append({
        'algo': algo,
        'ratio': a['compressed_total'] / max(raw_total, 1),
        'compress_mib_s': a['compress_weighted'] / max(raw_total, 1),
        'decompress_mib_s': a['decompress_weighted'] / max(raw_total, 1),
        'compress_median_ms': statistics.median(a['compress_ms']),
        'decompress_median_ms': statistics.median(a['decompress_ms']),
    })

print("\nAggregate (weighted by input bytes):")
rows = [
    (
        s['algo'],
        f"{s['ratio']:.4f}",
        f"{s['compress_mib_s']:.1f}",
        f"{s['decompress_mib_s']:.1f}",
        f"{s['compress_median_ms']:.2f}",
        f"{s['decompress_median_ms']:.2f}",
    )
    for s in sorted(summary, key=lambda s: s['ratio'])
]
print_table(rows, [
    'algo',
    'ratio',
    'compress MiB/s',
    'decompress MiB/s',
    'compress median ms',
    'decompress median ms',
])


print("\nTop 3 by ratio:")
for s in sorted(summary, key=lambda s: s['ratio'])[:3]:
    print(f"- {s['algo']:12} ratio={s['ratio']:.4f}")

print("\nTop 3 by compression speed:")
for s in sorted(summary, key=lambda s: s['compress_mib_s'], reverse=True)[:3]:
    print(f"- {s['algo']:12} compress={s['compress_mib_s']:.1f} MiB/s")

print("\nTop 3 by decompression speed:")
for s in sorted(summary, key=lambda s: s['decompress_mib_s'], reverse=True)[:3]:
    print(f"- {s['algo']:12} decompress={s['decompress_mib_s']:.1f} MiB/s")


Per-file best compression ratio (lower is better):
- harvest1.cp312-win_amd64.pyd        lzma-p6      ratio=0.3624 size=0.094 MiB
- harvest2.cp312-win_amd64.pyd        lzma-p6      ratio=0.3571 size=0.096 MiB
- harvest3.cp312-win_amd64.pyd        lzma-p6      ratio=0.3339 size=0.098 MiB
- harvest4.cp312-win_amd64.pyd        lzma-p6      ratio=0.2664 size=0.106 MiB
- harvest5.cp312-win_amd64.pyd        lzma-p6      ratio=0.1653 size=0.134 MiB
- harvest6.cp312-win_amd64.pyd        lzma-p6      ratio=0.0912 size=0.227 MiB
- harvest7.cp312-win_amd64.pyd        lzma-p6      ratio=0.0643 size=0.595 MiB

Aggregate (weighted by input bytes):
algo        | ratio  | compress MiB/s | decompress MiB/s | compress median ms | decompress median ms
------------+--------+----------------+------------------+--------------------+---------------------
lzma-p6     | 0.0980 | 11.1           | 160.0            | 67.12              | 5.66                
bz2-l9      | 0.1589 | 8.6            | 88.7           