This script cleans Boltz outputs for freeing up disk space. Below is a single cleanup function for an entire run directory (all chunks)

This will:

1. rewrite embeddings to “z-only” (skipping already-done)
2. delete the heavy extra files
3. optionally delete any stray *.tmp.npz files
4. give you a summary of space reclaimed

In [20]:
from pathlib import Path
import numpy as np
import os

HEAVY_PATTERNS = [
    "pae_*_model_0.npz",
    "pde_*_model_0.npz",
    "plddt_*_model_0.npz",
    "*_model_0.cif",
]

def keep_only_z_in_npz_skip(npz_path: Path) -> str:
    """
    Returns: "rewritten" | "already_z" | "missing_z" | "fail"
    """
    npz_path = Path(npz_path)
    try:
        with np.load(npz_path, allow_pickle=False) as arr:
            # already cleaned
            if arr.files == ["z"]:
                return "already_z"
            if "z" not in arr.files:
                return "missing_z"
            z = arr["z"]
    except Exception as e:
        print("FAIL load:", npz_path, "err:", repr(e))
        return "fail"

    tmp = npz_path.with_name(npz_path.name + ".tmp.npz")
    try:
        np.savez_compressed(tmp, z=z)
        os.replace(tmp, npz_path)
        return "rewritten"
    except Exception as e:
        print("FAIL write:", npz_path, "err:", repr(e))
        try:
            if tmp.exists():
                tmp.unlink()
        except Exception:
            pass
        return "fail"


def cleanup_boltz_predictions(run_root: Path, *, keep_json=True, dry_run=False):
    """
    run_root example:
      /home/natasha/multimodal_model/outputs/train
    This will walk ALL chunks under run_root and clean each predictions/pair_xxx folder.
    """
    run_root = Path(run_root)

    # find all pair prediction dirs that contain embeddings_*.npz
    emb_files = list(run_root.rglob("predictions/pair_*/embeddings_pair_*.npz"))
    print("Found embeddings:", len(emb_files))

    # size before
    def total_size(paths):
        s = 0
        for p in paths:
            try:
                s += p.stat().st_size
            except Exception:
                pass
        return s

    before = total_size(emb_files)

    stats = {"rewritten":0, "already_z":0, "missing_z":0, "fail":0}
    deleted_files = 0
    deleted_bytes = 0

    # 1) rewrite embeddings to z-only
    for p in emb_files:
        status = keep_only_z_in_npz_skip(p) if not dry_run else "dry_run"
        if status in stats:
            stats[status] += 1

    # 2) delete heavy extras in the same pair dirs
    pair_dirs = sorted({p.parent for p in emb_files})  # predictions/pair_xxx
    for d in pair_dirs:
        # heavy artifacts
        for pat in HEAVY_PATTERNS:
            for f in d.glob(pat):
                try:
                    sz = f.stat().st_size
                    if not dry_run:
                        f.unlink()
                    deleted_files += 1
                    deleted_bytes += sz
                except Exception as e:
                    print("WARN could not delete:", f, "err:", repr(e))

        # optional: if you *don't* want json
        if not keep_json:
            for f in d.glob("confidence_*.json"):
                try:
                    sz = f.stat().st_size
                    if not dry_run:
                        f.unlink()
                    deleted_files += 1
                    deleted_bytes += sz
                except Exception as e:
                    print("WARN could not delete:", f, "err:", repr(e))

    # 3) remove any stray temp files from previous attempts
    tmp_files = list(run_root.rglob("*.tmp.npz"))
    for f in tmp_files:
        try:
            sz = f.stat().st_size
            if not dry_run:
                f.unlink()
            deleted_files += 1
            deleted_bytes += sz
        except Exception as e:
            print("WARN could not delete tmp:", f, "err:", repr(e))

    # size after (embeddings only)
    after = total_size(emb_files)

    print("\n=== Embedding rewrite stats ===")
    for k,v in stats.items():
        print(f"{k:>10}: {v}")

    print("\n=== Deletions ===")
    print("files deleted:", deleted_files)
    print("GB deleted (approx):", deleted_bytes / (1024**3))

    print("\n=== Embeddings total size ===")
    print("before (GB):", before / (1024**3))
    print("after  (GB):", after / (1024**3))


In [21]:
cleanup_boltz_predictions(
    Path("/home/natasha/multimodal_model/outputs/train"),
    keep_json=True,
    dry_run=True
)


Found embeddings: 4288

=== Embedding rewrite stats ===
 rewritten: 0
 already_z: 0
 missing_z: 0
      fail: 0

=== Deletions ===
files deleted: 17152
GB deleted (approx): 11.51417211163789

=== Embeddings total size ===
before (GB): 653.1688089426607
after  (GB): 653.1688089426607


In [22]:
cleanup_boltz_predictions(
    Path("/home/natasha/multimodal_model/outputs/train"),
    keep_json=True,
    dry_run=False
)


Found embeddings: 4288

=== Embedding rewrite stats ===
 rewritten: 4286
 already_z: 1
 missing_z: 1
      fail: 0

=== Deletions ===
files deleted: 17152
GB deleted (approx): 11.51417211163789

=== Embeddings total size ===
before (GB): 653.1688089426607
after  (GB): 649.8201619535685


In [23]:
# testing