In [None]:
from pathlib import Path
import subprocess, shlex
import pandas as pd

# === Configure paths ===
BASE_DIR = Path("/home/natasha/multimodal_model")

RUN_ROOT = BASE_DIR / "outputs"
RUN_ROOT.mkdir(parents=True, exist_ok=True)

BOLTZ_OUT_TRAIN = RUN_ROOT / "train"
BOLTZ_OUT_VAL   = RUN_ROOT / "val"
BOLTZ_OUT_TEST  = RUN_ROOT / "test"
BOLTZ_OUT_TRAIN.mkdir(parents=True, exist_ok=True)
BOLTZ_OUT_VAL.mkdir(parents=True, exist_ok=True)
BOLTZ_OUT_TEST.mkdir(parents=True, exist_ok=True)

YAML_DIR_TRAIN = BASE_DIR / "data" / "train"
YAML_DIR_VAL   = BASE_DIR / "data" / "val"
YAML_DIR_TEST  = BASE_DIR / "data" / "test"

# Where your chunks live (created by your chunking helper)
TRAIN_CHUNKS_ROOT = YAML_DIR_TRAIN / "_chunks"
TEST_CHUNKS_ROOT  = YAML_DIR_TEST / "_chunks"

# One GPU => run chunks sequentially
NPROC = 1

BOLTZ_CMD_TEMPLATE = (
    "conda run -n boltz-env --no-capture-output boltz predict {input_path} "
    "--out_dir {outdir} "
    "--accelerator gpu "
    "--devices 1 "
    "--model boltz2 "
    "--recycling_steps 1 "
    "--sampling_steps 10 "
    "--diffusion_samples 1 "
    "--max_parallel_samples 1 "
    "--max_msa_seqs 64 "
    "--num_subsampled_msa 34 "
    # IMPORTANT: remove --override once you're in production/resume mode
    "--write_embeddings"
)

# --override (only use when want to rerun on all values)
# also reduced num sampled msa to 34 and also number of recycling steps to 10 (from 20), to try and speed things up

def run_cli(input_path: Path, outdir: Path) -> int:
    """
    input_path can be:
      - a single YAML file, or
      - a directory containing many YAMLs (chunk)
    """
    input_path = Path(input_path).resolve()
    outdir = Path(outdir).resolve()
    outdir.mkdir(parents=True, exist_ok=True)

    cmd = BOLTZ_CMD_TEMPLATE.format(input_path=str(input_path), outdir=str(outdir))
    print("CMD:", cmd)

    with open(outdir / "stdout.log", "w") as so, open(outdir / "stderr.log", "w") as se:
        proc = subprocess.run(
            shlex.split(cmd),
            stdout=so,
            stderr=se,
            text=True,
            cwd=str(BASE_DIR),
        )

    print("Return code:", proc.returncode)
    return proc.returncode


def has_any_embeddings(outdir: Path) -> bool:
    pred = Path(outdir) / "predictions"
    if not pred.exists():
        return False
    return any(pred.rglob("embeddings_pair_*.npz"))



In [2]:
# RUN TRAIN CHUNKS

def list_chunk_dirs(chunks_root: Path):
    chunks_root = Path(chunks_root).resolve()
    if not chunks_root.exists():
        raise FileNotFoundError(f"Chunks root not found: {chunks_root}")

    # chunk_000, chunk_001, ...
    chunk_dirs = sorted([p for p in chunks_root.iterdir() if p.is_dir()])
    if not chunk_dirs:
        raise ValueError(f"No chunk directories found in: {chunks_root}")
    return chunk_dirs

train_chunk_dirs = list_chunk_dirs(TRAIN_CHUNKS_ROOT)

for chunk_dir in train_chunk_dirs:
    chunk_name = chunk_dir.name
    outdir = BOLTZ_OUT_TRAIN / chunk_name
    print(f"\n=== TRAIN {chunk_name} ===")
    rc = run_cli(chunk_dir, outdir)
    if rc != 0:
        print(f"[STOP] Train chunk failed: {chunk_name}. See logs in {outdir}")
        break



=== TRAIN chunk_000 ===
CMD: conda run -n boltz-env --no-capture-output boltz predict /home/natasha/multimodal_model/data/train/_chunks/chunk_000 --out_dir /data_ssd/boltz_outputs/train/chunk_000 --accelerator gpu --devices 1 --model boltz2 --recycling_steps 1 --sampling_steps 10 --diffusion_samples 1 --max_parallel_samples 1 --max_msa_seqs 64 --num_subsampled_msa 34 --write_embeddings
Return code: 1
[STOP] Train chunk failed: chunk_000. See logs in /home/natasha/multimodal_model/outputs/train/chunk_000


In [None]:
# RUN TEST CHUNKS

test_chunk_dirs = list_chunk_dirs(TEST_CHUNKS_ROOT)

for chunk_dir in test_chunk_dirs:
    chunk_name = chunk_dir.name
    outdir = BOLTZ_OUT_TEST / chunk_name
    print(f"\n=== TEST {chunk_name} ===")
    rc = run_cli(chunk_dir, outdir)
    if rc != 0:
        print(f"[STOP] Test chunk failed: {chunk_name}. See logs in {outdir}")
        break


In [None]:
# RUN VAL CHUNK

print("\n=== VAL (single directory) ===")
rc = run_cli(YAML_DIR_VAL, BOLTZ_OUT_VAL / "val_all")
if rc != 0:
    print(f"[FAIL] Val run failed. See logs in {BOLTZ_OUT_VAL / 'val_all'}")


Run With Resume from Chunks

In [None]:
# from pathlib import Path
# import subprocess, shlex, time

# BASE_DIR = Path("/home/natasha/multimodal_model")

# # outputs is a symlink -> /data_ssd/boltz_outputs (good)
# RUN_ROOT = BASE_DIR / "outputs"
# RUN_ROOT.mkdir(parents=True, exist_ok=True)

# BOLTZ_OUT_TRAIN = RUN_ROOT / "train"
# BOLTZ_OUT_VAL   = RUN_ROOT / "val"
# BOLTZ_OUT_TEST  = RUN_ROOT / "test"
# for p in (BOLTZ_OUT_TRAIN, BOLTZ_OUT_VAL, BOLTZ_OUT_TEST):
#     p.mkdir(parents=True, exist_ok=True)

# YAML_DIR_TRAIN = BASE_DIR / "data" / "train"
# YAML_DIR_VAL   = BASE_DIR / "data" / "val"
# YAML_DIR_TEST  = BASE_DIR / "data" / "test"

# TRAIN_CHUNKS_ROOT = YAML_DIR_TRAIN / "_chunks"
# TEST_CHUNKS_ROOT  = YAML_DIR_TEST  / "_chunks"

# BOLTZ_CMD_TEMPLATE = (
#     "conda run -n boltz-env --no-capture-output boltz predict {input_path} "
#     "--out_dir {outdir} "
#     "--accelerator gpu "
#     "--devices 1 "
#     "--model boltz2 "
#     "--recycling_steps 1 "
#     "--sampling_steps 10 "
#     "--diffusion_samples 1 "
#     "--max_parallel_samples 1 "
#     "--max_msa_seqs 64 "
#     "--num_subsampled_msa 34 "
#     "--write_embeddings"
# )

# DONE_MARKER = ".DONE"
# FAIL_LOG = "failed_yamls.txt"     # append-only
# STAMP = lambda: time.strftime("%Y%m%d_%H%M%S")


# def run_cli(input_path: Path, outdir: Path) -> int:
#     input_path = Path(input_path).resolve()
#     outdir = Path(outdir).resolve()
#     outdir.mkdir(parents=True, exist_ok=True)

#     cmd = BOLTZ_CMD_TEMPLATE.format(input_path=str(input_path), outdir=str(outdir))
#     so_path = outdir / f"stdout_{STAMP()}.log"
#     se_path = outdir / f"stderr_{STAMP()}.log"

#     print("CMD:", cmd)
#     with open(so_path, "w") as so, open(se_path, "w") as se:
#         proc = subprocess.run(
#             shlex.split(cmd),
#             stdout=so,
#             stderr=se,
#             text=True,
#             cwd=str(BASE_DIR),
#         )
#     print("Return code:", proc.returncode)
#     return proc.returncode


# def list_chunk_dirs(chunks_root: Path):
#     chunks_root = Path(chunks_root).resolve()
#     if not chunks_root.exists():
#         raise FileNotFoundError(f"Chunks root not found: {chunks_root}")
#     chunk_dirs = sorted([p for p in chunks_root.iterdir() if p.is_dir() and p.name.startswith("chunk_")])
#     if not chunk_dirs:
#         raise ValueError(f"No chunk directories found in: {chunks_root}")
#     return chunk_dirs


# def list_yamls(dir_path: Path):
#     d = Path(dir_path)
#     return sorted(list(d.glob("*.yml")) + list(d.glob("*.yaml")))


# def yaml_to_pair_dirname(yaml_path: Path) -> str:
#     """
#     Assumption that matches your structure: pair_000.yaml -> pair_000
#     If your YAMLs are named differently, tweak this one function only.
#     """
#     return yaml_path.stem


# def embeddings_exist_for_yaml(yaml_path: Path, outdir: Path) -> bool:
#     pair_dir = Path(outdir) / "predictions" / yaml_to_pair_dirname(yaml_path)
#     return any(pair_dir.glob("embeddings_pair_*.npz"))


# def all_embeddings_exist_for_dir(input_dir: Path, outdir: Path) -> bool:
#     yamls = list_yamls(input_dir)
#     if not yamls:
#         return False
#     return all(embeddings_exist_for_yaml(y, outdir) for y in yamls)


# def mark_done(outdir: Path):
#     (Path(outdir) / DONE_MARKER).write_text(f"done_at={time.strftime('%Y-%m-%d %H:%M:%S')}\n")


# def append_fail(outdir: Path, yaml_path: Path, rc: int):
#     p = Path(outdir) / FAIL_LOG
#     with open(p, "a") as f:
#         f.write(f"{time.strftime('%Y-%m-%d %H:%M:%S')}\trc={rc}\t{yaml_path}\n")


# def run_dir_with_safe_resume(input_dir: Path, outdir: Path, label: str):
#     """
#     Runs a chunk/val dir with:
#       - hard skip if DONE exists
#       - hard skip if all embeddings already present
#       - attempt fast directory run
#       - on failure, fall back to per-YAML runs with per-YAML skip
#       - continue past failures, record them, do not overwrite existing embeddings
#     """
#     input_dir = Path(input_dir)
#     outdir = Path(outdir)

#     if (outdir / DONE_MARKER).exists():
#         print(f"[SKIP] {label} already has {DONE_MARKER}: {outdir}")
#         return

#     if all_embeddings_exist_for_dir(input_dir, outdir):
#         print(f"[SKIP] {label} already complete by embeddings check: {outdir}")
#         mark_done(outdir)
#         return

#     # 1) Try the fast path: run boltz on the directory
#     print(f"\n=== {label} (dir run) ===")
#     rc = run_cli(input_dir, outdir)

#     if rc == 0 and all_embeddings_exist_for_dir(input_dir, outdir):
#         mark_done(outdir)
#         return

#     # 2) Fallback: per-YAML safe resume (skip if embeddings exist)
#     print(f"[FALLBACK] {label}: switching to per-YAML runs (safe resume).")
#     yamls = list_yamls(input_dir)
#     for y in yamls:
#         if embeddings_exist_for_yaml(y, outdir):
#             print(f"[SKIP-YAML] embeddings exist: {y.name}")
#             continue

#         rc_y = run_cli(y, outdir)
#         if rc_y != 0:
#             print(f"[FAIL-YAML] {y.name} (rc={rc_y}) — logged, continuing.")
#             append_fail(outdir, y, rc_y)
#             continue

#         # After successful run, ensure embeddings exist; if not, log it
#         if not embeddings_exist_for_yaml(y, outdir):
#             print(f"[WARN] {y.name} returned 0 but embeddings not found — logged.")
#             append_fail(outdir, y, 999)

#     # Done if all embeddings exist for all yamls (ignoring failures)
#     if all_embeddings_exist_for_dir(input_dir, outdir):
#         mark_done(outdir)
#     else:
#         print(f"[INFO] {label} not fully complete; check {outdir}/{FAIL_LOG} and rerun later.")


# def run_chunked_dataset(chunks_root: Path, out_root: Path, label: str):
#     for chunk_dir in list_chunk_dirs(chunks_root):
#         outdir = Path(out_root) / chunk_dir.name
#         run_dir_with_safe_resume(chunk_dir, outdir, f"{label} {chunk_dir.name}")


# def run_val_folder(val_yaml_dir: Path, out_root: Path):
#     outdir = Path(out_root) / "val_full"
#     run_dir_with_safe_resume(Path(val_yaml_dir), outdir, "VAL val_full")


# # ---- Execute ----
# run_chunked_dataset(TRAIN_CHUNKS_ROOT, BOLTZ_OUT_TRAIN, "TRAIN")
# run_val_folder(YAML_DIR_VAL, BOLTZ_OUT_VAL)
# run_chunked_dataset(TEST_CHUNKS_ROOT, BOLTZ_OUT_TEST, "TEST")


In [None]:
from pathlib import Path
import subprocess, time, shlex, os

BASE_DIR = Path("/home/natasha/multimodal_model")

# outputs is a symlink -> /data_ssd/boltz_outputs (good)
RUN_ROOT = BASE_DIR / "outputs"
RUN_ROOT.mkdir(parents=True, exist_ok=True)

BOLTZ_OUT_TRAIN = RUN_ROOT / "train"
BOLTZ_OUT_VAL   = RUN_ROOT / "val"
BOLTZ_OUT_TEST  = RUN_ROOT / "test"
for p in (BOLTZ_OUT_TRAIN, BOLTZ_OUT_VAL, BOLTZ_OUT_TEST):
    p.mkdir(parents=True, exist_ok=True)

YAML_DIR_TRAIN = BASE_DIR / "data" / "train"
YAML_DIR_VAL   = BASE_DIR / "data" / "val"
YAML_DIR_TEST  = BASE_DIR / "data" / "test"

TRAIN_CHUNKS_ROOT = YAML_DIR_TRAIN / "_chunks"
TEST_CHUNKS_ROOT  = YAML_DIR_TEST  / "_chunks"

# ---- LOCKED ENV ----
BOLTZ_ENV = "boltz-env-torchfix"
CONDA_BASE = Path("/home/natasha/miniconda3")  # adjust if yours differs
CONDA_SH = CONDA_BASE / "etc" / "profile.d" / "conda.sh"

# kernels ON => do NOT pass --no_kernels
# If you want production settings, tweak sampling/recycling here.
BOLTZ_ARGS = (
    "--accelerator gpu "
    "--devices 1 "
    "--model boltz2 "
    "--recycling_steps 1 "
    "--sampling_steps 10 "
    "--diffusion_samples 1 "
    "--max_parallel_samples 1 "
    "--max_msa_seqs 64 "
    "--num_subsampled_msa 34 "
    "--write_embeddings"
)

DONE_MARKER = ".DONE"
FAIL_LOG = "failed_yamls.txt"     # append-only
STAMP = lambda: time.strftime("%Y%m%d_%H%M%S")


def build_boltz_command(input_path: Path, outdir: Path) -> list[str]:
    """
    Runs boltz inside a real 'conda activate' so activate.d hooks apply.
    This is the most reliable way to ensure your LD_PRELOAD/LD_LIBRARY_PATH fix is used.
    """
    input_q = shlex.quote(str(Path(input_path).resolve()))
    outdir_q = shlex.quote(str(Path(outdir).resolve()))

    # shell snippet: source conda.sh -> conda activate -> boltz predict ...
    shell_cmd = (
        f"source {shlex.quote(str(CONDA_SH))} && "
        f"conda activate {shlex.quote(BOLTZ_ENV)} && "
        f"boltz predict {input_q} --out_dir {outdir_q} {BOLTZ_ARGS}"
    )

    # Run via bash -lc so conda activation behaves properly
    return ["bash", "-lc", shell_cmd]


def run_cli(input_path: Path, outdir: Path) -> int:
    input_path = Path(input_path).resolve()
    outdir = Path(outdir).resolve()
    outdir.mkdir(parents=True, exist_ok=True)

    cmd_list = build_boltz_command(input_path, outdir)

    so_path = outdir / f"stdout_{STAMP()}.log"
    se_path = outdir / f"stderr_{STAMP()}.log"

    print("CMD:", " ".join(cmd_list))
    with open(so_path, "w") as so, open(se_path, "w") as se:
        proc = subprocess.run(
            cmd_list,
            stdout=so,
            stderr=se,
            text=True,
            cwd=str(BASE_DIR),
        )
    print("Return code:", proc.returncode)
    return proc.returncode


def list_chunk_dirs(chunks_root: Path):
    chunks_root = Path(chunks_root).resolve()
    if not chunks_root.exists():
        raise FileNotFoundError(f"Chunks root not found: {chunks_root}")
    chunk_dirs = sorted([p for p in chunks_root.iterdir() if p.is_dir() and p.name.startswith("chunk_")])
    if not chunk_dirs:
        raise ValueError(f"No chunk directories found in: {chunks_root}")
    return chunk_dirs


def list_yamls(dir_path: Path):
    d = Path(dir_path)
    return sorted(list(d.glob("*.yml")) + list(d.glob("*.yaml")))


def yaml_to_pair_dirname(yaml_path: Path) -> str:
    return yaml_path.stem


def embeddings_exist_for_yaml(yaml_path: Path, outdir: Path) -> bool:
    pair_dir = Path(outdir) / "predictions" / yaml_to_pair_dirname(yaml_path)
    return any(pair_dir.glob("embeddings_pair_*.npz"))


def all_embeddings_exist_for_dir(input_dir: Path, outdir: Path) -> bool:
    yamls = list_yamls(input_dir)
    if not yamls:
        return False
    return all(embeddings_exist_for_yaml(y, outdir) for y in yamls)


def mark_done(outdir: Path):
    (Path(outdir) / DONE_MARKER).write_text(f"done_at={time.strftime('%Y-%m-%d %H:%M:%S')}\n")


def append_fail(outdir: Path, yaml_path: Path, rc: int):
    p = Path(outdir) / FAIL_LOG
    with open(p, "a") as f:
        f.write(f"{time.strftime('%Y-%m-%d %H:%M:%S')}\trc={rc}\t{yaml_path}\n")


def run_dir_with_safe_resume(input_dir: Path, outdir: Path, label: str):
    input_dir = Path(input_dir)
    outdir = Path(outdir)

    if (outdir / DONE_MARKER).exists():
        print(f"[SKIP] {label} already has {DONE_MARKER}: {outdir}")
        return

    if all_embeddings_exist_for_dir(input_dir, outdir):
        print(f"[SKIP] {label} already complete by embeddings check: {outdir}")
        mark_done(outdir)
        return

    # 1) Try fast path: run boltz on the directory
    print(f"\n=== {label} (dir run) ===")
    rc = run_cli(input_dir, outdir)

    if rc == 0 and all_embeddings_exist_for_dir(input_dir, outdir):
        mark_done(outdir)
        return

    # 2) Fallback: per-YAML safe resume
    print(f"[FALLBACK] {label}: switching to per-YAML runs (safe resume).")
    yamls = list_yamls(input_dir)
    for y in yamls:
        if embeddings_exist_for_yaml(y, outdir):
            print(f"[SKIP-YAML] embeddings exist: {y.name}")
            continue

        rc_y = run_cli(y, outdir)
        if rc_y != 0:
            print(f"[FAIL-YAML] {y.name} (rc={rc_y}) — logged, continuing.")
            append_fail(outdir, y, rc_y)
            continue

        if not embeddings_exist_for_yaml(y, outdir):
            print(f"[WARN] {y.name} returned 0 but embeddings not found — logged.")
            append_fail(outdir, y, 999)

    if all_embeddings_exist_for_dir(input_dir, outdir):
        mark_done(outdir)
    else:
        print(f"[INFO] {label} not fully complete; check {outdir}/{FAIL_LOG} and rerun later.")


def run_chunked_dataset(chunks_root: Path, out_root: Path, label: str):
    for chunk_dir in list_chunk_dirs(chunks_root):
        outdir = Path(out_root) / chunk_dir.name
        run_dir_with_safe_resume(chunk_dir, outdir, f"{label} {chunk_dir.name}")


def run_val_folder(val_yaml_dir: Path, out_root: Path):
    outdir = Path(out_root) / "val_full"
    run_dir_with_safe_resume(Path(val_yaml_dir), outdir, "VAL val_full")


# ---- Execute ----
run_chunked_dataset(TRAIN_CHUNKS_ROOT, BOLTZ_OUT_TRAIN, "TRAIN")
run_val_folder(YAML_DIR_VAL, BOLTZ_OUT_VAL)
run_chunked_dataset(TEST_CHUNKS_ROOT, BOLTZ_OUT_TEST, "TEST")
