In [1]:
%env CUDA_DEVICE_ORDER=PCI_BUS_ID
%env CUDA_VISIBLE_DEVICES=5

env: CUDA_DEVICE_ORDER=PCI_BUS_ID
env: CUDA_VISIBLE_DEVICES=5


In [2]:
import shutil
import sys
from datetime import datetime
from pathlib import Path

from IPython.display import clear_output, display

from add_linker import convert_fasta
from create_empty_msas import main as create_empty_msas
from debug_openfold import run_prediction as run_openfold
from extract_best_seq import parse_and_extract
from pmpnn import main as run_protein_mpnn

[2024-09-03 20:41:14,215] [INFO] [real_accelerator.py:161:get_accelerator] Setting ds_accelerator to cuda (auto detect)


In [4]:
PREFIX = "dna_design_0.4"
auto_generate = True

if not auto_generate:
    OUTPUT_DIR = Path("/nfs/homedirs/hetzell/code/protein_design/example_outputs") / f"30Apr24_looping"
else:
    current_date = datetime.now()
    ddmmmyy_prefix = current_date.strftime("%d%b%y")

    OUTPUT_DIR = Path("/nfs/homedirs/hetzell/code/protein_design/example_outputs") / f"{ddmmmyy_prefix}_{PREFIX}"

if not OUTPUT_DIR.exists():
    OUTPUT_DIR.mkdir(parents=True)

print(OUTPUT_DIR)

/nfs/homedirs/hetzell/code/protein_design/example_outputs/03Sep24_dna_design_0.4


In [5]:
# C8_designs = Path("/nfs/homedirs/hetzell/code/protein_project/protein-backbone-MCTS/outputs/C_small_chunks")
# INPUT_PDB = C8_designs / "Subfolder_3" 
# INPUT_PDB = Path("/nfs/homedirs/hetzell/code/protein-frame-flow/inference_outputs/hallucination_pdb/2024-07-25_11-48-32/last/unconditional/run_2024-08-23_12-46-11/pdbs")
INPUT_PDB = Path("/nfs/homedirs/hetzell/code/protein_design/example_outputs/03Sep24_dna_design_0.4/input")
OUTPUT_PDB = OUTPUT_DIR/"pdbs"
# Ensure the output directory exists
if not OUTPUT_PDB.exists():
    OUTPUT_PDB.mkdir(parents=True, exist_ok=False)

# Move all PDB files to OUTPUT_DIR
for pdb in INPUT_PDB.iterdir():
    if pdb.is_file() and pdb.suffix.lower() == '.pdb':  # Ensure it's a file and has the .pdb extension
        target_path = OUTPUT_PDB/ pdb.name
        if target_path.exists():
            continue
        try:
            shutil.copy(str(pdb), str(target_path))
        except Exception as e:
            print(f"Failed to move {pdb} to {target_path}: {e}")


In [5]:
run_protein_mpnn(str(OUTPUT_PDB), str(OUTPUT_DIR/'protein_mpnn'), symmetry=False, seqs=1000)

----------------------------------------
chain_id_jsonl is NOT loaded
----------------------------------------
fixed_positions_jsonl is NOT loaded
----------------------------------------
pssm_jsonl is NOT loaded
----------------------------------------
omit_AA_jsonl is NOT loaded
----------------------------------------
bias_AA_jsonl is NOT loaded
----------------------------------------
tied_positions_jsonl is NOT loaded
----------------------------------------
bias by residue dictionary is not loaded, or not provided
----------------------------------------
discarded {'bad_chars': 0, 'too_long': 0, 'bad_seq_length': 0}
----------------------------------------
Number of edges: 48
Training noise level: 0.2A
Generating sequences for: template_dna
1000 sequences of length 602 generated in 142.185 seconds


In [6]:
FASTA_DIR = OUTPUT_DIR/ "protein_mpnn" / "seqs"
OUTPUT_SEQS = FASTA_DIR.parent / "seqs_best"
fasta_files = [f for f in FASTA_DIR.iterdir() if f.suffix in [".fasta", ".fa"]]

for i, f in enumerate(fasta_files):
    parse_and_extract(f, OUTPUT_SEQS, overwrite=True, n_seqs=250, add_monomer=True, add_trimer=False)
    print(f"Sequences {i+1:5}/{len(fasta_files):5} done.")
    clear_output(wait=True)

Sequences     1/    1 done.


In [7]:
fasta_files = [f for f in (OUTPUT_DIR/ "protein_mpnn"/ "seqs_best"/"trimer").iterdir() if f.suffix in [".fasta", ".fa"]]

OUTPUT_LINKER = OUTPUT_DIR/ "protein_mpnn"/ "seqs_best" / "trimer_linker"

OUTPUT_LINKER.mkdir(parents=True, exist_ok=True)

for i, f in enumerate(fasta_files):
    o = OUTPUT_LINKER / f"{f.stem}_linker.fasta"
    convert_fasta(f, o)
    print(f"Sequences {i+1:5}/{len(fasta_files):5} done.\n")
    clear_output(wait=True)

Sequences   300/  300 done.



In [7]:
# setting = 'trimer_linker'
setting = 'monomer'

OUTPUT_OPENFOLD = OUTPUT_DIR / f"openfold_{setting}"
# FASTA_DIR = OUTPUT_LINKER
FASTA_DIR = OUTPUT_DIR/ "protein_mpnn"/ "seqs_best" / f"{setting}"

create_empty_msas(str(FASTA_DIR), str(OUTPUT_OPENFOLD / "alignments"))

In [10]:
FASTA_DIR

PosixPath('/nfs/homedirs/hetzell/code/protein_design/example_outputs/24Aug24_frameflow_design/protein_mpnn/seqs_best/homomer')

In [19]:
# # Only test on a subset of sequences
# _FASTA_DIR = FASTA_DIR.parent / "trimer_linker_test"

run_openfold(
    fasta_dir=str(FASTA_DIR),
    use_precomputed_alignments=str(OUTPUT_OPENFOLD / "alignments"),
    config_preset="model_1",
    model_device="cuda:0",
    output_dir=str(OUTPUT_OPENFOLD),
)

KeyboardInterrupt: 

In [7]:
setting = "homomer"

OUTPUT_OPENFOLD = OUTPUT_DIR / f"openfold_{setting}"
# Only test on a subset of sequences
_FASTA_DIR = OUTPUT_DIR / "protein_mpnn" / "seqs_best" / "homomer"

create_empty_msas(str(_FASTA_DIR), str(OUTPUT_OPENFOLD / "alignments"))

In [6]:
# Only test on a subset of sequences
_FASTA_DIR = OUTPUT_DIR / "protein_mpnn" / "seqs_best" / "homomer"

run_openfold(
    fasta_dir=str(_FASTA_DIR),
    use_precomputed_alignments=str(OUTPUT_OPENFOLD / "alignments"),
    config_preset="model_1_multimer_v3",
    model_device="cuda:0",
    output_dir=str(OUTPUT_OPENFOLD),
)

INFO:/nfs/staff-ssd/hetzell/code/protein_design/modules/openfold/openfold/utils/script_utils.py:Successfully loaded JAX parameters at modules/openfold/openfold/resources/params/params_model_1_multimer_v3.npz...
INFO:/nfs/staff-ssd/hetzell/code/protein_design/modules/openfold/run_pretrained_openfold.py:Skipping C8_oligo_14_0__A-B-C-D-E-F-G-H_model_1_multimer_v3 as it already exists...
INFO:/nfs/staff-ssd/hetzell/code/protein_design/modules/openfold/run_pretrained_openfold.py:Skipping C8_oligo_1_1__A-B-C-D-E-F-G-H_model_1_multimer_v3 as it already exists...
INFO:/nfs/staff-ssd/hetzell/code/protein_design/modules/openfold/run_pretrained_openfold.py:Skipping C8_oligo_11_0__A-B-C-D-E-F-G-H_model_1_multimer_v3 as it already exists...
INFO:/nfs/staff-ssd/hetzell/code/protein_design/modules/openfold/run_pretrained_openfold.py:Skipping C8_oligo_1_2__A-B-C-D-E-F-G-H_model_1_multimer_v3 as it already exists...
INFO:/nfs/staff-ssd/hetzell/code/protein_design/modules/openfold/run_pretrained_openfol