# prepare_protein — Run a batch protein preperation in parallel

This notebook shows how to load a large amount of data and execute runs on them in parallel

# 0) Code Sample
See the detailed breakdown in sections.

```python
#...import the dependencies and set your configuration (see 1.0, 1.1), then:

# ensure workdirs exist
os.makedirs(WORK_DIR, exist_ok=True)
os.makedirs(PROTEIN_PDB_FOLDER_PATH, exist_ok=True)

# 2 Initialize the client
client = await rush.build_provider_with_functions(
    access_token=TOKEN, url=URL, batch_tags=TAGS
)

# fetch datafiles
for pdb in ["1B39", "4QXI", "8FSU"]:
    complex = list(pdb_fetch.fetch_structure(pdb))
    protein = pdb_delhetatm.remove_hetatm(pdb_selchain.select_chain(complex, "A"))

    with open(PROTEIN_PDB_FOLDER_PATH / f"{pdb}_protein.pdb", "w") as f:
        for l in protein:
            f.write(str(l))

# 3 Prepare each protein
proteins = map(lambda x: Path(x), glob(str(PROTEIN_PDB_FOLDER_PATH / "*.pdb")))

protein_outputs = []

for protein_path in proteins:
    print(protein_path)
    name = protein_path.stem
    (prepped_protein_qdxf, prepped_protein_pdb) = await client.prepare_protein(
        protein_path,
        tags=[name],
        target=TARGET
    )
    protein_outputs.append((name, prepped_protein_qdxf, prepped_protein_pdb))
```

# 1) Setup

## 1.0) Imports

In [None]:
import os
import asyncio
from glob import glob
from datetime import datetime
from pathlib import Path

from pdbtools import (
    pdb_fetch,
    pdb_delhetatm,
    pdb_selchain,
    pdb_rplresname,
    pdb_keepcoord,
    pdb_selresname,
)
import py3Dmol

import rush

## 1.1) Configuration

In [None]:
# Set our token - ensure you have exported RUSH_TOKEN in your shell; or just
# replace the os.getenv with your token
TOKEN = os.getenv("RUSH_TOKEN")
URL = os.getenv("RUSH_URL")

In [None]:
# Define our project information
DESCRIPTION = "rush-py batch notebook"
TAGS = ["qdx", "rush-py-v2", "demo", "batch-prep"]
WORK_DIR = Path.home() / "qdx" / "rush-py-batch-prep"

# Set our inputs
PROTEIN_PDB_FOLDER_PATH = WORK_DIR / "proteins"

TARGET="NIX_SSH_2"

In [None]:
# |hide
if WORK_DIR.exists():
    client = rush.Provider(workspace=WORK_DIR)
    await client.nuke(remote=False)

Ensure your workdir exists

In [None]:
os.makedirs(WORK_DIR, exist_ok=True)
os.makedirs(PROTEIN_PDB_FOLDER_PATH, exist_ok=True)

## 1.2) Build your client

Initialize our rush client and fetch available module paths.

In [None]:
# Get our client, for calling modules and using the rush API
client = await rush.build_provider_with_functions(
    access_token=TOKEN, url=URL, batch_tags=TAGS
)

In [None]:
# |hide
client = await rush.build_provider_with_functions(
    access_token=TOKEN, url=URL, workspace=WORK_DIR, batch_tags=TAGS, restore_by_default=True
)

In [None]:
# fetch datafiles
for pdb in ["1B39", "4QXI", "8FSU"]:
    complex = list(pdb_fetch.fetch_structure(pdb))
    protein = pdb_delhetatm.remove_hetatm(pdb_selchain.select_chain(complex, "A"))

    with open(PROTEIN_PDB_FOLDER_PATH / f"{pdb}_protein.pdb", "w") as f:
        for l in protein:
            f.write(str(l))

In [None]:
help(client.prepare_protein)

Help on function prepare_protein in module rush.provider:

async prepare_protein(*args: *tuple[RushObject[bytes]], target: Optional[Target] = None, resources: Optional[Resources] = {'storage': 138, 'storage_units': 'MB', 'gpus': 1}, tags: list[str] | None = None, restore: bool | None = None) -> tuple[RushObject[list[Conformer]], RushObject[bytes]]
    Prepare a PDB for downstream tasks: protonate, fill missing atoms, etc.
    
    Module version:  
    `github:talo/prepare_protein/83bed2ad1f01f495c94518717f9f5b1bd7fe855c#prepare_protein_tengu`
    
    QDX Type Description:
    
        input_pdb: @bytes
        ->
        output_qdxf: @[Conformer];
        output_pdb: @bytes
    
    :param input_pdb: An input protein as a file; one PDB file
    :return output_qdxf: An output protein a vec: one qdxf per model in pdb
    :return output_pdb: An output protein as a file: one PDB file



# 2) Prepare each protein

In [None]:
proteins = map(lambda x: Path(x), glob(str(PROTEIN_PDB_FOLDER_PATH / "*.pdb")))

protein_outputs = []

for protein_path in proteins:
    print(protein_path)
    name = protein_path.stem
    (prepped_protein_qdxf, prepped_protein_pdb) = await client.prepare_protein(
        protein_path,
        tags=[name],
        target=TARGET
    )
    protein_outputs.append((name, prepped_protein_qdxf, prepped_protein_pdb))

protein_outputs

/home/machineer/qdx/rush-py-batch-prep/proteins/1B39_protein.pdb
2024-02-10 18:00:20,666 - rush - INFO - Trying to restore job with tags: ['1B39_protein', 'qdx', 'rush-py-v2', 'demo', 'batch-prep'] and path: github:talo/prepare_protein/83bed2ad1f01f495c94518717f9f5b1bd7fe855c#prepare_protein_tengu
2024-02-10 18:00:21,021 - rush - INFO - Restoring job from previous run with id 15fb3f47-3ccf-4f8d-9f4e-6e2a34d8f6f6
/home/machineer/qdx/rush-py-batch-prep/proteins/8FSU_protein.pdb
2024-02-10 18:00:21,021 - rush - INFO - Trying to restore job with tags: ['8FSU_protein', 'qdx', 'rush-py-v2', 'demo', 'batch-prep'] and path: github:talo/prepare_protein/83bed2ad1f01f495c94518717f9f5b1bd7fe855c#prepare_protein_tengu
2024-02-10 18:00:21,306 - rush - INFO - Restoring job from previous run with id 68fa9383-a88d-4da4-b27f-2cb160d41290
/home/machineer/qdx/rush-py-batch-prep/proteins/4QXI_protein.pdb
2024-02-10 18:00:21,307 - rush - INFO - Trying to restore job with tags: ['4QXI_protein', 'qdx', 'rush-

[('1B39_protein',
  Arg(id=0a513d96-bb81-4e41-948f-5e49ee489dca, value=None),
  Arg(id=0debb2d8-9b19-4bc7-a64d-c33886f3332e, value=None)),
 ('8FSU_protein',
  Arg(id=4a2565bf-e07b-4a09-9533-55ea0ed74888, value=None),
  Arg(id=a61cccf2-0959-4977-8281-53695d2c48d3, value=None)),
 ('4QXI_protein',
  Arg(id=9291347d-dd2e-45b3-918c-02a9198cdab2, value=None),
  Arg(id=7988aa2c-7610-4af7-a609-d8331f20bc3c, value=None))]

# 3) Report progress
This will show the status of all of your runs

In [None]:
status = await client.status(group_by="path")
print(f"{'Module':<32} | {'Status':<32} | Count")
print("-" * 75)
for module, (status, path, count) in status.items():
    print(f"{path:<32} | {status:<32} | {count:>5}")

Module                           | Status                           | Count
---------------------------------------------------------------------------


# 4) Download Results
This will retrieve results for your completed module_instances

In [None]:

await asyncio.gather(
    *[
        output[1].download(filename=f"protein_{output[0]}_prepared.qdxf.json")
        for output in protein_outputs
    ] + [
        output[2].download(filename=f"protein_{output[0]}_prepared.pdb")
        for output in protein_outputs
    ],
)

[None, None, None, None, None, None]

# 5) Visualize Results

In [None]:
view = py3Dmol.view(viewergrid=(1,3))

prepared_proteins = map(lambda x: Path(x), glob(str(client.workspace / "objects" / "*.pdb")))

for (i, file) in enumerate(prepared_proteins):
    with open(file, "r") as f:
        view.addModel(f.read(), "pdb", viewer=(0,i))
        view.setStyle({"cartoon": {"color": "spectrum"}}, viewer=(0,i))
        view.zoomTo(viewer=(0,i))
view.show()

# 6) Check failures
This will retrieve failed runs in your workspace history

In [None]:
for instance_id, (status, name, count) in (await client.status()).items():
    if status.value == "FAILED":
        async for log_page in client.logs(instance_id, "stderr"):
            for log in log_page:
                print(log)