# Batch Protein Prep Example

This notebook shows how to load a large amount of data and execute runs on them in parallel

In [None]:
import os
import asyncio
from glob import glob
from datetime import datetime
from pathlib import Path

from pdbtools import pdb_fetch, pdb_delhetatm, pdb_selchain, pdb_rplresname, pdb_keepcoord, pdb_selresname

import tengu

### 0) Setup

In [None]:
# Set our token - ensure you have exported TENGU_TOKEN in your shell; or just replace the os.getenv with your token
TOKEN = os.getenv("TENGU_TOKEN")
URL = os.getenv("TENGU_URL")

In [None]:
# Define our project information
DESCRIPTION = "tengu-py batch notebook"
TAGS = ["qdx", "tengu-py-v2", "demo", "batch-prep" ]
WORK_DIR = Path.home() / "qdx" / "tengu-py-batch-prep"

# Set our inputs
PROTEIN_PDB_FOLDER_PATH = WORK_DIR / "proteins"

TARGET = "NIX_SSH"

In [None]:
#|hide
if WORK_DIR.exists():
    client = tengu.Provider(workspace=WORK_DIR)
    await client.nuke(remote=True)

Ensure your workdir exists

In [None]:
os.makedirs(WORK_DIR, exist_ok=True)
os.makedirs(PROTEIN_PDB_FOLDER_PATH, exist_ok=True)

## Initialize our tengu client and fetch available module paths

In [None]:
# Get our client, for calling modules and using the tengu API
client = await tengu.build_provider_with_functions(
    access_token=TOKEN, url=URL, workspace=WORK_DIR, batch_tags=TAGS
)

In [None]:
# fetch datafiles
for pdb in ["1B39", "4QXI", "8FSU"]:
    complex = list(pdb_fetch.fetch_structure(pdb))
    protein = pdb_delhetatm.remove_hetatm(pdb_selchain.select_chain(complex, "A"))

    with open(PROTEIN_PDB_FOLDER_PATH / f"{pdb}_protein.pdb", "w") as f:
        for l in protein:
            f.write(str(l))

In [None]:
help(client.prepare_protein)

Help on function prepare_protein in module tengu.provider:

async prepare_protein(*args: [<class 'pathlib.Path'>], target: tengu.graphql_client.enums.ModuleInstanceTarget | None = <ModuleInstanceTarget.NIX_SSH_2: 'NIX_SSH_2'>, resources: tengu.graphql_client.input_types.ModuleInstanceResourcesInput | None = ModuleInstanceResourcesInput(gpus=1, gpu_mem=None, gpu_mem_units=None, cpus=None, nodes=None, mem=None, mem_units=None, storage=128, storage_units=<MemUnits.MB: 'MB'>, walltime=None, storage_mounts=None), tags: list[str] | None = None, restore: bool | None = None) -> [<class 'pathlib.Path'>, <class 'pathlib.Path'>]
    Prepare a PDB for downstream tasks: protonate, fill missing atoms, etc.
    
    Module version: github:talo/pdb2pqr/ff5abe87af13f31478ede490d37468a536621e9c#prepare_protein_tengu
    
    QDX Type Description:
    
        input_pdb: @bytes 
    
    ->
    
        output_qdxf: @[Conformer];
    
        output_pdb: @bytes
    
    
    
    :param input_pdb: An inp

## For each ligand, start a gmx + gmx_mmpbsa run

In [None]:
proteins = map(lambda x: Path(x), glob(str(PROTEIN_PDB_FOLDER_PATH / "*.pdb")))

protein_outputs = []

for protein_path in proteins:
    print(protein_path)
    name = protein_path.stem
    (prepped_protein_qdxf, prepped_pdb) = await client.prepare_protein(
        protein_path,
        target=TARGET,
        tags=[name],
    )
    protein_outputs.append((name, prepped_protein_qdxf))

protein_outputs

/home/machineer/qdx/tengu-py-batch-prep/proteins/1B39_protein.pdb
/home/machineer/qdx/tengu-py-batch-prep/proteins/8FSU_protein.pdb
/home/machineer/qdx/tengu-py-batch-prep/proteins/4QXI_protein.pdb


[('1B39_protein', Arg(id=ffa4d091-ecb7-4fa7-95ef-5a1da8ccf12f, value=None)),
 ('8FSU_protein', Arg(id=7eb1c698-1b82-40ca-af51-58ea107c51fb, value=None)),
 ('4QXI_protein', Arg(id=5c235e54-3227-4b63-85bc-09edb68bd961, value=None))]

## Report progress
This will show the status of all of your runs

In [None]:
status = await client.status(group_by="path")
print(f"{'Module':<20} | {'Status':<20} | Count")
print("-" * 50)
for module, (status, path, count) in status.items():
    print(f"{path:<20} | {status:<20} | {count}")

Module               | Status               | Count
--------------------------------------------------
prepare_protein      | ModuleInstanceStatus.RESOLVING | 2
prepare_protein      | ModuleInstanceStatus.ADMITTED | 1


## Download Results
This will retrieve results for your completed module_instances

In [None]:
await asyncio.gather(
    *[output[1].download(filename=f"protein_{output[0]}.qdxf.json") for output in protein_outputs]
)

2024-01-23 12:51:58,004 - tengu - INFO - Argument ffa4d091-ecb7-4fa7-95ef-5a1da8ccf12f is now ModuleInstanceStatus.ADMITTED
2024-01-23 12:51:58,039 - tengu - INFO - Argument 5c235e54-3227-4b63-85bc-09edb68bd961 is now ModuleInstanceStatus.RESOLVING
2024-01-23 12:51:58,081 - tengu - INFO - Argument 7eb1c698-1b82-40ca-af51-58ea107c51fb is now ModuleInstanceStatus.RESOLVING
2024-01-23 12:52:09,119 - tengu - INFO - Argument ffa4d091-ecb7-4fa7-95ef-5a1da8ccf12f is now ModuleInstanceStatus.DISPATCHED
2024-01-23 12:52:15,777 - tengu - INFO - Argument ffa4d091-ecb7-4fa7-95ef-5a1da8ccf12f is now ModuleInstanceStatus.QUEUED
2024-01-23 12:52:20,274 - tengu - INFO - Argument 7eb1c698-1b82-40ca-af51-58ea107c51fb is now ModuleInstanceStatus.ADMITTED
2024-01-23 12:52:20,354 - tengu - INFO - Argument 5c235e54-3227-4b63-85bc-09edb68bd961 is now ModuleInstanceStatus.ADMITTED


Exception: (<ModuleFailureReason.RUN: 'RUN'>, ModuleInstanceCommonFailureContext(stdout='', stderr='thread \'main\' panicked at /nix/store/eeeeeeeeeeeeeeeeeeeeeeeeeeeeeeee-vendor-cargo-deps/40f807129291673b808a9355e94137a848422f905b66c6b8a5e5c8205bce5c84/qdx-common-0.6.0/src/module.rs:386:33:\nincompatible input arguments ["null"]\nnote: run with `RUST_BACKTRACE=1` environment variable to display a backtrace', syserr='/home/ryan/.cache/tengu_store/nq_queues/nqdir_0/,18d34a948cd.1708094'))

2024-01-23 13:01:12,808 - tengu - INFO - Argument 7eb1c698-1b82-40ca-af51-58ea107c51fb is now ModuleInstanceStatus.DISPATCHED
2024-01-23 13:01:18,302 - tengu - INFO - Argument 7eb1c698-1b82-40ca-af51-58ea107c51fb is now ModuleInstanceStatus.AWAITING_UPLOAD
2024-01-23 13:01:35,826 - tengu - INFO - Argument 5c235e54-3227-4b63-85bc-09edb68bd961 is now ModuleInstanceStatus.DISPATCHED
2024-01-23 13:01:44,757 - tengu - INFO - Argument 5c235e54-3227-4b63-85bc-09edb68bd961 is now ModuleInstanceStatus.AWAITING_UPLOAD


## Check failures
This will retrieve failed runs in your workspace history

In [None]:
for instance_id, (status, name, count) in (await client.status()).items():
    if status.value == "FAILED":
        async for log_page in client.logs(instance_id, "stderr"):
            for log in log_page:
                print(log)