# Prepare Protein Module

In this example, the workflow is augmented with the `prepare_protein` module, which primarily focuses on completing incomplete protein structures by inserting missing residues and hydrogens.

1. **Fills in Missing Residues**  
   Using advanced modeling techniques, the module reconstructs missing segments of the protein backbone and sidechains, ensuring structural completeness.
   
2. **Adds Missing Hydrogen Atoms**  
   Protonation states are assigned based on physiological conditions or user-defined pH values, ensuring a chemically valid structure.

Underneath the hood, `prepare_protein` relies on two widely used tools:

1. **[PDBFixer](https://github.com/openmm/pdbfixer)**  
   PDBFixer is a tool that identifies and corrects issues in PDB files, such as missing atoms and residues, while preserving the overall structure.

2. **[PDB2PQR](https://pdb2pqr.readthedocs.io/en/latest/)**  
   PDB2PQR is used for assigning protonation states and optimizing hydrogen bonding networks based on the specified pH conditions.


## Function usage
```heskell
let
    prepare_protein_options = {
        truncation_threshold = some 2,
        capping_style = some 'Truncated',
        naming_scheme = some 'Amber',
        ph = some 7.4
    },

    prepare_protein = \\protein_conformer_trc -> 
        map to_data (get 0 ( prepare_protein_rex_s default_runspec prepare_protein_options [protein_conformer_trc] ))


in
\\input ->
    let
        prepare_protein_result =  prepare_protein unprepped_trc ,
        
        topology = get 0 (prepare_protein_result),
        residues = get 1 (prepare_protein_result),
        chains = get 2 (prepare_protein_result),
        prepared_trc = [ topology, residues, chains ],


```


In [1]:
from rush import build_blocking_provider

ImportError: cannot import name 'build_blocking_provider' from 'rush' (/Users/quekweiliang/miniconda3/lib/python3.11/site-packages/rush/__init__.py)

In [2]:
# |hide
# hidden setup for the notebook
import os
import pathlib

WORK_DIR = pathlib.Path("~/qdx/benchmark_notebook").expanduser()
if WORK_DIR.exists():
    !rm -r $WORK_DIR
os.makedirs(WORK_DIR, exist_ok=True)
# swap into clean workdir so that our tests are deterministic
os.chdir(WORK_DIR)
PUT_YOUR_TOKEN_HERE = os.environ["RUSH_TOKEN"]
PUT_YOUR_PREFERRED_WORKING_DIRECTORY_HERE = WORK_DIR
RUSH_URL=os.environ["RUSH_URL"]
os.environ["RUSH_RESTORE_BY_DEFAULT"] = "False"

In [3]:
client = build_blocking_provider(
    access_token=PUT_YOUR_TOKEN_HERE,
    url = RUSH_URL,
    # for example, if your token is 00000000-dddd-cccc-0000-11111111,
    # then you should put access_token="00000000-dddd-cccc-0000-11111111"
    # (including the double quotes)
)

2025-02-05 18:24:27,664 - rush - INFO - Not restoring by default via env


GraphQLClientGraphQLMultiError: Data `tengu_dal::user::User` does not exist.

In [6]:
benchmark = client.benchmark(name="OpenFF CDK2 RMSD17 Benchmark")

2025-02-05 17:38:01,482 - rush - INFO - Not restoring by default via env


In [13]:
# |hide
from IPython.display import Markdown as md
rex_code_above = """
let

    auto3d = \\smi ->  map to_data (get 0 (auto3d_rex_s default_runspec_gpu { k = 1 } [smi])),

    p2rank = \\prot_conf ->  p2rank_rex_s default_runspec {} prot_conf,

    gnina = \\prot_conf -> \\bounding_box -> \\smol_conf ->
        get 0 (get 0 (gnina_rex_s default_runspec_gpu {} [prot_conf] [bounding_box] smol_conf [])),

    prepare_protein_options = {
        truncation_threshold = some 2,
        capping_style = some 'Truncated',
        naming_scheme = some 'Amber',
        ph = some 7.4
    },

    prepare_protein = \\protein_conformer_trc -> 
        map to_data (get 0 ( prepare_protein_rex_s default_runspec prepare_protein_options [protein_conformer_trc] ))

in
\\input ->
    let
        protein = load (id (get 0 input)) 'ProteinConformer',
        smol_id = id (get 1 input),
        smiles = smi (load smol_id 'Smol'),

        structure = load (structure_id protein) 'Structure',
        trc = [
            topology structure,
            residues structure,
            chains structure
        ],

        prepare_protein_result =  prepare_protein trc ,
        
        topology = get 0 (prepare_protein_result),
        residues = get 1 (prepare_protein_result),
        chains = get 2 (prepare_protein_result),
        prepared_trc = [ topology, residues, chains ],

        bounding_box = get 0 (get 0 (p2rank prepared_trc)),

        smol_structure = auto3d smiles,

        docked_structure = gnina prepared_trc bounding_box [smol_structure],

        min_affinity = list_min (map (get "affinity") (get "scores" docked_structure)),

        binding_affinity = BindingAffinity {
            affinity = min_affinity,
            affinity_metric = 'kcal/mol',
            protein_id = protein_id protein,
            smol_id = smol_id,
            metadata = Metadata {
                name = "blah",
                description = none,
                tags = []
            }
        }
    in
        [BenchmarkArg {
            entity = "BindingAffinity",
            id = save binding_affinity
        }]
"""

In [6]:
#| echo:false
md(f"```haskell{rex_code_above}```")

```haskell
let
    runspec = RunSpec {
        target = 'Bullet',
        resources = Resources {
            storage = some 10,
            storage_units = some "MB",
            gpus = some 1
        }
    },

    runspec_nogpu = RunSpec {
        target = 'Bullet',
        resources = Resources {
            storage = some 10,
            storage_units = some "MB",
            gpus = none
        }
    },

    auto3d = \smi ->
        let
            result = get 0 (auto3d_rex_s runspec { k = 1 } [smi]),
            make_virtual_object = \index ->
                VirtualObject {
                    path = get "path" (get index result),
                    size = get "size" (get index result),
                    format = "json"
                }
        in
            (make_virtual_object 0, make_virtual_object 1),

    p2rank = \prot_conf ->  p2rank_rex_s runspec_nogpu {} prot_conf,

    gnina = \prot_conf -> \bounding_box -> \smol_conf ->
        get 0 (get 0 (gnina_rex_s runspec {} [prot_conf] [bounding_box] smol_conf [])),


    prepare_protein_options = {
        truncation_threshold = some 2,
        capping_style = some 'Truncated',
        naming_scheme = some 'Amber',
        ph = some 7.4
    },

    prepare_protein = \protein_conformer_trc -> 
        prepare_protein_rex_s runspec prepare_protein_options [protein_conformer_trc],

in
\input ->
    let
        protein = load (id (get 0 input)) 'ProteinConformer',
        smol_id = id (get 1 input),
        smiles = smi (load smol_id 'Smol'),

        structure = load (structure_id protein) 'Structure',
        trc = [
            topology structure,
            residues structure,
            chains structure
        ],

        prepare_protein_result = get 0 ( prepare_protein trc ),
        make_virtual_object = \index ->
            VirtualObject {
                path = get "path" (get index prepare_protein_result),
                size = get "size" (get index prepare_protein_result),
                format = "json"
            },
        topology = make_virtual_object 0,
        residues = make_virtual_object 1,
        chains = make_virtual_object 2,
        prepared_trc = [ topology, residues, chains ],

        bounding_box = get 0 (get 0 (p2rank prepared_trc)),

        smol_structure = auto3d smiles,

        docked_structure = gnina prepared_trc bounding_box [smol_structure],

        min_affinity = list_min (map (get "affinity") (get "scores" docked_structure)),

        binding_affinity = BindingAffinity {
            affinity = min_affinity,
            affinity_metric = 'kcal/mol',
            protein_id = protein_id protein,
            smol_id = smol_id,
            metadata = Metadata {
                name = "blah",
                description = none,
                tags = []
            }
        }
    in
        [BenchmarkArg {
            entity = "BindingAffinity",
            id = save binding_affinity
        }]
```

In [14]:
submission = client.run_benchmark(
    benchmark.id, 
    rex_code_above, 
    "with prepare protein", 
    sample=0.2)

View your submission at https://rush-qdx-2-staging.web.app/project/07c7d14a-3a55-491c-ab50-65cca07ec7a0/runs?selectedRunId=e09375f5-1f2c-467a-8bd7-615eb5c1cfb4
