# MPNN (序列設計)
從backbone/全原子 -> 推估序列

    protein_mpnn: Original ProteinMPNN for protein-only design
    ligand_mpnn: Extended model supporting ligand-aware design


    batch_size: Number of sequences to generate per structure
    remove_waters: Whether to exclude water molecules from context


# 最簡單的template

In [5]:
# 先隨機生成一個骨架
from lightning.fabric import seed_everything
from rfd3.engine import RFD3InferenceConfig, RFD3InferenceEngine
from atomworks.io.utils.visualize import view
# Set seed for reproducibility
seed_everything(42)
config = RFD3InferenceConfig(
    specification={
        'length': 100,  # Generate 80-residue proteins
        'extra': {
            #"contigs": ["30-30", "A50-65", "30-30"]
        },  # We are not using any extra specifications here.
    },
    diffusion_batch_size=2,  # Generate 2 structures per batch
)
# Initialize engine and run generation
model = RFD3InferenceEngine(**config)
outputs = model.run(
    inputs=None,      # None for unconditional generation
    out_dir=None,     # None to return in memory (no file output)
    n_batches=1,      # Generate 1 batch
)

Seed set to 42
Using bfloat16 Automatic Mixed Precision (AMP)
13:01:06 INFO rfd3.engine: [rank: 0] Finished inference batch in 9.18 seconds.


In [6]:
first_key = next(iter(outputs.keys()))
atom_array = outputs[first_key][0].atom_array
view(atom_array)

<py3Dmol.view at 0x77ea2fba93d0>

In [7]:
from mpnn.inference_engines.mpnn import MPNNInferenceEngine

# Configure MPNN inference engine
# See mpnn.utils.inference.MPNN_GLOBAL_INFERENCE_DEFAULTS for all options
engine_config = {
    "model_type": "ligand_mpnn",  # or "protein_mpnn" for vanilla ProteinMPNN
    #"checkpoint_path":None,      # 指定check_point位置
    #"config_json":None           # 指定json
    "is_legacy_weights": True,    # Required for now for ligand_mpnn and protein_mpnn
    
    "out_directory": None,        # Return results in memory
    "write_structures": False,
    "write_fasta": False,
    
}
# Configure per-input inference options
# See mpnn.utils.inference.MPNN_PER_INPUT_INFERENCE_DEFAULTS for all options
input_configs = [
    {
        "batch_size": 10,         # Generate 10 sequences per structure
        "remove_waters": True,
    }
    
]
# Run sequence design on the RFD3-generated backbone
model = MPNNInferenceEngine(**engine_config)
mpnn_outputs = model.run(input_dicts=input_configs, atom_arrays=[atom_array])

In [8]:
from biotite.structure import get_residue_starts
from biotite.sequence import ProteinSequence

# Extract and display the designed sequences
print(f"Generated {len(mpnn_outputs)} designed sequences:\n")

for i, item in enumerate(mpnn_outputs):
    res_starts = get_residue_starts(item.atom_array)
    # Convert 3-letter codes to 1-letter using Biotite
    seq_1letter = ''.join(
        ProteinSequence.convert_letter_3to1(res_name)
        for res_name in item.atom_array.res_name[res_starts]
    )
    print(f"Sequence {i+1}: {seq_1letter}")

Generated 10 designed sequences:

Sequence 1: MKYTALLFSAPWCGPCARVRAALEAIDDPAVTLKEISISDTAALAKYNAPAQAPTIVLLDEAGNVLARHVGAPAAATLAAALAAAAAGAPPEALPAGWVA
Sequence 2: MRLTALLFTHPWCGPCAAVRAALRAIDLPSVELREISIDDTAALAKYGAPAQAPTIVLLDEEGRVLARHVGAPAAATLAAALAAAAAGAAPEDLPAGWVR
Sequence 3: MKYTALLFTAPWCGPCRRVAAALDAIDDPRVTVRRVSIYDQAALAKYGVPAQYPTIVILDENGNVLARHVGAPPAETLAAALRAAAAGAPPSSLPAGWVP
Sequence 4: MKYTVLLFAHPWCGPCRRVQAALEAIKNPSVELKVISISDKAALAKYGAPAQAPTLVILDENGNVLARHVGAPAAETLAAALAKAAAGAAPSELPAGWVA
Sequence 5: MRYTALLFTHPWCGPCARVAAALRAIANPAVTVREVSIDDTAALARYGAPAQAPTVVLLDADGRVLARHVGAPAAATLAAALAAAAAGAPPSALPAGWVP
Sequence 6: MRLTALLFTAPWCGPCHRVAAALRAIDNPAVTVKEVSISDTAALARYGVPAQAPTIVLLDAAGRVLARHVGAPPAATLAAALAAAAAGAPPASLPAGWVP
Sequence 7: MKITALLFDAPWCGPCARVRAALEAIDNPAVSLKVVSISDAAALAKYGAPAQAPTIVILDEQGRVLARHVGAPPAETLRAALEKAAAGAPPASLPAGWVA
Sequence 8: MKYTVLLFGHPWCGPCARVAAALRAIDEPAVTVKEVSIDDAAALARYGVPAVAPTLVILDEAGRVLAVHAGAPAAETLRAAVARAAAGAAPEDLPAGWVR
Sequence 9: MKITVLLFTHPWCGPCARVRAALEAIDLPDVELREVSIDDAAALAKYGAP

### 輸入結構
* struct_path: 指定PDB路徑(如果我不是要用RFD產生的atom_array的話）
* name: 這個設計 case 的名字(fasta/PDB的檔名）

### Sampling/Batch
* seed: 可重現性而已
* batch_size: 一個batch一次生出多少序列
* number_of_batches: 幾個batch

### 清理PDB + 前處理
* remove_ccds: CCD code（如 PEG、SO4）
* remove_waters: HOH/WAT
* occupancy_threshold_
* undesired_res_names : e.g.["MSE"]

In [42]:
input_configs = [
    {
        "seed":42,
        
        "structure_path": "8JZ1.pdb",
        "name": "RFD3_design_001",
        
        "batch_size": 3,
        "number_of_batches": 2,
        "remove_waters": True,
        "remove_ccds": ["SO4", "PEG"],
        "designed_residues": ["A1","A2","A3"],
        # 這裡只讓前3個residue變異/讓生出的序列盡量接近原來的8jz1
    }
]

# Run sequence design on the RFD3-generated backbone
model = MPNNInferenceEngine(**engine_config)
mpnn_outputs = model.run(input_dicts=input_configs)
print(f"Generated {len(mpnn_outputs)} designed sequences:\n")

for i, item in enumerate(mpnn_outputs):
    res_starts = get_residue_starts(item.atom_array)
    # Convert 3-letter codes to 1-letter using Biotite
    seq_1letter = ''.join(
        ProteinSequence.convert_letter_3to1(res_name)
        for res_name in item.atom_array.res_name[res_starts]
    )
    print(f"Sequence {i+1}: {seq_1letter}")



Generated 6 designed sequences:

Sequence 1: GSSDWEIIDIGPFTQNLGKFAVDEENKIGQYGRLTFNKVIRPVMKKTIYENEREIKGYEYQLYVYASDKLFRADISEDYKTRGRKLLRFNGPVPPPSGEWEIIDIGPFTQNLGKFAVDEENKIGQYGRLTFNKVIRPVMKKTIYENEREIKGYEYQLYVYASDKLFRADISEDYKTRGRKLLRFNGPVPPP
Sequence 2: GSSDWEIIDIGPFTQNLGKFAVDEENKIGQYGRLTFNKVIRPVMKKTIYENEREIKGYEYQLYVYASDKLFRADISEDYKTRGRKLLRFNGPVPPPSGEWEIIDIGPFTQNLGKFAVDEENKIGQYGRLTFNKVIRPVMKKTIYENEREIKGYEYQLYVYASDKLFRADISEDYKTRGRKLLRFNGPVPPP
Sequence 3: GSSEWEIIDIGPFTQNLGKFAVDEENKIGQYGRLTFNKVIRPVMKKTIYENEREIKGYEYQLYVYASDKLFRADISEDYKTRGRKLLRFNGPVPPPSGEWEIIDIGPFTQNLGKFAVDEENKIGQYGRLTFNKVIRPVMKKTIYENEREIKGYEYQLYVYASDKLFRADISEDYKTRGRKLLRFNGPVPPP
Sequence 4: GSSDWEIIDIGPFTQNLGKFAVDEENKIGQYGRLTFNKVIRPVMKKTIYENEREIKGYEYQLYVYASDKLFRADISEDYKTRGRKLLRFNGPVPPPSGEWEIIDIGPFTQNLGKFAVDEENKIGQYGRLTFNKVIRPVMKKTIYENEREIKGYEYQLYVYASDKLFRADISEDYKTRGRKLLRFNGPVPPP
Sequence 5: GSSEWEIIDIGPFTQNLGKFAVDEENKIGQYGRLTFNKVIRPVMKKTIYENEREIKGYEYQLYVYASDKLFRADISEDYKTRGRKLLRFNGPVPPPSGEWEIIDIGPFTQNLGKFAVDEENKIGQYGRLTFNKVIRPVM

# 採樣
* structure_noise: 1.0~0.0
    * 對 backbone 加 Gaussian noise(data augmentation)
    * 提高多樣性 or 穩健性
* decode_type / causality_pattern: 大致上都只需要用auto_regressive
    * auto_agressive: auto_regressive/teacher_forcing
        * auto_regressive
            * 一個一個胺基酸「依序生成」
            * 可以生成很多不同 sequence
            * 用途:
                * de novo design
                * backbone design
                * binder design
        * teacher_forcing
            * 假設sequence是正確的/計算「模型覺得這個位置對不對」  
            * 不生成新序列
            * backbone合理性評估
            * ProteinMPNN --score_only
    * causality_pattern:
        * 控制生成的時候residue要看哪些context
        * auto_regressive
            * residue i 只能看 i 之前的 residue 
        * unconditional
            * 每個位置「獨立」決定
        * conditional
            * residue i 可以看 所有 residue（包含自己）
        * conditional_minus_self
            * residue i 看 除了自己以外的所有 residue
    * 預設&通常是auto_agressive
* initialize_sequence_embedding_with_ground_truth: bool
    * 是否用原本序列作為初始 embedding -> output更接近原序列
* features_to_return:
    *  回傳中間 feature（logits, embeddings): 拿來做分析
* atomize_side_chains: bool(把 side chain 當成 atom-level graph, LigandMPNN專用)

In [53]:
input_configs = [
    {
        "seed":42,
        
        "structure_path": "8JZ1.pdb",
        "name": "RFD3_design_001",
        
        "batch_size": 3,
        "remove_waters": True,
        "remove_ccds": ["SO4", "PEG"],

        "structure_noise" : 0.1,
        "decode_type": "auto_regressive",    # 逐步 autoregressive 解碼
        "causality_pattern": "auto_regressive",  
        "initialize_sequence_embedding_with_ground_truth": True,  
        #暫時用不了
        #"features_to_return": {
            #"input_features": ["mask_for_loss"],
            #"decoder_features": ["logits"]
        #}
        #"designed_residues": ["A1","A2","A3"],
        # 這裡只讓前3個residue變異/讓生出的序列盡量接近原來的8jz1
    }
]

# Run sequence design on the RFD3-generated backbone
model = MPNNInferenceEngine(**engine_config)
mpnn_outputs = model.run(input_dicts=input_configs)
print(f"Generated {len(mpnn_outputs)} designed sequences:\n")

for i, item in enumerate(mpnn_outputs):
    res_starts = get_residue_starts(item.atom_array)
    # Convert 3-letter codes to 1-letter using Biotite
    seq_1letter = ''.join(
        ProteinSequence.convert_letter_3to1(res_name)
        for res_name in item.atom_array.res_name[res_starts]
    )
    print(f"Sequence {i+1}: {seq_1letter}")



Generated 3 designed sequences:

Sequence 1: MWGPWELIPIGEETQELGRFAVEAANEDGRFGELTFERVIAPVLKRTLWADETNIAGYEYRLYVYASGKLFEARLSEDAATGERTLLEFRGPVPEPSRTFELIPLGPETDELGRFAVEAQNKVGTYGTLTFDRVLEPCLKQTILAGDNELQGYVYTLYVVASGKLFRAEISEDAATGERRLLRFEGPVAAP
Sequence 2: MKGPWELIPINEETQELGKFAVDAANKIGKFGKLTFEKVIEPVLKRTIWADETNIAGYEYEIYVYASGKLYRARLSEDYATGKRTLLRFEGPVPEPSKTYELIPLGPETEELGRFAVEAQNKVGTYGTLTFDEVLEPCLKQTILAGDNELQGYVYTLYVRASGRLFRAEISQDYATGERRLLRFEGPVAAP
Sequence 3: MWGPWELIPIDEQTQELGRFAVAAANERGQFGELTFERVIAPVLKRTHYADETNIAGYEYRLYVYASGKLFEAELSEDAATGERRLLSFRGPVPEPSRTWELIPLGPETDELGRFAVEEQNKVGTYGKLTFDRVLEPNLLQTILAGDNQLVGKEYVLYVVASGKLFRAVISEDAATGERRLLRFEGPVEAP


In [None]:
GSGEWEIIDIGPFTQNLGKFAVDEENKIGQYGRLTFNKVIRPVMKKTIYENEREIKGYEYQLYVYASDKLFRADISEDYKTRGRKLLRFNGPVPPP
MEIHVVLVGAPWCGYCHRAEAALRAAGIENLTFVDIRSMSPEELARYGITSIPTVLFFDEDGRLLARIAGGDAAIRNAPTVAAALRAGASLEELAGLAEA
MWGPWELIPIGEETQELGRFAVEAANEDGRFGELTFERVIAPVLKRTLWADETNIAGYEYRLYVYASGKLFEARLSEDAATGERTLLEFRGPVPEPSRTFELIPLGPETDELGRFAVEAQNKVGTYGTLTFDRVLEPCLKQTILAGDNELQGYVYTLYVVASGKLFRAEISEDAATGERRLLRFEGPVAAP

# 設計範圍
* fixed_residues/designed_residues
    * 指定哪些部份不要/要設計
    * 輸入格式: e.g. ["A1","A2","A3"]
* fixed_chains / designed_chains
    * 指定哪些Chain不要/要設計
    * Multiple Chain 用的到
    * 輸入格式["A"]
    * 不能同時使用
# 功能性設計
* omit/omit_AA/omit_per_residue(hard_mask)
    * 整個/某段範圍內禁止某些AA
* pair_bias/bias(看大小 >0鼓勵 <0不建議但還是可能抽到）
    * "bias": {
            "MET": 5.0,
            "PHE": 1.0
        }
* temperature/temperature_bias
* 越低 → 越保守/越高 → 多樣性越高

In [27]:
input_configs = [
    {
        "seed":42,
        
        "structure_path": "1A2Y.pdb",
        "name": "RFD3_design_001",
        
        "batch_size": 3,
        "remove_waters": True,
        "remove_ccds": ["PO4", "SO4", "PEG"],
        
        "designed_chains": ["A"], #設計A 其他當作context
        "omit_AA": ["GLY"], #禁止G
        "omit": ["UNK","GLY"],
        "bias": {
            "MET": 1.5,
            "PHE": 0.2
        },
        "temperature":0.8

        #Origin(A):  DIVLTQSPASLSASVGETVTITCRASGNIHNYLAWYQQKQGKSPQLLVYYTTTLADGVPSRFSGSGSGTQYSLKINSLQPEDFGSYYCQHFWSTPRTFGGGTKLEIK
        #Designed(A):DITLTQSPASIKASVGETVTITCTASADINGLLSWYQQLPGQSPRLLIYNTTTLAPGVPSRFSGSGSGTSYSLTITNIQPEDFGDYYCQHFYGTPRAFGQGTTLQPK
        #Origin(B):  QVQLQESGPGLVAPSQSLSITCTVSGFSLTGYGVNWVRQPPGKGLEWLGMIWGDGNTDYNSALKSRLSISKDNSKSQVFLKMNSLHTDDTARYYCARERDYRLDYWGQGTTLTVSS
        #Designed(B):QVQLQESGPGLVAPSQSLSITCTVSGFSLTGYGVNWVRQPPGKGLEWLGMIWGDGNTDYNSALKSRLSISKDNSKSQVFLKMNSLHTDDTARYYCARERDYRLDYWGQGTTLTVSS
        #Origin(C):  KVFGRCELAAAMKRHGLANYRGYSLGNWVCAAKFESNFNTQATNRNTDGSTDYGILQINSRWWCNDGRTPGSRNLCNIPCSALLSSDITASVNCAKKIVSDGNGMNAWVAWRNRCKGTDVQAWIRGCRL
        #Designed(C):KVFGRCELAAAMKRHGLANYRGYSLGNWVCAAKFESNFNTQATNRNTDGSTDYGILQINSRWWCNDGRTPGSRNLCNIPCSALLSSDITASVNCAKKIVSDGNGMNAWVAWRNRCKGTDVQAWIRGCRL
    }
]

# Run sequence design on the RFD3-generated backbone
model = MPNNInferenceEngine(**engine_config)
mpnn_outputs = model.run(input_dicts=input_configs)
print(f"Generated {len(mpnn_outputs)} designed sequences:\n")

for i, item in enumerate(mpnn_outputs):
    res_starts = get_residue_starts(item.atom_array)
    # Convert 3-letter codes to 1-letter using Biotite
    seq_1letter = ''.join(
        ProteinSequence.convert_letter_3to1(res_name)
        for res_name in item.atom_array.res_name[res_starts]
    )
    print(f"Sequence {i+1}: {seq_1letter}")



Generated 3 designed sequences:

Sequence 1: DIPLNVTPKTMRVMEWETVTMNCKSSAVVIHMVSWYMQRPFQSPRLLIYDTTTLSAMVPSRYTMSDSMTSYSLTIQAIQDEDFKDYYCQQFYSTPKAFFQETTLQKKQVQLQESGPGLVAPSQSLSITCTVSGFSLTGYGVNWVRQPPGKGLEWLGMIWGDGNTDYNSALKSRLSISKDNSKSQVFLKMNSLHTDDTARYYCARERDYRLDYWGQGTTLTVSSKVFGRCELAAAMKRHGLANYRGYSLGNWVCAAKFESNFNTQATNRNTDGSTDYGILQINSRWWCNDGRTPGSRNLCNIPCSALLSSDITASVNCAKKIVSDGNGMNAWVAWRNRCKGTDVQAWIRGCRL
Sequence 2: DITLTQSPASMMASIFDNVTITCFATMPIMKRLNWYQQKPPEMPQLILFNTSQREPFVMDRFHIAPADTNFSLTITNMLSDDFMDYECQHYYSVPQAFQNYTTLRKMQVQLQESGPGLVAPSQSLSITCTVSGFSLTGYGVNWVRQPPGKGLEWLGMIWGDGNTDYNSALKSRLSISKDNSKSQVFLKMNSLHTDDTARYYCARERDYRLDYWGQGTTLTVSSKVFGRCELAAAMKRHGLANYRGYSLGNWVCAAKFESNFNTQATNRNTDGSTDYGILQINSRWWCNDGRTPGSRNLCNIPCSALLSSDITASVNCAKKIVSDGNGMNAWVAWRNRCKGTDVQAWIRGCRL
Sequence 3: MIQLVMEPESMMAAEIETIKITCTSSADINHLLSWYKQRPHQAMQLLIYDTTDLETMVPSRFSMSCSYTNFSLTISHIKPEDYVRYFCQHHEWFPWAFFNVTEVRRKQVQLQESGPGLVAPSQSLSITCTVSGFSLTGYGVNWVRQPPGKGLEWLGMIWGDGNTDYNSALKSRLSISKDNSKSQVFLKMNSLHTDDTARYYCARERDYRLDYWGQGTTLTVSSKV

In [None]:
DIVLTQSPASLSASVGETVTITCRASGNIHNYLAWYQQKQGKSPQLLVYYTTTLADGVPSRFSGSGSGTQYSLKINSLQPEDFGSYYCQHFWSTPRTFGGGTKLEIK

* symmetry_residues
    * residue-level 的對稱關係
    * "symmetry_residues": [
        [("A", 10), ("B", 10)],
        [("A", 11), ("B", 11)],
    ]
* symmetry_residues_weights
    * 1.0	完全對稱（硬 constraint）
    * 0.5	偏好一樣，但允許不同
    * 0.0	完全不管
* homo_oligomer_chains
    *  "homo_oligomer_chains": [["A", "B"]]
    *  Chain A、B 序列必須完全一樣(MPNN 只會設計一條序列，然後 copy)