In [28]:
import esm
import torch
from esm.sdk.api import ESMProtein, GenerationConfig
from esm.utils.structure.protein_chain import ProteinChain
import py3Dmol

You need an account on: https://forge.evolutionaryscale.ai/

After you have made an account, log in, click on "API Keys" and create a new key.

In [None]:
from getpass import getpass

# Enter your API key here. Remeber you need to keep it confidential.
# Never add your key to a github repo or post it publicly in any way.
api_key = getpass("Your EvolutionaryScale API key: ")


In [42]:
model = esm.sdk.client("esm3-medium-2024-08", token=api_key)

In [17]:
# Define a prompt with masked tokens (underscores) for the part you want to generate
prompt_sequence = "____________________________________DQATSLRILNNGHAFNVEFDDSQDKAVLKGGPLDGTYRLIQFHFHWGSLDGQGSEHTVDKKKYAAELHLVHWNTKYGDFGKAVQQPDGLAVLGIFLKVGSAKPGLQKVVDVLDSIKTKGKSADFTNFDPRGLLPESLDYWTYPGSLTTPP___________________________________________________________"
protein = ESMProtein(sequence=prompt_sequence)

# Generate the protein sequence
protein = model.generate(
    protein,
    GenerationConfig(
        track="sequence",  # specify that we want to generate a sequence
        num_steps=8,      # number of steps for iterative decoding
        temperature=0.7   # controls diversity vs. perplexity
    )
)

# The `protein` object now contains the generated sequence.
# You can access it via `protein.sequence`
print(protein.sequence)


MKDYPACSGKRQSPIDIVTSKVTKVSLPPLEFTGYDDQATSLRILNNGHAFNVEFDDSQDKAVLKGGPLDGTYRLIQFHFHWGSLDGQGSEHTVDKKKYAAELHLVHWNTKYGDFGKAVQQPDGLAVLGIFLKVGSAKPGLQKVVDVLDSIKTKGKSADFTNFDPRGLLPESLDYWTYPGSLTTPPCSESVTWIVFKEPIEISKEQLQKFRTSLFFEEEGKDEKLLVNNFRPVQPLNGRTVKESS


To fold the protein you have created, go to: https://alphafoldserver.com/

You need to set up an account there as well.

In [1]:
def mask_sequence(protein, retain=None, replace=None):
    """
    Mask parts of a protein sequence with underscores.
    
    Parameters:
    -----------
    protein : str
        The amino acid sequence
    retain : str, optional
        Positions to retain (1-indexed), e.g., "1,2,6-10,22-45"
    replace : str, optional
        Positions to replace (1-indexed), e.g., "1-4,15,20-25"
    
    Returns:
    --------
    str
        The masked sequence
    
    Note: If both retain and replace are specified, retain takes precedence.
    """
    def parse_positions(pos_string):
        positions = set()
        for part in pos_string.split(','):
            if '-' in part:
                start, end = map(int, part.split('-'))
                positions.update(range(start, end + 1))
            else:
                positions.add(int(part))
        return positions
    
    masked = list(protein)
    
    if retain:
        keep = parse_positions(retain)
        masked = [ch if i+1 in keep else '_' for i, ch in enumerate(masked)]
    elif replace:
        rep = parse_positions(replace)
        masked = ['_' if i+1 in rep else ch for i, ch in enumerate(masked)]
    
    return ''.join(masked)


# Example usage:
protein = "MSKGEELFTGVVPILVELDGDVNGHKFSVSGEGEGDATYGKLTLKFICTTGKLPVPWPTLVTTFSYGVQC"

# Example 1: Retain specific positions
result1 = mask_sequence(protein, retain="1,2,6-10,22-45")
print("Retain 1,2,6-10,22-45:")
print(result1)

# Example 2: Replace specific positions (e.g., replace first 4)
result2 = mask_sequence(protein, replace="1-4")
print("\nReplace 1-4:")
print(result2)

# Example 3: Retain positions 5-15
result3 = mask_sequence(protein, retain="5-15")
print("\nRetain 5-15:")
print(result3)

# Example 4: Replace multiple ranges
result4 = mask_sequence(protein, replace="1-10,20-30,50-60")
print("\nReplace 1-10,20-30,50-60:")
print(result4)

Retain 1,2,6-10,22-45:
MS___ELFTG___________VNGHKFSVSGEGEGDATYGKLTLK_________________________

Replace 1-4:
____EELFTGVVPILVELDGDVNGHKFSVSGEGEGDATYGKLTLKFICTTGKLPVPWPTLVTTFSYGVQC

Retain 5-15:
____EELFTGVVPIL_______________________________________________________

Replace 1-10,20-30,50-60:
__________VVPILVELD___________GEGEGDATYGKLTLKFICT___________VTTFSYGVQC


In [45]:
# GFP
# Sequence: https://www.uniprot.org/uniprotkb/P42212/
# Structure: https://www.rcsb.org/3d-view/1B9C/1
# 56 - 72
protein = "MSKGEELFTGVVPILVELDGDVNGHKFSVSGEGEGDATYGKLTLKFICTTGKLPVPWPTLVTTFSYGVQCFSRYPDHMKQHDFFKSAMPEGYVQERTIFFKDDGNYKTRAEVKFEGDTLVNRIELKGIDFKEDGNILGHKLEYNYNSHNVYIMADKQKNGIKVNFKIRHNIEDGSVQLADHYQQNTPIGDGPVLLPDNHYLSTQSALSKDPNEKRDHMVLLEFVTAAGITHGMDELYK"
#masked = mask_sequence(protein, retain="56-72,153-225")
masked = mask_sequence(protein, replace="129-147,188-198")
print(masked)

MSKGEELFTGVVPILVELDGDVNGHKFSVSGEGEGDATYGKLTLKFICTTGKLPVPWPTLVTTFSYGVQCFSRYPDHMKQHDFFKSAMPEGYVQERTIFFKDDGNYKTRAEVKFEGDTLVNRIELKGI___________________HNVYIMADKQKNGIKVNFKIRHNIEDGSVQLADHYQQNTP___________HYLSTQSALSKDPNEKRDHMVLLEFVTAAGITHGMDELYK


In [47]:
protein = ESMProtein(sequence=masked)

# Generate the protein sequence
protein = model.generate(
    protein,
    GenerationConfig(
        track="sequence",  # specify that we want to generate a sequence
        num_steps=8,      # number of steps for iterative decoding
        temperature=0.7   # controls diversity vs. perplexity
    )
)

# The `protein` object now contains the generated sequence.
# You can access it via `protein.sequence`
print(protein.sequence)

MSKGEELFTGVVPILVELDGDVNGHKFSVSGEGEGDATYGKLTLKFICTTGKLPVPWPTLVTTFSYGVQCFSRYPDHMKQHDFFKSAMPEGYVQERTIFFKDDGNYKTRAEVKFEGDTLVNRIELKGIGFKKGGKILDGKLIKDIEPHNVYIMADKQKNGIKVNFKIRHNIEDGSVQLADHYQQNTPLGNGKLPVLPPHYLSTQSALSKDPNEKRDHMVLLEFVTAAGITHGMDELYK


From: https://colab.research.google.com/github/evolutionaryscale/esm/blob/main/cookbook/tutorials/3_gfp_design.ipynb#scrollTo=cDWcXKmlbC1z

In [16]:
template_gfp = ESMProtein.from_protein_chain(
    ProteinChain.from_rcsb("1qy3", chain_id="A")
)
template_gfp_tokens = model.encode(template_gfp)

print("Sequence tokens:")
print(
    "    ", ", ".join([str(token) for token in template_gfp_tokens.sequence.tolist()])
)

print("Structure tokens:")
print(
    "    ", ", ".join([str(token) for token in template_gfp_tokens.structure.tolist()])
)

Sequence tokens:
     0, 15, 6, 9, 9, 4, 18, 11, 6, 7, 7, 14, 12, 4, 7, 9, 4, 13, 6, 13, 7, 17, 6, 21, 15, 18, 8, 7, 8, 6, 9, 6, 9, 6, 13, 5, 11, 19, 6, 15, 4, 11, 4, 15, 18, 12, 23, 11, 11, 6, 15, 4, 14, 7, 14, 22, 14, 11, 4, 7, 11, 11, 4, 11, 19, 6, 7, 16, 23, 18, 8, 10, 19, 14, 13, 21, 20, 15, 16, 21, 13, 18, 18, 15, 8, 5, 20, 14, 9, 6, 19, 7, 16, 9, 5, 11, 12, 8, 18, 15, 13, 13, 6, 17, 19, 15, 11, 10, 5, 9, 7, 15, 18, 9, 6, 13, 11, 4, 7, 17, 10, 12, 9, 4, 15, 6, 12, 13, 18, 15, 9, 13, 6, 17, 12, 4, 6, 21, 15, 4, 9, 19, 17, 19, 17, 8, 21, 17, 7, 19, 12, 11, 5, 13, 15, 16, 15, 17, 6, 12, 15, 5, 17, 18, 15, 12, 10, 21, 17, 12, 9, 13, 6, 8, 7, 16, 4, 5, 13, 21, 19, 16, 16, 17, 11, 14, 12, 6, 13, 6, 14, 7, 4, 4, 14, 13, 17, 21, 19, 4, 8, 11, 16, 8, 5, 4, 8, 15, 13, 14, 17, 9, 15, 10, 13, 21, 20, 7, 4, 4, 9, 18, 7, 11, 5, 5, 6, 12, 2
Structure tokens:
     4098, 1025, 3124, 1129, 3227, 722, 1645, 2037, 2490, 60, 2567, 1779, 457, 2708, 383, 2219, 653, 4084, 2984, 3370, 66, 608, 2504, 103,

In [19]:
prompt_sequence = ["_"] * len(template_gfp.sequence)
prompt_sequence[59] = "T"
prompt_sequence[62] = "T"
prompt_sequence[63] = "Y"
prompt_sequence[64] = "G"
prompt_sequence[93] = "R"
prompt_sequence[219] = "E"
prompt_sequence = "".join(prompt_sequence)

print(template_gfp.sequence)
print(prompt_sequence)

prompt = model.encode(ESMProtein(sequence=prompt_sequence))

# We construct an empty structure track like |<bos> <mask> ... <mask> <eos>|...
prompt.structure = torch.full_like(prompt.sequence, 4096)
prompt.structure[0] = 4098
prompt.structure[-1] = 4097
# ... and then we fill in structure tokens at key residues near the alpha helix
# kink and at the stabilizing R and E positions on the beta barrel.
prompt.structure[55:70] = template_gfp_tokens.structure[56:71]
prompt.structure[93] = template_gfp_tokens.structure[93]
prompt.structure[219] = template_gfp_tokens.structure[219]

print("".join(["✔" if st < 4096 else "_" for st in prompt.structure]))

KGEELFTGVVPILVELDGDVNGHKFSVSGEGEGDATYGKLTLKFICTTGKLPVPWPTLVTTLTYGVQCFSRYPDHMKQHDFFKSAMPEGYVQEATISFKDDGNYKTRAEVKFEGDTLVNRIELKGIDFKEDGNILGHKLEYNYNSHNVYITADKQKNGIKANFKIRHNIEDGSVQLADHYQQNTPIGDGPVLLPDNHYLSTQSALSKDPNEKRDHMVLLEFVTAAGI
___________________________________________________________T__TYG____________________________R_____________________________________________________________________________________________________________________________E_______
_______________________________________________________✔✔✔✔✔✔✔✔✔✔✔✔✔✔✔_______________________✔_____________________________________________________________________________________________________________________________✔_________


In [None]:
num_tokens_to_decode = min((prompt.structure == 4096).sum().item(), 20)


protein = model.generate(
    prompt,
    GenerationConfig(
        # Generate a sequence.
        track="sequence",
        num_steps=50,      # number of steps for iterative decoding
        # Sampling temperature trades perplexity with diversity.
        temperature=1.0,
    ),
)

# The `protein` object now contains the generated sequence.
# You can access it via `protein.sequence`
print(protein.sequence)

tensor([ 0, 20,  9, 15, 23, 18, 13, 11, 15, 11,  4,  4,  9, 17,  4,  9,  9,  4,
        15,  9, 15,  4, 15,  9,  9, 12, 15,  9, 13,  4, 15, 15,  4,  4,  9, 15,
         4, 15,  9,  4, 15,  6,  9,  9, 15,  9,  9,  4, 15, 15, 15,  4, 16,  9,
        12, 15, 13, 15,  4,  4, 11,  7,  4, 11, 19,  6,  4, 13, 15,  4, 23, 12,
        15, 13, 12, 14,  9, 17, 15,  5, 15,  9, 12, 12, 15, 15, 12,  4, 13, 15,
         4, 17, 12, 13, 10, 15, 11, 13, 18,  9, 15, 12, 15,  9,  9, 18,  9, 15,
        15,  4,  9,  9,  9, 15, 12, 15,  9, 12, 15,  9, 15, 19,  9,  9,  5, 15,
        15,  9,  4, 10,  9,  9,  4, 15, 17, 15,  4, 11,  9, 15, 12, 15,  9,  7,
         4,  8, 15,  9,  9, 12,  9, 11, 12, 23, 15,  9,  4,  9,  9, 11,  7, 19,
        15,  9, 22, 13,  9, 15, 15, 15, 13, 18, 12, 15, 17,  4, 11, 12, 15,  4,
         4, 13,  4,  7, 15,  9,  9,  4,  9, 15,  4, 15, 15,  9,  4, 15,  9,  4,
        15, 15,  7, 15, 12,  9, 12,  9, 15, 13,  4, 15,  9, 15, 19, 11, 12,  9,
        12,  9, 15, 12,  9, 12, 15,  9, 

In [25]:
model.decode(protein).sequence

'MEKCFDTKTLLENLEELKEKLKEEIKEDLKKLLEKLKELKGEEKEELKKKLQEIKDKLLTVLTYGLDKLCIKDIPENKAKEIIKKILDKLNIDRKTDFEKIKEEFEKKLEEEKIKEIKEKYEEAKKELREELKNKLTEKIKEVLSKEEIETICKELEETVYKEWDEKKKDFIKNLTIKLLDLVKEELEKLKKELKELKKVKIEIEKDLKEKYTIEIEKIEIKEEKKE'

In [26]:
num_tokens_to_decode = min((prompt.structure == 4096).sum().item(), 20)


structure_generation = model.generate(
    prompt,
    GenerationConfig(
        # Generate a structure.
        track="structure",
        # Sample one token per forward pass of the model.
        num_steps=num_tokens_to_decode,
        # Sampling temperature trades perplexity with diversity.
        temperature=1.0,
    ),
)

print("These are the structure tokens corresponding to our new design:")
print(
    "    ", ", ".join([str(token) for token in structure_generation.structure.tolist()])
)

# Decodes structure tokens to backbone coordinates.
structure_generation_protein = model.decode(structure_generation)

These are the structure tokens corresponding to our new design:
     4098, 2918, 2690, 836, 303, 2730, 2294, 486, 290, 66, 4070, 3550, 3829, 1294, 1691, 863, 701, 3692, 2752, 2339, 470, 452, 2543, 3084, 3189, 1770, 3843, 724, 4083, 3117, 3813, 657, 274, 193, 987, 3979, 2416, 1472, 2874, 2660, 3391, 264, 3056, 2798, 1195, 1265, 1097, 3675, 1701, 2874, 588, 1667, 3019, 247, 2521, 1774, 732, 1797, 3372, 3403, 2370, 2582, 3704, 2737, 3007, 1660, 499, 484, 2202, 2786, 2404, 2422, 1638, 3913, 3334, 1973, 2676, 4084, 1572, 1973, 3673, 3101, 802, 1185, 3056, 2874, 1480, 3882, 611, 3080, 2479, 684, 3362, 1066, 2557, 2871, 3731, 4005, 1177, 1607, 3254, 252, 239, 309, 3054, 1568, 3384, 1885, 3721, 1568, 1283, 3481, 1237, 461, 703, 377, 3484, 3798, 3630, 2732, 559, 3116, 2484, 1202, 2637, 4, 2874, 3735, 2147, 2955, 2605, 658, 1387, 3101, 2169, 3287, 722, 3056, 2650, 2347, 3366, 2298, 2056, 2827, 407, 2103, 818, 1534, 2703, 3005, 2721, 3552, 2821, 463, 2568, 1312, 1759, 2588, 1628, 1517, 1588, 3126

In [39]:
view = py3Dmol.view(width=600, height=400)
view.addModel(
    structure_generation_protein.to_protein_chain().infer_oxygen().to_pdb_string(),
    "pdb",
)
view.setStyle({"cartoon": {"color": "lightgreen"}})
view.zoomTo()
view.show()

In [None]:
# Create a viewer object and load the PDB structure using a query
# Replace '1A2C' with your desired PDB ID
view = py3Dmol.view(query='pdb:1B9C', width=600, height=400)

view.setStyle({}, {})  # hide everything
#view.setStyle({'chain': 'A'}, {'cartoon': {'color': 'lightgreen'}}) # color chain A
view.setStyle({'chain': 'B'}, {'cartoon': {'color': 'lightblue'}}) # color chain B
#view.setStyle({'chain': 'C'}, {'cartoon': {'color': 'khaki'}}) # color chain C
#view.setStyle({'chain': 'D'}, {'cartoon': {'color': 'tomato'}}) # color chain D
view.setStyle({'chain': 'B' ,'resi': ['129-147', '188-198']}, {'cartoon': {'color': 'tomato'}})


# zoom to the structure and display it
view.zoomTo()
view.show()

In [None]:
import py3Dmol
import biotite.structure.io.pdbx as pdbx
from biotite.structure.io.pdb import PDBFile
from biotite.structure import superimpose, rmsd
from io import StringIO
import warnings

warnings.filterwarnings('ignore', message='Attribute .* not found within')

def align_and_visualize_cif(cif_file1, cif_file2, chain1='A', chain2='A'):
    """
    Align two CIF files and visualize the overlay.
    
    Parameters:
    -----------
    cif_file1 : str
        Reference CIF file
    cif_file2 : str
        CIF file to align
    chain1, chain2 : str
        Chain IDs to use for alignment
    """
    # Read CIF files
    structure1 = pdbx.get_structure(pdbx.CIFFile.read(cif_file1), model=1)
    structure2 = pdbx.get_structure(pdbx.CIFFile.read(cif_file2), model=1)
    
    # Get CA atoms for alignment
    ca1 = structure1[(structure1.chain_id == chain1) & (structure1.atom_name == 'CA')]
    ca2 = structure2[(structure2.chain_id == chain2) & (structure2.atom_name == 'CA')]
    
    # Match lengths
    n_atoms = min(len(ca1), len(ca2))
    ca1 = ca1[:n_atoms]
    ca2 = ca2[:n_atoms]
    
    # Superimpose CA atoms to get transformation
    ca2_aligned, transformation = superimpose(ca1, ca2)
    
    # Apply transformation to full structure
    structure2_aligned = structure2.copy()
    structure2_aligned.coord = transformation.apply(structure2.coord)
    
    # Print alignment info
    print(f"RMSD: {rmsd(ca1, ca2_aligned):.2f} Å ({n_atoms} CA atoms)")
    
    # Convert to PDB format for visualization
    pdb1 = StringIO()
    pdb_file1 = PDBFile()
    pdb_file1.set_structure(structure1)
    pdb_file1.write(pdb1)
    
    pdb2 = StringIO()
    pdb_file2 = PDBFile()
    pdb_file2.set_structure(structure2_aligned)
    pdb_file2.write(pdb2)
    
    # Visualize
    view = py3Dmol.view(width=600, height=400)
    view.addModel(pdb1.getvalue(), 'pdb')
    view.addModel(pdb2.getvalue(), 'pdb')
    view.setStyle({'model': 0}, {'cartoon': {'color': 'lightblue'}})
    view.setStyle({'model': 1}, {'cartoon': {'color': 'lightgreen'}})
    view.zoomTo()
    
    return view

# Usage
view = align_and_visualize_cif('GFP_AF3.cif', 'GFP_ESM3_loop_rebuild.cif')
view.show()

RMSD: 1.21 Å (238 CA atoms)
