In [None]:
# Setup and Dependencies
# Input Protein Sequence
# Protein Structural Modeling with ESMFold (ESM-3)
# B-cell Epitope Prediction
# Linear Epitope Prediction
# Conformational Epitope Prediction
# T-cell Epitope Prediction
# Population Coverage Analysis
# Molecular Docking Simulation
# Visualization and Analysis

In [2]:
# import os
# import numpy as np
# import pandas as pd
# import matplotlib.pyplot as plt
from Bio import SeqIO
# from Bio.SeqUtils.ProtParam import ProteinAnalysis
# from Bio.PDB import PDBParser, Selection
# import torch
# import esm  # For ESM-3 and ESMFold
# Additional imports for visualization and docking

In [3]:
def load_fasta(file_path):
    """
    Load protein sequences from a FASTA file.
    """
    sequences = []
    for record in SeqIO.parse(file_path, "fasta"):
        sequences.append(str(record.seq))
    return sequences


# Example usage:
sequences = load_fasta("../allergen-analysis/ara-h-2/AAN77576.fasta")
protein_sequence = sequences[0]  # Use the first sequence for this example

In [4]:
import os

from dotenv import load_dotenv
from esm.models.esm3 import ESM3
from esm.sdk.api import ESM3InferenceClient, ESMProtein, GenerationConfig
from huggingface_hub import login

# Load the environment variables from the .env file
load_dotenv()

# Retrieve the Hugging Face token from the environment
hf_token = os.getenv("HUGGINGFACE_TOKEN")

# Will instruct you how to get an API key from huggingface hub, make one with "Read" permission.
login(token=hf_token)

# This will download the model weights and instantiate the model on your machine.
model: ESM3InferenceClient = ESM3.from_pretrained("esm3_sm_open_v1").to(
    "cuda"
)  # or "cpu"

# Generate a completion for a partial Carbonic Anhydrase (2vvb)
prompt = "___________________________________________________DQATSLRILNNGHAFNVEFDDSQDKAVLKGGPLDGTYRLIQFHFHWGSLDGQGSEHTVDKKKYAAELHLVHWNTKYGDFGKAVQQPDGLAVLGIFLKVGSAKPGLQKVVDVLDSIKTKGKSADFTNFDPRGLLPESLDYWTYPGSLTTPP___________________________________________________________"
protein = ESMProtein(sequence=prompt)
# Generate the sequence, then the structure. This will iteratively unmask the sequence track.
protein = model.generate(
    protein, GenerationConfig(track="sequence", num_steps=8, temperature=0.7)
)
# We can show the predicted structure for the generated sequence.
protein = model.generate(protein, GenerationConfig(track="structure", num_steps=8))
protein.to_pdb("./generation.pdb")
# Then we can do a round trip design by inverse folding the sequence and recomputing the structure
protein.sequence = None
protein = model.generate(protein, GenerationConfig(track="sequence", num_steps=8))
protein.coordinates = None
protein = model.generate(protein, GenerationConfig(track="structure", num_steps=8))
protein.to_pdb("./round_tripped.pdb")

The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.
Token is valid (permission: read).
Your token has been saved to /home/atom/.cache/huggingface/token
Login successful




Fetching 22 files:   0%|          | 0/22 [00:00<?, ?it/s]

config.json:   0%|          | 0.00/3.00 [00:00<?, ?B/s]

data/entry_list_safety_29026.list:   0%|          | 0.00/1.60M [00:00<?, ?B/s]

hyperplanes_8bit_58641.npz:   0%|          | 0.00/30.0M [00:00<?, ?B/s]

data/esm3_entry.list:   0%|          | 0.00/1.93M [00:00<?, ?B/s]

data/ParentChildTreeFile.txt:   0%|          | 0.00/595k [00:00<?, ?B/s]

.gitattributes:   0%|          | 0.00/1.52k [00:00<?, ?B/s]

data/1utn.pdb:   0%|          | 0.00/569k [00:00<?, ?B/s]

hyperplanes_8bit_68103.npz:   0%|          | 0.00/34.9M [00:00<?, ?B/s]

data/interpro2keywords.csv:   0%|          | 0.00/7.32M [00:00<?, ?B/s]

(…)ata/interpro_29026_to_keywords_58641.csv:   0%|          | 0.00/10.1M [00:00<?, ?B/s]

data/keywords.txt:   0%|          | 0.00/788k [00:00<?, ?B/s]

keyword_idf_safety_filtered_58641.npy:   0%|          | 0.00/469k [00:00<?, ?B/s]

data/tag_dict_4.json:   0%|          | 0.00/691k [00:00<?, ?B/s]

(…)ord_vocabulary_safety_filtered_58641.txt:   0%|          | 0.00/788k [00:00<?, ?B/s]

data/tag_dict_4_safety_filtered.json:   0%|          | 0.00/569k [00:00<?, ?B/s]

(…)0_residue_annotations_gt_1k_proteins.csv:   0%|          | 0.00/109k [00:00<?, ?B/s]

tfidf_safety_filtered_58641.pkl:   0%|          | 0.00/2.02M [00:00<?, ?B/s]

esm3_function_decoder_v0.pth:   0%|          | 0.00/1.30G [00:00<?, ?B/s]

esm3_sm_open_v1.pth:   0%|          | 0.00/2.80G [00:00<?, ?B/s]

esm3_structure_decoder_v0.pth:   0%|          | 0.00/1.24G [00:00<?, ?B/s]

esm3_structure_encoder_v0.pth:   0%|          | 0.00/62.3M [00:00<?, ?B/s]

  state_dict = torch.load(
  0%|          | 0/8 [04:06<?, ?it/s]


KeyboardInterrupt: 