In [1]:
%set_env TOKENIZERS_PARALLELISM=false
!pip install esm
import numpy as np
import torch

!pip install py3Dmol
import py3Dmol

from esm.utils.structure.protein_chain import ProteinChain
from esm.sdk import client
from esm.sdk.api import (
    ESMProtein,
    GenerationConfig,
)

env: TOKENIZERS_PARALLELISM=false
Collecting esm
  Using cached esm-3.0.5-py3-none-any.whl.metadata (9.4 kB)
Collecting torchtext (from esm)
  Using cached torchtext-0.18.0-cp311-cp311-manylinux1_x86_64.whl.metadata (7.9 kB)
Collecting einops (from esm)
  Using cached einops-0.8.0-py3-none-any.whl.metadata (12 kB)
Collecting biotite==0.41.2 (from esm)
  Using cached biotite-0.41.2-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (7.1 kB)
Collecting msgpack-numpy (from esm)
  Using cached msgpack_numpy-0.4.8-py2.py3-none-any.whl.metadata (5.0 kB)
Collecting biopython (from esm)
  Using cached biopython-1.84-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Using cached esm-3.0.5-py3-none-any.whl (148 kB)
Using cached biotite-0.41.2-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (37.0 MB)
Using cached biopython-1.84-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.2 MB)
Using cached einops-0.8.0-py3-none-any.whl (43 kB

In [3]:
from getpass import getpass

token = getpass("Token from Forge console: ")
model = client(
    model="esm3-small-2024-08",
    url="https://forge.evolutionaryscale.ai",
    token=token,
)

Token from Forge console:  ········


In [2]:
from Bio.PDB import PDBParser

# Load the structure from the PDB ID
parser = PDBParser(QUIET=True)
structure = parser.get_structure("structure", "./8db4.pdb")

# Iterate over the chains in the structure
for model in structure:
    for chain in model:
        print(f"Chain ID: {chain.id}, Number of residues: {len(chain)}")

with open('8db4.pdb', 'r') as pdb_file:
    for line in pdb_file:
        if line.startswith('COMPND'):
            print(line.strip())

Chain ID: A, Number of residues: 292
Chain ID: B, Number of residues: 250
Chain ID: C, Number of residues: 249
Chain ID: D, Number of residues: 236
Chain ID: E, Number of residues: 108
Chain ID: F, Number of residues: 104
Chain ID: G, Number of residues: 235
Chain ID: H, Number of residues: 218
Chain ID: I, Number of residues: 258
Chain ID: J, Number of residues: 270
COMPND    MOL_ID: 1;
COMPND   2 MOLECULE: 13T1 HEAVY CHAIN;
COMPND   3 CHAIN: A, G;
COMPND   4 ENGINEERED: YES;
COMPND   5 MOL_ID: 2;
COMPND   6 MOLECULE: 13T1 LIGHT CHAIN;
COMPND   7 CHAIN: B, H;
COMPND   8 ENGINEERED: YES;
COMPND   9 MOL_ID: 3;
COMPND  10 MOLECULE: 22S1 HEAVY CHAIN;
COMPND  11 CHAIN: C, I;
COMPND  12 ENGINEERED: YES;
COMPND  13 MOL_ID: 4;
COMPND  14 MOLECULE: 22S1 LIGHT CHAIN;
COMPND  15 CHAIN: D, J;
COMPND  16 ENGINEERED: YES;
COMPND  17 MOL_ID: 5;
COMPND  18 MOLECULE: ARA H 2 ALLERGEN;
COMPND  19 CHAIN: E, F;
COMPND  20 ENGINEERED: YES


In [3]:
pdb_id = "8DB4"  # PDB ID corresponding to Ara h 2 bound by two neutralizing antibodies
pdb_file = "./8db4.pdb"
chain_id = "E"  # Chain ID corresponding to Ara h 2 in the PDB structure
arah2_chain = ProteinChain.from_pdb(pdb_file, chain_id)
# Alternatively, we could have used ProteinChain.from_pdb() to load a protein structure from a local PDB file

In [1]:
print(arah2_chain.sequence)

NameError: name 'arah2_chain' is not defined

In [None]:
print("atom37_positions shape: ", arah2_chain.atom37_positions.shape)
print(arah2_chain.atom37_positions[:3])

In [None]:
# First we can create a `py3Dmol` view object
view = py3Dmol.view(width=500, height=500)
# py3Dmol requires the atomic coordinates to be in PDB format, so we convert the `ProteinChain` object to a PDB string
pdb_str = arah2_chain.to_pdb_string()
# Load the PDB string into the `py3Dmol` view object
view.addModel(pdb_str, "pdb")
# Set the style of the protein chain
view.setStyle({"cartoon": {"color": "spectrum"}})
# Zoom in on the protein chain
view.zoomTo()
# Display the protein chain
view.show()

In [5]:
!pip install freesasa

Collecting freesasa
  Using cached freesasa-2.2.1-cp311-cp311-linux_x86_64.whl
Installing collected packages: freesasa
Successfully installed freesasa-2.2.1


In [6]:
from Bio import PDB
from Bio.SeqUtils import seq1
import freesasa
import numpy as np
from scipy.spatial.distance import pdist
from scipy.cluster.hierarchy import fcluster, linkage
import csv

# Load the structure using BioPython's PDBParser
pdb_parser = PDB.PDBParser(QUIET=True)  # QUIET mode to suppress warnings
structure = pdb_parser.get_structure("structure", "./8db4.pdb")

# Specify the chain of interest (e.g., Chain E)
chain_id = "E"

# Extract just the chain of interest from the structure
chain_structure = None
for model in structure:
    chain_structure = model[chain_id]
    break  # Exit after extracting the first model (if multiple models exist)

# Write a temporary PDB file containing only the selected chain
with open("temp_chain.pdb", "w") as temp_pdb:
    io = PDB.PDBIO()
    io.set_structure(chain_structure)
    io.save(temp_pdb)

# Initialize FreeSASA structure for the specific chain
freesasa_structure = freesasa.Structure("temp_chain.pdb")

# Run FreeSASA to calculate ASA for each atom
result = freesasa.calc(freesasa_structure)

# Max ASA values for RSA calculation (adjusted per residue type)
max_asa = {
    'A': 113, 'R': 241, 'N': 158, 'D': 151, 'C': 140, 'Q': 189, 'E': 183,
    'G': 85,  'H': 194, 'I': 182, 'L': 180, 'K': 211, 'M': 204, 'F': 218,
    'P': 143, 'S': 122, 'T': 146, 'W': 259, 'Y': 229, 'V': 160
}

# Function to check for N-glycosylation motif (N-X-S/T)
def is_nglycosylated(seq, pos):
    if pos + 2 < len(seq) and seq[pos] == 'N':
        if seq[pos + 2] in ['S', 'T'] and seq[pos + 1] != 'P':
            return 1
    return 0

# Prepare CSV output for residue-level data
with open('asa_rsa_output.csv', mode='w', newline='') as file:
    writer = csv.writer(file)
    writer.writerow(["Residue", "ASA", "RSA", "N-Glycosylation"])

    # FreeSASA iterates over atoms, not residues, so we have to match atoms
    atom_idx = 0  # Keep track of FreeSASA atom index

    # Get the sequence of residues for N-glycosylation check
    chain_sequence = [seq1(res.resname) for res in chain_structure.get_residues()]
    print("Chain E Sequence:", "".join(chain_sequence))  # Print sequence for debugging

    # Extract residue information and calculate ASA, RSA, and N-glycosylation
    residue_coords = []  # For storing C-alpha coordinates
    rsa_values = []  # For storing RSA values to calculate averages later
    for i, residue in enumerate(chain_structure.get_residues()):
        try:
            # Filter out non-protein residues (e.g., metals, water)
            if residue.id[0] != ' ':
                continue

            res_id = residue.id[1]  # Residue position in the chain
            amino = residue.resname  # 3-letter amino acid code
            amino_one_letter = seq1(amino)  # Convert to 1-letter code

            # Initialize ASA for the entire residue
            residue_asa = 0.0

            # Iterate over atoms in the residue to sum up their ASA values
            for atom in residue:
                residue_asa += result.atomArea(atom_idx)
                atom_idx += 1  # Move to the next atom

            # Calculate RSA (Relative Solvent Accessibility)
            rsa = residue_asa / max_asa.get(amino_one_letter, 1)
            rsa_values.append(rsa)

            # Check if the residue is N-glycosylated and ensure index is valid
            n_glycosylation = is_nglycosylated(chain_sequence, i) if i + 2 < len(chain_sequence) else 0

            # Create the "Residue" field as position:amino
            residue_field = f"{res_id}:{amino_one_letter}"

            # Write to CSV (position:amino, ASA, RSA, N-Glycosylation)
            writer.writerow([residue_field, residue_asa, rsa, n_glycosylation])

            # Store residue information for clustering
            if residue.has_id("CA"):
                ca_atom = residue["CA"]
                coord = ca_atom.get_coord()
                residue_coords.append((res_id, amino_one_letter, coord))

        except Exception as e:
            print(f"Error processing residue {residue}: {e}")

# Clustering based on 3D proximity
# Extract coordinates for clustering
positions = np.array([coord[2] for coord in residue_coords])

# Calculate pairwise distances between residues based on their Cα coordinates
distance_matrix = pdist(positions)  # Use pdist directly, no need for squareform

# Cluster residues using a hierarchical clustering approach with 8 Å threshold
linkage_matrix = linkage(distance_matrix, method='complete')
distance_threshold = 8.0  # Threshold in Å for clustering
cluster_labels = fcluster(linkage_matrix, t=distance_threshold, criterion='distance')

# Create residue clusters with 1-letter amino acid representation
clusters = {}
for idx, cluster_id in enumerate(cluster_labels):
    if cluster_id not in clusters:
        clusters[cluster_id] = {
            "residues": [],
            "rsa_values": []
        }
    res_id, amino_one_letter = residue_coords[idx][0], residue_coords[idx][1]
    clusters[cluster_id]["residues"].append(f"{res_id}:{amino_one_letter}")
    clusters[cluster_id]["rsa_values"].append(rsa_values[idx])

# Write clusters to a new CSV file for residue clusters with average RSA
with open('residue_clusters.csv', mode='w', newline='') as file:
    writer = csv.writer(file)
    writer.writerow(["Residue_Cluster", "Average_RSA"])

    for cluster in clusters.values():
        avg_rsa = sum(cluster["rsa_values"]) / len(cluster["rsa_values"]) if cluster["rsa_values"] else 0
        writer.writerow([";".join(cluster["residues"]), avg_rsa])


Chain E Sequence: AARRCQSQLERANLRPCEQHLMQKIQRSQHQERCCNELNEFENNQRCMCEALQQIMENQSDRLQGRQQEQQFKRELRNLPQQCGLRAPQRCDLDVXXXXXXXXXXXX


In [7]:
# Hydrophilicity scale (Hopp-Woods scale)
hydrophilicity_scale = {
    'A': -0.5, 'R': 3.0, 'N': 0.2, 'D': 3.0, 'C': -1.0,
    'Q': 0.2,  'E': 3.0, 'G': 0.0, 'H': -0.5, 'I': -1.8,
    'L': -1.8, 'K': 3.0, 'M': -1.3, 'F': -2.5, 'P': 0.0,
    'S': 0.3,  'T': -0.4, 'W': -3.4, 'Y': -2.3, 'V': -1.5
}

# Step 1: Update 'asa_rsa_output.csv' with hydrophilicity column
# Re-open the existing CSV and add a hydrophilicity column without duplicating columns
with open('asa_rsa_output.csv', mode='r') as infile:
    reader = csv.reader(infile)
    header = next(reader)  # Skip the header row
    rows = list(reader)

# Check if the header already contains "Hydrophilicity"
if "Hydrophilicity" not in header:
    header.append("Hydrophilicity")

# Calculate and add hydrophilicity values for each row
for row in rows:
    try:
        residue_info = row[0]  # Format is "position:amino"
        position, amino_one_letter = residue_info.split(':')
        # Assign hydrophilicity based on the amino acid using Hopp-Woods scale
        hydrophilicity = hydrophilicity_scale.get(amino_one_letter, 0)
        # If the row already has the hydrophilicity column, update it, else append
        if len(row) < len(header):
            row.append(hydrophilicity)
        else:
            row[header.index("Hydrophilicity")] = hydrophilicity
    except Exception as e:
        print(f"Error processing row {row}: {e}")

# Write the updated data back to 'asa_rsa_output.csv'
with open('asa_rsa_output.csv', mode='w', newline='') as outfile:
    writer = csv.writer(outfile)
    writer.writerow(header)  # Write the updated header
    writer.writerows(rows)  # Write all rows with updated hydrophilicity values

# Step 2: Load residue hydrophilicity data from the updated 'asa_rsa_output.csv'
residue_hydrophilicity = {}

# Read the updated residue-level data including hydrophilicity
with open('asa_rsa_output.csv', mode='r') as infile:
    reader = csv.reader(infile)
    header = next(reader)  # Skip the header row
    for row in reader:
        residue_info = row[0]  # Format is "position:amino"
        position, amino = residue_info.split(':')
        position = int(position)  # Convert position to an integer for easier lookup
        hydrophilicity = float(row[header.index("Hydrophilicity")])
        residue_hydrophilicity[position] = hydrophilicity

# Step 3: Update 'residue_clusters.csv' with average hydrophilicity column
clusters_with_hydrophilicity = []

with open('residue_clusters.csv', mode='r') as infile:
    reader = csv.reader(infile)
    header = next(reader)  # Skip the header row

    # Ensure "Average_Hydrophilicity" is in the header
    if "Average_Hydrophilicity" not in header:
        header.append("Average_Hydrophilicity")

    for row in reader:
        residues = row[0].split(';')  # Residues are in the format "position:amino"

        # Calculate the average hydrophilicity for each cluster
        total_hydrophilicity = 0
        residue_count = 0
        for residue in residues:
            position, amino = residue.split(':')
            position = int(position)
            if position in residue_hydrophilicity:
                total_hydrophilicity += residue_hydrophilicity[position]
                residue_count += 1

        # Compute the average hydrophilicity for the cluster
        avg_hydrophilicity = total_hydrophilicity / residue_count if residue_count > 0 else 0

        # Update row with the average hydrophilicity
        if len(row) < len(header):  # If only the cluster information is present
            row.append(avg_hydrophilicity)
        else:
            row[header.index("Average_Hydrophilicity")] = avg_hydrophilicity
        clusters_with_hydrophilicity.append(row)

# Write the updated cluster data back to 'residue_clusters.csv'
with open('residue_clusters.csv', mode='w', newline='') as outfile:
    writer = csv.writer(outfile)
    writer.writerow(header)  # Write the updated header
    writer.writerows(clusters_with_hydrophilicity)  # Write all updated rows

In [8]:
b_factors = []
for residue in chain_structure.get_residues():
    if residue.id[0] == ' ':
        # Calculate the average B-factor for the residue
        avg_b_factor = sum(atom.get_bfactor() for atom in residue) / len(residue)
        b_factors.append(avg_b_factor)

# Step 2: Update 'asa_rsa_output.csv' to add or update B-factor column
with open('asa_rsa_output.csv', mode='r') as infile:
    reader = csv.reader(infile)
    header = next(reader)
    rows = list(reader)

# Check if the header already contains "B-Factor"
if "B-Factor" not in header:
    header.append("B-Factor")

# Add or update B-factor values for each row
for i, row in enumerate(rows):
    try:
        b_factor = b_factors[i]
        # If the row already has the B-factor column, update it; otherwise, append
        if len(row) < len(header):
            row.append(b_factor)
        else:
            row[header.index("B-Factor")] = b_factor  # Use index of "B-Factor" in header
    except Exception as e:
        print(f"Error processing row {row}: {e}")

# Write the updated data back to 'asa_rsa_output.csv'
with open('asa_rsa_output.csv', mode='w', newline='') as outfile:
    writer = csv.writer(outfile)
    writer.writerow(header)  # Write the updated header
    writer.writerows(rows)  # Write all rows with updated B-factor values

# Step 3: Load residue B-factor data from 'asa_rsa_output.csv'
residue_data = {}

# Read the updated residue-level data including B-factor
with open('asa_rsa_output.csv', mode='r') as infile:
    reader = csv.reader(infile)
    header = next(reader)
    for row in reader:
        residue_info = row[0]  # Format is "position:amino"
        position, amino = residue_info.split(':')
        position = int(position)  # Convert position to an integer for easier lookup
        b_factor = float(row[header.index("B-Factor")])  # Get B-factor value from header index
        hydrophilicity = float(row[header.index("Hydrophilicity")])  # Get hydrophilicity value from header index if exists
        residue_data[position] = {"b_factor": b_factor, "hydrophilicity": hydrophilicity}

# Step 4: Update 'residue_clusters.csv' with average hydrophilicity and B-factor columns
clusters_with_data = []

with open('residue_clusters.csv', mode='r') as infile:
    reader = csv.reader(infile)
    header = next(reader)

    # Ensure correct header columns are present
    if "Average_Hydrophilicity" not in header:
        header.append("Average_Hydrophilicity")
    if "Average_B-Factor" not in header:
        header.append("Average_B-Factor")

    for row in reader:
        residues = row[0].split(';')  # Residues are in the format "position:amino"

        # Calculate the average hydrophilicity and B-factor for each cluster
        total_hydrophilicity = 0
        total_b_factor = 0
        residue_count = 0

        for residue in residues:
            position, amino = residue.split(':')
            position = int(position)
            if position in residue_data:
                total_hydrophilicity += residue_data[position]["hydrophilicity"]
                total_b_factor += residue_data[position]["b_factor"]
                residue_count += 1

        # Compute the average values for the cluster
        avg_hydrophilicity = total_hydrophilicity / residue_count if residue_count > 0 else 0
        avg_b_factor = total_b_factor / residue_count if residue_count > 0 else 0

        # Ensure the row has enough columns to add or update the data
        updated_row = [row[0]]  # Start with the cluster information

        # If there are existing columns for hydrophilicity and B-factor, update them
        if len(row) > 1:
            updated_row.extend([row[1], avg_hydrophilicity, avg_b_factor])
        else:
            # Append the calculated average values for hydrophilicity and B-factor
            updated_row.extend([avg_hydrophilicity, avg_b_factor])

        clusters_with_data.append(updated_row)

# Write the updated cluster data back to 'residue_clusters.csv'
with open('residue_clusters.csv', mode='w', newline='') as outfile:
    writer = csv.writer(outfile)
    writer.writerow(header)  # Write the updated header
    writer.writerows(clusters_with_data)  # Write all updated rows

In [9]:
# Charge scale for residues (basic, acidic, or neutral)
charge_scale = {
    'A': 0, 'R': 1, 'N': 0, 'D': -1, 'C': 0,
    'Q': 0, 'E': -1, 'G': 0, 'H': 1, 'I': 0,
    'L': 0, 'K': 1, 'M': 0, 'F': 0, 'P': 0,
    'S': 0, 'T': 0, 'W': 0, 'Y': 0, 'V': 0
}

# Update the CSV to add Charge column
with open('asa_rsa_output.csv', mode='r') as infile:
    reader = csv.reader(infile)
    header = next(reader)
    rows = list(reader)

# Check if the header already contains "Charge"
if "Charge" not in header:
    header.append("Charge")

# Add or update charge values for each row
for row in rows:
    try:
        residue_info = row[0]  # Format is "position:amino"
        _, amino = residue_info.split(':')
        # Assign charge based on the amino acid
        charge = charge_scale.get(amino, 0)
        # If the row already has the charge column, update it, else append
        if len(row) < len(header):
            row.append(charge)
        else:
            row[header.index("Charge")] = charge
    except Exception as e:
        print(f"Error processing row {row}: {e}")

# Write the updated data back to 'asa_rsa_output.csv'
with open('asa_rsa_output.csv', mode='w', newline='') as outfile:
    writer = csv.writer(outfile)
    writer.writerow(header)  # Write the updated header
    writer.writerows(rows)  # Write all rows with updated charge values