# Generating ESM embeddings from PDB files
  This is a colab for [generating ESM2 embeddings](https://github.com/facebookresearch/esm/tree/main) from sequences in PDB files.


In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:

pip install biopython torch_geometric torch fair-esm

Collecting biopython
  Downloading biopython-1.81-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.1/3.1 MB[0m [31m7.5 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting torch_geometric
  Downloading torch_geometric-2.3.1.tar.gz (661 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m661.6/661.6 kB[0m [31m14.7 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Collecting fair-esm
  Downloading fair_esm-2.0.0-py3-none-any.whl (93 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m93.1/93.1 kB[0m [31m10.5 MB/s[0m eta [36m0:00:00[0m
Building wheels for collected packages: torch_geometric
  Building wheel for torch_geometric (pyproject.toml) ... [?25l[?25hdone
  Created wheel for torch_geometric: filename=torch_geo

In [4]:
import os
import os.path as osp
import warnings
from math import pi as PI
from typing import Callable, Dict, Optional, Tuple

import numpy as np
import torch
import torch.nn.functional as F
from torch import Tensor
from torch.nn import Embedding, Linear, ModuleList, Sequential

from torch_geometric.data import Dataset, download_url, extract_zip
from torch_geometric.data.makedirs import makedirs
from torch_geometric.nn import MessagePassing, SumAggregation, radius_graph
from torch_geometric.nn.resolver import aggregation_resolver as aggr_resolver
from torch_geometric.typing import OptTensor

from Bio.PDB import PDBParser
from Bio import SeqIO

import esm

In [5]:
#%%bash
#rm -R embeddings
#rm -R fasta

### Setting data directories
We load PDB files from Google Drive and specify the output FASTA and embedding directories. Since the FASTA files aren't utilized downstream, we only save the embedding files to Drive.

In [6]:
label_file = '/content/drive/My Drive/RBG coding assignment/labels.txt'
data_dir = '/content/drive/My Drive/RBG coding assignment/data'

embeddings_dir = '/content/drive/MyDrive/esm_embed'
os.makedirs(embeddings_dir, exist_ok=True)
fasta_dir = 'fasta'
os.makedirs(fasta_dir, exist_ok=True)

In [7]:
files = os.listdir(data_dir)
files

['1a0g_1_protein.pdb',
 '1a05_1_protein.pdb',
 '1a0f_1_protein.pdb',
 '1a0j_1_protein.pdb',
 '1a0q_1_protein.pdb',
 '1a0t_1_protein.pdb',
 '1a1b_1_protein.pdb',
 '1a1m_1_protein.pdb',
 '1a1e_1_protein.pdb',
 '1a1o_1_protein.pdb',
 '1a1c_1_protein.pdb',
 '1a1a_1_protein.pdb',
 '1a1n_1_protein.pdb',
 '1a2b_1_protein.pdb',
 '1a2c_1_protein.pdb',
 '1a3u_1_protein.pdb',
 '1a2u_1_protein.pdb',
 '1a3k_1_protein.pdb',
 '1a2t_1_protein.pdb',
 '1a3t_1_protein.pdb',
 '1a3v_1_protein.pdb',
 '1a4g_1_protein.pdb',
 '1a4k_1_protein.pdb',
 '1a4h_1_protein.pdb',
 '1a4i_1_protein.pdb',
 '1a4m_2_protein.pdb',
 '1a4q_1_protein.pdb',
 '1a4m_4_protein.pdb',
 '1a4m_1_protein.pdb',
 '1a4m_3_protein.pdb',
 '1a4r_1_protein.pdb',
 '1a4k_2_protein.pdb',
 '1a4w_1_protein.pdb',
 '1a5b_1_protein.pdb',
 '1a5s_1_protein.pdb',
 '1a5u_1_protein.pdb',
 '1a5u_2_protein.pdb',
 '1a5z_1_protein.pdb',
 '1a5w_1_protein.pdb',
 '1a5x_1_protein.pdb',
 '1a5v_1_protein.pdb',
 '1a6v_1_protein.pdb']

### Creating FASTA files from PDB
We first convert the sequence information in PDB files to FASTA files, which the ESM library handles. The FASTA files should contain per-chain information for each protein.

In [203]:
from Bio import SeqIO
import time

for file in files:
  filepath = os.path.join(data_dir, file)
  for record in SeqIO.parse(filepath, "pdb-atom"):
    chain = record.annotations['chain']
    id = os.path.basename(file).split('.')[0]
    record.id = id + ':' + chain
    record.description = ''

    sequence_lengths[record.id] = len(record.seq)
    fasta_filename = os.path.join('fasta', id + '_' + chain + '.fasta')

    print(fasta_filename)
    with open(fasta_filename, "w") as output_handle:
      print('>' + record.id, file=output_handle)
      print(record.seq, file=output_handle)
      output_handle.flush()





fasta/1a0g_1_protein_A.fasta
fasta/1a0g_1_protein_B.fasta
fasta/1a05_1_protein_A.fasta
fasta/1a05_1_protein_B.fasta
fasta/1a0f_1_protein_A.fasta
fasta/1a0f_1_protein_B.fasta




fasta/1a0j_1_protein_A.fasta
fasta/1a0j_1_protein_B.fasta
fasta/1a0j_1_protein_C.fasta
fasta/1a0j_1_protein_D.fasta
fasta/1a0q_1_protein_H.fasta
fasta/1a0q_1_protein_L.fasta
fasta/1a0t_1_protein_P.fasta
fasta/1a0t_1_protein_Q.fasta
fasta/1a0t_1_protein_R.fasta
fasta/1a1b_1_protein_A.fasta
fasta/1a1b_1_protein_B.fasta




fasta/1a1m_1_protein_A.fasta
fasta/1a1m_1_protein_B.fasta
fasta/1a1e_1_protein_A.fasta
fasta/1a1e_1_protein_B.fasta
fasta/1a1o_1_protein_A.fasta
fasta/1a1o_1_protein_B.fasta
fasta/1a1c_1_protein_A.fasta
fasta/1a1c_1_protein_B.fasta
fasta/1a1a_1_protein_A.fasta
fasta/1a1a_1_protein_B.fasta
fasta/1a1n_1_protein_A.fasta
fasta/1a1n_1_protein_B.fasta




fasta/1a2b_1_protein_A.fasta
fasta/1a2c_1_protein_H.fasta
fasta/1a2c_1_protein_I.fasta
fasta/1a2c_1_protein_L.fasta
fasta/1a3u_1_protein_A.fasta
fasta/1a2u_1_protein_A.fasta
fasta/1a3k_1_protein_A.fasta




fasta/1a2t_1_protein_A.fasta
fasta/1a3t_1_protein_A.fasta
fasta/1a3v_1_protein_A.fasta


[1;30;43mStreaming output truncated to the last 5000 lines.[0m
Exception ignored.
Some atoms or residues may be missing in the data structure.
Exception ignored.
Some atoms or residues may be missing in the data structure.
Exception ignored.
Some atoms or residues may be missing in the data structure.
Exception ignored.
Some atoms or residues may be missing in the data structure.
Exception ignored.
Some atoms or residues may be missing in the data structure.
Exception ignored.
Some atoms or residues may be missing in the data structure.
Exception ignored.
Some atoms or residues may be missing in the data structure.
Exception ignored.
Some atoms or residues may be missing in the data structure.
Exception ignored.
Some atoms or residues may be missing in the data structure.
Exception ignored.
Some atoms or residues may be missing in the data structure.
Exception ignored.
Some atoms or residues may be missing in the data structure.
Exception ignored.
Some atoms or residues may be missin

fasta/1a4g_1_protein_A.fasta
fasta/1a4g_1_protein_B.fasta




fasta/1a4k_1_protein_H.fasta
fasta/1a4k_1_protein_L.fasta


[1;30;43mStreaming output truncated to the last 5000 lines.[0m
Exception ignored.
Some atoms or residues may be missing in the data structure.
Exception ignored.
Some atoms or residues may be missing in the data structure.
Exception ignored.
Some atoms or residues may be missing in the data structure.
Exception ignored.
Some atoms or residues may be missing in the data structure.
Exception ignored.
Some atoms or residues may be missing in the data structure.
Exception ignored.
Some atoms or residues may be missing in the data structure.
Exception ignored.
Some atoms or residues may be missing in the data structure.
Exception ignored.
Some atoms or residues may be missing in the data structure.
Exception ignored.
Some atoms or residues may be missing in the data structure.
Exception ignored.
Some atoms or residues may be missing in the data structure.
Exception ignored.
Some atoms or residues may be missing in the data structure.
Exception ignored.
Some atoms or residues may be missin

fasta/1a4h_1_protein_A.fasta
fasta/1a4i_1_protein_A.fasta
fasta/1a4i_1_protein_B.fasta
fasta/1a4m_2_protein_B.fasta


[1;30;43mStreaming output truncated to the last 5000 lines.[0m
Exception ignored.
Some atoms or residues may be missing in the data structure.
Exception ignored.
Some atoms or residues may be missing in the data structure.
Exception ignored.
Some atoms or residues may be missing in the data structure.
Exception ignored.
Some atoms or residues may be missing in the data structure.
Exception ignored.
Some atoms or residues may be missing in the data structure.
Exception ignored.
Some atoms or residues may be missing in the data structure.
Exception ignored.
Some atoms or residues may be missing in the data structure.
Exception ignored.
Some atoms or residues may be missing in the data structure.
Exception ignored.
Some atoms or residues may be missing in the data structure.
Exception ignored.
Some atoms or residues may be missing in the data structure.
Exception ignored.
Some atoms or residues may be missing in the data structure.
Exception ignored.
Some atoms or residues may be missin

fasta/1a4q_1_protein_A.fasta
fasta/1a4q_1_protein_B.fasta
fasta/1a4m_4_protein_D.fasta




fasta/1a4m_1_protein_A.fasta
fasta/1a4m_3_protein_C.fasta




fasta/1a4r_1_protein_A.fasta
fasta/1a4r_1_protein_B.fasta
fasta/1a4k_2_protein_A.fasta
fasta/1a4k_2_protein_B.fasta


Exception ignored.
Some atoms or residues may be missing in the data structure.
Exception ignored.
Some atoms or residues may be missing in the data structure.
Exception ignored.
Some atoms or residues may be missing in the data structure.
Exception ignored.
Some atoms or residues may be missing in the data structure.
Exception ignored.
Some atoms or residues may be missing in the data structure.
Exception ignored.
Some atoms or residues may be missing in the data structure.
Exception ignored.
Some atoms or residues may be missing in the data structure.
Exception ignored.
Some atoms or residues may be missing in the data structure.
Exception ignored.
Some atoms or residues may be missing in the data structure.
Exception ignored.
Some atoms or residues may be missing in the data structure.
Exception ignored.
Some atoms or residues may be missing in the data structure.
Exception ignored.
Some atoms or residues may be missing in the data structure.
Exception ignored.
Some atoms or residue

fasta/1a4w_1_protein_H.fasta
fasta/1a4w_1_protein_I.fasta
fasta/1a4w_1_protein_L.fasta


[1;30;43mStreaming output truncated to the last 5000 lines.[0m
Exception ignored.
Some atoms or residues may be missing in the data structure.
Exception ignored.
Some atoms or residues may be missing in the data structure.
Exception ignored.
Some atoms or residues may be missing in the data structure.
Exception ignored.
Some atoms or residues may be missing in the data structure.
Exception ignored.
Some atoms or residues may be missing in the data structure.
Exception ignored.
Some atoms or residues may be missing in the data structure.
Exception ignored.
Some atoms or residues may be missing in the data structure.
Exception ignored.
Some atoms or residues may be missing in the data structure.
Exception ignored.
Some atoms or residues may be missing in the data structure.
Exception ignored.
Some atoms or residues may be missing in the data structure.
Exception ignored.
Some atoms or residues may be missing in the data structure.
Exception ignored.
Some atoms or residues may be missin

fasta/1a5b_1_protein_A.fasta
fasta/1a5b_1_protein_B.fasta


[1;30;43mStreaming output truncated to the last 5000 lines.[0m
Exception ignored.
Some atoms or residues may be missing in the data structure.
Exception ignored.
Some atoms or residues may be missing in the data structure.
Exception ignored.
Some atoms or residues may be missing in the data structure.
Exception ignored.
Some atoms or residues may be missing in the data structure.
Exception ignored.
Some atoms or residues may be missing in the data structure.
Exception ignored.
Some atoms or residues may be missing in the data structure.
Exception ignored.
Some atoms or residues may be missing in the data structure.
Exception ignored.
Some atoms or residues may be missing in the data structure.
Exception ignored.
Some atoms or residues may be missing in the data structure.
Exception ignored.
Some atoms or residues may be missing in the data structure.
Exception ignored.
Some atoms or residues may be missing in the data structure.
Exception ignored.
Some atoms or residues may be missin

fasta/1a5u_1_protein_A.fasta
fasta/1a5u_1_protein_B.fasta
fasta/1a5u_1_protein_C.fasta
fasta/1a5u_1_protein_D.fasta




fasta/1a5u_2_protein_E.fasta
fasta/1a5u_2_protein_F.fasta
fasta/1a5u_2_protein_G.fasta
fasta/1a5u_2_protein_H.fasta


[1;30;43mStreaming output truncated to the last 5000 lines.[0m
Exception ignored.
Some atoms or residues may be missing in the data structure.
Exception ignored.
Some atoms or residues may be missing in the data structure.
Exception ignored.
Some atoms or residues may be missing in the data structure.
Exception ignored.
Some atoms or residues may be missing in the data structure.
Exception ignored.
Some atoms or residues may be missing in the data structure.
Exception ignored.
Some atoms or residues may be missing in the data structure.
Exception ignored.
Some atoms or residues may be missing in the data structure.
Exception ignored.
Some atoms or residues may be missing in the data structure.
Exception ignored.
Some atoms or residues may be missing in the data structure.
Exception ignored.
Some atoms or residues may be missing in the data structure.
Exception ignored.
Some atoms or residues may be missing in the data structure.
Exception ignored.
Some atoms or residues may be missin

fasta/1a5z_1_protein_A.fasta


[1;30;43mStreaming output truncated to the last 5000 lines.[0m
Exception ignored.
Some atoms or residues may be missing in the data structure.
Exception ignored.
Some atoms or residues may be missing in the data structure.
Exception ignored.
Some atoms or residues may be missing in the data structure.
Exception ignored.
Some atoms or residues may be missing in the data structure.
Exception ignored.
Some atoms or residues may be missing in the data structure.
Exception ignored.
Some atoms or residues may be missing in the data structure.
Exception ignored.
Some atoms or residues may be missing in the data structure.
Exception ignored.
Some atoms or residues may be missing in the data structure.
Exception ignored.
Some atoms or residues may be missing in the data structure.
Exception ignored.
Some atoms or residues may be missing in the data structure.
Exception ignored.
Some atoms or residues may be missing in the data structure.
Exception ignored.
Some atoms or residues may be missin

fasta/1a5w_1_protein_A.fasta


IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



fasta/1a5v_1_protein_A.fasta
fasta/1a6v_1_protein_H.fasta
fasta/1a6v_1_protein_L.fasta


Exception ignored.
Some atoms or residues may be missing in the data structure.
Exception ignored.
Some atoms or residues may be missing in the data structure.
Exception ignored.
Some atoms or residues may be missing in the data structure.
Exception ignored.
Some atoms or residues may be missing in the data structure.
Exception ignored.
Some atoms or residues may be missing in the data structure.
Exception ignored.
Some atoms or residues may be missing in the data structure.
Exception ignored.
Some atoms or residues may be missing in the data structure.
Exception ignored.
Some atoms or residues may be missing in the data structure.
Exception ignored.
Some atoms or residues may be missing in the data structure.
Exception ignored.
Some atoms or residues may be missing in the data structure.
Exception ignored.
Some atoms or residues may be missing in the data structure.
Exception ignored.
Some atoms or residues may be missing in the data structure.
Exception ignored.
Some atoms or residue

### Generating ESM embeddings from FASTA
We use the FASTA files to generate ESM embeddings according to the ESM repo, which contains a python script to run the model and extract embeddings at certain layers.

In [None]:
%%bash
export PATH="$HOME/.local/bin:$PATH"
git clone https://github.com/facebookresearch/esm.git


In [207]:
%%bash

for filename in fasta/*; do
  python esm/scripts/extract.py esm2_t33_650M_UR50D $filename \
    embeddings --repr_layers 0 32 33 --include mean per_tok
done


Read fasta/1a05_1_protein_A.fasta with 1 sequences
Processing 1 of 1 batches (1 sequences)
Read fasta/1a05_1_protein_B.fasta with 1 sequences
Processing 1 of 1 batches (1 sequences)
Read fasta/1a0f_1_protein_A.fasta with 1 sequences
Processing 1 of 1 batches (1 sequences)
Read fasta/1a0f_1_protein_B.fasta with 1 sequences
Processing 1 of 1 batches (1 sequences)
Read fasta/1a0g_1_protein_A.fasta with 1 sequences
Processing 1 of 1 batches (1 sequences)
Read fasta/1a0g_1_protein_B.fasta with 1 sequences
Processing 1 of 1 batches (1 sequences)
Read fasta/1a0j_1_protein_A.fasta with 1 sequences
Processing 1 of 1 batches (1 sequences)
Read fasta/1a0j_1_protein_B.fasta with 1 sequences
Processing 1 of 1 batches (1 sequences)
Read fasta/1a0j_1_protein_C.fasta with 1 sequences
Processing 1 of 1 batches (1 sequences)
Read fasta/1a0j_1_protein_D.fasta with 1 sequences
Processing 1 of 1 batches (1 sequences)
Read fasta/1a0q_1_protein_H.fasta with 1 sequences
Processing 1 of 1 batches (1 sequences)