# ESM2 Github Embedding Consistency

In [1]:
import warnings
from pathlib import Path
import os
import numpy as np
import torch
import esm

warnings.filterwarnings('ignore')
warnings.simplefilter('ignore')

# Show ESM2 version

In [3]:
!pip show fair-esm

Name: fair-esm
Version: 2.0.0
Summary: Evolutionary Scale Modeling (esm): Pretrained language models for proteins. From Facebook AI Research.
Home-page: https://github.com/facebookresearch/esm
Author: Facebook AI Research
Author-email: 
License: MIT
Location: /usr/local/lib/python3.10/dist-packages
Requires: 
Required-by: 


# Load model 

In [2]:
import torch
model, alphabet = torch.hub.load("facebookresearch/esm:main", "esm2_t33_650M_UR50D")

Using cache found in /root/.cache/torch/hub/facebookresearch_esm_main
Downloading: "https://dl.fbaipublicfiles.com/fair-esm/models/esm2_t33_650M_UR50D.pt" to /root/.cache/torch/hub/checkpoints/esm2_t33_650M_UR50D.pt


Downloading: "https://dl.fbaipublicfiles.com/fair-esm/regression/esm2_t33_650M_UR50D-contact-regression.pt" to /root/.cache/torch/hub/checkpoints/esm2_t33_650M_UR50D-contact-regression.pt


In [3]:
# Load ESM-2 model
model, alphabet = esm.pretrained.esm2_t33_650M_UR50D()
batch_converter = alphabet.get_batch_converter()
model.eval().cuda()  # disables dropout for deterministic results

ESM2(
  (embed_tokens): Embedding(33, 1280, padding_idx=1)
  (layers): ModuleList(
    (0-32): 33 x TransformerLayer(
      (self_attn): MultiheadAttention(
        (k_proj): Linear(in_features=1280, out_features=1280, bias=True)
        (v_proj): Linear(in_features=1280, out_features=1280, bias=True)
        (q_proj): Linear(in_features=1280, out_features=1280, bias=True)
        (out_proj): Linear(in_features=1280, out_features=1280, bias=True)
        (rot_emb): RotaryEmbedding()
      )
      (self_attn_layer_norm): ESM1bLayerNorm(torch.Size([1280]), eps=1e-05, elementwise_affine=True)
      (fc1): Linear(in_features=1280, out_features=5120, bias=True)
      (fc2): Linear(in_features=5120, out_features=1280, bias=True)
      (final_layer_norm): ESM1bLayerNorm(torch.Size([1280]), eps=1e-05, elementwise_affine=True)
    )
  )
  (contact_head): ContactPredictionHead(
    (regression): Linear(in_features=660, out_features=1, bias=True)
    (activation): Sigmoid()
  )
  (emb_layer_norm_

# Test embedding consistency

## Input data

In [4]:
short_seq = ('short_seq', 'MQLYLVLLLISYLLTPIGASILGRCTVAKMLYDGGLNYFEGYSLENWVCLAYFESKFNPSAVYEDPQDGSTGFGLFQIRDNEWCGHGKNLCSVSCTALLNPNLKDTIQCAKKIVKGKHGMGAWPIWSKNCQLSDVLDRWLDGCDL')
long_seq = ('long_seq', 'MKWVTFISLLLLFSSAYSRGVFRRDTHKSEIAHRFKDLGEEHFKGLVLIAFSQYLQQCPFDEHVKLVNELTEFAKTCVADESHAGCEKSLHTLFGDELCKVASLRETYGDMADCCEKQEPERNECFLSHKDDSPDLPKLKPDPNTLCDEFKADEKKFWGKYLYEIARRHPYFYAPELLYYANKYNGVFQECCQAEDKGACLLPKIETMREKVLASSARQRLRCASIQKFGERALKAWSVARLSQKFPKAEFVEVTKLVTDLTKVHKECCHGDLLECADDRADLAKYICDNQDTISSKLKECCDKPLLEKSHCIAEVEKDAIPENLPPLTADFAEDKDVCKNYQEAKDAFLGSFLYEYSRRHPEYAVSVLLRLAKEYEATLEECCAKDDPHACYSTVFDKLKHLVDEPQNLIKQNCDQFEKLGEYGFQNALIVRYTRKVPQVSTPTLVEVSRSLGKVGTRCCTKPESERMPCTEDYLSLILNRLCVLHEKTPVSEKVTKCCTESLVNRRPCFSALTPDETYVPKAFDEKLFTFHADICTLPDTEKQIKKQTALVELLKHKPKATEEQLKTVMENFVAFVDKCCAADDKEACFAVEGPKLVVSTQTALA')

input1 = [short_seq]
input2 = [short_seq, short_seq]
input3 = [short_seq, long_seq]

print(len(short_seq[1]))

145


## Run tests

### Batch Effects

In [5]:
def predict(data):
    """
    Generate per-residue representations for a batch of sequences.
    :param data: list of tuples (sequence_id, sequence)
    """
    batch_labels, batch_strs, batch_tokens = batch_converter(data)
    # add to GPU
    batch_tokens = batch_tokens.to('cuda')
    # Extract per-residue representations (on CPU)
    with torch.no_grad():
        results = model(batch_tokens, repr_layers=[33], return_contacts=True)
    token_representations = results["representations"][33]
    return token_representations

In [6]:
result1 = predict(input1)[0].cpu()
result2a = predict(input2)[0].cpu() # same seq, at index 0 in the array
result2b  = predict(input2)[1].cpu() # same seq, at index 1 in the array
result3 = predict(input3)[0].cpu()

In [7]:
# reullt1 and result2a are NOT exactly the same
assert not np.array_equal(result1, result2a)

# they are slightly different
assert np.allclose(result1, result2b, atol=1e-2, rtol=0)

In [8]:
# result2a and result2b are the same
assert np.array_equal(result2a, result2b)

Note that `result3` has a different shape, because the sequence was padded to the longer sequence in the list. 

In [9]:
result3_paddings_removed = result3[:147, :]
# the actual embeddings between result1 and result3 are NOT exactly the same
assert not np.array_equal(result1, result3_paddings_removed)

# they are slightly different
assert np.allclose(result1[1:146, :], result3[1:146, :], atol=1e-2, rtol=0)

### Run N=1 Batch

We will run N=1 batch for 10 times on the same protein. 

In [10]:
N = 10
# running inference 10 time
for n in range(N): 
	# but each time, we only embed 1 smiles
    result = predict(input1)[0].cpu()
    # this generates exactly the same array per round
    assert np.array_equal(result1, result)

For the same protein, if we run N=1 batch, the result is **deterministic**

**Conclusion**: Similar to the ESM2nv, to ensure that the embeddings are exactly same from each round, submit one protein sequence for each query, instead of putting them into a list. 