# EMS2 Github, ESM2nv (BioNeMo Framework) and ESM2nv (BioNeMo legacy service) embedding comparison

Goal: Compare if the matrices generated by ESM2 Github (650M), ESM2nv-650M (BioNeMo FW) and ESM2nv-650M (BioNEMO legacy service) are the same for the same protein

In [1]:
import warnings
from pathlib import Path
import os
import numpy as np
import pickle
import os

warnings.filterwarnings('ignore')
warnings.simplefilter('ignore')

# Input proteins

In [2]:
prot1 = ('prot1', 'MQLYLVLLLISYLLTPIGASILGRCTVAKMLYDGGLNYFEGYSLENWVCLAYFESKFNPSAVYEDPQDGSTGFGLFQIRDNEWCGHGKNLCSVSCTALLNPNLKDTIQCAKKIVKGKHGMGAWPIWSKNCQLSDVLDRWLDGCDL')
prot2 = ('prot2', 'MKWVTFISLLLLFSSAYSRGVFRRDTHKSEIAHRFKDLGEEHFKGLVLIAFSQYLQQCPFDEHVKLVNELTEFAKTCVADESHAGCEKSLHTLFGDELCKVASLRETYGDMADCCEKQEPERNECFLSHKDDSPDLPKLKPDPNTLCDEFKADEKKFWGKYLYEIARRHPYFYAPELLYYANKYNGVFQECCQAEDKGACLLPKIETMREKVLASSARQRLRCASIQKFGERALKAWSVARLSQKFPKAEFVEVTKLVTDLTKVHKECCHGDLLECADDRADLAKYICDNQDTISSKLKECCDKPLLEKSHCIAEVEKDAIPENLPPLTADFAEDKDVCKNYQEAKDAFLGSFLYEYSRRHPEYAVSVLLRLAKEYEATLEECCAKDDPHACYSTVFDKLKHLVDEPQNLIKQNCDQFEKLGEYGFQNALIVRYTRKVPQVSTPTLVEVSRSLGKVGTRCCTKPESERMPCTEDYLSLILNRLCVLHEKTPVSEKVTKCCTESLVNRRPCFSALTPDETYVPKAFDEKLFTFHADICTLPDTEKQIKKQTALVELLKHKPKATEEQLKTVMENFVAFVDKCCAADDKEACFAVEGPKLVVSTQTALA')
prot3 = ('prot3', 'MARGGDTGCTGPSETSASGAAAIALPGLEGPATDAQCQTLPLTVLKSRSPSPRSLPPALSCPPPQPAMLEHLSSLPTQMDYKGQKLAEQMFQGIILFSAIVGFIYGYVAEQFGWTVYIVMAGFAFSCLLTLPPWPIYRRHPLKWLPVQESSTDDKKPGERKIKRHAKNN')

inputs = [prot1, prot2, prot3]

# ESM2 Github

## Setup model

In [3]:
# show ESM2 versiona
!pip show fair-esm

Name: fair-esm
Version: 2.0.0
Summary: Evolutionary Scale Modeling (esm): Pretrained language models for proteins. From Facebook AI Research.
Home-page: https://github.com/facebookresearch/esm
Author: Facebook AI Research
Author-email: 
License: MIT
Location: /usr/local/lib/python3.10/dist-packages
Requires: 
Required-by: 


In [3]:
import torch
import esm
model, alphabet = torch.hub.load("facebookresearch/esm:main", "esm2_t33_650M_UR50D")

Using cache found in /root/.cache/torch/hub/facebookresearch_esm_main


In [4]:
# Load ESM-2 model
model, alphabet = esm.pretrained.esm2_t33_650M_UR50D()
batch_converter = alphabet.get_batch_converter()
model.eval().cuda()  # disables dropout for deterministic results

ESM2(
  (embed_tokens): Embedding(33, 1280, padding_idx=1)
  (layers): ModuleList(
    (0-32): 33 x TransformerLayer(
      (self_attn): MultiheadAttention(
        (k_proj): Linear(in_features=1280, out_features=1280, bias=True)
        (v_proj): Linear(in_features=1280, out_features=1280, bias=True)
        (q_proj): Linear(in_features=1280, out_features=1280, bias=True)
        (out_proj): Linear(in_features=1280, out_features=1280, bias=True)
        (rot_emb): RotaryEmbedding()
      )
      (self_attn_layer_norm): ESM1bLayerNorm(torch.Size([1280]), eps=1e-05, elementwise_affine=True)
      (fc1): Linear(in_features=1280, out_features=5120, bias=True)
      (fc2): Linear(in_features=5120, out_features=1280, bias=True)
      (final_layer_norm): ESM1bLayerNorm(torch.Size([1280]), eps=1e-05, elementwise_affine=True)
    )
  )
  (contact_head): ContactPredictionHead(
    (regression): Linear(in_features=660, out_features=1, bias=True)
    (activation): Sigmoid()
  )
  (emb_layer_norm_

## Run prediction

In [5]:
def predict(data):
    """
    Generate per-residue representations for a batch of sequences.
    :param data: list of tuples (sequence_id, sequence)
    """
    batch_labels, batch_strs, batch_tokens = batch_converter(data)
    # add to GPU
    batch_tokens = batch_tokens.to('cuda')
    # Extract per-residue representations (on CPU)
    with torch.no_grad():
        results = model(batch_tokens, repr_layers=[33], return_contacts=True)
    token_representations = results["representations"][33]
    return token_representations

In [13]:
results = []
# iterate through each input (N=1 batch prediction)
for protein in inputs:
    token_representations = predict([protein])[0].cpu()
    results.append(token_representations)

In [14]:
fp = 'output/esm2_t33_650M_UR50D.pkl'
with open(fp, 'wb') as f:
    pickle.dump(results, f)

# ESM2nv-650M Legacy BioNeMo service

## Setup model

In [26]:
from bionemo.api import BionemoClient
from dotenv import load_dotenv
# Load the environment variables from the .env file
load_dotenv()

# Access the NGC_API_KEY variable
ngc_api_key = os.getenv("NGC_API_KEY")

api = BionemoClient(ngc_api_key)

## Run prediction

In [47]:
results = []
for protein in inputs:
    r = api.esm2_sync(
      sequences = [protein[1]], # protein is tuple (sequence_id, sequence)
      model = "650m"
    )
    results.append(r[0]['representations'])



In [53]:
# dump the results to a pickle file
fp = 'output/esm2nv-650-bionemo-service.pkl'
with open(fp, 'wb') as f:
    pickle.dump(results, f)

# ESM2nv-650M BioNeMo Framework

## Setup model

In [3]:
try:
    BIONEMO_HOME: Path = Path(os.environ['BIONEMO_HOME']).absolute()
except KeyError:
    print("Must have BIONEMO_HOME set in the environment! See docs for instructions.")
    raise

config_path = BIONEMO_HOME / "examples" / "protein" / "esm2nv" / "conf"
print(f"Using model configuration at: {config_path}")
assert config_path.is_dir()

Using model configuration at: /workspace/bionemo/examples/protein/esm2nv/conf


In [5]:
from bionemo.utils.hydra import load_model_config

cfg = load_model_config(config_name="infer.yaml", config_path=config_path)

# the config file at /workspace/bionemo/examples/protein/esm2nv/conf/infer.yaml uses 650M model
# as seen by this line in the YAML file: restore_from_path: "${oc.env:BIONEMO_HOME}/models/protein/esm2nv/esm2nv_650M_converted.nemo"

In [7]:
from bionemo.triton.utils import load_model_for_inference
from bionemo.model.protein.esm1nv.infer import ESM1nvInference

inferer = load_model_for_inference(cfg, interactive=True)

print(f"Loaded a {type(inferer)}")
assert isinstance(inferer, ESM1nvInference)

[NeMo I 2024-05-31 16:25:41 utils:487] pytorch DDP is not initialized. Initializing with pytorch-lightening...


GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs


[NeMo I 2024-05-31 16:25:41 utils:333] Restoring model from /workspace/bionemo/models/protein/esm2nv/esm2nv_650M_converted.nemo
[NeMo I 2024-05-31 16:25:41 utils:337] Loading model class: bionemo.model.protein.esm1nv.esm1nv_model.ESM2nvModel


GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs


Interactive mode selected, using strategy='auto'
[NeMo I 2024-05-31 16:25:41 exp_manager:394] Experiments will be logged at /workspace/bionemo/fw-dev/esm2nv/embedding/nemo_experiments/ESM2nv_Inference/2024-05-31_16-25-41
[NeMo I 2024-05-31 16:25:41 exp_manager:835] TensorboardLogger has been set up
[NeMo I 2024-05-31 16:25:42 utils:306] 
    
    ************** Trainer configuration ***********
[NeMo I 2024-05-31 16:25:42 utils:307] 
    name: ESM2nv_Inference
    desc: Minimum configuration for initializing a ESM2nv model for inference.
    trainer:
      precision: 16-mixed
      devices: 1
      num_nodes: 1
      accelerator: gpu
      logger: false
      accumulate_grad_batches: 1
    exp_manager:
      explicit_log_dir: null
      exp_dir: null
      name: ${name}
      create_checkpoint_callback: false
    model:
      micro_batch_size: ${model.data.batch_size}
      tensor_model_parallel_size: 1
      pipeline_model_parallel_size: 1
      seq_length: 1024
      max_position_emb

[NeMo W 2024-05-31 16:25:43 megatron_base_model:821] The model: ESM2nvModel() does not have field.name: context_parallel_size in its cfg. Add this key to cfg or config_mapping to make to make it configurable.
[NeMo W 2024-05-31 16:25:43 megatron_base_model:821] The model: ESM2nvModel() does not have field.name: virtual_pipeline_model_parallel_size in its cfg. Add this key to cfg or config_mapping to make to make it configurable.
[NeMo W 2024-05-31 16:25:43 megatron_base_model:821] The model: ESM2nvModel() does not have field.name: sequence_parallel in its cfg. Add this key to cfg or config_mapping to make to make it configurable.
[NeMo W 2024-05-31 16:25:43 megatron_base_model:821] The model: ESM2nvModel() does not have field.name: expert_model_parallel_size in its cfg. Add this key to cfg or config_mapping to make to make it configurable.
[NeMo W 2024-05-31 16:25:43 megatron_base_model:821] The model: ESM2nvModel() does not have field.name: gradient_accumulation_fusion in its cfg. Add

[NeMo I 2024-05-31 16:25:43 megatron_init:234] Rank 0 has data parallel group: [0]
[NeMo I 2024-05-31 16:25:43 megatron_init:237] All data parallel group ranks: [[0]]
[NeMo I 2024-05-31 16:25:43 megatron_init:238] Ranks 0 has data parallel rank: 0
[NeMo I 2024-05-31 16:25:43 megatron_init:246] Rank 0 has model parallel group: [0]
[NeMo I 2024-05-31 16:25:43 megatron_init:247] All model parallel group ranks: [[0]]
[NeMo I 2024-05-31 16:25:43 megatron_init:257] Rank 0 has tensor model parallel group: [0]
[NeMo I 2024-05-31 16:25:43 megatron_init:261] All tensor model parallel group ranks: [[0]]
[NeMo I 2024-05-31 16:25:43 megatron_init:262] Rank 0 has tensor model parallel rank: 0
[NeMo I 2024-05-31 16:25:43 megatron_init:276] Rank 0 has pipeline model parallel group: [0]
[NeMo I 2024-05-31 16:25:43 megatron_init:288] Rank 0 has embedding group: [0]
[NeMo I 2024-05-31 16:25:43 megatron_init:294] All pipeline model parallel group ranks: [[0]]
[NeMo I 2024-05-31 16:25:43 megatron_init:295]

[NeMo W 2024-05-31 16:25:43 megatron_base_model:821] The model: ESM2nvModel() does not have field.name: context_parallel_size in its cfg. Add this key to cfg or config_mapping to make to make it configurable.
[NeMo W 2024-05-31 16:25:43 megatron_base_model:821] The model: ESM2nvModel() does not have field.name: virtual_pipeline_model_parallel_size in its cfg. Add this key to cfg or config_mapping to make to make it configurable.
[NeMo W 2024-05-31 16:25:43 megatron_base_model:821] The model: ESM2nvModel() does not have field.name: sequence_parallel in its cfg. Add this key to cfg or config_mapping to make to make it configurable.
[NeMo W 2024-05-31 16:25:43 megatron_base_model:821] The model: ESM2nvModel() does not have field.name: expert_model_parallel_size in its cfg. Add this key to cfg or config_mapping to make to make it configurable.
[NeMo W 2024-05-31 16:25:43 megatron_base_model:821] The model: ESM2nvModel() does not have field.name: gradient_accumulation_fusion in its cfg. Add

[NeMo I 2024-05-31 16:25:43 tokenizer_utils:182] Getting HuggingFace AutoTokenizer with pretrained_model_name: facebook/esm2_t33_650M_UR50D


tokenizer_config.json:   0%|          | 0.00/95.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/93.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

[NeMo I 2024-05-31 16:25:44 megatron_base_model:315] Padded vocab_size: 128, original vocab_size: 33, dummy tokens: 95.


[NeMo W 2024-05-31 16:25:44 base:108] Using custom ESM2 Embeddings instead of the default NeMo version
[NeMo W 2024-05-31 16:25:44 attention:87] Using custom ESM2 attention instead of the default NeMo version
[NeMo W 2024-05-31 16:25:44 mlp:192] Using custom ESM2 GELU function instead of the default NeMo version
[NeMo W 2024-05-31 16:25:44 attention:87] Using custom ESM2 attention instead of the default NeMo version
[NeMo W 2024-05-31 16:25:44 mlp:192] Using custom ESM2 GELU function instead of the default NeMo version
[NeMo W 2024-05-31 16:25:44 attention:87] Using custom ESM2 attention instead of the default NeMo version
[NeMo W 2024-05-31 16:25:44 mlp:192] Using custom ESM2 GELU function instead of the default NeMo version
[NeMo W 2024-05-31 16:25:44 attention:87] Using custom ESM2 attention instead of the default NeMo version
[NeMo W 2024-05-31 16:25:44 mlp:192] Using custom ESM2 GELU function instead of the default NeMo version
[NeMo W 2024-05-31 16:25:44 attention:87] Using custo

[NeMo I 2024-05-31 16:25:51 nlp_overrides:752] Model ESM2nvModel was successfully restored from /workspace/bionemo/models/protein/esm2nv/esm2nv_650M_converted.nemo.
Loaded a <class 'bionemo.model.protein.esm1nv.infer.ESM1nvInference'>


## Run prediction

In [11]:
results = []
for protein in inputs: 
	# but each time, we only embed 1 protein
    r = inferer.seq_to_hiddens([protein[1]])[0][0, :].cpu()
    results.append(r)

In [15]:
# dump the results to a pickle file
fp = 'output/esm2nv-650-bionemo-fw.pkl'
with open(fp, 'wb') as f:
    pickle.dump(results, f)

# Compare results

In [16]:
github = pickle.load(open('output/esm2_t33_650M_UR50D.pkl', 'rb'))
bionemo_service = pickle.load(open('output/esm2nv-650-bionemo-service.pkl', 'rb'))
bionemo_fw = pickle.load(open('output/esm2nv-650-bionemo-fw.pkl', 'rb'))

## shape

In [22]:
for i in range(3): 
    assert github[i].shape == bionemo_fw[i].shape
    assert github[i].shape[0] == bionemo_service[i].shape[0] + 2 # 2 extra tokens: start, end   

- The hidden representations produced by Github and BioNeMo FW contain 2 extra tokens (start, end). 
- The hidden representations produced by BioNeMo legacy service do not have these extra tokens.

## values

In [74]:
for i in range(3):

    # github vs bionemo framework
    # the github and bionemo framework resutls are NOT the exactly the same
    assert not np.array_equal(github[i], bionemo_fw[i])
    # there are some differences
    assert np.allclose(github[i], bionemo_fw[i], atol=0.1, rtol=0)


    # bionemo FW vs bionemo service
    # the bionemo framework and bionemo service results are NOT the exactly the same
    assert not np.array_equal(bionemo_fw[i][1:-1], bionemo_service[i])
    # there are some edifferences
    assert np.allclose(bionemo_fw[i][1:-1], bionemo_service[i], atol=1e-1, rtol=0)

    # github vs bionemo service
    # the github and bionemo service results are NOT the exactly the same
    assert not np.array_equal(github[i][1:-1], bionemo_service[i])
    # there are some edifferences
    assert np.allclose(github[i][1:-1], bionemo_service[i], atol=1e-1, rtol=0)


**Conclusions**: 
- The hidden representations produced by ESM2 Github, ESM2nv-650M (BioNeMo FW), and ESM2nv-650 (BioNeMo legacy service) are not exactly the same (abs tolerance up to 0.1)
- Choose one of the 3 method and use `N=1` infernece batch to get the exactly same result each time