In [14]:
import numpy as np
import pandas as pd
import os
import torch

parent_dir = os.path.dirname(os.getcwd())
data_root = os.path.join(parent_dir, "ZooTransform")
os.chdir(data_root)

from src.model.species_model import SpeciesAwareESM2

# Load the pre-trained SpeciesAwareESM2 model
model = SpeciesAwareESM2(model_name="facebook/esm2_t6_8M_UR50D", species_list=["human", "mouse", "ecoli"])

# Example data for embedding - need to replace with our actual data
data = pd.DataFrame({
    "species": ["human", "mouse", "ecoli"],
    "sequence": [
        "MKTAYIAKQRQISFVKSHFSRQLEERLGLIEVQ",
        "MKVSAIAKQRQISFVKSHFSRQLRERLGLIEVQ",
        "MKTVYIAKQRQISFVKSHFSRQLEERLGLIEVQ"
    ]
})

embeddings = []

for _, row in data.iterrows():
    emb = model.embed(row['species'], row['sequence'])
    emb_mean = emb.mean(dim=1).squeeze().cpu().numpy() # Mean pooling over sequence length, can decide to use different pooling
    embeddings.append(emb_mean)

embeddings = np.vstack(embeddings)
print("Embeddings array shape:", embeddings.shape)

species_batch = data['species'].tolist()
sequence_batch = data['sequence'].tolist()

with torch.no_grad():
    outputs = model.forward(species_batch, sequence_batch)

batch_embeddings = outputs.last_hidden_state.mean(dim=1).cpu().numpy()
print(batch_embeddings.shape)





âœ“ Using device: cuda
  GPU: NVIDIA A100-SXM4-40GB
  Memory: 42.29 GB
ðŸ“¥ Loading model: facebook/esm2_t6_8M_UR50D


Some weights of EsmModel were not initialized from the model checkpoint at facebook/esm2_t6_8M_UR50D and are newly initialized: ['esm.pooler.dense.bias', 'esm.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Adding species tokens: ['<sp_human>', '<sp_mouse>', '<sp_ecoli>']
Added 3 new special tokens
Resized model embeddings to 36 tokens
âœ“ Model and tokenizer ready!
  Hidden size: 320
  Number of layers: 6
Embeddings array shape: (3, 320)
(3, 320)
