## Load packages and Set Working Directory

In [120]:
import numpy as np
import pandas as pd
import os
import torch

parent_dir = os.path.dirname(os.getcwd())
data_root = os.path.join(parent_dir, "ZooTransform")
os.chdir(data_root)

from src.model.species_model import SpeciesAwareESM2

## Load and Use SpeciesAwareESM2 Model

In [121]:
# Load the pre-trained SpeciesAwareESM2 model
model = SpeciesAwareESM2(model_name="facebook/esm2_t6_8M_UR50D", species_list=["human", "mouse", "ecoli"]) #TODO - define species list

‚úì Using device: cuda
  GPU: NVIDIA A100-SXM4-40GB
  Memory: 42.29 GB
üì• Loading model: facebook/esm2_t6_8M_UR50D


'(MaxRetryError('HTTPSConnectionPool(host=\'huggingface.co\', port=443): Max retries exceeded with url: /facebook/esm2_t6_8M_UR50D/resolve/main/config.json (Caused by NameResolutionError("<urllib3.connection.HTTPSConnection object at 0x7f199ae6f4a0>: Failed to resolve \'huggingface.co\' ([Errno -3] Temporary failure in name resolution)"))'), '(Request ID: b4a5f838-2e5a-4bbb-820f-b54bd23bb8ad)')' thrown while requesting HEAD https://huggingface.co/facebook/esm2_t6_8M_UR50D/resolve/main/config.json
Retrying in 1s [Retry 1/5].
'(MaxRetryError('HTTPSConnectionPool(host=\'huggingface.co\', port=443): Max retries exceeded with url: /api/resolve-cache/models/facebook/esm2_t6_8M_UR50D/c731040fcd8d73dceaa04b0a8e6329b345b0f5df/config.json (Caused by NameResolutionError("<urllib3.connection.HTTPSConnection object at 0x7f193eb7e7b0>: Failed to resolve \'huggingface.co\' ([Errno -3] Temporary failure in name resolution)"))'), '(Request ID: d79f7f61-92d1-4dcb-9ce9-c8163792db86)')' thrown while reque

Adding species tokens: ['<sp_human>', '<sp_mouse>', '<sp_ecoli>']
Added 3 new special tokens
Resized model embeddings to 36 tokens
‚úì Model and tokenizer ready!
  Hidden size: 320
  Number of layers: 6


## Load Data

In [122]:
# Example data for embedding - need to replace with our actual data
data = pd.DataFrame({
    "species": ["human", "mouse", "ecoli"],
    "sequence": [
        "MKTAYIAKQRQISFVKSHFSRQLEERLGLIEVQ",
        "MKVSAIAKQRQISFVKSHFSRQLRERLGLIEVQ",
        "MKTVYIAKQRQISFVKSHFSRQLEERLGLIEVQ"
    ]
})

## Generate Embeddings for Sequences with Species Information, with mean pooling
slower , not recommended for a large dataset

In [123]:
# Generate embeddings for each sequence manually
embeddings = []

for _, row in data.iterrows():
    emb = model.embed(row['species'], row['sequence'])
    emb_mean = emb.mean(dim=1).squeeze().cpu().numpy() # Mean pooling over sequence length, can decide to use different pooling
    embeddings.append(emb_mean)

embeddings = np.vstack(embeddings)
print("Embeddings array shape:", embeddings.shape)

Embeddings array shape: (3, 320)


## Generate Embeddings for *a Batch of Sequences* with Species Information, with mean pooling

In [124]:
species_batch = data['species'].tolist()
sequence_batch = data['sequence'].tolist()

with torch.no_grad():
    outputs = model.forward(species_batch, sequence_batch)

batch_embeddings = outputs.last_hidden_state.mean(dim=1).cpu().numpy()
print(batch_embeddings.shape)

(3, 320)


In [125]:
for name, module in list(model.model.named_modules())[:60]:
    print(name)



embeddings
embeddings.word_embeddings
embeddings.dropout
embeddings.position_embeddings
encoder
encoder.layer
encoder.layer.0
encoder.layer.0.attention
encoder.layer.0.attention.self
encoder.layer.0.attention.self.query
encoder.layer.0.attention.self.key
encoder.layer.0.attention.self.value
encoder.layer.0.attention.self.dropout
encoder.layer.0.attention.self.rotary_embeddings
encoder.layer.0.attention.output
encoder.layer.0.attention.output.dense
encoder.layer.0.attention.output.dropout
encoder.layer.0.attention.LayerNorm
encoder.layer.0.intermediate
encoder.layer.0.intermediate.dense
encoder.layer.0.output
encoder.layer.0.output.dense
encoder.layer.0.output.dropout
encoder.layer.0.LayerNorm
encoder.layer.1
encoder.layer.1.attention
encoder.layer.1.attention.self
encoder.layer.1.attention.self.query
encoder.layer.1.attention.self.key
encoder.layer.1.attention.self.value
encoder.layer.1.attention.self.dropout
encoder.layer.1.attention.self.rotary_embeddings
encoder.layer.1.attention.o

## Fine-tune the Model using LoRA

In [126]:
from src.fine_tuning.fine_tuning import LoraFinetuner

# ---- Instantiate base model ----
species_list = ["human", "mouse", "ecoli"] #TODO - define species list
model = SpeciesAwareESM2(species_list=species_list)

# ---- Prepare your data ----
species_batch = data["species"].tolist()
sequence_batch = data["sequence"].tolist()

# # ---- Create and train LoRA finetuner ----
# finetuner = LoraESMFinetuner(
#     base_model=model,
#     r=8,
#     alpha=16,
#     dropout=0.05,
#     lr=1e-4,
#     batch_size=4,
#     mlm_probability=0.15,
# )
# 
# finetuner.train(species_batch, sequence_batch, epochs=10)
with torch.no_grad():
    old_outputs = model.model(
        model.tokenizer(sequence_batch, return_tensors="pt",
                             padding=True, truncation=True,
                             max_length=model.max_length).to(model.device)
    )
teacher_embeddings = old_outputs.last_hidden_state.cpu().numpy()

# 3Ô∏è‚É£ Initialize LoRA finetuner
finetuner = LoraFinetuner(model, r=8, alpha=16, dropout=0.05, lr=1e-4, batch_size=4)

# 4Ô∏è‚É£ Train LoRA to add species-awareness
finetuner.train(species_batch, sequence_batch, teacher_embeddings, epochs=5)

# 5Ô∏è‚É£ Get species-aware embeddings
species_aware_embeddings = finetuner.embed(species_batch, sequence_batch)
print(species_aware_embeddings.shape)

Some weights of EsmModel were not initialized from the model checkpoint at facebook/esm2_t6_8M_UR50D and are newly initialized: ['esm.pooler.dense.bias', 'esm.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


‚úì Using device: cuda
  GPU: NVIDIA A100-SXM4-40GB
  Memory: 42.29 GB
üì• Loading model: facebook/esm2_t6_8M_UR50D
Adding species tokens: ['<sp_human>', '<sp_mouse>', '<sp_ecoli>']
Added 3 new special tokens
Resized model embeddings to 36 tokens
‚úì Model and tokenizer ready!
  Hidden size: 320
  Number of layers: 6


KeyError: 'Invalid key. Only three types of key are available: (1) string, (2) integers for backend Encoding, and (3) slices for data subsetting.'

In [ ]:
with torch.no_grad():
    tuned_embeddings = finetuner.embed(species_batch, sequence_batch)

print("Tuned embedding shape:", tuned_embeddings.shape)


In [None]:
finetuner.model.print_trainable_parameters()

In [127]:
from src.fine_tuning.fine_tuning import LoraFinetuner
from transformers import AutoModel, AutoTokenizer

device = "cuda" if torch.cuda.is_available() else "cpu"
old_model = AutoModel.from_pretrained("facebook/esm2_t6_8M_UR50D").to(device)
old_tokenizer = AutoTokenizer.from_pretrained("facebook/esm2_t6_8M_UR50D")
tokens_old = old_tokenizer(sequence_batch, return_tensors="pt", padding=True, truncation=True, max_length=1024)
tokens_old = {k: v.to(device) for k,v in tokens_old.items()}
with torch.no_grad():
    old_embeddings = old_model(**tokens_old).last_hidden_state.mean(dim=1)

# 2Ô∏è‚É£ Species-aware model
species_model = SpeciesAwareESM2(species_list=["human","mouse","ecoli"])

# 3Ô∏è‚É£ LoRA finetuner
finetuner = LoraFinetuner(base_model=species_model, batch_size=4)

# 4Ô∏è‚É£ Train to align species embeddings to frozen embeddings
finetuner.train(species_batch, sequence_batch, frozen_embeddings=old_embeddings, epochs=5)

# 5Ô∏è‚É£ Extract tuned embeddings
tuned_embeddings = finetuner.embed(species_batch, sequence_batch)
print("Tuned embeddings shape:", tuned_embeddings.shape)

ImportError: cannot import name 'LoraFinetuner' from 'src.fine_tuning.fine_tuning' (/workspace/ZooTransform/src/fine_tuning/fine_tuning.py)