## Load packages and Set Working Directory

In [19]:
import numpy as np
import pandas as pd
import os
import torch

#parent_dir = os.path.dirname(os.getcwd())
#print(parent_dir)
data_root = os.path.join(os.getcwd())
os.chdir(data_root)

# from src.model.species_model import SpeciesAwareESM2
# from src.zootransform.dataset.load_uniprot import load_uniprot

In [20]:
!ls

Dockerfile		  config.mk		    models.md
Makefile		  configure		    plain_mlp_best.pt
ProteinGym_DMS_data	  data			    pyproject.toml
README.md		  docker-examples	    uniprot_data
ZooTransform		  entrypoint.sh		    uv.lock
ai-for-science-tutorials  jupyter_server_config.py
best_model.pt		  license.txt


## Load Data

In [21]:
# Example data for embedding - need to replace with our actual data
data = load_uniprot()
species_names = sorted(set(data['species'].unique().tolist()))
species_names

['Arabidopsis thaliana',
 'Bos taurus',
 'Escherichia coli',
 'Homo sapiens',
 'Mus musculus',
 'Oryza sativa',
 'Rattus norvegicus',
 'Rhodotorula toruloides',
 'Saccharolobus solfataricus',
 'Saccharomyces cerevisiae',
 'Schizosaccharomyces pombe',
 'Staphylococcus aureus']

## Load and Use SpeciesAwareESM2 Model

In [17]:
# Load the pre-trained SpeciesAwareESM2 model
species_model = SpeciesAwareESM2(model_name="facebook/esm2_t6_8M_UR50D", species_list=species_names) #TODO - define species list

Using device: cuda
  GPU: NVIDIA A100-SXM4-40GB
  Memory: 42.29 GB
Loading model: facebook/esm2_t6_8M_UR50D


Some weights of EsmModel were not initialized from the model checkpoint at facebook/esm2_t6_8M_UR50D and are newly initialized: ['esm.pooler.dense.bias', 'esm.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Adding species tokens: ['<sp_Arabidopsis thaliana>', '<sp_Bos taurus>', '<sp_Escherichia coli>', '<sp_Homo sapiens>', '<sp_Mus musculus>', '<sp_Oryza sativa>', '<sp_Rattus norvegicus>', '<sp_Rhodotorula toruloides>', '<sp_Saccharolobus solfataricus>', '<sp_Saccharomyces cerevisiae>', '<sp_Schizosaccharomyces pombe>', '<sp_Staphylococcus aureus>']
Added 12 new special tokens
Resized model embeddings to 45 tokens
✓ Model and tokenizer ready!
  Hidden size: 320
  Number of layers: 6


## Generate Embeddings for Sequences with Species Information, with mean pooling
slower , not recommended for a large dataset

In [None]:
# Generate embeddings for each sequence manually
embeddings = []

for _, row in data.iterrows():
    emb = species_model.embed(row['species'], row['sequence'])
    emb_mean = emb.mean(dim=1).squeeze().cpu().numpy() # Mean pooling over sequence length, can decide to use different pooling
    embeddings.append(emb_mean)

embeddings = np.vstack(embeddings)
print("Embeddings array shape:", embeddings.shape)

## Generate Embeddings for *a Batch of Sequences* with Species Information, with mean pooling

In [None]:
# Generate embeddings for a batch of sequences (faster); here batch is the entire dataset
species_batch = data["species"].tolist()
sequence_batch = data["sequence"].tolist()

with torch.no_grad():
    outputs = species_model.forward(species_batch, sequence_batch)

batch_embeddings = outputs.last_hidden_state.mean(dim=1).cpu().numpy()
print(batch_embeddings.shape)

(9, 320)


In [None]:
# for name, module in list(species_model.model.named_modules())[:60]:
#     print(name)

## Split Data into Train, Validation, and Test Sets (not used in our case)

In [None]:
from sklearn.model_selection import train_test_split #TODO - do we want to have splits? generally we just want to fine-tune on all data

# First, split train vs temp (validation + test)
species_train, species_temp, seq_train, seq_temp = train_test_split(
    species_batch, sequence_batch, test_size=0.3, random_state=42
)

# Then, split temp into validation and test (50% of temp each = 15% total)
species_val, species_test, seq_val, seq_test = train_test_split(
    species_temp, seq_temp, test_size=0.5, random_state=42
)

print(f"Train: {len(species_train)}, Val: {len(species_val)}, Test: {len(species_test)}")

Train: 6, Val: 1, Test: 2


## Fine-tune the Model using LoRA (on all data)

In [None]:
from src.fine_tuning.fine_tuning import LoraFinetuner
from transformers import AutoModel, AutoTokenizer
import torch

species_batch = data["species"].tolist()
sequence_batch = data["sequence"].tolist()

device = "cuda" if torch.cuda.is_available() else "cpu"
old_model = AutoModel.from_pretrained("facebook/esm2_t6_8M_UR50D").to(device)
old_tokenizer = AutoTokenizer.from_pretrained("facebook/esm2_t6_8M_UR50D")
tokens_old = old_tokenizer(sequence_batch, return_tensors="pt", padding=True, truncation=True, max_length=1024)
tokens_old = {k: v.to(device) for k,v in tokens_old.items()}
with torch.no_grad():
    old_embeddings = old_model(**tokens_old).last_hidden_state.mean(dim=1)

# Species-aware model
species_model = SpeciesAwareESM2(species_list=["human","mouse","ecoli"])

# LoRA finetuner
finetuner = LoraFinetuner(
    base_model=species_model, 
    r=8, 
    alpha=16, 
    dropout=0.05, 
    target_modules=None, 
    lr=1e-4, 
    batch_size=4)  #TODO - optionally optimize parameters for LoRA 

# Train to align species embeddings to frozen embeddings
finetuner.train(species_train, seq_train, frozen_embeddings=None, epochs=5) #TODO - set frozen embeddings

# Extract tuned embeddings
tuned_embeddings = finetuner.embed(species_batch, sequence_batch)
print("Tuned embeddings shape:", tuned_embeddings.shape)

Some weights of EsmModel were not initialized from the model checkpoint at facebook/esm2_t6_8M_UR50D and are newly initialized: ['esm.pooler.dense.bias', 'esm.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of EsmModel were not initialized from the model checkpoint at facebook/esm2_t6_8M_UR50D and are newly initialized: ['esm.pooler.dense.bias', 'esm.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Using device: cuda
  GPU: NVIDIA A100-SXM4-40GB
  Memory: 42.29 GB
Loading model: facebook/esm2_t6_8M_UR50D
Adding species tokens: ['<sp_human>', '<sp_mouse>', '<sp_ecoli>']
Added 3 new special tokens
Resized model embeddings to 36 tokens
✓ Model and tokenizer ready!
  Hidden size: 320
  Number of layers: 6


Epoch 1/5: 100%|██████████| 2/2 [00:00<00:00, 26.49it/s, loss=0.1123]


Epoch 1 — avg loss: 0.1122


Epoch 2/5: 100%|██████████| 2/2 [00:00<00:00, 32.40it/s, loss=0.1127]


Epoch 2 — avg loss: 0.1118


Epoch 3/5: 100%|██████████| 2/2 [00:00<00:00, 34.82it/s, loss=0.1109]


Epoch 3 — avg loss: 0.1108


Epoch 4/5: 100%|██████████| 2/2 [00:00<00:00, 34.53it/s, loss=0.1101]


Epoch 4 — avg loss: 0.1101


Epoch 5/5: 100%|██████████| 2/2 [00:00<00:00, 34.88it/s, loss=0.1093]


Epoch 5 — avg loss: 0.1093
Returning final average loss across epochs: 0.1108
Tuned embeddings shape: torch.Size([9, 320])


## Fine-tune the Model using LoRA for Masked Language Modeling (MLM)

In [None]:
from src.fine_tuning.fine_tuning import LoraFinetunerMLM  # new MLM version
from src.model.species_model import SpeciesAwareESM2
import torch

device = "cuda" if torch.cuda.is_available() else "cpu"

# Species-aware model
species_model = SpeciesAwareESM2(species_list=["human","mouse","ecoli"])
species_model.model.to(device)

species_batch = data["species"].tolist()
sequence_batch = data["sequence"].tolist()

finetuner = LoraFinetunerMLM(
    base_model=species_model,
    r=8,
    alpha=16,
    dropout=0.05,
    target_modules=["attention.self.key", "attention.self.value"],  # LoRA targets
    lr=1e-4,
    batch_size=4,
    mlm_probability=0.15  # fraction of tokens to mask
)

finetuner.train(
    species_batch=species_batch,
    sequence_batch=sequence_batch,
    epochs=5
)

tuned_embeddings = finetuner.embed(species_batch, sequence_batch)
print("Tuned embeddings shape:", tuned_embeddings.shape)

Using device: cuda
  GPU: NVIDIA A100-SXM4-40GB
  Memory: 42.29 GB
Loading model: facebook/esm2_t6_8M_UR50D


Some weights of EsmModel were not initialized from the model checkpoint at facebook/esm2_t6_8M_UR50D and are newly initialized: ['esm.pooler.dense.bias', 'esm.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Adding species tokens: ['<sp_human>', '<sp_mouse>', '<sp_ecoli>']
Added 3 new special tokens
Resized model embeddings to 36 tokens
✓ Model and tokenizer ready!
  Hidden size: 320
  Number of layers: 6


Epoch 1/5: 100%|██████████| 3/3 [00:00<00:00, 19.14it/s, loss=5.7250]


Epoch 1 — avg loss: 5.7860


Epoch 2/5: 100%|██████████| 3/3 [00:00<00:00, 20.16it/s, loss=5.7591]


Epoch 2 — avg loss: 5.7836


Epoch 3/5: 100%|██████████| 3/3 [00:00<00:00, 20.07it/s, loss=5.7840]


Epoch 3 — avg loss: 5.7868


Epoch 4/5: 100%|██████████| 3/3 [00:00<00:00, 19.87it/s, loss=5.8020]


Epoch 4 — avg loss: 5.7881


Epoch 5/5: 100%|██████████| 3/3 [00:00<00:00, 20.10it/s, loss=5.7877]


Epoch 5 — avg loss: 5.7809
Tuned embeddings shape: torch.Size([9, 320])


# Save and Load the Fine-tuned Model

In [None]:
# Directory to save LoRA adapters
save_dir = "lora_finetuned_species_model" #TODO - specify path

# Save only LoRA weights 
finetuner.model.save_pretrained(save_dir)
print(f"LoRA adapters saved to {save_dir}")


LoRA adapters saved to lora_finetuned_species_model
