# Uncertainty Quantification for species model

In [1]:
import numpy as np
import pandas as pd
import os
import sys
import torch

print(os.getcwd())
data_root = os.path.join(os.getcwd(), 'ZooTransform')

sys.path.append(data_root)

from src.zootransform.fine_tuning.fine_tuning import LoraFinetunerMLM  # new MLM version
from src.zootransform.model.species_model import SpeciesAwareESM2
from src.zootransform.dataset.load_uniprot import load_uniprot

/home/hslab/Olive/Kode/ZooTransform


  from .autonotebook import tqdm as notebook_tqdm


✓ All libraries imported successfully!


# Load data & model

In [2]:
data = load_uniprot()
species_names = sorted(set(data['species'].unique().tolist()))
species_model = SpeciesAwareESM2(model_name="facebook/esm2_t6_8M_UR50D", species_list=species_names) #TODO - define species list

# Prep dataset
np.random.seed(0)
n_max_dataset = 1000
idxs_rand = np.random.choice(len(data), n_max_dataset, replace=False)
species_batch = data["species"].iloc[idxs_rand].tolist()
sequence_batch = data["sequence"].iloc[idxs_rand].tolist()

Using device: cuda
  GPU: NVIDIA GeForce RTX 4090
  Memory: 25.39 GB
Loading model: facebook/esm2_t6_8M_UR50D


The new embeddings will be initialized from a multivariate normal distribution that has old embeddings' mean and covariance. As described in this article: https://nlp.stanford.edu/~johnhew/vocab-expansion.html. To disable this, use `mean_resizing=False`


Adding species tokens: ['<sp_Arabidopsis thaliana>', '<sp_Bos taurus>', '<sp_Escherichia coli>', '<sp_Homo sapiens>', '<sp_Mus musculus>', '<sp_Oryza sativa>', '<sp_Rattus norvegicus>', '<sp_Rhodotorula toruloides>', '<sp_Saccharolobus solfataricus>', '<sp_Saccharomyces cerevisiae>', '<sp_Schizosaccharomyces pombe>', '<sp_Staphylococcus aureus>']
Added 12 new special tokens
Resized model embeddings to 45 tokens
✓ Model and tokenizer ready!
  Hidden size: 320
  Number of layers: 6


# Train model

In [3]:
device = "cuda" if torch.cuda.is_available() else "cpu"

# Species-aware model
species_model = SpeciesAwareESM2(species_list=species_names)
species_model.model.to(device)

finetuner = LoraFinetunerMLM(
    base_model=species_model,
    r=8,
    alpha=16,
    dropout=0.05,
    target_modules=["attention.self.key", "attention.self.value", "attention.self.query", "embeddings.word_embeddings"],  # LoRA targets
    lr=1e-4,
    batch_size=4,
    mlm_probability=0.15  # fraction of tokens to mask
)

finetuner.train(
    species_batch=species_batch,
    sequence_batch=sequence_batch,
    epochs=5
)

tuned_embeddings = finetuner.embed(species_batch, sequence_batch)
print("Tuned embeddings shape:", tuned_embeddings.shape)

Using device: cuda
  GPU: NVIDIA GeForce RTX 4090
  Memory: 25.39 GB
Loading model: facebook/esm2_t6_8M_UR50D
Adding species tokens: ['<sp_Arabidopsis thaliana>', '<sp_Bos taurus>', '<sp_Escherichia coli>', '<sp_Homo sapiens>', '<sp_Mus musculus>', '<sp_Oryza sativa>', '<sp_Rattus norvegicus>', '<sp_Rhodotorula toruloides>', '<sp_Saccharolobus solfataricus>', '<sp_Saccharomyces cerevisiae>', '<sp_Schizosaccharomyces pombe>', '<sp_Staphylococcus aureus>']
Added 12 new special tokens
Resized model embeddings to 45 tokens
✓ Model and tokenizer ready!
  Hidden size: 320
  Number of layers: 6


Epoch 1/5: 100%|██████████| 250/250 [00:24<00:00, 10.27it/s, loss=2.4151]


Epoch 1 — avg loss: 2.4091


Epoch 2/5: 100%|██████████| 250/250 [00:24<00:00, 10.39it/s, loss=2.2072]


Epoch 2 — avg loss: 2.3938


Epoch 3/5: 100%|██████████| 250/250 [00:24<00:00, 10.39it/s, loss=2.3366]


Epoch 3 — avg loss: 2.3951


Epoch 4/5: 100%|██████████| 250/250 [00:24<00:00, 10.39it/s, loss=2.4724]


Epoch 4 — avg loss: 2.3924


Epoch 5/5: 100%|██████████| 250/250 [00:24<00:00, 10.40it/s, loss=2.3325]


Epoch 5 — avg loss: 2.3771
Tuned embeddings shape: torch.Size([1000, 320])


In [6]:
# Directory to save LoRA adapters
save_dir = f"lora_finetuned_species_model_{int(n_max_dataset)}" #TODO - specify path

# Save only LoRA weights 
finetuner.model.save_pretrained(save_dir)
np.save(os.path.join(save_dir, f"tuned_embeddings_{int(n_max_dataset)}.npy"), tuned_embeddings)
print(f"LoRA adapters saved to {save_dir}")

finetuner.tokenizer.save_pretrained(save_dir)


LoRA adapters saved to lora_finetuned_species_model_1000


('lora_finetuned_species_model_1000/tokenizer_config.json',
 'lora_finetuned_species_model_1000/special_tokens_map.json',
 'lora_finetuned_species_model_1000/vocab.txt',
 'lora_finetuned_species_model_1000/added_tokens.json')

# Run uncertainty

In [None]:
from src.zootransform.fine_tuning.fine_tuning import ProteinDataset, DataLoader

@torch.no_grad()
def embed_with_uncertainty(finetuner, species, sequences, n_mc_draws=20):
    """
    Monte Carlo dropout uncertainty estimation.
    Returns mean embedding and uncertainty estimate.

    Inputs:
    - species: species names
    - sequences: protein sequences
    """
    finetuner.model.train()  # keep dropout active!
    embeddings = []


    dataset = ProteinDataset(species_batch, sequence_batch, tokenizer=finetuner.tokenizer,
                                max_length=finetuner.max_length)
    loader = DataLoader(dataset, batch_size=finetuner.batch_size, shuffle=True,
                        collate_fn=finetuner.data_collator)

    for _ in range(n_mc_draws):
        # emb = model.embed(species, sequences)
        
        with torch.no_grad():
            # outputs = model(**inputs, output_hidden_states=True)
            outputs = finetuner.model(input_ids=input_ids, output_hidden_states=True)

        token_embeddings = outputs.hidden_states[-1].squeeze(0)      # (seq_len, hidden_dim)
        # mean pool excluding special tokens ([CLS] and [EOS])
        emb = token_embeddings[1:-1].mean(dim=0)              # (hidden_dim,)

        emb_mean = emb.mean(dim=1).cpu().numpy()
        embeddings.append(emb_mean)

    embeddings = np.stack(embeddings, axis=0)
    mean_embedding = embeddings.mean(axis=0)
    uncertainty = embeddings.std(axis=0).mean()

    return mean_embedding, uncertainty, embeddings

In [21]:
np.random.seed(1)
n_max_dataset_uq = 2000
idxs_rand = np.random.choice(len(data), n_max_dataset_uq, replace=False)
species_uq = data["species"].iloc[idxs_rand].tolist()
sequence_uq = data["sequence"].iloc[idxs_rand].tolist()

mean_embedding, uncertainty, embeddings = embed_with_uncertainty(
    finetuner=finetuner,
    species=species_uq,
    sequences=sequence_uq,
    n_mc_draws=5)

AttributeError: 'dict' object has no attribute 'ne'