# Fine Tune ESM2 for a classification task

In [None]:
# istall peft
!pip install peft

In [None]:
# imports
import torch
import random
import numpy as np


# tokenize the data and create the dataloader
from torch.utils.data import Dataset
from torch.utils.data import DataLoader
from sklearn.model_selection import train_test_split

# on the train loop
from torch.optim import Adam


# get the model
from transformers import AutoTokenizer, EsmForSequenceClassification, EsmForMaskedLM

# apply lora layer to the model
from peft import get_peft_model, LoraConfig

device = 'cuda' if torch.cuda.is_available() else 'cpu'
device

No CUDA runtime is found, using CUDA_HOME='/usr/local/cuda'


'cpu'

In [None]:
from transformers import AutoTokenizer, EsmForMaskedLM

model = EsmForMaskedLM.from_pretrained("facebook/esm2_t6_8M_UR50D")
model

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/775 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/31.4M [00:00<?, ?B/s]

EsmForMaskedLM(
  (esm): EsmModel(
    (embeddings): EsmEmbeddings(
      (word_embeddings): Embedding(33, 320, padding_idx=1)
      (dropout): Dropout(p=0.0, inplace=False)
      (position_embeddings): Embedding(1026, 320, padding_idx=1)
    )
    (encoder): EsmEncoder(
      (layer): ModuleList(
        (0-5): 6 x EsmLayer(
          (attention): EsmAttention(
            (self): EsmSelfAttention(
              (query): Linear(in_features=320, out_features=320, bias=True)
              (key): Linear(in_features=320, out_features=320, bias=True)
              (value): Linear(in_features=320, out_features=320, bias=True)
              (dropout): Dropout(p=0.0, inplace=False)
              (rotary_embeddings): RotaryEmbedding()
            )
            (output): EsmSelfOutput(
              (dense): Linear(in_features=320, out_features=320, bias=True)
              (dropout): Dropout(p=0.0, inplace=False)
            )
            (LayerNorm): LayerNorm((320,), eps=1e-05, elementwise_a

### Step 1 - Load the model and the tokenizer
[EsmForSequenceClassification](https://huggingface.co/docs/transformers/en/model_doc/esm#transformers.EsmForSequenceClassification)

In [None]:
tokenizer = AutoTokenizer.from_pretrained("facebook/esm2_t30_150M_UR50D")
model = EsmForSequenceClassification.from_pretrained("facebook/esm2_t30_150M_UR50D", num_labels=2)
#model = model.to(device)
model

### Step 2 - Apply load layer to the model

[pfet github](https://github.com/huggingface/peft)

In [None]:
# Load peft configuration to apply to the model
peft_config = LoraConfig(
    #task_type="SEQ_CLS",
    r=4,
    lora_alpha=8,
    lora_dropout=0.2,
    target_modules=[
        'query',
        'key',
        'value',
    ]
)

In [None]:
lora_model = get_peft_model(model, peft_config)
lora_model.print_trainable_parameters()
lora_model = lora_model.to(device)
lora_model

## Fine Tunning

### Step 4 - Load the data

In [None]:
def generate_protein_sequence(length, number_of_seqs=1):
    amino_acids = 'ACDEFGHIKLMNPQRSTVWY'
    seqs, labels= [], []
    #data = {}
    for _ in range(number_of_seqs):
        seq = ''.join(random.choice(amino_acids) for _ in range(length))
        label = random.choice([0, 1])

        #data.append((seq, label))
        seqs.append(seq)
        labels.append(label)
    return seqs, labels

sequences, labels = generate_protein_sequence(75, 1000)
sequences[:5], labels[:5]

(['LPFHCKCKMAFMSLMHTLRWNTPFIHFQDENRLMLETCPHQGCYGGSWGNQWFHACAARCITDKRRFQVMCTPKP',
  'NENMQFGTLHWSATAEGLGPMNWMCQKCDITEQDLLQCMHWLKFNAWNYFEYWMPPMFMKWPKSHPPEIGHGQLR',
  'TEYMLAIPNPFKQIEWFCANWGMGAWVYNTRDQMKMWGEMCNWNTNYNQFWNHWKYALEKAQDPNQHCPHYGYNY',
  'FYDDHTGNDVECCKEKELTHNLRNWRKSSGSKCCIDMFCEDPRRDKNYIKCLWFKHFMQRTEECPSNSAWSFFRE',
  'FWMLSSIHATWEVVQMNMCSDAWCCCSITERCLRCYQNCHDLPWCTFATNFCGSNCDFSDIMSGPQGDTYWCTVD'],
 [1, 0, 1, 1, 1])

### Step 5 -  Tokenize the data

Most of it came from torch webpage [dataloader](https://pytorch.org/docs/stable/data.html)

In [None]:
tokenizer(sequences, max_length=75, padding=True, truncation=True, return_tensors='pt')

{'input_ids': tensor([[ 0,  6, 20,  ...,  8, 22,  2],
        [ 0, 21,  7,  ..., 16, 12,  2],
        [ 0,  6,  9,  ...,  9, 23,  2],
        ...,
        [ 0,  8, 16,  ...,  6,  7,  2],
        [ 0, 12, 23,  ..., 21, 10,  2],
        [ 0, 15, 20,  ..., 18, 15,  2]]), 'attention_mask': tensor([[1, 1, 1,  ..., 1, 1, 1],
        [1, 1, 1,  ..., 1, 1, 1],
        [1, 1, 1,  ..., 1, 1, 1],
        ...,
        [1, 1, 1,  ..., 1, 1, 1],
        [1, 1, 1,  ..., 1, 1, 1],
        [1, 1, 1,  ..., 1, 1, 1]])}

In [None]:
# the dataloaders requires a dataset class
class EsmTokenizer(Dataset):
    def __init__(self, sequences, targets, tokenizer):
        """Initialize the dataset with sequences and targets, using a tokenizer to preprocess the sequences."""
        self.sequences = sequences
        self.targets = torch.tensor(targets)
        self.batch_tokens = tokenizer(sequences, max_length=55, padding='max_length', truncation=True, return_tensors='pt')['input_ids']

    def __len__(self):
        """Returns the total number of samples in the dataset."""
        return len(self.sequences)

    def __getitem__(self, idx):
        """Retrieve the x samples from the dataset."""
        tokens = self.batch_tokens[idx]
        target = self.targets[idx]
        return tokens, target

In [None]:
# we need a data loader that could load the data in batches
def data_loader(sequences, targets, tokenizer, batch_size=4, shuffle=True, num_workers=0):
    """Create and return a DataLoader for the EsmTokenizer dataset."""
    # create a EsmTokenizer dataset object
    dataset = EsmTokenizer(sequences, targets, tokenizer)
    # split the data into train and test with sklearn train_test_split
    X_train, X_test, y_train, y_test = train_test_split(sequences, labels, test_size=0.2)
    # tokenize the data
    train_loader = DataLoader(EsmTokenizer(X_train, y_train, tokenizer), batch_size=64, shuffle=True)
    test_loader = DataLoader(EsmTokenizer(X_test, y_test, tokenizer), batch_size=64, shuffle=False)

    return train_loader, test_loader


# get the data loaders
train_loader, test_loader = data_loader(sequences, labels, tokenizer)

# check the data loader, this should return the tokens and the targets
# and in the train loop we will run like this and pass it to the model
for batch in train_loader:
    tokens, targets = batch
    print(tokens, targets)
    break

tensor([[ 0,  6,  9, 18,  8, 21, 22, 20, 17, 20,  5,  6, 21, 17, 20, 10, 15,  9,
         17,  9, 17,  7, 17,  9, 17, 16, 18,  9, 19, 17, 16, 20, 17, 18, 22,  7,
          5, 14, 21,  9, 23,  4, 13,  4, 17, 20,  7, 14, 21,  6,  9, 16, 17,  4,
          2],
        [ 0, 12, 12, 18, 21, 10, 15, 12,  9, 19,  4,  7, 12, 14, 12, 22, 18, 17,
         12, 22, 23,  6, 19, 12,  4, 10, 10, 13, 12, 15, 13, 14, 18, 22, 11, 17,
         13, 11, 22, 11, 15,  7,  9, 18, 19, 16,  6, 17, 19,  6, 22, 11,  9, 12,
          2],
        [ 0, 21, 18, 11, 17, 15, 19, 15, 20,  5, 10, 18, 16, 15, 20, 15, 12, 14,
         10,  7, 23, 21, 22, 18, 23, 19, 19, 23, 22, 15,  5, 23, 22, 16, 17,  6,
         23, 22, 22, 23, 19,  7, 14, 22, 16, 12, 15, 21, 16, 14, 22,  8,  6,  7,
          2],
        [ 0, 12, 19, 18, 23, 12, 21,  5, 16, 17, 10, 15, 23, 23,  9,  5,  4, 11,
          4, 20, 19, 18,  8,  6, 20, 11, 13, 23, 23,  5, 23,  7,  6, 20, 10, 22,
         12, 13, 13, 13, 17, 11,  6, 19, 17, 18,  7, 11, 21, 16, 16

### Step 6 - Define the optimizer

[torch Adam](https://pytorch.org/docs/stable/generated/torch.optim.Adam.html)

In [None]:
optimizer = Adam(lora_model.parameters(), lr=1e-5)

## Step 7 - Train loop

In [None]:
# Training loop
epochs = 10
optimizer = Adam(lora_model.parameters(), lr=1e-5)
for epoch in range(epochs):
    model.train()
    for batch in train_loader:
        tokens, targets = batch
        tokens, targets = tokens.to(device), targets.to(device)
        optimizer.zero_grad()
        outputs = lora_model(input_ids=tokens, labels=targets)

        ###### Here is where the magic happens. ###########################
        # The model outputs a loss value, which we use to compute gradients
        # and update the model weights on the backward pass.
        # New_weight = old_weight - learning_rate * gradient

        loss = outputs.loss
        loss.backward()
        optimizer.step()
        ####################################################################

    # Evaluation
    model.eval()
    for batch in test_loader:
        with torch.no_grad():
            tokens, targets = batch
            tokens, targets = tokens.to(device), targets.to(device)
            outputs = lora_model(input_ids=tokens, labels=targets)
            loss = outputs.loss

    print(f"Epoch {epoch}\n Train loss: {loss.item()}\tTest loss: {loss.item()}")

Epoch 0
 Train loss: 0.7002440690994263	Test loss: 0.7002440690994263
Epoch 1
 Train loss: 0.7001743912696838	Test loss: 0.7001743912696838
Epoch 2
 Train loss: 0.7001006603240967	Test loss: 0.7001006603240967
Epoch 3
 Train loss: 0.6999905109405518	Test loss: 0.6999905109405518
Epoch 4
 Train loss: 0.6998594999313354	Test loss: 0.6998594999313354
Epoch 5
 Train loss: 0.6996915340423584	Test loss: 0.6996915340423584
Epoch 6
 Train loss: 0.699545681476593	Test loss: 0.699545681476593
Epoch 7
 Train loss: 0.6993405818939209	Test loss: 0.6993405818939209


KeyboardInterrupt: 

In [None]:
stop

NameError: name 'stop' is not defined

# Without lora
3.3G


In [None]:
optimizer = Adam(model.parameters(), lr=1e-5)

# Training loop
epochs = 10

for epoch in range(epochs):
    model.train()
    for batch in train_loader:
        tokens, targets = batch
        tokens, targets = tokens.to(device), targets.to(device)
        optimizer.zero_grad()
        outputs = model(input_ids=tokens, labels=targets)

        ###### Here is where the magic happens. ###########################
        # The model outputs a loss value, which we use to compute gradients
        # and update the model weights on the backward pass.
        # New_weight = old_weight - learning_rate * gradient

        loss = outputs.loss
        loss.backward()
        optimizer.step()
        ####################################################################

    # Evaluation
    model.eval()
    for batch in test_loader:
        with torch.no_grad():
            tokens, targets = batch
            tokens, targets = tokens.to(device), targets.to(device)
            outputs = model(input_ids=tokens, labels=targets)
            loss = outputs.loss

    print(f"Epoch {epoch}\n Train loss: {loss.item()}\tTest loss: {loss.item()}")