<a href="https://colab.research.google.com/github/vvithurshan/Antibody_Efficiency_Prediction/blob/main/prot_bert_bfd.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

**References**

https://github.com/agemagician/ProtTrans/blob/master/Fine-Tuning/ProtBert_BFD_FineTuning_MS.ipynb

In [1]:
!pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.25.1-py3-none-any.whl (5.8 MB)
[K     |████████████████████████████████| 5.8 MB 24.9 MB/s 
Collecting huggingface-hub<1.0,>=0.10.0
  Downloading huggingface_hub-0.11.1-py3-none-any.whl (182 kB)
[K     |████████████████████████████████| 182 kB 76.5 MB/s 
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1
  Downloading tokenizers-0.13.2-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.6 MB)
[K     |████████████████████████████████| 7.6 MB 70.9 MB/s 
Installing collected packages: tokenizers, huggingface-hub, transformers
Successfully installed huggingface-hub-0.11.1 tokenizers-0.13.2 transformers-4.25.1


In [2]:
import torch
from transformers import AutoTokenizer, Trainer, TrainingArguments, AutoModelForSequenceClassification
from torch.utils.data import Dataset
import os
import pandas as pd
import requests
from tqdm.auto import tqdm
import numpy as np
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
import re

In [3]:
!nvidia-smi

Sat Dec 31 18:05:45 2022       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 460.32.03    Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla T4            Off  | 00000000:00:04.0 Off |                    0 |
| N/A   62C    P0    28W /  70W |      0MiB / 15109MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [4]:
!git clone https://github.com/vvithurshan/Antibody_Efficiency_Prediction.git

Cloning into 'Antibody_Efficiency_Prediction'...
remote: Enumerating objects: 20, done.[K
remote: Counting objects: 100% (20/20), done.[K
remote: Compressing objects: 100% (17/17), done.[K
remote: Total 20 (delta 8), reused 0 (delta 0), pack-reused 0[K
Unpacking objects: 100% (20/20), done.


In [5]:
model_name = 'Rostlab/prot_bert_bfd'

In [6]:
class DeepLocDataset(Dataset):
    """Face Landmarks dataset."""

    def __init__(self, split="train", tokenizer_name='Rostlab/prot_bert_bfd', max_length=1024):

        self.datasetFolderPath = './Antibody_Efficiency_Prediction/Data_train_test/'
        self.trainFilePath = os.path.join(self.datasetFolderPath, 'train_df.csv')
        self.testFilePath = os.path.join(self.datasetFolderPath, 'test_df.csv')
        self.tokenizer = AutoTokenizer.from_pretrained(tokenizer_name, do_lower_case=False)

        if split=="train":
          self.seqs, self.labels = self.load_dataset(self.trainFilePath)
        else:
          self.seqs, self.labels = self.load_dataset(self.testFilePath)

        self.max_length = max_length

    def load_dataset(self,path):

        df = pd.read_csv(path,names=['FASTA_com', 'IC50'],skiprows=1)     
        self.labels_dic = {0:'Ineff', 1:'Eff'}
        seq = list(df['FASTA_com'])
        df['labels'] = np.where(df['IC50']== 1, 1, 0)
        label = list(df['labels'])

        assert len(seq) == len(label)
        return seq, label

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        if torch.is_tensor(idx):
            idx = idx.tolist()

        seq = " ".join("".join(self.seqs[idx].split()))
        seq = re.sub(r"[UZOB]", "X", seq)

        seq_ids = self.tokenizer(seq, truncation=True, padding='max_length', max_length=self.max_length)

        sample = {key: torch.tensor(val) for key, val in seq_ids.items()}
        sample['labels'] = torch.tensor(self.labels[idx])

        return sample

In [7]:
train_dataset = DeepLocDataset(split="train", tokenizer_name=model_name, max_length=256) # max_length is only capped to speed-up example.
test_dataset = DeepLocDataset(split="test", tokenizer_name=model_name, max_length=256)

Downloading:   0%|          | 0.00/86.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/361 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/81.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/112 [00:00<?, ?B/s]

In [8]:
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='binary')
    acc = accuracy_score(labels, preds)
    return {
        'accuracy': acc,
        'f1': f1,
        'precision': precision,
        'recall': recall
    }

In [10]:
def model_init():
  return AutoModelForSequenceClassification.from_pretrained(model_name)

In [11]:
training_args = TrainingArguments(
    output_dir='./results',          # output directory
    num_train_epochs=10,              # total number of training epochs
    per_device_train_batch_size=1,   # batch size per device during training
    per_device_eval_batch_size=10,   # batch size for evaluation
    warmup_steps=1000,               # number of warmup steps for learning rate scheduler
    weight_decay=0.01,               # strength of weight decay
    logging_dir='./logs',            # directory for storing logs
    logging_steps=200,               # How often to print logs
    do_train=True,                   # Perform training
    do_eval=True,                    # Perform evaluation
    evaluation_strategy="epoch",     # evalute after eachh epoch
    gradient_accumulation_steps=64,  # total number of steps before back propagation
    fp16=True,                       # Use mixed precision
    fp16_opt_level="02",             # mixed precision mode
    run_name="ProBert-BFD",       # experiment name
    seed=3                           # Seed for experiment reproducibility 3x3
)

trainer = Trainer(
    model_init=model_init,                # the instantiated 🤗 Transformers model to be trained
    args=training_args,                   # training arguments, defined above
    train_dataset=train_dataset,          # training dataset
    eval_dataset=test_dataset,             # evaluation dataset
    compute_metrics = compute_metrics,    # evaluation metrics
)

trainer.train()

loading configuration file config.json from cache at /root/.cache/huggingface/hub/models--Rostlab--prot_bert_bfd/snapshots/6c5c8a55a52ff08a664dfd584aa1773f125a0487/config.json
Model config BertConfig {
  "_name_or_path": "Rostlab/prot_bert_bfd",
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.0,
  "classifier_dropout": null,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.0,
  "hidden_size": 1024,
  "initializer_range": 0.02,
  "intermediate_size": 4096,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 40000,
  "model_type": "bert",
  "num_attention_heads": 16,
  "num_hidden_layers": 30,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "transformers_version": "4.25.1",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 30
}



Downloading:   0%|          | 0.00/1.68G [00:00<?, ?B/s]

loading weights file pytorch_model.bin from cache at /root/.cache/huggingface/hub/models--Rostlab--prot_bert_bfd/snapshots/6c5c8a55a52ff08a664dfd584aa1773f125a0487/pytorch_model.bin
Some weights of the model checkpoint at Rostlab/prot_bert_bfd were not used when initializing BertForSequenceClassification: ['cls.predictions.decoder.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.seq_relationship.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expec

Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
0,No log,0.662296,0.644809,0.784053,0.644809,1.0
1,No log,0.644637,0.644809,0.784053,0.644809,1.0
2,No log,0.586867,0.644809,0.784053,0.644809,1.0
3,No log,0.535749,0.84153,0.888889,0.811189,0.983051
4,No log,0.481742,0.844262,0.891013,0.811847,0.987288
5,No log,0.460149,0.844262,0.891013,0.811847,0.987288
6,No log,0.427883,0.860656,0.900585,0.833935,0.978814
7,No log,0.395684,0.849727,0.894027,0.819788,0.983051
8,No log,0.371929,0.863388,0.903101,0.832143,0.987288
9,0.556100,0.423364,0.844262,0.885312,0.842912,0.932203


***** Running Evaluation *****
  Num examples = 366
  Batch size = 10
***** Running Evaluation *****
  Num examples = 366
  Batch size = 10
***** Running Evaluation *****
  Num examples = 366
  Batch size = 10
***** Running Evaluation *****
  Num examples = 366
  Batch size = 10
***** Running Evaluation *****
  Num examples = 366
  Batch size = 10
***** Running Evaluation *****
  Num examples = 366
  Batch size = 10
***** Running Evaluation *****
  Num examples = 366
  Batch size = 10
***** Running Evaluation *****
  Num examples = 366
  Batch size = 10
***** Running Evaluation *****
  Num examples = 366
  Batch size = 10
***** Running Evaluation *****
  Num examples = 366
  Batch size = 10


Training completed. Do not forget to share your model on huggingface.co/models =)




TrainOutput(global_step=220, training_loss=0.5460008187727494, metrics={'train_runtime': 1712.6736, 'train_samples_per_second': 8.554, 'train_steps_per_second': 0.128, 'total_flos': 8493891870526464.0, 'train_loss': 0.5460008187727494, 'epoch': 9.96})

In [12]:
predictions, label_ids, metrics = trainer.predict(test_dataset)


***** Running Prediction *****
  Num examples = 366
  Batch size = 10


In [13]:
idx = 0
sample_ground_truth = test_dataset.labels_dic[int(test_dataset[idx]['labels'])]
sample_predictions =  test_dataset.labels_dic[np.argmax(predictions[idx], axis=0)]
sample_sequence = test_dataset.tokenizer.decode(test_dataset[idx]['input_ids'], skip_special_tokens=True)

In [14]:
print("Sequence: {} \nGround Truth is: {}\nprediction is: {}".format(sample_sequence,
                                                                      sample_ground_truth,
                                                                      sample_predictions))

Sequence: A L A L H F Y P G V Y D D Y G P P I A R G V N A L D K W N 
Ground Truth is: Ineff
prediction is: Ineff


In [15]:
# trainer.save_model('models/')