In [None]:
import torch
from transformers import AutoTokenizer, Trainer, TrainingArguments, AutoModelForSequenceClassification
from torch.utils.data import Dataset
import os
import pandas as pd
import requests
from tqdm.auto import tqdm
import numpy as np
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
import re

In [2]:
model_name = 'Rostlab/prot_bert_bfd'

In [55]:
import pandas as pd
dataset = pd.read_csv("../Fhalab/virus.csv")
df = dataset[['FASTA_com', 'IC50']].copy()
df.loc[df['IC50'] <= 10, 'IC50'] = 1
df.loc[df['IC50'] > 10, 'IC50'] = 0
df = df.dropna()

In [3]:
train_df = df.sample(frac=0.8, random_state=0)
test_df = df.drop(train_df.index)


In [7]:
train_df.to_csv('train_df.csv')
test_df.to_csv('test_df.csv')

In [18]:
data = pd.read_csv("./Data_train_test/test_df.csv",names=['FASTA_com', 'IC50'],skiprows=1)     
x = list(data['FASTA_com'])
y = list(data['IC50'])

In [24]:
y[0]

0.0

In [67]:
class DeepLocDataset(Dataset):
    """Face Landmarks dataset."""

    def __init__(self, split="train", tokenizer_name='Rostlab/prot_bert_bfd', max_length=1024):

        self.datasetFolderPath = 'Data_train_test/'
        self.trainFilePath = os.path.join(self.datasetFolderPath, 'train_df.csv')
        self.testFilePath = os.path.join(self.datasetFolderPath, 'test_df.csv')
        self.tokenizer = AutoTokenizer.from_pretrained(tokenizer_name, do_lower_case=False)

        if split=="train":
          self.seqs, self.labels = self.load_dataset(self.trainFilePath)
        else:
          self.seqs, self.labels = self.load_dataset(self.testFilePath)

        self.max_length = max_length

    def load_dataset(self,path):

        df = pd.read_csv(path,names=['FASTA_com', 'IC50'],skiprows=1)     
        self.labels_dic = {0:'Ineff', 1:'Eff'}
        seq = list(df['FASTA_com'])
        df['labels'] = np.where(df['IC50']== 1, 1, 0)
        label = list(df['labels'])

        assert len(seq) == len(label)
        return seq, label

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        if torch.is_tensor(idx):
            idx = idx.tolist()

        seq = " ".join("".join(self.seqs[idx].split()))
        seq = re.sub(r"[UZOB]", "X", seq)

        seq_ids = self.tokenizer(seq, truncation=True, padding='max_length', max_length=self.max_length)

        sample = {key: torch.tensor(val) for key, val in seq_ids.items()}
        sample['labels'] = torch.tensor(self.labels[idx])

        return sample

In [71]:
train_dataset = DeepLocDataset(split="train", tokenizer_name=model_name, max_length=256) # max_length is only capped to speed-up example.
test_dataset = DeepLocDataset(split="test", tokenizer_name=model_name, max_length=256)

loading configuration file config.json from cache at /home/nmrbox/0014/vvarenthirarajah/.cache/huggingface/hub/models--Rostlab--prot_bert_bfd/snapshots/6c5c8a55a52ff08a664dfd584aa1773f125a0487/config.json
Model config BertConfig {
  "_name_or_path": "Rostlab/prot_bert_bfd",
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.0,
  "classifier_dropout": null,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.0,
  "hidden_size": 1024,
  "initializer_range": 0.02,
  "intermediate_size": 4096,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 40000,
  "model_type": "bert",
  "num_attention_heads": 16,
  "num_hidden_layers": 30,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "transformers_version": "4.25.1",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 30
}

loading file vocab.txt from cache at /home/nmrbox/0014/vvarenthirarajah/.cache/huggingface/hub/models--Rostlab--prot_bert_bfd/snapshots/6c5c8a55a52ff08a664dfd584aa17

In [72]:
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='binary')
    acc = accuracy_score(labels, preds)
    return {
        'accuracy': acc,
        'f1': f1,
        'precision': precision,
        'recall': recall
    }

In [61]:
def model_init():
  return AutoModelForSequenceClassification.from_pretrained(model_name)

In [76]:
training_args = TrainingArguments(
    output_dir='./results',          # output directory
    num_train_epochs=10,              # total number of training epochs
    per_device_train_batch_size=1,   # batch size per device during training
    per_device_eval_batch_size=10,   # batch size for evaluation
    warmup_steps=1000,               # number of warmup steps for learning rate scheduler
    weight_decay=0.01,               # strength of weight decay
    logging_dir='./logs',            # directory for storing logs
    logging_steps=200,               # How often to print logs
    do_train=True,                   # Perform training
    do_eval=True,                    # Perform evaluation
    evaluation_strategy="epoch",     # evalute after eachh epoch
    gradient_accumulation_steps=64,  # total number of steps before back propagation
    fp16=True,                       # Use mixed precision
    fp16_opt_level="02",             # mixed precision mode
    run_name="ProBert-BFD-MS",       # experiment name
    seed=3                           # Seed for experiment reproducibility 3x3
)

trainer = Trainer(
    model_init=model_init,                # the instantiated 🤗 Transformers model to be trained
    args=training_args,                   # training arguments, defined above
    train_dataset=train_dataset,          # training dataset
    eval_dataset=test_dataset,             # evaluation dataset
    compute_metrics = compute_metrics,    # evaluation metrics
)

trainer.train()

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).
loading configuration file config.json from cache at /home/nmrbox/0014/vvarenthirarajah/.cache/huggingface/hub/models--Rostlab--prot_bert_bfd/snapshots/6c5c8a55a52ff08a664dfd584aa1773f125a0487/config.json
Model config BertConfig {
  "_name_or_path": "Rostlab/prot_bert_bfd",
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.0,
  "classifier_dropout": null,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.0,
  "hidden_size": 1024,
  "initializer_range": 0.02,
  "intermediate_size": 4096,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 40000,
  "model_type": "bert",
  "num_attention_heads": 16,
  "num_hidden_layers": 30,
  "pad_token_id": 0,
  "position_e

Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
0,No log,0.662302,0.644809,0.784053,0.644809,1.0
1,No log,0.6483,0.644809,0.784053,0.644809,1.0
2,No log,0.607038,0.644809,0.784053,0.644809,1.0
3,No log,0.556467,0.778689,0.852459,0.747604,0.991525
4,No log,0.525659,0.825137,0.879699,0.790541,0.991525
5,No log,0.480787,0.838798,0.887619,0.806228,0.987288
6,No log,0.408345,0.86612,0.904854,0.835125,0.987288
7,No log,0.400374,0.863388,0.903101,0.832143,0.987288
8,No log,0.419991,0.836066,0.886364,0.80137,0.991525
9,0.567000,0.388932,0.857923,0.898438,0.833333,0.974576


***** Running Evaluation *****
  Num examples = 366
  Batch size = 10
  nn.utils.clip_grad_norm_(
***** Running Evaluation *****
  Num examples = 366
  Batch size = 10
***** Running Evaluation *****
  Num examples = 366
  Batch size = 10
***** Running Evaluation *****
  Num examples = 366
  Batch size = 10
***** Running Evaluation *****
  Num examples = 366
  Batch size = 10
***** Running Evaluation *****
  Num examples = 366
  Batch size = 10
***** Running Evaluation *****
  Num examples = 366
  Batch size = 10
***** Running Evaluation *****
  Num examples = 366
  Batch size = 10
***** Running Evaluation *****
  Num examples = 366
  Batch size = 10
  nn.utils.clip_grad_norm_(
***** Running Evaluation *****
  Num examples = 366
  Batch size = 10


Training completed. Do not forget to share your model on huggingface.co/models =)




TrainOutput(global_step=220, training_loss=0.5547770370136608, metrics={'train_runtime': 2382.3872, 'train_samples_per_second': 6.149, 'train_steps_per_second': 0.092, 'total_flos': 8493891870526464.0, 'train_loss': 0.5547770370136608, 'epoch': 9.96})

In [77]:
!nvidia-smi

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
Thu Dec 29 10:42:49 2022       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 495.29.05    Driver Version: 495.29.05    CUDA Version: 11.5     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla T4            Off  | 00000000:03:00.0 Off |                  N/A |
| N/A   81C    P0    35W /  70W |   9593MiB / 15109MiB |      0%      Default |
|                               |            

In [63]:
%load_ext tensorboard
%tensorboard --logdir logs

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [78]:
predictions, label_ids, metrics = trainer.predict(test_dataset)


***** Running Prediction *****
  Num examples = 366
  Batch size = 10


In [79]:
idx = 0
sample_ground_truth = test_dataset.labels_dic[int(test_dataset[idx]['labels'])]
sample_predictions =  test_dataset.labels_dic[np.argmax(predictions[idx], axis=0)]
sample_sequence = test_dataset.tokenizer.decode(test_dataset[idx]['input_ids'], skip_special_tokens=True)

In [80]:
print("Sequence: {} \nGround Truth is: {}\nprediction is: {}".format(sample_sequence,
                                                                      sample_ground_truth,
                                                                      sample_predictions))

Sequence: A L A L H F Y P G V Y D D Y G P P I A R G V N A L D K W N 
Ground Truth is: Ineff
prediction is: Ineff


In [81]:
trainer.save_model('models/')

Saving model checkpoint to models/
Configuration saved in models/config.json
Model weights saved in models/pytorch_model.bin
