In [48]:
import sys

from datasets import Dataset, DatasetDict
import evaluate
import numpy as np
import pandas as pd
import torch
from transformers import AutoTokenizer, EsmForSequenceClassification, logging, \
    Trainer, TrainingArguments
import wandb
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, roc_auc_score, classification_report
import sklearn

In [49]:
def precision_recall_curve(y_true, pred_scores, thresholds):
    precisions = []
    recalls = []

    for threshold in thresholds:
        y_pred = [1 if score >= threshold else 0 for score in pred_scores]
        # print(y_pred)
        precision = sklearn.metrics.precision_score(y_true=y_true, y_pred=y_pred)
        recall = sklearn.metrics.recall_score(y_true=y_true, y_pred=y_pred)

        precisions.append(precision)
        recalls.append(recall)

    return precisions, recalls

In [50]:
def configure_tester(tokenizer, tokenized_dataset, model_name, sep, checkpoint=r'tmp\stapler_esm2_t6_8M_UR50D_0_epv1_aabb\checkpoint-18834'):    
    model = EsmForSequenceClassification.from_pretrained(checkpoint, num_labels=2)
    model.resize_token_embeddings(len(tokenizer))

    # Initialize wandb
    # wandb.init(project='stapler_esm', name=f'{model_name}_{sep}_ep_pc_v1')

    # Configure training arguments
    training_args = TrainingArguments(output_dir=f'tmp/stapler_{model_name}_{sep}_epv1',
                                      evaluation_strategy='epoch',
                                      per_device_train_batch_size=64,
                                      per_device_eval_batch_size=64,
                                      num_train_epochs=100,
                                      logging_strategy='epoch',
                                      learning_rate=0.000001,
                                      save_total_limit=1,
                                      
                                      
                                      load_best_model_at_end=True,
                                      metric_for_best_model="accuracy",
                                      save_strategy='epoch',)

    # Configure metrics
    metric = evaluate.load('accuracy')
    def compute_metrics(eval_pred):
        logits, labels = eval_pred
        predictions = np.argmax(logits, axis=-1)
        return metric.compute(predictions=predictions, references=labels)

    # Instantiate Trainer
    trainer = Trainer(model=model,
                      args=training_args,
                      train_dataset=tokenized_dataset['train'],
                      eval_dataset=tokenized_dataset['test'],
                      compute_metrics=compute_metrics)
    return trainer, model


# def main(model_name, sep):
#     """
#     Entry point of the program.

#     """
model_name, sep='esm2_t6_8M_UR50D', '0'
# Load data
print('No. of cuda devices:', torch.cuda.device_count())
df = pd.read_csv('vdjdb_external_negatives_data.csv')
# test_df = pd.read_csv('train-set_full-seq.csv')[0.7*23544:]
df['label_true_pair']=df['label_true_pair'].astype('int')

def insert_1_after_characters(s):
    return '1'.join(s) + '1'
# train_df['seq_2'] = train_df['seq_2'].apply(insert_1_after_characters)
# test_df['seq_2'] = test_df['seq_2'].apply(insert_1_after_characters)

### comment for 1 vocab
# df['epitope_aa'] = df['epitope_aa'].apply(insert_1_after_characters)


train_df, test_df = train_test_split(df, test_size=0.3)
# print((train_df).head())
# Format data
test_df=df
train_df = pd.DataFrame({'seq': train_df['cdr3_alpha_aa'] + sep + train_df['epitope_aa']+ sep +train_df['cdr3_beta_aa'],
                            'label': train_df['label_true_pair']})
test_df = pd.DataFrame({'seq': test_df['cdr3_alpha_aa'] + sep + test_df['epitope_aa']+ sep +test_df['cdr3_beta_aa'],
                        'label': test_df['label_true_pair']})

dataset = DatasetDict({
    'train': Dataset.from_pandas(train_df),
    'test': Dataset.from_pandas(test_df)
})
print('loading model')
# Load tokenizer and add custom tokens
tokenizer = AutoTokenizer.from_pretrained(f'facebook/{model_name}')
tokenizer.add_tokens([sep])
epitope_vocab = ["A1", "C1", "D1", "E1", "F1", "G1", "H1", "I1", "K1", "L1", "M1", "N1", "P1", "Q1", "R1", "S1", "T1", "V1", "W1", "Y1"]

###########  comment for 1 vocab
# tokenizer.add_tokens(epitope_vocab)

# Tokenize sequences
def tokenize_function(dataset):
    return tokenizer(dataset['seq'], return_tensors='pt', max_length=len(tokenizer), padding='max_length', truncation=True)
tokenized_dataset = dataset.map(tokenize_function, batched=True, batch_size=16)


print('loading trainer')
# Configure Trainer
trainer, model = configure_tester(tokenizer, tokenized_dataset, model_name, sep)
iid=torch.tensor(tokenized_dataset['test']['input_ids']).to('cuda')
atm=torch.tensor(tokenized_dataset['test']['attention_mask']).to('cuda')
with torch.no_grad():
    out=model(iid,atm)

preds = torch.nn.functional.sigmoid(out.logits)
predictions=(torch.argmax(preds, dim=1))
print(classification_report(y_pred=predictions.tolist(), y_true=(tokenized_dataset['test']['label'])))

thresholds=np.arange(start=0.2, stop=0.7, step=0.05)


# pred_scores = predictions['scores']
precisions, recalls = precision_recall_curve(y_true=(tokenized_dataset['test']['label']),
                                            pred_scores=preds[:,0].tolist(),
                                            thresholds=thresholds)
precisions.append(1)
recalls.append(0)

precisions = np.array(precisions)
recalls = np.array(recalls)

AP = np.sum((recalls[:-1] - recalls[1:]) * precisions[:-1])
print('Avg mean precision: ', AP)

No. of cuda devices: 1
loading model


Map: 100%|██████████| 2406/2406 [00:00<00:00, 5882.66 examples/s]
Map: 100%|██████████| 3438/3438 [00:00<00:00, 6390.25 examples/s]
You are resizing the embedding layer without providing a `pad_to_multiple_of` parameter. This means that the new embedding dimension will be 34. This might induce some performance reduction as *Tensor Cores* will not be available. For more details about this, or help on choosing the correct value for resizing, refer to this guide: https://docs.nvidia.com/deeplearning/performance/dl-performance-matrix-multiplication/index.html#requirements-tc


loading trainer
              precision    recall  f1-score   support

           0       0.85      0.99      0.91      2865
           1       0.66      0.13      0.22       573

    accuracy                           0.84      3438
   macro avg       0.75      0.56      0.57      3438
weighted avg       0.82      0.84      0.80      3438

Avg mean precision:  0.14135146273177523
