# Fine-tuning the pRoBERTa model for PPI prediction


## 1. Process Input Data
The input data are two datasets in csv format containing experimentally confirmed protein-protein interactions which will be called the
positive dataset and random protein sequences pairs that will be called the negative datasets. These datasets should have
the following columns ```HUMAN_SEQ, VIRUS_SEQ``` designating the columns containing the sequences from the humans and
viruses respectively.

The datasets are labeled positive and negative then concatenated to one dataset.

### 1.1 Alternative
The input dataset can be an excel file containing a positive and negative sheet.

In [None]:
import pandas as pd
def from_csv(positive_path, negative_path):
    positive_dataframe = pd.read_csv(positive_path, usecols=['HUMAN_SEQ', 'VIRUS_SEQ'])
    positive_dataframe['label'] = 'positive'
    negative_dataframe = pd.read_csv(negative_path, usecols=['HUMAN_SEQ', 'VIRUS_SEQ'])
    negative_dataframe['label'] = 'negative'
    output_dataframe = pd.concat([positive_dataframe, negative_dataframe])
    output_dataframe = output_dataframe.rename(columns={'HUMAN_SEQ':'from', 'VIRUS_SEQ':'to'})
    return output_dataframe

def from_excel(filepath, positive_sheet, negative_sheet):
    positive_dataframe = pd.read_excel(filepath, sheet_name=positive_sheet)
    negative_dataframe = pd.read_excel(filepath, sheet_name=negative_sheet)
    positive_dataframe['label'] = 'positive'
    negative_dataframe['label'] = 'negative'
    output_dataframe = pd.concat([positive_dataframe, negative_dataframe])
    output_dataframe = output_dataframe.rename(columns={'HUMAN_SEQ': 'from', 'VIRUS_SEQ': 'to'})
    return output_dataframe

In [None]:
positive = 'Data/h1n1_data/training_set/H1N1_human_pos_training.csv'
negative = 'Data/h1n1_data/training_set/H1N1_human_neg_training.csv'

df = from_csv(positive, negative)

In [None]:
df.head()

## 2. Tokenize the protein sequences using a pretrained BPE model
The model was trained previously on the uniprot database of protein sequences based on the BPE algorithm using SentencePiece
The HUMAN_SEQ and VIRUS_SEQ columns was changed to ```from``` and ```to```.

The tokenized sequences were saved in a csv file for future use or as a variable for continued processing.

In [None]:
import sentencepiece as spm
model_path = 'BPE_model/m_reviewed.model'
model = spm.SentencePieceProcessor()
model.load(model_path)

In [None]:
def tokenize(dataframe, model):
    dfv = dataframe[['from', 'to', 'label']].values
    out = []
    for row in dfv:
        out.append([" ".join(model.encode_as_pieces(row[0])), " ".join(model.encode_as_pieces(row[1])),row[2]])
    print(out[101])
    out_df = pd.DataFrame(out, columns=['from', 'to', 'label'])
    print(out_df.head())
    out_df.to_csv('Data/H1N1_interact_tokenized_full.csv', index=False)
    return out_df

tokenized_df = tokenize(df, model)

In [None]:
tokenized_df.head()

## 3. Shuffling and Dataset Split
The tokenized dataframe is shuffled using ```sample(frac=1)``` which samples the dataset and returns 100% of the dataset
shuffled.

The dataframe is then sliced into 80%, 10% and 10% slices, for train, validation and test sets.


In [None]:
import math

def shuffle_dataframe(out_put_df):
    out_put_df_shuffled = out_put_df.sample(frac=1)
    train = math.ceil(len(out_put_df_shuffled) * 0.8)
    test = math.ceil(len(out_put_df_shuffled) * 0.8) + math.ceil(len(out_put_df_shuffled) * 0.1)

    shuffled_80_train = out_put_df_shuffled[0:train]
    shuffled_10_valid = out_put_df_shuffled[train:test]
    shuffled_10_test = out_put_df_shuffled[test:]

    return shuffled_80_train, shuffled_10_valid, shuffled_10_test

In [None]:
tokenized_shuffled_80_train, tokenized_shuffled_10_valid, tokenized_shuffled_10_test = shuffle_dataframe(tokenized_df)


## 4. Export the split datasets
The each column from the split datasets is extracted and converted to lists. These are then written to the disk.

In [1]:
# from sequences

def splits_to_list():
    from_to_label_list = ['from', 'to', 'label']
    for item in from_to_label_list:
        if item == 'from':
            from_trained = tokenized_shuffled_80_train[item].tolist()
            from_valid = tokenized_shuffled_10_valid[item].tolist()
            from_test = tokenized_shuffled_10_test[item].tolist()
            from_dict = {'from_train':from_trained, 'from_valid':from_valid, 'from_test':from_test}
            for k,v in from_dict.items():
                with open(f'split_tokenized/from/{k}.txt', 'w') as f:
                    for it in v:
                        f.write('%s\n' % it)
        if item == 'to':
            to_trained = tokenized_shuffled_80_train['to'].tolist()
            to_valid  = tokenized_shuffled_10_valid['to'].tolist()
            to_test = tokenized_shuffled_10_test['to'].tolist()
            to_dict = {'to_train':to_trained, 'to_valid':to_valid,'to_test':to_test}
            for k,v in to_dict.items():
                with open(f'split_tokenized/to/{k}.txt', 'w') as f:
                    for it in v:
                        f.write('%s\n' % it)
        if item == 'label':
            label_trained =tokenized_shuffled_80_train['label'].tolist()
            label_valid = tokenized_shuffled_10_valid['label'].tolist()
            label_test = tokenized_shuffled_10_test['label'].tolist()
            label_dict = {'label_train':label_trained, 'label_valid':label_valid,'label_test':label_test}
            for k,v in label_dict.items():
                with open(f'split_tokenized/label/{k}.txt', 'w') as f:
                    for it in v:
                        f.write('%s\n' % it)

SyntaxError: invalid syntax (<ipython-input-1-3e30db170a2b>, line 24)

## 6. Fine-tune the ProBERTa model to predict PPIs
As a test I used the pretrained model to fine tune on human-human PPI data. That model I fine-tuned I used to further fine-tune
a human-H1N1 ppi predictor.

Arguments:
ppi - model file prefix --PREFIX

int (1) - number of gpus --NUM_GPUS

ppi_finetune - destination directory --OUTPUT_DIR

split_binarized - input directory --DATA_DIR

768 - Dimension of embedding generated by the encoders --ENCODER_EMBED_DIM

5 - Number of encoder layers in the model --ENCODER_LAYERS

125000 - Maximum number of updates during training --TOTAL_UPDATES
3125 - Total number of Learning Rate warm-up updates during training --WARMUP_UPDATES
0.0025 - Peak learning rate for training --PEAK_LEARNING_RATE
32 - Maximum number of sequences in each batch --MAX_SENTENCES
64 - Updates the model every UPDATE_FREQ batches --UPDATE_FREQ
3  - Early stop training if valid performance does not improve for PATIENCS consecutive validation runs --PATIENCE
checkpoint.pt - Path to pretrained model checkpoint --PRETRAIN_CHECKPOINT
no- Whether to resume training from previous finetuned model checkpoints -- RESUME_TRAINING
True - To use [cls] token

These arguments are used to initiate ```fairseq-train``` with the following arguments:

```
if [ "$RESUME" = "no" ]; then
    fairseq-train --fp16 --fp16-no-flatten-grads $DATA_DIR \
        --max-positions $MAX_POSITIONS --max-sentences $MAX_SENTENCES \
        --arch roberta_base --task sentence_prediction \
        --truncate-sequence --use-cls-token $USE_CLS \
	      --bpe sentencepiece \
        --classification-head-name protein_interaction_prediction \
        --restore-file "$ROBERTA_PATH" --reset-optimizer --reset-dataloader --reset-meters \
        --init-token 0 --separator-token 2 \
        --criterion sentence_prediction --num-classes $NUM_CLASSES \
        --optimizer lamb \
        --lr-scheduler polynomial_decay --lr $PEAK_LR --warmup-updates $WARMUP_UPDATES --total-num-update $TOTAL_UPDATES \
        --dropout 0.1 --attention-dropout 0.1 --weight-decay 0.01 \
        --update-freq $UPDATE_FREQ \
        --max-update $TOTAL_UPDATES \
        --encoder-embed-dim $ENCODER_EMBED_DIM --encoder-layers $ENCODER_LAYERS \
        --save-dir "$CHECKPOINT_DIR" --save-interval 1 --save-interval-updates 100 --keep-interval-updates 5 \
        --distributed-world-size $NUM_GPUS --ddp-backend=no_c10d \
        --best-checkpoint-metric accuracy --maximize-best-checkpoint-metric \
        --patience $PATIENCE \
        --log-format simple --log-interval 1000 2>&1 | tee -a "$LOG_FILE"
else
    fairseq-train --fp16 --fp16-no-flatten-grads $DATA_DIR \
        --max-positions $MAX_POSITIONS --max-sentences $MAX_SENTENCES \
        --arch roberta_base --task sentence_prediction \
        --truncate-sequence --use-cls-token $USE_CLS \
        --bpe sentencepiece \
        --classification-head-name protein_interaction_prediction \
        --init-token 0 --separator-token 2 \
        --criterion sentence_prediction --num-classes $NUM_CLASSES \
        --optimizer lamb \
        --lr-scheduler polynomial_decay --lr $PEAK_LR --warmup-updates $WARMUP_UPDATES --total-num-update $TOTAL_UPDATES \
        --dropout 0.1 --attention-dropout 0.1 --weight-decay 0.01 \
        --update-freq $UPDATE_FREQ \
        --max-update $TOTAL_UPDATES \
        --encoder-embed-dim $ENCODER_EMBED_DIM --encoder-layers $ENCODER_LAYERS \
        --save-dir "$CHECKPOINT_DIR" --save-interval 1 --save-interval-updates 100 --keep-interval-updates 5 \
        --distributed-world-size $NUM_GPUS --ddp-backend=no_c10d \
        --best-checkpoint-metric accuracy --maximize-best-checkpoint-metric \
        --patience $PATIENCE \
        --log-format simple --log-interval 1000 2>&1 | tee -a "$LOG_FILE"

```

In [None]:
%%bash
pRoBERTa_finetune_ppi.sh ppi 1 ppi_finetune split_binarized/ 768 5 12500 312 0.0025 32 64 2 3 ppi_prediction/ppi.DIM_768.LAYERS_5.UPDATES_12500.WARMUP_312.LR_0.0025.BATCH_2048.PATIENCE_3/checkpoints/checkpoint_best.pt no True

## 7. Evaluate the model
We use the test data to evaluate the effectiveness of the model.

In [None]:
from fairseq.models.roberta import RobertaModel
from fairseq.data.data_utils import collate_tokens
from scipy.special import softmax
import sys
import numpy as np
import torch

In [None]:
# Path to input tokenized data: from, to, label


data_path = ''
binarized_path = ''
output_path = ''
model_folder = ''
classificatin_head = ''
batch_size = int('')

has_cuda = torch.cuda.device_count() > 0

In [None]:
from_col = 0
to_col = 1
label_col = 2
tuple_col = 3
softmax_col = 4
pred_col = 5

In [None]:
data = pd.read_csv(data_path, header=None)
data[label_col] = data[label_col].str.replace(" ", "_")

In [None]:
model = RobertaModel.from_pretrained(model_folder, "checkpoint_best.pt", binarized_path, bpe=None)
model.eval()

In [None]:
if (has_cuda):
    model.cuda()

In [None]:
split_num = int(len(data)/batch_size)
batch_data = np.array_split(data, split_num)
print(f"Total batches:{len(batch_data)}")
