In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Loading the Dataset

In [2]:
# The IAV protein for which we have to extract the embeddings is in excel format file
df = pd.read_excel('IAV_Interaction_Region_Dataset.xlsx')
# df = pd.read_csv('IAV_Interaction_Region_Dataset.csv')

In [3]:
df = df.dropna(axis=1, how='all') # Drops all the columns that are empty. If there are no empty columns no need to run this

In [4]:
df.head()

Unnamed: 0,Residue_Index,PDBID,Amino_Acid,Interaction_Region
0,1,PB2,M,0
1,2,PB2,E,0
2,3,PB2,R,0
3,4,PB2,I,0
4,5,PB2,K,0


# Load ProtBERT tokenizer and model

---



In [5]:
import pandas as pd
import torch
import re
from transformers import BertTokenizer, BertModel

# Load ProtBERT tokenizer and model
tokenizer = BertTokenizer.from_pretrained("Rostlab/prot_bert", do_lower_case=False)
model = BertModel.from_pretrained("Rostlab/prot_bert")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/86.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/81.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/361 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/1.68G [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.68G [00:00<?, ?B/s]

In [6]:
def extract_and_save_embeddings(protein_name, sequence, filename):

    output_file=f"{filename}_protein_embeddings.csv"
    sequence = re.sub(r"[UZOB]", "X", sequence)  # Replace rare amino acids
    sequence = " ".join(sequence)  # ProtBERT expects space-separated amino acids

    # Tokenize sequence
    encoded_input = tokenizer(sequence, return_tensors='pt')

    with torch.no_grad():
        output = model(**encoded_input)

    # Extract last hidden states
    embeddings = output.last_hidden_state.squeeze(0)  # Shape: (L, 1024)
    embeddings = embeddings[1:-1]  # Remove CLS and SEP

    residues = list(sequence.replace(" ", ""))
    df = pd.DataFrame(embeddings.numpy(), columns=[f"dim_{i}" for i in range(embeddings.shape[1])])
    df.insert(0, "Residue", residues)
    df.insert(0, "Protein", protein_name)

    df.to_csv(output_file, index=False, mode='a', header=not pd.io.common.file_exists(output_file))
    print(f"Embeddings for {protein_name} saved to {output_file}")

    return embeddings

# Group amino acids into full protein sequences.

In [7]:
protein_sequences = df.groupby('PDBID')['Amino_Acid'].apply(lambda x: ''.join(x)).to_dict()

In [8]:
protein_sequences

{'M1': 'MSLLTEVETYVLSIIPSGPLKAEIAQRLESVFAGKNTDLEALMEWLKTRPILSPLTKGILGFVFTLTVPSERGLQRRRFIQNALNGNGDPNNMDRAVKLYKKLKREITFHGAKEVSLSYSTGALASCMGLIYNRMGTVTTEAAFGLVCATCEQIADSQHRSHRQMATTTNPLIRHENRMVLASTTAKAMEQVAGSSEQAAEAMEVANKTRQMVHAMRTIGTHPSSSAGLRDDLLENLQAYQKRMGVQMQRFK',
 'NEP': 'MDSNTMSSFQDILMRMSKQLGSSSEDLNGMVTRFESLKIYRDSLGEAVMRMGDLHYLQSRNEKWREQLGQKFEEIRWLIEEMRHRLKATENSFEQITFMQALQLLLEVEQEIRAFSFQLI',
 'NP': 'MASQGTKRSYEQMETGGERQDTTEIRASVGRMIGGIGRFYIQMCTELKLSDYDGRLIQNSITIERMVLSAFDERRNKYLEEHPSAGKDPKKTGGPIYRRIDGKWTRELILYDKEEIRRVWRQANNGEDATAGLTHIMIWHSNLNDATYQRTRALVRTGMDPRMCSLMQGSTLPRRSGAAGAAVKGVGTIAMELIRMIKRGINDRNFWRGENGRRTRVAYERMCNILKGKFQTAAQRAMMDQVRESRNPGNAEIEDLIFLARSALILRGSVAHKSCLPACVYGLAVASGHDFEREGYSLVGIDPFKLLQNSQVVSLMRPNENPAHKSQLVWMACHSAAFEDLRVSSFIRGKKVIPRGKLSTRGVQIASNENVETMDSNTLELRSRYWAIRTRSGGNTNQQKASAGQISVQPTFSVQRNLPFERATVMAAFSGNNEGRTSDMRTEVIRMMESAKPEDLSFQGRGVFELSDEKATNPIVPSFDMSNEGSYFFGDNAEEYDN',
 'NS1': 'MDSNTMSSFQVDCFLWHIRKRFADNGLGDAPFLDRLRRDQKSLKGRGNTLGLDIETATLVGKQIVEWILKEESSETLRMTIASVPTS

# Saving the protein embeddings as per the order in the excel sheet

In [9]:
ordered_proteins = df['PDBID'].drop_duplicates()
ordered_proteins

Unnamed: 0,PDBID
0,PB2
759,PB1
1516,PA
2232,NP
2730,M1
2982,NS1
3202,NEP


### Name the file as per the use

In [10]:
filename = "IAV"
for protein in ordered_proteins:
    sequence = ''.join(df[df['PDBID'] == protein]['Amino_Acid'].tolist())
    extract_and_save_embeddings(protein, sequence,filename)

Embeddings for PB2 saved to IAV_protein_embeddings.csv
Embeddings for PB1 saved to IAV_protein_embeddings.csv
Embeddings for PA saved to IAV_protein_embeddings.csv
Embeddings for NP saved to IAV_protein_embeddings.csv
Embeddings for M1 saved to IAV_protein_embeddings.csv
Embeddings for NS1 saved to IAV_protein_embeddings.csv
Embeddings for NEP saved to IAV_protein_embeddings.csv
