In [1]:
import os
import time
import os.path
import argparse
import numpy as np
from tqdm import tqdm

In [2]:
from lxml import etree
from pyfaidx import Faidx
import pandas as pd
from IPython.display import display

In [3]:
##  @brief  :   Keras & TF Libraries
from tensorflow import keras
import tensorflow as tf
from tensorflow import keras
from keras import backend as K

2023-11-30 22:33:25.511312: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [4]:
def read_list(file_name):
    """
    read a text file to get the list of elements
    :param file_name: complete path to a file (string)
    :return: list of elements in the text file
    """
    with open(file_name, 'r') as f:
        text = f.read().splitlines()
    return text


def read_fasta_file(fname):
    """
    reads the sequence from the fasta file
    :param fname: filename (string)
    :return: protein sequence  (string)
    """
    with open(fname + '.fasta', 'r') as f:
        AA = ''.join(f.read().splitlines()[1:])
    return AA

In [5]:
##  @brief  :   Local Modules
from proteinbert.tokenization import ADDED_TOKENS_PER_SEQ, index_to_token, token_to_index
from proteinbert.model_generation import ModelGenerator, PretrainingModelGenerator, FinetuningModelGenerator, InputEncoder, load_pretrained_model_from_dump, tokenize_seqs
from proteinbert.existing_model_loading import load_pretrained_model
from proteinbert.finetuning import OutputType, OutputSpec, finetune, evaluate_by_len
from proteinbert.conv_and_global_attention_model import get_model_with_hidden_layers_as_outputs

In [6]:
file_list = "/mnt/nvme/home/bbabatun/IDL/PROJECT/SPOT-1D-LM/spot_1d_lm/lists/casp12.txt"
device = "cuda:1"
## Need to define a maximum sequence input for model
## Check sequence lengths whilst generating esm embeddings 1100 should be large enough
MAX_SEQ_LEN = 1100  
## Size of Embedding Dim
EMBEDDING_DIM = 1562                     ## > NP Embeddings are size = (max_seq_len, embedding_dim)


In [7]:
##  @brief  :   Load Model and Tokenizer
pretrained_model_generator, input_encoder = load_pretrained_model()
#input_encoder.to(args.device)
## Lodel model to obtain local_representations & global represntations
model = get_model_with_hidden_layers_as_outputs(pretrained_model_generator.create_model(MAX_SEQ_LEN))
#model.to(args.device) # I

prot_list = read_list(file_list)

2023-11-30 22:33:29.125112: W tensorflow/core/common_runtime/gpu/gpu_device.cc:1960] Cannot dlopen some GPU libraries. Please make sure the missing libraries mentioned above are installed properly if you would like to use GPU. Follow the guide at https://www.tensorflow.org/install/gpu for how to download and setup the required libraries for your platform.
Skipping registering GPU devices...


In [8]:
# pretrained_model_generator, input_encoder = load_pretrained_model('/mnt/nvme/home/bbabatun/proteinbert_models/epoch_92400_sample_23500000.pkl')
print(prot_list)

['T0859-D1', 'T0862-D1', 'T0863-D1', 'T0863-D2', 'T0864-D1', 'T0866-D1', 'T0869-D1', 'T0870-D1', 'T0886-D1', 'T0886-D2', 'T0892-D2', 'T0896-D3', 'T0897-D1', 'T0897-D2', 'T0898-D1', 'T0900-D1', 'T0904-D1', 'T0912-D3', 'T0918-D1', 'T0918-D2', 'T0918-D3', 'T0941-D1']


In [13]:
##  @brief  :   Iterate through Files in Dataset & Generate Embeddings
for prot_path in tqdm(prot_list):

    prot_name = prot_path.split('/')[-1].split('.')[0]
    save_path = "/mnt/nvme/home/bbabatun/IDL/PROJECT/SPOT-1D-LM/inputs/" + prot_name + "_pb.npy"
    print(save_path)

    ## Check no embedding exists
    if not os.path.isfile(save_path):
        try:  
            ## Extract Protein Sequence as a String & Process through Model
            path = "/mnt/nvme/home/bbabatun/IDL/PROJECT/SPOT-1D-LM/spot_1d_lm/labels"
            labels = np.load(os.path.join(path, prot_name + ".npy"), allow_pickle=True)
            # seq = read_fasta_file(prot_path)
            seq = ''.join(labels[:, 3])

            ## Get raw sequence length
            seq_len = len(seq)
        
            ## Replace Us with Xs to normalise encoding over models
            seq = seq.replace("U", "X")

            ## Encode Input sequence
            encoded_x = input_encoder.encode_X([seq], MAX_SEQ_LEN)

            ## Obtain local & global embeddings
            local_representations, global_representations = model.predict(encoded_x)
            ##local_representations.to(args.file_list)

            ## Remove padding, end and start tokens
            save_arr = local_representations[0,1:seq_len,:]

            ## Save np file
            np.save(save_path, save_arr)
            print(save_path)
        except:
            #   print("No file available for: ",  prot_name, prot_path)
              print("No file available for: ",  prot_name, path)

print(" ProteinBERT embeddings generation completed ... ")

100%|██████████| 22/22 [00:00<00:00, 27818.72it/s]

/mnt/nvme/home/bbabatun/IDL/PROJECT/SPOT-1D-LM/inputs/T0859-D1_pb.npy
/mnt/nvme/home/bbabatun/IDL/PROJECT/SPOT-1D-LM/inputs/T0862-D1_pb.npy
/mnt/nvme/home/bbabatun/IDL/PROJECT/SPOT-1D-LM/inputs/T0863-D1_pb.npy
/mnt/nvme/home/bbabatun/IDL/PROJECT/SPOT-1D-LM/inputs/T0863-D2_pb.npy
/mnt/nvme/home/bbabatun/IDL/PROJECT/SPOT-1D-LM/inputs/T0864-D1_pb.npy
/mnt/nvme/home/bbabatun/IDL/PROJECT/SPOT-1D-LM/inputs/T0866-D1_pb.npy
/mnt/nvme/home/bbabatun/IDL/PROJECT/SPOT-1D-LM/inputs/T0869-D1_pb.npy
/mnt/nvme/home/bbabatun/IDL/PROJECT/SPOT-1D-LM/inputs/T0870-D1_pb.npy
/mnt/nvme/home/bbabatun/IDL/PROJECT/SPOT-1D-LM/inputs/T0886-D1_pb.npy
/mnt/nvme/home/bbabatun/IDL/PROJECT/SPOT-1D-LM/inputs/T0886-D2_pb.npy
/mnt/nvme/home/bbabatun/IDL/PROJECT/SPOT-1D-LM/inputs/T0892-D2_pb.npy
/mnt/nvme/home/bbabatun/IDL/PROJECT/SPOT-1D-LM/inputs/T0896-D3_pb.npy
/mnt/nvme/home/bbabatun/IDL/PROJECT/SPOT-1D-LM/inputs/T0897-D1_pb.npy
/mnt/nvme/home/bbabatun/IDL/PROJECT/SPOT-1D-LM/inputs/T0897-D2_pb.npy
/mnt/nvme/home/bbaba


