In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import pickle
import sys,os
from tool.att import Attention
from keras.models import load_model
import esm
import torch
from tqdm import tqdm
import requests
import logging
import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'

2024-02-23 15:36:45.674988: E tensorflow/compiler/xla/stream_executor/cuda/cuda_dnn.cc:9342] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-02-23 15:36:45.675059: E tensorflow/compiler/xla/stream_executor/cuda/cuda_fft.cc:609] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-02-23 15:36:45.675103: E tensorflow/compiler/xla/stream_executor/cuda/cuda_blas.cc:1518] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-02-23 15:36:45.683492: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
# Set gpu
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
print(f"Using device: {device}")

Using device: cuda


# data load

In [3]:
os.environ['HTTP_PROXY'] = 'http://127.0.0.1:7890'

In [4]:
# Get data from fasta 
url = "https://rest.uniprot.org/uniprotkb/A0A2A9IR05.fasta"

response = requests.get(url)

if response.status_code == 200:
    fasta_data = response.text
else:
    fasta_data = "Failed to download data. Status code: " + str(response.status_code)
    
header, *sequence_lines = fasta_data.split('\n')
uniprot_id = header.split('|')[1]
sequence = ''.join(sequence_lines)

dataset = pd.DataFrame({'uniprot_id': [uniprot_id], 'seq': [sequence]})

# embedding

In [5]:
model, alphabet = esm.pretrained.esm2_t33_650M_UR50D()
batch_converter = alphabet.get_batch_converter()
model.eval()
model = model.to(device)

In [6]:
# Esm2 embedding
def get_rep_seq(sequences):

    batch_labels, batch_strs, batch_tokens = batch_converter(sequences)
    batch_tokens = batch_tokens.to(device)
    batch_lens = (batch_tokens != alphabet.padding_idx).sum(1)

    with torch.no_grad():
        results = model(batch_tokens, repr_layers=[33], return_contacts=False)
    token_representations = results["representations"][33]
    sequence_representations = []
    for i, tokens_len in enumerate(batch_lens):
        sequence_representations.append(token_representations[i, 1 : tokens_len - 1].mean(0))
       
    np_list = []

    for i, ten in enumerate(sequence_representations):
        ten=ten.cpu().detach().numpy()
        np_list.append(ten)
    res = pd.DataFrame(np_list)
    res.columns = ['f'+str(i) for i in range (0,res.shape[1])]
    return res

In [7]:
df_data = list(zip(dataset.uniprot_id.index,dataset.seq))

# Run in batches
stride =2
num_iterations = len(df_data) // stride
if len(df_data) % stride != 0:
    num_iterations += 1
    
# Embedding
all_results = pd.DataFrame()

for i in tqdm(range(num_iterations)):
    
    start = i * stride
    end = start + stride

    current_data = df_data[start:end]

    rep33 = get_rep_seq(sequences=current_data)
    rep33['uniprot_id'] = dataset[start:end].uniprot_id.tolist()
    cols = list(rep33.columns)
    cols = [cols[-1]] + cols[:-1]
    rep33 = rep33[cols]
    all_results = pd.concat([all_results, rep33], ignore_index=True)

100%|██████████| 1/1 [00:00<00:00,  1.23it/s]


# predict

In [8]:
# Deepsub
model = load_model("./model/deepsub_new.h5",custom_objects={"Attention": Attention},compile=False)
predicted = model.predict(np.array(all_results.iloc[:,1:]).reshape(all_results.shape[0],1,-1))
predicted_labels = np.argmax(predicted, axis=1)
label_map = {0: 1, 1: 2, 2: 3, 3: 4, 4: 5, 5: 6, 6: 7, 7: 8, 8: 10, 9: 12}
y_test_transformed = [label_map[x] for x in predicted_labels]
print("These are the predicted labels:")
print(y_test_transformed)

2024-02-23 15:37:08.728133: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1886] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 6297 MB memory:  -> device: 0, name: NVIDIA GeForce RTX 3080 Ti, pci bus id: 0000:01:00.0, compute capability: 8.6
2024-02-23 15:37:08.730124: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1886] Created device /job:localhost/replica:0/task:0/device:GPU:1 with 9559 MB memory:  -> device: 1, name: NVIDIA GeForce RTX 3080 Ti, pci bus id: 0000:25:00.0, compute capability: 8.6
2024-02-23 15:37:08.731704: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1886] Created device /job:localhost/replica:0/task:0/device:GPU:2 with 9559 MB memory:  -> device: 2, name: NVIDIA GeForce RTX 3080 Ti, pci bus id: 0000:41:00.0, compute capability: 8.6
2024-02-23 15:37:08.733334: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1886] Created device /job:localhost/replica:0/task:0/device:GPU:3 with 9559 MB memory:  -> device: 3, name: NVIDIA GeForce RTX

These are the predicted labels:
[4]


2024-02-23 15:37:09.929091: I tensorflow/compiler/xla/stream_executor/cuda/cuda_dnn.cc:442] Loaded cuDNN version 8700
