In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import pickle
import sys,os
from tool.att import Attention
from keras.models import load_model
import esm
import torch
from tqdm import tqdm
import requests
import logging
import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'
data = pd.read_csv("./DATA/complexPortal_num(1).tsv",sep='\t')
os.environ['HTTP_PROXY'] = 'http://127.0.0.1:7890'

# data load

In [7]:
def get_fasta_data(ids):
    data_list = []
    
    for ID in ids:
        url = f"https://rest.uniprot.org/uniprotkb/{ID}.fasta"
        response = requests.get(url)
        
        if response.status_code == 200:
            fasta_data = response.text
        else:
            fasta_data = f"Failed to download data for ID {ID}. Status code: {response.status_code}"
        
        header, *sequence_lines = fasta_data.split('\n')
        uniprot_id = header.split('|')[1]
        sequence = ''.join(sequence_lines)
        
        data_list.append({'uniprot_id': uniprot_id, 'seq': sequence})
    
    dataset = pd.DataFrame(data_list)
    return dataset

# Example usage:
ID_list = ["A0A2A9IR05", "P12345", "Q9Y617"]
result_dataset = get_fasta_data(ID_list)
result_dataset.to_csv("output/download_seq.csv",index=False)

# embedding

In [5]:
model, alphabet = esm.pretrained.esm2_t33_650M_UR50D()
batch_converter = alphabet.get_batch_converter()
model.eval()
model = model.to(device)



In [6]:
# Esm2 embedding
def get_rep_seq(sequences):

    batch_labels, batch_strs, batch_tokens = batch_converter(sequences)
    batch_tokens = batch_tokens.to(device)
    batch_lens = (batch_tokens != alphabet.padding_idx).sum(1)

    with torch.no_grad():
        results = model(batch_tokens, repr_layers=[33], return_contacts=False)
    token_representations = results["representations"][33]
    sequence_representations = []
    for i, tokens_len in enumerate(batch_lens):
        sequence_representations.append(token_representations[i, 1 : tokens_len - 1].mean(0))
       
    np_list = []

    for i, ten in enumerate(sequence_representations):
        ten=ten.cpu().detach().numpy()
        np_list.append(ten)
    res = pd.DataFrame(np_list)
    res.columns = ['f'+str(i) for i in range (0,res.shape[1])]
    return res

In [7]:
df_data = list(zip(dataset.uniprot_id.index,dataset.seq))

# Run in batches
stride =2
num_iterations = len(df_data) // stride
if len(df_data) % stride != 0:
    num_iterations += 1
    
# Embedding
all_results = pd.DataFrame()

for i in tqdm(range(num_iterations)):
    
    start = i * stride
    end = start + stride

    current_data = df_data[start:end]

    rep33 = get_rep_seq(sequences=current_data)
    rep33['uniprot_id'] = dataset[start:end].uniprot_id.tolist()
    cols = list(rep33.columns)
    cols = [cols[-1]] + cols[:-1]
    rep33 = rep33[cols]
    all_results = pd.concat([all_results, rep33], ignore_index=True)

  0%|          | 0/1 [00:00<?, ?it/s]

100%|██████████| 1/1 [00:02<00:00,  2.26s/it]


# predict

In [8]:
# Deepsub
model = load_model("./model/deepsub_new.h5",custom_objects={"Attention": Attention},compile=False)
predicted = model.predict(np.array(all_results.iloc[:,1:]).reshape(all_results.shape[0],1,-1))
predicted_labels = np.argmax(predicted, axis=1)
label_map = {0: 1, 1: 2, 2: 3, 3: 4, 4: 5, 5: 6, 6: 7, 7: 8, 8: 10, 9: 12}
y_test_transformed = [label_map[x] for x in predicted_labels]
print("These are the predicted labels:")
print(y_test_transformed)

2024-03-04 08:08:26.655808: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1886] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 15770 MB memory:  -> device: 0, name: NVIDIA RTX A6000, pci bus id: 0000:73:00.0, compute capability: 8.6
2024-03-04 08:08:27.490935: I tensorflow/tsl/platform/default/subprocess.cc:304] Start cannot spawn child process: No such file or directory
2024-03-04 08:08:29.511625: I tensorflow/compiler/xla/stream_executor/cuda/cuda_dnn.cc:442] Loaded cuDNN version 8700


These are the predicted labels:
[4]


In [9]:
model

<keras.src.engine.functional.Functional at 0x7f21db1109d0>

In [10]:
model.summary()


Model: "model_4"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input (InputLayer)          [(None, 1, 1280)]         0         
                                                                 
 bi-gru (Bidirectional)      (None, 1, 256)            1082880   
                                                                 
 attention (Attention)       (None, 256)               8225      
                                                                 
 attention_dropout (Dropout  (None, 256)               0         
 )                                                               
                                                                 
 dense (Dense)               (None, 10)                2570      
                                                                 
Total params: 1093675 (4.17 MB)
Trainable params: 1093675 (4.17 MB)
Non-trainable params: 0 (0.00 Byte)
_____________________