In [10]:
'''
Author: DengRui
Date: 2023-09-11 02:10:21
LastEditors: DengRui
LastEditTime: 2023-09-11 02:20:39
FilePath: /DeepSub/dataprocess/get_train_test.ipynb
Description:  get train and test data
Copyright (c) 2023 by DengRui, All Rights Reserved. 
'''
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import pickle
import sys,os
from tool.att import Attention
from keras.models import load_model
import esm
import torch
from tqdm import tqdm
import requests
import logging
import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'

In [2]:
# Set gpu
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
print(f"Using device: {device}")

Using device: cuda


# data load

In [3]:
# Get data from fasta 
url = "https://rest.uniprot.org/uniprotkb/P08499.fasta"

response = requests.get(url)

if response.status_code == 200:
    fasta_data = response.text
else:
    fasta_data = "Failed to download data. Status code: " + str(response.status_code)
    
header, *sequence_lines = fasta_data.split('\n')
uniprot_id = header.split('|')[1]
sequence = ''.join(sequence_lines)

dataset = pd.DataFrame({'uniprot_id': [uniprot_id], 'seq': [sequence]})

# embedding

In [4]:
model, alphabet = esm.pretrained.esm2_t33_650M_UR50D()
batch_converter = alphabet.get_batch_converter()
model.eval()
model = model.to(device)

In [5]:
# Esm2 embedding
def get_rep_seq(sequences):

    batch_labels, batch_strs, batch_tokens = batch_converter(sequences)
    batch_tokens = batch_tokens.to(device)
    batch_lens = (batch_tokens != alphabet.padding_idx).sum(1)

    with torch.no_grad():
        results = model(batch_tokens, repr_layers=[33], return_contacts=False)
    token_representations = results["representations"][33]
    sequence_representations = []
    for i, tokens_len in enumerate(batch_lens):
        sequence_representations.append(token_representations[i, 1 : tokens_len - 1].mean(0))
       
    np_list = []

    for i, ten in enumerate(sequence_representations):
        ten=ten.cpu().detach().numpy()
        np_list.append(ten)
    res = pd.DataFrame(np_list)
    res.columns = ['f'+str(i) for i in range (0,res.shape[1])]
    return res

In [6]:
df_data = list(zip(dataset.uniprot_id.index,dataset.seq))

# Run in batches
stride =2
num_iterations = len(df_data) // stride
if len(df_data) % stride != 0:
    num_iterations += 1
    
# Embedding
all_results = pd.DataFrame()

for i in tqdm(range(num_iterations)):
    
    start = i * stride
    end = start + stride

    current_data = df_data[start:end]

    rep33 = get_rep_seq(sequences=current_data)
    rep33['uniprot_id'] = dataset[start:end].uniprot_id.tolist()
    cols = list(rep33.columns)
    cols = [cols[-1]] + cols[:-1]
    rep33 = rep33[cols]
    all_results = pd.concat([all_results, rep33], ignore_index=True)

100%|██████████| 1/1 [00:01<00:00,  1.20s/it]


# predict

In [11]:
# Deepsub
model = load_model("./model/deepsub_20240120.h5",custom_objects={"Attention": Attention},compile=False)
predicted = model.predict(np.array(all_results.iloc[:,1:]).reshape(all_results.shape[0],1,-1))
predicted_labels = np.argmax(predicted, axis=1)
label_map = {0: 1, 1: 2, 2: 3, 3: 4, 4: 5, 5: 6, 6: 7, 7: 8, 8: 10, 9: 12}
y_test_transformed = [label_map[x] for x in predicted_labels]
print("These are the predicted labels:")
print(y_test_transformed)

These are the predicted labels:
[2]
