In [None]:
import numpy as np
from tqdm import tqdm
import ankh
import torch
from transformers import T5EncoderModel, AutoTokenizer, AutoModel, BertTokenizer, T5Tokenizer
device = torch.device('cuda:5' if torch.cuda.is_available() else 'cpu')
tokenizer = AutoTokenizer.from_pretrained("") # replace this to your file path
model = T5EncoderModel.from_pretrained("").to(device)# replace this to your file path

def ESM2(file):
    with open(file, 'r') as file:
        lines = file.readlines()
    for i in tqdm(range(0, len(lines), 2)):
        protein_id = lines[i].split(" ")[0]
        sequence = lines[i].split(" ")[1]
        outputs = tokenizer(sequence, return_tensors="pt", truncation=True)
        with torch.no_grad():
            embeddings = model(input_ids=outputs['input_ids'].to(device), attention_mask=outputs['attention_mask'].to(device))
        exact_embedding = embeddings[0][0][1:-1].detach().detach().cpu().numpy()
        save_path_new = "" + protein_id + ".tensor" # replace this to your file path
        torch.save(exact_embedding, save_path_new)


data_paths = ["your file name"]
for data_path in data_paths:
    print(data_path)
    read_path = "" + data_path + ".txt" # replace this to your file path
    ESM2(read_path)

In [None]:
import numpy as np
import os
import torch,esm
import re,sys
from tqdm import tqdm
from transformers import T5Tokenizer, T5EncoderModel, BertGenerationEncoder, BertTokenizer
from sklearn.preprocessing import MinMaxScaler
transformer_link = "" # your file path
tokenizer = T5Tokenizer.from_pretrained(transformer_link, do_lower_case=False)
model = T5EncoderModel.from_pretrained(transformer_link)
device = torch.device('cuda:7' if torch.cuda.is_available() else 'cpu')
model = model.to(device)
model = model.eval()
if device==torch.device("cuda"):
  model = model.half()


def read_sequence(file):
    sequence_dict = dict()
    label_dict = dict()
    with open(file, 'r') as file:
        lines = file.readlines()

    for i in tqdm(range(0, len(lines), 2)):
        protein_id = lines[i].split(" ")[0]
        sequence = lines[i].split(" ")[1]
        lenn = len(sequence)
        seq = ""
        for i in range(lenn):
            seq = seq + sequence[i] + " "
        sequence_dict[protein_id] = seq
    return sequence_dict



def embed_dataset(seq, shift_left = 0, shirf_right = -1):
    with torch.no_grad():
        ids = tokenizer.batch_encode_plus([seq], add_special_tokens=True, padding=True, is_split_into_words=True, return_tensors="pt")
        embedding = model(input_ids = ids['input_ids'].to(device))[0]
        embedding = embedding[0][shift_left:shirf_right].detach().detach().cpu().numpy()
    return embedding

data_paths = ["your file name"]
for data_path in data_paths:
    print(data_path + "is processing")
    read_path = "" + data_path + ".txt"
    feature_dir = ""
    sequence_dict = read_sequence(read_path)
    for protein_id in tqdm(sequence_dict):
        seq = sequence_dict[protein_id]
        sample = list(seq)
        embedding = embed_dataset(sample, shift_left=0, shirf_right=-1)
        save_path_new = feature_dir + protein_id + ".tensor"
        torch.save(embedding, save_path_new)
        torch.cuda.empty_cache()


In [None]:
import torch
import torch.nn as nn
import pandas as pd
import numpy as np
from torch.utils.data import DataLoader
from model import *
import torch.utils.data as data
from torch.nn.utils.rnn import pad_sequence
class Getdata(data.Dataset):
    def __init__(self, ID_list):
        super(Getdata, self).__init__()
        self.IDs = ID_list

    def __len__(self):
        return len(self.IDs)

    def __getitem__(self, idx):
        return self._feature_(idx)

    def _feature_(self, idx):
        name = self.IDs[idx]
        with torch.no_grad():
            embedding1 = torch.load("../prostt5_embedding/" + name + ".tensor")
            embedding2 = torch.load("../ankh_embedding/" + name + ".tensor")
            labels = torch.load("../label_embedding/" + name + ".tensor")
        return embedding2, embedding1, labels


class BatchCollate(object):
    def __call__(self, batch):
        features1, features2, labels = zip(*batch)
        # 填充特征数据
        features1 = [torch.tensor(feature) for feature in features1]
        feature1 = pad_sequence(features1, batch_first=True)  # ([bz, length, dim])
        features2 = [torch.tensor(feature) for feature in features2]
        feature2 = pad_sequence(features2, batch_first=True)  # ([bz, length, dim])
        # 填充标签数据
        labels = [torch.tensor(label) for label in labels]
        label = pad_sequence(labels, batch_first=True, padding_value=-1)  # [bz, length]
        return feature1, feature2, label

# 设置设备
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# 加载测试集蛋白质ID
train_text = "" # your file path
protein_ids = []
with open(train_text, 'r') as file:
    lines = file.readlines()
    for i in range(0, len(lines), 2):
        protein_id = lines[i].split(" ")[0]
        protein_ids.append(protein_id)

# 加载数据集
test_dataset = Getdata(ID_list=protein_ids)
test_loader = DataLoader(test_dataset, collate_fn=BatchCollate(), shuffle=False, num_workers=8)  # shuffle改为False以保持顺序

# 加载模型
best_model = Plan_Ankh_ProstT5()
if torch.cuda.device_count() > 1:
    best_model = nn.DataParallel(best_model)
best_model.eval()
best_model.to(device)
best_model.load_state_dict(torch.load(f"/549_Ankh_ProstT5.pt"))

# 存储结果
results = []

# 模型预测并输出
with torch.no_grad():
    for protein_id, (data1, data2, label) in zip(protein_ids, test_loader):
        data1 = data1.to(device)
        data2 = data2.to(device)
        label = label.to(device)

        # 模型输出
        score, _, _, _, _, _ = best_model(data1, data2)
        label_list = [out.cpu().numpy() for out in label]
        score_list = [out.detach().cpu().numpy() for out in score]
        score = np.concatenate(score_list)
        label = np.concatenate(label_list)
        # 转换数据格式
        probabilities = score[:, 1]  # 正样本的概率
        predicted_classes = np.argmax(score, axis=1)  # 预测类别
        true_labels = label  # 真实标签

        # 逐位点存储结果
        for i in range(len(probabilities)):
            results.append({
                "Protein_ID": protein_id,
                "Residue_Index": i + 1,
                "Predicted_Probability": probabilities[i],
                "Predicted_Class": predicted_classes[i],
                "True_Label": true_labels[i]
            })

# 保存结果到CSV
output_df = pd.DataFrame(results)
output_df.to_csv("result.csv", index=False)
print("Results saved to result.csv.")
