In [1]:
from utils import *
import torch
import pandas as pd
import yaml
import warnings
from Structe_DPP_HyperGraph import HyGraph_Matrix_DPP_Structure
import torch.nn.functional as F
import wandb
import os
import torch
import warnings
import numpy as np
from model import *
from tqdm import tqdm
import torch.nn as nn
import torch.nn.functional as F
from hypergraph_utils import generate_G_from_H
from hypergraph_utils import construct_H_with_KNN
from sklearn.metrics import roc_auc_score, f1_score
from scipy.spatial.distance import pdist, squareform
from sklearn.metrics.pairwise import cosine_similarity as cos



warnings.filterwarnings("ignore")

with open('config.yaml', 'r') as f:
    config = yaml.safe_load(f)
config['device'] = "cuda" if torch.cuda.is_available() else "cpu"
#config['device'] = "cpu"

setup(config['seed'])


reg_loss_co = 0.0002
fold = 0

torch.set_default_dtype(torch.float32)







In [2]:
# 遍历数据集
for name in ['Luo']:
    # 加载特征和数据
    node_num, drug_protein, protein_drug, dtidata, features_d, features_p, HyGraph_Drug, HyGraph_protein = load_feature(name, config['feature_list'])
    
    print(node_num, drug_protein.shape, protein_drug.shape, dtidata.shape, 
          features_d.shape, features_p.shape, HyGraph_Drug.shape, HyGraph_protein.shape)

    # 将数据转换为 tensor，并移动到指定设备
    dti_label = torch.tensor(dtidata[:, 2:3]).to(config['device'])
    drug_protein = drug_protein.to(config['device'])
    protein_drug = protein_drug.to(config['device'])

    # 构建超图结构矩阵
    HyGraph_Structure_DPP = HyGraph_Matrix_DPP_Structure(dtidata, node_num[0], node_num[1], name)
    HyGraph_Structure_DPP = HyGraph_Structure_DPP.to(config['device'])

    # 数据和标签
    data = dtidata
    label = dti_label

    # 定义主函数进行训练和测试
    def model_eval(tr, te, seed):
        predicted_results = []  # 存储结果
        
        # 遍历交叉验证的每一折
        for i in range(len(tr)):
            # 保存训练集索引
            train_index = tr[i]
            with open(os.path.join(config['results_dir'], f"{name}_{config['feature_list']}_{i}foldtrain.txt"), "w", encoding="utf-8") as f:
                for idx in train_index:
                    f.write(f"{idx}\n")

            # 保存测试集索引
            test_index = te[i]
            with open(os.path.join(config['results_dir'], f"{name}_{config['feature_list']}_{i}foldtest.txt"), "w", encoding="utf-8") as f:
                for idx in test_index:
                    f.write(f"{idx}\n")

            # 加载模型并初始化
            model = HyperGCNDTI(
                num_protein=node_num[1],
                num_drug=node_num[0],
                num_hidden1=config['in_size'],
                num_hidden2=config['hidden_size'],
                num_out=config['out_size'],
                feature_list=config['feature_list']
            ).to(config['device'])
            # 构建模型路径
            model_path = f"/data/zyf/HyperGCN-DTI/savedmodel/{config['feature_list']}_dataset_{name}_best_model_fold_{i}_roc.pth"

            # 加载模型权重
            if os.path.exists(model_path):
                model.load_state_dict(torch.load(model_path, map_location=config['device']))
                print(f"Successfully loaded model from {model_path}")
            else:
                print(f"Model not found at {model_path}")
                continue  # 如果模型不存在，跳过这一折

            # 评估模型
            model.eval()
            with torch.no_grad():
                out = model(
                    node_num, features_d, features_p, protein_drug, 
                    drug_protein, HyGraph_Drug, HyGraph_protein, 
                    test_index, data, HyGraph_Structure_DPP, 
                    iftrain=False, d=features_d , p=features_p
                )
                print(f"Fold {i} evaluation completed.")
                #print(out)
                predicted_results.append(out)
                predictions = out.argmax(dim=1)
                true_labels = label[test_index].reshape(-1).long()
                false_positive_mask = (predictions == 1) & (true_labels == 0)
                false_positive_indices = torch.nonzero(false_positive_mask, as_tuple=True)[0]
                false_positive_probs = torch.softmax(out, dim=1)[false_positive_indices]
                max_probs = false_positive_probs.max(dim=1).values
                sorted_indices = max_probs.argsort(descending=True)
                print("按最大概率排序的假阳性样本位置:")
                for idx in sorted_indices:
                    sample_index = false_positive_indices[idx].item()  # 获取样本原始索引
                    print(f"样本索引: {sample_index}, 预测类别: {predictions[sample_index].item()}, "
                          f"真实类别: {true_labels[sample_index].item()}, 最大概率: {max_probs[idx].item()}")

        return predicted_results
    train_indeces, test_indeces = get_cross(dtidata)
    predicted_results = model_eval(train_indeces, test_indeces, config['seed'])

load LLM features
[708, 1512] torch.Size([708, 1512]) torch.Size([1512, 708]) (3846, 3) torch.Size([708, 384]) torch.Size([1512, 320]) torch.Size([708, 708]) torch.Size([1512, 1512])
Successfully loaded model from /data/zyf/HyperGCN-DTI/savedmodel/[1, 2]_dataset_Luo_best_model_fold_0_roc.pth
Fold 0 evaluation completed.
tensor([[-3.1180e-04, -8.0734e+00],
        [-1.3351e-04, -8.9216e+00],
        [-5.2832e-04, -7.5462e+00],
        ...,
        [-1.9474e-02, -3.9484e+00],
        [-8.5053e-03, -4.7713e+00],
        [-1.5746e-01, -1.9263e+00]], device='cuda:0')
按最大概率排序的假阳性样本位置:
样本索引: 489, 预测类别: 1, 真实类别: 0, 最大概率: 0.6923367381095886
样本索引: 471, 预测类别: 1, 真实类别: 0, 最大概率: 0.6437224745750427
Successfully loaded model from /data/zyf/HyperGCN-DTI/savedmodel/[1, 2]_dataset_Luo_best_model_fold_1_roc.pth
Fold 1 evaluation completed.
tensor([[-2.7807e-02, -3.5963e+00],
        [-9.4324e-03, -4.6683e+00],
        [-3.8908e-03, -5.5511e+00],
        ...,
        [-8.8408e-03, -4.7328e+00],
        [-

In [3]:
# 遍历数据集
for name in ['Luo']:
    # 加载特征和数据
    node_num, drug_protein, protein_drug, dtidata, features_d, features_p, HyGraph_Drug, HyGraph_protein = load_feature(name, config['feature_list'])
    
    print(node_num, drug_protein.shape, protein_drug.shape, dtidata.shape, 
          features_d.shape, features_p.shape, HyGraph_Drug.shape, HyGraph_protein.shape)

    # 将数据转换为 tensor，并移动到指定设备
    dti_label = torch.tensor(dtidata[:, 2:3]).to(config['device'])
    drug_protein = drug_protein.to(config['device'])
    protein_drug = protein_drug.to(config['device'])

    # 构建超图结构矩阵
    HyGraph_Structure_DPP = HyGraph_Matrix_DPP_Structure(dtidata, node_num[0], node_num[1], name)
    HyGraph_Structure_DPP = HyGraph_Structure_DPP.to(config['device'])

    # 数据和标签
    data = dtidata
    label = dti_label

    # 定义主函数进行训练和测试
    def model_eval(tr, te, seed):
        predicted_results = []  # 存储结果
        
        # 遍历交叉验证的每一折
        for i in range(len(tr)):
            # 保存训练集索引
            train_index = tr[i]
            with open(os.path.join(config['results_dir'], f"{name}_{config['feature_list']}_{i}foldtrain.txt"), "w", encoding="utf-8") as f:
                for idx in train_index:
                    f.write(f"{idx}\n")

            # 保存测试集索引
            test_index = te[i]
            with open(os.path.join(config['results_dir'], f"{name}_{config['feature_list']}_{i}foldtest.txt"), "w", encoding="utf-8") as f:
                for idx in test_index:
                    f.write(f"{idx}\n")

            # 加载模型并初始化
            model = HyperGCNDTI(
                num_protein=node_num[1],
                num_drug=node_num[0],
                num_hidden1=config['in_size'],
                num_hidden2=config['hidden_size'],
                num_out=config['out_size'],
                feature_list=config['feature_list']
            ).to(config['device'])
            # 构建模型路径
            model_path = f"/data/zyf/HyperGCN-DTI/savedmodel/{config['feature_list']}_dataset_{name}_best_model_fold_{i}_roc.pth"

            # 加载模型权重
            if os.path.exists(model_path):
                model.load_state_dict(torch.load(model_path, map_location=config['device']))
                print(f"Successfully loaded model from {model_path}")
            else:
                print(f"Model not found at {model_path}")
                continue  # 如果模型不存在，跳过这一折

            # 评估模型
            model.eval()
            with torch.no_grad():
                out = model(
                    node_num, features_d, features_p, protein_drug, 
                    drug_protein, HyGraph_Drug, HyGraph_protein, 
                    test_index, data, HyGraph_Structure_DPP, 
                    iftrain=False, d=features_d , p=features_p
                )
                print(f"Fold {i} evaluation completed.")
                print(out)
                predicted_results.append(out)
                predictions = out.argmax(dim=1)
                true_labels = label[test_index].reshape(-1).long()
                false_positive_mask = (predictions == 1) & (true_labels == 0)
                false_positive_indices = torch.nonzero(false_positive_mask, as_tuple=True)[0]
                false_positive_probs = torch.softmax(out, dim=1)[false_positive_indices]
                max_probs = false_positive_probs.max(dim=1).values
                sorted_indices = max_probs.argsort(descending=True)
                print("按最大概率排序的假阳性样本位置:")
                for idx in sorted_indices:
                    sample_index = false_positive_indices[idx].item()  # 获取样本原始索引
                    print(f"样本索引: {sample_index}, 预测类别: {predictions[sample_index].item()}, "
                          f"真实类别: {true_labels[sample_index].item()}, 最大概率: {max_probs[idx].item()}")

        return predicted_results
    train_indeces, test_indeces = get_cross(dtidata)
    predicted_results = model_eval(train_indeces, test_indeces, config['seed'])

load LLM features
[708, 1512] torch.Size([708, 1512]) torch.Size([1512, 708]) (3846, 3) torch.Size([708, 384]) torch.Size([1512, 320]) torch.Size([708, 708]) torch.Size([1512, 1512])
Successfully loaded model from /data/zyf/HyperGCN-DTI/savedmodel/[1, 2]_dataset_Luo_best_model_fold_0_roc.pth
Fold 0 evaluation completed.
tensor([[-5.8920e-04, -7.4370e+00],
        [-2.9563e-05, -1.0428e+01],
        [-1.4787e-02, -4.2214e+00],
        ...,
        [-2.5293e-03, -5.9811e+00],
        [-2.3842e-07, -1.5268e+01],
        [-1.5561e-01, -1.9372e+00]], device='cuda:0')
按最大概率排序的假阳性样本位置:
Successfully loaded model from /data/zyf/HyperGCN-DTI/savedmodel/[1, 2]_dataset_Luo_best_model_fold_1_roc.pth
Fold 1 evaluation completed.
tensor([[-0.0068, -5.0014],
        [-0.0216, -3.8458],
        [-0.0194, -3.9526],
        ...,
        [-0.3134, -1.3130],
        [-0.0545, -2.9365],
        [-0.0141, -4.2681]], device='cuda:0')
按最大概率排序的假阳性样本位置:
Successfully loaded model from /data/zyf/HyperGCN-DTI/saved

In [5]:
print(dtidata[3568,:],dtidata[2759,:],dtidata[2217,:])

#-P35354,-P08311,
#DB00188-Q16665,DB00339-Q15075,DB01410-Q5T6L4,DB00776-Q8IU85,DB01141-Q9ULK0

[563 961   0] [290  52   0] [  93 1450    0]


In [None]:
for name in ['Luo','Zheng']:
    node_num, drug_protein, protein_drug, dtidata,features_d,features_p,HyGraph_Drug,HyGraph_protein = load_feature(name,config['feature_list'])
    #[708, 1512] torch.Size([708, 1512]) torch.Size([1512, 708]) (3846, 3)
    print(node_num,drug_protein.shape,protein_drug.shape,dtidata.shape,features_d.shape,features_p.shape,HyGraph_Drug.shape,HyGraph_protein.shape)
    dti_label = torch.tensor(dtidata[:, 2:3]).to(config['device'])
    drug_protein = drug_protein.to(config['device'])
    protein_drug = protein_drug.to(config['device'])
    HyGraph_Structure_DPP = HyGraph_Matrix_DPP_Structure(dtidata, node_num[0], node_num[1],name )
    HyGraph_Structure_DPP = HyGraph_Structure_DPP.to(config['device'])

    data = dtidata
    label = dti_label

    def main(tr, te, seed):
        results = []
        for i in range(len(tr)):
            f = open( os.path.join(config['results_dir'],f"{name}_{config['feature_list']}_{i}foldtrain.txt"), "w", encoding="utf-8")
            train_index = tr[i]
            for train_index_one in train_index:
                f.write(f"{train_index_one}\n")
            test_index = te[i]
            f = open( os.path.join(config['results_dir'],f"{name}_{config['feature_list']}_{i}foldtest.txt"), "w", encoding="utf-8")
            for train_index_one in test_index:
                f.write(f"{train_index_one}\n")
            #
            # if not os.path.isdir(f"{dir}"):
            #     os.makedirs(f"{dir}")

            model = HyperGCNDTI(
                num_protein=node_num[1],
                num_drug=node_num[0],
                num_hidden1=config['in_size'],
                num_hidden2=config['hidden_size'],
                num_out=config['out_size'],
                feature_list= config['feature_list']
            ).to(config['device'])
                
            model.load_state_dict(torch.load(f"/data/zyf/HyperGCN-DTI/savedmodel/dataset_{}_best_model_fold_0_roc.pth.pth",name))
            model.eval()
            out = model(node_num, features_d, features_p, protein_drug, drug_protein, HyGraph_Drug, HyGraph_protein, test_index, data, HyGraph_Structure_DPP, iftrain=False, d=d, p=p)
   


In [6]:
protein_list = pd.read_csv('/data/zyf/HyperGCN-DTI/data/Luo/protein_seq.csv')
protein_sample = protein_list.sample(n=5, random_state=90)
print(protein_sample['UniProt ID'])
#P14555 P10912 P16444

814    P14555
41     P10912
703    P16444
Name: UniProt ID, dtype: object


In [7]:
node_num, drug_protein, protein_drug, dtidata,features_d,features_p,HyGraph_Drug,HyGraph_protein = load_feature('Luo',config['feature_list'])

load LLM features


In [9]:
print( drug_protein.shape)

torch.Size([708, 1512])


In [10]:
protein_list.head()

Unnamed: 0,UniProt ID,protein_sequence,feature_1,feature_2,feature_3,feature_4,feature_5,feature_6,feature_7,feature_8,...,feature_311,feature_312,feature_313,feature_314,feature_315,feature_316,feature_317,feature_318,feature_319,feature_320
0,Q9UI32,MRSMKALQKALSRAGSHCGRGGWGHPSRSPLLGGGVRHHLSEAAAQ...,-0.0457,-0.046508,0.060963,0.1241,0.15471,-0.12274,0.101871,-0.064752,...,0.112338,0.025133,0.034338,0.135733,-0.070924,-0.058296,-0.18491,0.293403,0.029837,0.020645
1,P00488,MSETSRTAFGGRRAVPPNNSNAAEDDLPTVELQGVVPRGVNLQEFL...,-0.111468,0.078015,0.121058,0.088189,0.323042,-0.141872,0.086735,-0.12421,...,0.018668,-0.035532,-0.106184,-0.015077,0.092423,-0.110347,-0.000927,0.13133,0.079626,-0.059638
2,P35228,MACPWKFLFKTKFHQYAMNGEKDINNNVEKAPCATSSPVTQDDLQY...,-0.081716,-0.111615,0.060804,0.135152,0.018172,-0.115424,0.098843,-0.070594,...,0.079049,-0.133051,-0.063088,0.156411,0.017324,-0.063999,-0.051186,0.099956,0.109377,0.031345
3,P06737,MAKPLTDQEKRRQISIRGIVGVENVAELKKSFNRHLHFTLVKDRNV...,0.033512,-0.051554,0.099956,0.191913,0.169532,-0.036704,-0.030264,-0.132078,...,0.016164,0.067447,-0.050351,-0.009934,0.19664,-0.103204,-0.098538,0.095371,-0.006853,0.019574
4,P11766,MANEVIKCKAAVAWEAGKPLSIEEIEVAPPKAHEVRIKIIATAVCH...,-0.082313,-0.015708,0.209881,0.011249,0.261356,-0.062054,0.046553,-0.040649,...,0.043615,-0.044199,-0.156546,0.267802,-0.080462,0.021552,-0.018043,0.168098,0.093072,0.157004


In [25]:
def construct_dtidate(num_drug,target_index):
    data = dtidata[(dtidata[:, 1] == target_index) & (dtidata[:, 2] == 1)]
    drug_index = data[:,0]
    first_column = np.array([i for i in range(num_drug) if i not in list(drug_index)])
    second_column = np.full_like(first_column, target_index)
    third_column = np.zeros_like(first_column)
    matrix = np.column_stack((first_column, second_column, third_column))
    data = np.vstack((data, matrix))
    return data

In [8]:
for loc in [814,41,703]:
    features_sample = protein_list.iloc[loc,2:]
    features_p = pd.DataFrame(np.repeat(features_sample.values, drug_protein.shape[1]).reshape(drug_protein.shape[1], -1))
    node_num, drug_protein, protein_drug, dtidata = load_dataset_from_name('Luo')
    drug_protein_sample = drug_protein[:,loc]
    drug_protein = drug_protein_sample.unsqueeze(1).repeat(1, drug_protein.shape[1])
    protein_drug = drug_protein.T
    data= construct_dtidate(drug_protein.shape[0],loc)
    

tensor([[0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        ...,
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.]])

In [15]:
print(dtidata.shape)

(3846, 3)


In [18]:
print(dtidata[dtidata[:, 1] == 814])

[[ 97 814   1]
 [251 814   1]
 [653 814   0]
 [379 814   0]]


In [20]:
data = dtidata[(dtidata[:, 1] == 814) & (dtidata[:, 2] == 1)]
drug_index = data[:,0]

In [21]:
drug_index

array([ 97, 251])