In [5]:
import torch
save_path = '/data/zhaojingtong/pharmrgdata/hetero_graph.pt'
# 假设 hetero_data 是你构建好的异构图对象
hetero_data = torch.load(save_path)
data = hetero_data
data

HeteroData(
  [1mProtein[0m={
    num_nodes=62295,
    x=[62295, 128]
  },
  [1mDisease[0m={
    num_nodes=5819,
    x=[5819, 128]
  },
  [1mPathway[0m={
    num_nodes=15771,
    x=[15771, 128]
  },
  [1mDrug[0m={
    num_nodes=9279,
    x=[9279, 128]
  },
  [1m(Drug, Drug-Protein, Protein)[0m={
    edge_index=[2, 25994],
    edge_attr=[25994, 128],
    edge_label=[25994]
  },
  [1m(Drug, DDI, Drug)[0m={
    edge_index=[2, 1419538],
    edge_attr=[1419538, 128],
    edge_label=[1419538]
  },
  [1m(Protein, Protein-Pathway, Pathway)[0m={
    edge_index=[2, 281950],
    edge_attr=[281950, 128],
    edge_label=[281950]
  },
  [1m(Drug, Drug-Pathway, Pathway)[0m={
    edge_index=[2, 5114],
    edge_attr=[5114, 128],
    edge_label=[5114]
  },
  [1m(Protein, Protein-Disease, Disease)[0m={
    edge_index=[2, 116699],
    edge_attr=[116699, 128],
    edge_label=[116699]
  },
  [1m(Drug, Drug-Disease, Disease)[0m={
    edge_index=[2, 61726],
    edge_attr=[61726, 128],
    

In [2]:
import torch
import random

def aug_random_edge_edge_index(edge_index, drop_percent=0.2):
    edge_num = edge_index.shape[1]
    percent = drop_percent / 2
    add_drop_num = int(edge_num * percent)
    
    src_nodes = edge_index[0].unique().tolist()  # 获取唯一的源节点列表
    dst_nodes = edge_index[1].unique().tolist()  # 获取唯一的目标节点列表
    edge_list = edge_index.t().tolist()  # 转换为二维列表

    drop_idx = random.sample(range(edge_num), add_drop_num)  # 随机选择要删除的边的索引
    drop_idx = sorted(drop_idx,reverse=True)

    # 删除选中的边
    # edge_list = [edge_list[i] for i in range(edge_num) if i not in drop_idx]
    for i in drop_idx:
        edge_list.pop(i)

    # 转换现有的边为集合，便于快速查找
    existing_edges = set(map(tuple, edge_list))

    # 优化：从源节点和目标节点中随机采样未存在的边
    add_list = []
    attempts = 0
    max_attempts = 10 * add_drop_num  # 为防止死循环，设置最大尝试次数

    while len(add_list) < add_drop_num and attempts < max_attempts:
        src = random.choice(src_nodes)  # 随机选择一个源节点
        dst = random.choice(dst_nodes)  # 随机选择一个目标节点
        new_edge = (src, dst)
        if new_edge not in existing_edges:
            add_list.append(new_edge)
            existing_edges.add(new_edge)  # 更新现有边集合
        attempts += 1


    
    # 如果采样边数不足，可以通过再次采样或终止采样
    if len(add_list) < add_drop_num:
        print(f"Warning: Only {len(add_list)} new edges were added out of {add_drop_num}.")

    # 增加新边
    edge_list.extend(add_list)
    augmented_edge_index = torch.tensor(edge_list).t()

    return augmented_edge_index

def aug_heterodata_random_edge_edge_index(hetero_data, drop_percent=0.2):
    for key in hetero_data.edge_types:
        print(key)
        edge_index = hetero_data[key]['edge_index']
        augmented_edge_index = aug_random_edge_edge_index(edge_index, drop_percent)
        hetero_data[key]['edge_index'] = augmented_edge_index  # 更新增强后的边索引
        

    return hetero_data

# 执行数据增强
augmented_data = aug_heterodata_random_edge_edge_index(data, drop_percent=0.2)
augmented_data

('Drug', 'Drug-Protein', 'Protein')
('Drug', 'DDI', 'Drug')
('Protein', 'Protein-Pathway', 'Pathway')
('Drug', 'Drug-Pathway', 'Pathway')
('Protein', 'Protein-Disease', 'Disease')
('Drug', 'Drug-Disease', 'Disease')
('Protein', 'PPI', 'Protein')


HeteroData(
  [1mDrug[0m={
    num_nodes=9279,
    x=[9279, 128]
  },
  [1mPathway[0m={
    num_nodes=15771,
    x=[15771, 128]
  },
  [1mDisease[0m={
    num_nodes=5819,
    x=[5819, 128]
  },
  [1mProtein[0m={
    num_nodes=62295,
    x=[62295, 128]
  },
  [1m(Drug, Drug-Protein, Protein)[0m={
    edge_index=[2, 25994],
    edge_attr=[25994, 128],
    edge_label=[25994]
  },
  [1m(Drug, DDI, Drug)[0m={
    edge_index=[2, 1419538],
    edge_attr=[1419538, 128],
    edge_label=[1419538]
  },
  [1m(Protein, Protein-Pathway, Pathway)[0m={
    edge_index=[2, 281950],
    edge_attr=[281950, 128],
    edge_label=[281950]
  },
  [1m(Drug, Drug-Pathway, Pathway)[0m={
    edge_index=[2, 5114],
    edge_attr=[5114, 128],
    edge_label=[5114]
  },
  [1m(Protein, Protein-Disease, Disease)[0m={
    edge_index=[2, 116699],
    edge_attr=[116699, 128],
    edge_label=[116699]
  },
  [1m(Drug, Drug-Disease, Disease)[0m={
    edge_index=[2, 61726],
    edge_attr=[61726, 128],
    

In [2]:
import copy
import numpy as np
import scipy.sparse as sp
def heterodata_preprocess_features(data):
    hererodata_feature_dict = {}
    input_feature_dict = data.x_dict
    new_data = copy.deepcopy(data)
    for node_type, input_feature in input_feature_dict.items():
        rowsum = np.array(input_feature.sum(1))
        r_inv = np.power(rowsum, -1).flatten()
        r_inv[np.isinf(r_inv)] = 0.
        r_mat_inv = sp.diags(r_inv)
        input_feature = r_mat_inv.dot(input_feature)
        
        # 将掩码后的特征矩阵存入结果字典
        hererodata_feature_dict[node_type] = torch.FloatTensor(input_feature)
    new_data.x_dict = hererodata_feature_dict
    return new_data


process_feature_data = heterodata_preprocess_features(data)
process_feature_data.x_dict

{'Drug': tensor([[ 0.1504,  0.0219,  0.0597,  ...,  0.0032,  0.0891,  0.0063],
         [ 0.0464, -0.0057,  0.1756,  ...,  0.0716,  0.0532, -0.0034],
         [-0.0058, -0.0561,  0.1004,  ..., -0.0296,  0.0905,  0.0509],
         ...,
         [ 0.0770,  0.4022,  0.0315,  ...,  0.1845,  0.1773,  0.1919],
         [ 0.0167, -0.0062,  0.0023,  ..., -0.0013,  0.0711,  0.0111],
         [-0.0199,  0.0525, -0.0748,  ..., -0.0165, -0.0857,  0.0126]]),
 'Pathway': tensor([[ 0.1529,  0.0295,  0.0199,  ...,  0.0375, -0.1480, -0.0692],
         [ 0.8021,  0.1237,  0.3116,  ..., -0.0532, -0.4990,  0.6141],
         [-0.1014,  0.2611, -0.0782,  ...,  0.0792,  0.1860, -0.1026],
         ...,
         [ 0.0705, -0.0023,  0.0123,  ...,  0.0750,  0.1089,  0.0450],
         [-0.0696,  0.1560,  0.0980,  ...,  0.1372, -0.2728,  0.2591],
         [ 0.0072, -0.0878, -0.0324,  ...,  0.0475, -0.0331,  0.0123]]),
 'Disease': tensor([[ 0.0646,  0.0476,  0.0480,  ..., -0.0353,  0.0163,  0.0477],
         [-0.00

In [26]:
import torch
import copy
import random

def aug_heterodata_random_mask(data, drop_percent=0.2):
    aug_feature_dict = {}
    input_feature_dict = data.x_dict
    new_data = copy.deepcopy(data)
    for node_type, input_feature in input_feature_dict.items():
        node_num = input_feature.shape[1]  # 获取当前节点类型的节点数
        mask_num = int(node_num * drop_percent)
        node_idx = [i for i in range(node_num)]
        mask_idx = random.sample(node_idx, mask_num)  # 随机选择掩码节点
        input_feature = input_feature.unsqueeze(0)
        # 深拷贝当前节点类型的特征
        aug_feature = copy.deepcopy(input_feature)
        
        # 生成全零向量，与节点特征的维度一致
        zeros = torch.zeros_like(aug_feature[0][0])
        
        # 进行掩码操作
        for j in mask_idx:
            aug_feature[0][j] = zeros
        
        # 将掩码后的特征矩阵存入结果字典
        aug_feature_dict[node_type] = aug_feature
    new_data.x_dict = aug_feature_dict
    return new_data


In [27]:
data.x_dict

{'Drug': tensor([[-1.5799, -0.2302, -0.6271,  ..., -0.0334, -0.9360, -0.0659],
         [ 0.3712, -0.0454,  1.4050,  ...,  0.5732,  0.4253, -0.0274],
         [-0.0781, -0.7583,  1.3575,  ..., -0.3998,  1.2230,  0.6880],
         ...,
         [-0.5286, -2.7615, -0.2162,  ..., -1.2668, -1.2173, -1.3172],
         [ 0.3995, -0.1491,  0.0561,  ..., -0.0322,  1.7017,  0.2655],
         [ 0.5760, -1.5194,  2.1641,  ...,  0.4785,  2.4810, -0.3658]]),
 'Pathway': tensor([[ 2.1880,  0.4213,  0.2848,  ...,  0.5361, -2.1169, -0.9894],
         [-1.4259, -0.2198, -0.5540,  ...,  0.0946,  0.8871, -1.0916],
         [-0.7687,  1.9800, -0.5931,  ...,  0.6007,  1.4102, -0.7782],
         ...,
         [-1.6508,  0.0529, -0.2870,  ..., -1.7571, -2.5521, -1.0535],
         [ 0.3769, -0.8440, -0.5302,  ..., -0.7426,  1.4763, -1.4021],
         [ 0.0892, -1.0831, -0.3995,  ...,  0.5859, -0.4080,  0.1513]]),
 'Disease': tensor([[-1.3973, -1.0292, -1.0393,  ...,  0.7628, -0.3516, -1.0327],
         [ 0.02

In [29]:
aug_feature1 = aug_heterodata_random_mask(data,drop_percent=0.9)
aug_feature1.x_dict

{'Drug': tensor([[[ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
          [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
          [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
          ...,
          [-0.5286, -2.7615, -0.2162,  ..., -1.2668, -1.2173, -1.3172],
          [ 0.3995, -0.1491,  0.0561,  ..., -0.0322,  1.7017,  0.2655],
          [ 0.5760, -1.5194,  2.1641,  ...,  0.4785,  2.4810, -0.3658]]]),
 'Pathway': tensor([[[ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
          [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
          [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
          ...,
          [-1.6508,  0.0529, -0.2870,  ..., -1.7571, -2.5521, -1.0535],
          [ 0.3769, -0.8440, -0.5302,  ..., -0.7426,  1.4763, -1.4021],
          [ 0.0892, -1.0831, -0.3995,  ...,  0.5859, -0.4080,  0.1513]]]),
 'Disease': tensor([[[ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000]

In [6]:
import torch
import numpy as np
import random
from torch_geometric.nn import HeteroConv, GCNConv, SAGEConv, GATConv, RGCNConv
from torch_geometric.data import HeteroData
from layers import GCN, AvgReadout 
# 设置随机种子以保证可复现性
def set_seed(seed):
    torch.manual_seed(seed)  # 设置PyTorch的随机种子
    np.random.seed(seed)     # 设置NumPy的随机种子
    random.seed(seed)        # 设置Python的随机种子
    if torch.cuda.is_available():
        torch.cuda.manual_seed(seed)  # 设置CUDA的随机种子
        torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True  # 保证每次卷积计算结果一致
    torch.backends.cudnn.benchmark = False     # 禁用CUDNN自动优化，保证结果一致

# 设置种子
set_seed(42)
hetero_conv = HeteroConv({
    ('Drug', 'Drug-Protein', 'Protein'): GATConv((-1,-1), 8,add_self_loops=False),
    ('Drug', 'DDI', 'Drug'): SAGEConv((-1, -1), 8,add_self_loops=False),
    ('Protein', 'Protein-Pathway', 'Pathway'): GATConv((-1, -1), 8,add_self_loops=False),
    ('Drug', 'Drug-Pathway', 'Pathway'):GATConv((-1, -1), 8,add_self_loops=False),
    ('Protein', 'Protein-Disease', 'Disease'):GATConv((-1, -1), 8,add_self_loops=False),
    ('Protein', 'PPI', 'Protein'):GATConv((-1, -1), 8,add_self_loops=False),
    ('Drug', 'Drug-Disease', 'Disease'):GATConv((-1, -1), 8,add_self_loops=False)
}, aggr='sum')
# Ensure that data.x_dict is available and contains the node features

out = hetero_conv(data.x_dict,data.edge_index_dict)
print(out)

defaultdict(<class 'list'>, {'Protein': tensor([[ 3.0563, -0.6334, -0.4554,  ..., -0.6886,  0.6381,  1.3322],
        [-0.9938,  1.3988, -0.8841,  ...,  0.2894,  0.6098,  0.6078],
        [ 0.2261, -0.0468,  0.6282,  ..., -1.2509,  0.2560,  0.5871],
        ...,
        [ 0.0083,  0.1923, -0.1373,  ..., -0.8041,  0.1730,  1.2379],
        [ 2.0125,  0.7516, -1.5695,  ..., -1.7745,  1.0601, -0.5281],
        [ 0.1774, -1.0672,  0.4343,  ...,  1.7720,  0.4198, -1.6798]],
       grad_fn=<SumBackward1>), 'Drug': tensor([[-0.2535,  0.1135,  1.1522,  ...,  0.3727,  0.3809,  0.0145],
        [-0.1394,  0.9026, -0.4468,  ...,  0.5326, -0.2357, -0.8338],
        [ 0.2158, -0.4886,  0.3527,  ..., -1.1602, -0.6388, -0.5690],
        ...,
        [-0.7983, -0.5847,  0.7673,  ..., -0.3104,  0.3760,  0.4161],
        [ 0.1296,  0.3447, -0.9729,  ...,  0.7852,  0.1588,  0.2169],
        [-0.8563, -0.0421,  0.6167,  ...,  0.1104, -0.5018,  0.3748]],
       grad_fn=<AddBackward0>), 'Pathway': tensor([[

In [5]:
data.edge_index_dict

{('Drug',
  'Drug-Protein',
  'Protein'): tensor([[   0,    1,    2,  ..., 3332, 1964, 1876],
         [   0,    1,    2,  ..., 1486,   68,  325]]),
 ('Drug',
  'DDI',
  'Drug'): tensor([[1743, 4885, 6658,  ...,   96, 2308,  463],
         [7046, 6883, 4212,  ..., 5425, 6809, 4086]]),
 ('Protein',
  'Protein-Pathway',
  'Pathway'): tensor([[ 3548,  3549,  3550,  ...,  2159, 11722, 19631],
         [    0,     1,     2,  ...,   574,  2338,  1915]]),
 ('Drug',
  'Drug-Pathway',
  'Pathway'): tensor([[ 1156,     8,  7025,  ...,  5093,   215,  3135],
         [ 6474, 15639, 15640,  ...,  7153, 10742, 15686]]),
 ('Protein',
  'Protein-Disease',
  'Disease'): tensor([[18425,  8491, 17289,  ...,  2746,   449, 12560],
         [    0,     1,     2,  ...,  2601,   226,    12]]),
 ('Drug',
  'Drug-Disease',
  'Disease'): tensor([[2520, 6597, 4078,  ..., 3054, 4462, 3508],
         [ 302,  109, 4577,  ...,  175, 4916,  869]]),
 ('Protein',
  'PPI',
  'Protein'): tensor([[19685,  9222, 54090,  ...