In [1]:
import torch
import pandas as pd
import gzip
from torch_geometric.data import HeteroData
from torch_geometric.utils import add_self_loops

In [4]:
import gzip
import pandas as pd
import json

# 解压并读取数据
def read_data(file_path):
    with gzip.open(file_path, 'rt') as f:
        df = pd.read_csv(f, sep='\t', header=None, names=['source', 'relation', 'target'])
    return df

# 生成全局ID映射
def generate_global_id_mapping(dir_path,file_paths, output_file):
    node_id = {}
    global_node_count = 0

    # 读取所有文件并为节点生成全局唯一 ID
    for file_name in file_paths.values():
        df = read_data(dir_path+file_name)
        for _, row in df.iterrows():
            src = row['source']
            tgt = row['target']

            if src not in node_id:
                node_id[src] = global_node_count
                global_node_count += 1
            if tgt not in node_id:
                node_id[tgt] = global_node_count
                global_node_count += 1

    # 将节点映射关系保存到 JSON 文件中
    with open(dir_path+output_file, 'w') as f:
        json.dump(node_id, f)

    print(f"Global node ID mapping saved to {output_file}")
    print(f"Total nodes: {len(node_id)}")

# 文件路径
file_paths = {
    'Drug-Protein': 'drug_protein.txt.gz',
    'DDI': 'ddi.txt.gz',
    'Protein-Pathway': 'protein_pathway.txt.gz',
    'Drug-Pathway': 'drug_pathway.txt.gz',
    'Protein-Disease': 'protein_disease.txt.gz',
    'Drug-Disease': 'drug_disease.txt.gz',
    'PPI': 'ppi.txt.gz'
}
dir_path = '/data/zhaojingtong/pharmrgdata/'
# 保存全局ID映射
output_file = 'global_node_id_mapping.json'
generate_global_id_mapping(dir_path,file_paths, output_file)


Global node ID mapping saved to global_node_id_mapping.json
Total nodes: 93164


In [1]:
import gzip
import pandas as pd
import torch
from torch_geometric.data import HeteroData

# 解压并读取数据
def read_data(file_path):
    with gzip.open(file_path, 'rt') as f:
        df = pd.read_csv(f, sep='\t', header=None, names=['source', 'relation', 'target'])
    return df

# 创建异构图并初始化特征和关系标签
def create_hetero_graph():
    data = HeteroData()

    # 定义不同的文件路径
    file_paths = {
        'Drug-Protein': 'drug_protein.txt.gz',
        'DDI': 'ddi.txt.gz',
        'Protein-Pathway': 'protein_pathway.txt.gz',
        'Drug-Pathway': 'drug_pathway.txt.gz',
        'Protein-Disease': 'protein_disease.txt.gz',
        'Drug-Disease': 'drug_disease.txt.gz',
        'PPI': 'ppi.txt.gz'
    }
    
    # 定义节点类型和边类型
    node_types = {'Protein', 'Drug', 'Disease', 'Pathway'}
    edge_types = [
        ('Drug', 'Drug-Protein', 'Protein'),
        ('Drug', 'DDI', 'Drug'),
        ('Protein', 'Protein-Pathway', 'Pathway'),
        ('Drug', 'Drug-Pathway', 'Pathway'),
        ('Protein', 'Protein-Disease', 'Disease'),
        ('Drug', 'Drug-Disease', 'Disease'),
        ('Protein', 'PPI', 'Protein')
    ]

    # 为所有节点创建一个全局ID映射
    global_node_id = {}
    global_node_count = 0

    # 为每种边类型分配一个唯一的标签（0 到 6）
    relation_labels = {edge_types[i]: i for i in range(len(edge_types))}

    # 每个节点类型独立的特征初始化
    node_features = {ntype: [] for ntype in node_types}
    node_id_map = {ntype: {} for ntype in node_types}  # 各个类型下的节点 ID 对应关系

    # 读取文件并填充图数据
    for edge_type in edge_types:
        file_name = file_paths[edge_type[1]]
        print(f"Processing edge type: {edge_type}, file: {file_name}")
        dir_path = '/data/zhaojingtong/pharmrgdata/'  # 更新为数据存储的实际路径
        df = read_data(dir_path + file_name)
        src_type, relation, tgt_type = edge_type

        # 用于存储每条边的源节点和目标节点 ID
        src_ids = []
        tgt_ids = []

        for _, row in df.iterrows():
            src = row['source']
            tgt = row['target']

            # 为源节点和目标节点分配全局唯一ID，并初始化各节点类型的特征
            if src not in global_node_id:
                global_node_id[src] = global_node_count
                global_node_count += 1
                # 按照类型初始化特征
                node_id_map[src_type][src] = len(node_features[src_type])
                node_features[src_type].append(torch.randn(128))  # 128维随机特征

            if tgt not in global_node_id:
                global_node_id[tgt] = global_node_count
                global_node_count += 1
                # 按照类型初始化特征
                node_id_map[tgt_type][tgt] = len(node_features[tgt_type])
                node_features[tgt_type].append(torch.randn(128))  # 128维随机特征

            src_ids.append(global_node_id[src])
            tgt_ids.append(global_node_id[tgt])

        # 转换为 PyG 格式
        edge_index = torch.tensor([src_ids, tgt_ids], dtype=torch.long)
        data[edge_type].edge_index = edge_index

        # 初始化边特征，128维度随机向量
        num_edges = edge_index.size(1)
        data[edge_type].edge_attr = torch.randn(num_edges, 128)

        # 为每种边类型分配标签，使用 relation_labels
        data[edge_type].edge_label = torch.full((num_edges,), relation_labels[edge_type], dtype=torch.long)

    # 创建不同节点类型的特征
    for ntype in node_types:
        num_nodes = len(node_features[ntype])
        data[ntype].num_nodes = num_nodes
        data[ntype].x = torch.stack(node_features[ntype])  # 128维度随机特征

    return data

# 生成异构图并初始化特征和标签
hetero_data = create_hetero_graph()

# 检查生成的异构图数据
print(hetero_data)


Processing edge type: ('Drug', 'Drug-Protein', 'Protein'), file: drug_protein.txt.gz
Processing edge type: ('Drug', 'DDI', 'Drug'), file: ddi.txt.gz


KeyboardInterrupt: 

In [2]:
# 解压并读取数据
def read_data(file_path):
    with gzip.open(file_path, 'rt') as f:
        df = pd.read_csv(f, sep='\t', header=None, names=['source', 'relation', 'target'])
    return df

# 创建异构图并初始化特征和关系标签
def create_hetero_graph():
    data = HeteroData()

    # 定义不同的文件路径
    file_paths = {
        'Drug-Protein': 'drug_protein.txt.gz',
        'DDI': 'ddi.txt.gz',
        'Protein-Pathway': 'protein_pathway.txt.gz',
        'Drug-Pathway': 'drug_pathway.txt.gz',
        'Protein-Disease': 'protein_disease.txt.gz',
        'Drug-Disease': 'drug_disease.txt.gz',
        'PPI': 'ppi.txt.gz'
    }
    
    # 用于存储所有的节点和边
    node_types = {'Protein', 'Drug', 'Disease', 'Pathway'}
    edge_types = [
        ('Drug', 'Drug-Protein', 'Protein'),
        ('Drug', 'DDI', 'Drug'),
        ('Protein', 'Protein-Pathway', 'Pathway'),
        ('Drug', 'Drug-Pathway', 'Pathway'),
        ('Protein', 'Protein-Disease', 'Disease'),
        ('Drug', 'Drug-Disease', 'Disease'),
        ('Protein', 'PPI', 'Protein')
    ]
    
    # 为每个节点类型创建 ID 映射
    node_id = {ntype: {} for ntype in node_types}
    node_count = {ntype: 0 for ntype in node_types}

    # 为每种边类型分配一个唯一的标签（0 到 6）
    relation_labels = {edge_types[i]: i for i in range(len(edge_types))}
    
    # 读取文件并填充图数据
    for edge_type in edge_types:
        file_name = file_paths[edge_type[1]]
        print(f"Processing edge type: {edge_type}, file: {file_name}")
        dir_path = '/data/zhaojingtong/pharmrgdata/'
        df = read_data(dir_path + file_name)
        src_type, relation, tgt_type = edge_type

        if src_type not in node_id:
            node_id[src_type] = {}
        if tgt_type not in node_id:
            node_id[tgt_type] = {}

        # 为源节点和目标节点创建 ID
        src_ids = []
        tgt_ids = []

        for _, row in df.iterrows():
            src = row['source']
            tgt = row['target']

            if src not in node_id[src_type]:
                node_id[src_type][src] = node_count[src_type]
                node_count[src_type] += 1
            if tgt not in node_id[tgt_type]:
                node_id[tgt_type][tgt] = node_count[tgt_type]
                node_count[tgt_type] += 1

            src_ids.append(node_id[src_type][src])
            tgt_ids.append(node_id[tgt_type][tgt])

        # 转换为 PyG 格式
        edge_index = torch.tensor([src_ids, tgt_ids], dtype=torch.long)
        data[edge_type].edge_index = edge_index

        # 初始化边特征，128维度随机向量
        num_edges = edge_index.size(1)
        data[edge_type].edge_attr = torch.randn(num_edges, 128)

        # 为每种边类型分配标签，使用 relation_labels
        data[edge_type].edge_label = torch.full((num_edges,), relation_labels[edge_type], dtype=torch.long)

    # 创建节点特征，初始化为128维随机向量
    for ntype, id_map in node_id.items():
        num_nodes = len(id_map)
        data[ntype].num_nodes = num_nodes
        data[ntype].x = torch.randn(num_nodes, 128)  # 128维度随机特征

    return data

# 生成异构图并初始化特征和标签
hetero_data = create_hetero_graph()

# 检查生成的异构图数据
print(hetero_data)


Processing edge type: ('Drug', 'Drug-Protein', 'Protein'), file: drug_protein.txt.gz
Processing edge type: ('Drug', 'DDI', 'Drug'), file: ddi.txt.gz
Processing edge type: ('Protein', 'Protein-Pathway', 'Pathway'), file: protein_pathway.txt.gz
Processing edge type: ('Drug', 'Drug-Pathway', 'Pathway'), file: drug_pathway.txt.gz
Processing edge type: ('Protein', 'Protein-Disease', 'Disease'), file: protein_disease.txt.gz
Processing edge type: ('Drug', 'Drug-Disease', 'Disease'), file: drug_disease.txt.gz
Processing edge type: ('Protein', 'PPI', 'Protein'), file: ppi.txt.gz
HeteroData(
  [1mProtein[0m={
    num_nodes=62295,
    x=[62295, 128]
  },
  [1mDisease[0m={
    num_nodes=5819,
    x=[5819, 128]
  },
  [1mPathway[0m={
    num_nodes=15771,
    x=[15771, 128]
  },
  [1mDrug[0m={
    num_nodes=9279,
    x=[9279, 128]
  },
  [1m(Drug, Drug-Protein, Protein)[0m={
    edge_index=[2, 25994],
    edge_attr=[25994, 128],
    edge_label=[25994]
  },
  [1m(Drug, DDI, Drug)[0m={
   

In [3]:
import torch
save_path = '/data/zhaojingtong/pharmrgdata/hetero_graph.pt'
# 假设 hetero_data 是你构建好的异构图对象
torch.save(hetero_data, save_path)

In [6]:
hetero_data = torch.load(save_path)
print(hetero_data)

HeteroData(
  [1mDrug[0m={
    num_nodes=9279,
    x=[9279, 128]
  },
  [1mPathway[0m={
    num_nodes=15771,
    x=[15771, 128]
  },
  [1mDisease[0m={
    num_nodes=5819,
    x=[5819, 128]
  },
  [1mProtein[0m={
    num_nodes=62295,
    x=[62295, 128]
  },
  [1m(Drug, Drug-Protein, Protein)[0m={
    edge_index=[2, 25994],
    edge_attr=[25994, 128],
    edge_label=[25994]
  },
  [1m(Drug, DDI, Drug)[0m={
    edge_index=[2, 1419538],
    edge_attr=[1419538, 128],
    edge_label=[1419538]
  },
  [1m(Protein, Protein-Pathway, Pathway)[0m={
    edge_index=[2, 281950],
    edge_attr=[281950, 128],
    edge_label=[281950]
  },
  [1m(Drug, Drug-Pathway, Pathway)[0m={
    edge_index=[2, 5114],
    edge_attr=[5114, 128],
    edge_label=[5114]
  },
  [1m(Protein, Protein-Disease, Disease)[0m={
    edge_index=[2, 116699],
    edge_attr=[116699, 128],
    edge_label=[116699]
  },
  [1m(Drug, Drug-Disease, Disease)[0m={
    edge_index=[2, 61726],
    edge_attr=[61726, 128],
    

In [7]:
hetero_data.x_dict

{'Drug': tensor([[-1.5799, -0.2302, -0.6271,  ..., -0.0334, -0.9360, -0.0659],
         [ 0.3712, -0.0454,  1.4050,  ...,  0.5732,  0.4253, -0.0274],
         [-0.0781, -0.7583,  1.3575,  ..., -0.3998,  1.2230,  0.6880],
         ...,
         [-0.5286, -2.7615, -0.2162,  ..., -1.2668, -1.2173, -1.3172],
         [ 0.3995, -0.1491,  0.0561,  ..., -0.0322,  1.7017,  0.2655],
         [ 0.5760, -1.5194,  2.1641,  ...,  0.4785,  2.4810, -0.3658]]),
 'Pathway': tensor([[ 2.1880,  0.4213,  0.2848,  ...,  0.5361, -2.1169, -0.9894],
         [-1.4259, -0.2198, -0.5540,  ...,  0.0946,  0.8871, -1.0916],
         [-0.7687,  1.9800, -0.5931,  ...,  0.6007,  1.4102, -0.7782],
         ...,
         [-1.6508,  0.0529, -0.2870,  ..., -1.7571, -2.5521, -1.0535],
         [ 0.3769, -0.8440, -0.5302,  ..., -0.7426,  1.4763, -1.4021],
         [ 0.0892, -1.0831, -0.3995,  ...,  0.5859, -0.4080,  0.1513]]),
 'Disease': tensor([[-1.3973, -1.0292, -1.0393,  ...,  0.7628, -0.3516, -1.0327],
         [ 0.02

In [10]:
import torch

# 初始化每种类型的节点特征为128维度随机向量
def initialize_node_edge_features(hetero_data):
    feature_dim = 1024  # 设定特征维度为128

    # 为每种节点类型初始化特征
    for node_type in hetero_data.node_types:
        num_nodes = hetero_data[node_type].num_nodes
        # 生成随机特征向量，形状为 [num_nodes, 128]
        hetero_data[node_type].x = torch.randn(num_nodes, feature_dim)
        
    for edge_type in hetero_data.edge_types:
        num_edges = hetero_data[edge_type].edge_index.size(1)  # 获取该类型的边数量
        # 生成随机特征向量，形状为 [num_edges, 128]
        hetero_data[edge_type].edge_attr = torch.randn(num_edges, feature_dim)
    return hetero_data

# 假设 hetero_data 是你构建好的异构图
hetero_data = initialize_node_edge_features(hetero_data)

# 检查每个节点类型的特征维度是否正确
print(hetero_data)

HeteroData(
  Drug-Disease=[2, 61726],
  Protein-Pathway=[2, 281950],
  Protein-Disease=[2, 116699],
  DDI=[2, 1419538],
  Drug-Pathway=[2, 5114],
  PPI=[2, 131406],
  Drug-Protein=[2, 25994],
  [1mDrug[0m={
    num_nodes=9279,
    x=[9279, 1024]
  },
  [1mDisease[0m={
    num_nodes=5819,
    x=[5819, 1024]
  },
  [1mPathway[0m={
    num_nodes=15771,
    x=[15771, 1024]
  },
  [1mProtein[0m={
    num_nodes=62295,
    x=[62295, 1024]
  }
)


In [11]:
torch.save(hetero_data, save_path)

In [9]:
hetero_data.edge_index_dict


{('Drug',
  'Drug-Protein',
  'Protein'): tensor([[   0,    2,    4,  ..., 5367, 3299, 3153],
         [   1,    3,    5,  ..., 3699,  136,  691]]),
 ('Drug',
  'DDI',
  'Drug'): tensor([[ 2957,  7584,  9954,  ...,   192,  3854,   870],
         [10594, 10257,  6654,  ...,  8319, 10158,  6461]]),
 ('Protein',
  'Protein-Pathway',
  'Pathway'): tensor([[12143, 12145, 12147,  ...,  5750, 26021, 37079],
         [12144, 12146, 12148,  ..., 13232, 16957, 16013]]),
 ('Drug',
  'Drug-Pathway',
  'Pathway'): tensor([[ 2034,    16, 10556,  ...,  7868,   414,  5081],
         [28434, 72596, 72597,  ..., 30672, 44861, 72652]]),
 ('Protein',
  'Protein-Disease',
  'Disease'): tensor([[35445, 21094, 33886,  ...,  7764,   980, 27251],
         [72777, 72778, 72779,  ..., 76667, 73043, 72791]]),
 ('Drug',
  'Drug-Disease',
  'Disease'): tensor([[ 4182,  9877,  6446,  ...,  4965,  7009,  5642],
         [73137, 72905, 83081,  ..., 72980, 83608, 73867]]),
 ('Protein',
  'PPI',
  'Protein'): tensor([[3