In [1]:
import torch
import torch.nn.functional as F
from torch import nn
from torch_geometric.nn import EdgeConv, NNConv
from torch_geometric.data import Data, DataLoader

In [2]:
import pandas as pd
import os

### Load training samples

In [3]:
files = os.listdir('training-samples')
files = [f for f in files if os.path.isfile('training-samples'+'/'+f)]
len(files)

820

### Data Preparation for GNN

- `edge_attr`: GNN Layer can use edge feature 
    - Edge feature matrix with shape `[num_edges, num_edge_features]`
- `edge_index`: Graph connectivity with shape `[2, num_edges]`

References: https://pytorch-geometric.readthedocs.io/en/2.6.0/get_started/introduction.html#data-handling-of-graphs

**Create Node Label to Integer Index Mapping**
- Extract all unique node labels from dataset (can get this from `union_ppi`)
- Create dictionary that maps each label to a unique integer index

In [4]:
union_ppi = pd.read_csv('processed-data/union_ppi.txt', sep='\t', header=None)
unique_nodes = set(union_ppi[0].tolist() + union_ppi[1].tolist())

In [5]:
# build mapping for label -> int ID
label_id_map = {label: idx for idx, label in enumerate(sorted(unique_nodes))}
num_nodes = len(label_id_map)
print(f"Total unique nodes: {num_nodes}")

Total unique nodes: 17407


**Build the `Data` objects - one for each training sample**

In [6]:
data_folder = 'training-samples/'
data_list = []

for training_sample in files:
    print('Processing: ', training_sample)
    training_df = pd.read_csv(os.path.join(data_folder, training_sample), sep='\t')
    training_data = training_df.values.tolist()
    
    edge_index_list = [[], []]
    edge_attr_list = []  # each edge attribute: [prize]
    edge_labels = []     # this will get passed as data.y
    selected_edges = []  # store indices of nodes for edges with label 1

    for data_sample in training_data:
        node1, node2, prize, flag, label = data_sample
        idx1 = label_id_map[node1]
        idx2 = label_id_map[node2]
        edge_index_list[0].append(idx1)
        edge_index_list[1].append(idx2)
        edge_attr_list.append([prize])
        edge_labels.append(label)
        
        # if edge label = 1, then record its nodes
        if label == 1:
            selected_edges.append((idx1, idx2))
    
    # lists -> torch tensors
    edge_index = torch.tensor(edge_index_list, dtype=torch.long)
    edge_attr = torch.tensor(edge_attr_list, dtype=torch.float)
    edge_labels = torch.tensor(edge_labels, dtype=torch.float).view(-1, 1)
    
    # init node features: one feature per node (0 by default)
    num_nodes = len(label_id_map)
    x = torch.zeros((num_nodes, 1), dtype=torch.float)
    
    # update node features - mark nodes as 1 if they are connected by a selected edge
    for idx1, idx2 in selected_edges:
        x[idx1] = 1
        x[idx2] = 1
    
    data_obj = Data(x=x, edge_index=edge_index, edge_attr=edge_attr, y=edge_labels)
    data_obj.file_name = training_sample
    data_list.append(data_obj)

Processing:  biotin_metabolism_train_1902.txt
Processing:  alanine__aspartate_a_train_2535.txt
Processing:  alanine__aspartate_a_train_3273.txt
Processing:  alanine__aspartate_a_train_3711.txt
Processing:  alanine__aspartate_a_train_3781.txt
Processing:  alanine__aspartate_a_train_6130.txt
Processing:  alanine__aspartate_a_train_6672.txt
Processing:  alanine__aspartate_a_train_6727.txt
Processing:  alanine__aspartate_a_train_8527.txt
Processing:  alanine__aspartate_a_train_9941.txt
Processing:  alpha_linolenic_acid_train_1854.txt
Processing:  alpha_linolenic_acid_train_2371.txt
Processing:  alpha_linolenic_acid_train_4263.txt
Processing:  alpha_linolenic_acid_train_4939.txt
Processing:  alpha_linolenic_acid_train_5635.txt
Processing:  alpha_linolenic_acid_train_5783.txt
Processing:  alpha_linolenic_acid_train_6633.txt
Processing:  alpha_linolenic_acid_train_7454.txt
Processing:  alpha_linolenic_acid_train_9373.txt
Processing:  alpha_linolenic_acid_train_9806.txt
Processing:  aminoacyl_

Processing:  d_glutamine_and_d_gl_train_3853.txt
Processing:  d_glutamine_and_d_gl_train_5918.txt
Processing:  d_glutamine_and_d_gl_train_6415.txt
Processing:  d_glutamine_and_d_gl_train_6453.txt
Processing:  d_glutamine_and_d_gl_train_6575.txt
Processing:  d_glutamine_and_d_gl_train_7147.txt
Processing:  d_glutamine_and_d_gl_train_7653.txt
Processing:  d_glutamine_and_d_gl_train_8627.txt
Processing:  d_glutamine_and_d_gl_train_9192.txt
Processing:  d_glutamine_and_d_gl_train_9381.txt
Processing:  drug_metabolism___cy_train_1576.txt
Processing:  drug_metabolism___cy_train_2581.txt
Processing:  drug_metabolism___cy_train_2665.txt
Processing:  drug_metabolism___cy_train_3545.txt
Processing:  drug_metabolism___cy_train_4503.txt
Processing:  drug_metabolism___cy_train_6262.txt
Processing:  drug_metabolism___cy_train_7207.txt
Processing:  drug_metabolism___cy_train_7291.txt
Processing:  drug_metabolism___cy_train_7643.txt
Processing:  drug_metabolism___cy_train_7684.txt
Processing:  drug_me

Processing:  glycosaminoglycan_bi_train_9258.txt
Processing:  glycosaminoglycan_bi_train_9612.txt
Processing:  glycosaminoglycan_bi_train_9670.txt
Processing:  glycosaminoglycan_de_train_1224.txt
Processing:  glycosaminoglycan_de_train_1711.txt
Processing:  glycosaminoglycan_de_train_2488.txt
Processing:  glycosaminoglycan_de_train_3162.txt
Processing:  glycosaminoglycan_de_train_3395.txt
Processing:  glycosaminoglycan_de_train_4549.txt
Processing:  glycosaminoglycan_de_train_4714.txt
Processing:  glycosaminoglycan_de_train_7274.txt
Processing:  glycosaminoglycan_de_train_8470.txt
Processing:  glycosaminoglycan_de_train_9198.txt
Processing:  glycosphingolipid_bi__1_train_1944.txt
Processing:  glycosphingolipid_bi__1_train_2264.txt
Processing:  glycosphingolipid_bi__1_train_2271.txt
Processing:  glycosphingolipid_bi__1_train_3112.txt
Processing:  glycosphingolipid_bi__1_train_4340.txt
Processing:  glycosphingolipid_bi__1_train_4541.txt
Processing:  glycosphingolipid_bi__1_train_6175.txt

Processing:  nitrogen_metabolism_train_7075.txt
Processing:  nitrogen_metabolism_train_7440.txt
Processing:  nitrogen_metabolism_train_8986.txt
Processing:  nitrogen_metabolism_train_9395.txt
Processing:  nitrogen_metabolism_train_9476.txt
Processing:  nitrogen_metabolism_train_9511.txt
Processing:  one_carbon_pool_by_f_train_1293.txt
Processing:  one_carbon_pool_by_f_train_1466.txt
Processing:  one_carbon_pool_by_f_train_1656.txt
Processing:  one_carbon_pool_by_f_train_1671.txt
Processing:  one_carbon_pool_by_f_train_3035.txt
Processing:  one_carbon_pool_by_f_train_3699.txt
Processing:  one_carbon_pool_by_f_train_4046.txt
Processing:  one_carbon_pool_by_f_train_4656.txt
Processing:  one_carbon_pool_by_f_train_5988.txt
Processing:  one_carbon_pool_by_f_train_9250.txt
Processing:  other_types_of_o_gly_train_2394.txt
Processing:  other_types_of_o_gly_train_2494.txt
Processing:  other_types_of_o_gly_train_2893.txt
Processing:  other_types_of_o_gly_train_5221.txt
Processing:  other_types_o

Processing:  sphingolipid_metabol_train_5576.txt
Processing:  sphingolipid_metabol_train_6402.txt
Processing:  sphingolipid_metabol_train_7075.txt
Processing:  sphingolipid_metabol_train_7417.txt
Processing:  sphingolipid_metabol_train_7532.txt
Processing:  sphingolipid_metabol_train_7670.txt
Processing:  sphingolipid_metabol_train_8777.txt
Processing:  starch_and_sucrose_m_train_1473.txt
Processing:  starch_and_sucrose_m_train_2356.txt
Processing:  starch_and_sucrose_m_train_3953.txt
Processing:  starch_and_sucrose_m_train_4604.txt
Processing:  starch_and_sucrose_m_train_5326.txt
Processing:  starch_and_sucrose_m_train_5329.txt
Processing:  starch_and_sucrose_m_train_5671.txt
Processing:  starch_and_sucrose_m_train_5869.txt
Processing:  starch_and_sucrose_m_train_5925.txt
Processing:  starch_and_sucrose_m_train_8147.txt
Processing:  steroid_biosynthesis_train_1126.txt
Processing:  steroid_biosynthesis_train_3268.txt
Processing:  steroid_biosynthesis_train_4516.txt
Processing:  steroid

In [7]:
# Save the dataset to disk
torch.save(data_list, 'dataset.pt')