In [4]:
import torch
from torch_geometric.data import Data
from sklearn.preprocessing import MinMaxScaler
import pandas as pd

# Load protein-to-gene data
protein_to_gene_file = "../preprocessing/protein_to_gene_filtered.csv"
df_protein_to_gene = pd.read_csv(protein_to_gene_file)

# Load psychiatric disorders data
psychiatric_disorders_file = "../psychiatric_disorders_full_GDA.csv"
df_psychiatric_disorders = pd.read_csv(psychiatric_disorders_file)

# Normalize edge feature columns in protein-to-gene data
protein_edge_feature_columns = [
    "neighborhood", "fusion", "cooccurence", "coexpression",
    "experimental", "database", "textmining", "combined_score"
]
for col in protein_edge_feature_columns:
    scaler = MinMaxScaler()
    df_protein_to_gene[col] = scaler.fit_transform(df_protein_to_gene[[col]])

# Normalize edge feature columns in psychiatric disorders data
disorder_edge_feature_columns = [
    "score", "yearInitial", "yearFinal", "numPMIDs", "evidence_index"
]
for col in disorder_edge_feature_columns:
    scaler = MinMaxScaler()
    df_psychiatric_disorders[col] = scaler.fit_transform(df_psychiatric_disorders[[col]])

# Initialize dictionaries for nodes
node_dict = {}
node_type_dict = {}
node_index = 0

def get_node_index(node, node_type):
    global node_index
    if node not in node_dict:
        node_dict[node] = node_index
        node_type_dict[node_index] = node_type
        node_index += 1
    return node_dict[node]

# Create graph edges and edge features
edge_index = []
edge_features = []

# Add protein-to-gene edges
for _, row in df_protein_to_gene.iterrows():
    protein1 = row["protein1"]
    protein2 = row["protein2"]
    gene1 = row["gene1"]
    gene2 = row["gene2"]
    
    p1_idx = get_node_index(protein1, 0)  # Protein node
    p2_idx = get_node_index(protein2, 0)  # Protein node
    g1_idx = get_node_index(gene1, 1)     # Gene node
    g2_idx = get_node_index(gene2, 1)     # Gene node
    
    for source, target in [(p1_idx, p2_idx), (p1_idx, g1_idx), (p1_idx, g2_idx)]:
        edge_index.append([source, target])
        edge_index.append([target, source])  
        
        edge_features.append(row[protein_edge_feature_columns].values)
        edge_features.append(row[protein_edge_feature_columns].values)

# Add gene-to-disease edges
for _, row in df_psychiatric_disorders.iterrows():
    gene = row["gene_symbol"]
    disease = row["disease_name"]
    
    if pd.isna(disease) or pd.isna(gene):
        continue
    
    if gene in node_dict:
        gene_idx = get_node_index(gene, 1)  # Gene node
        disease_idx = get_node_index(disease, 2)  # Disease node
        
        edge_index.append([gene_idx, disease_idx])
        edge_index.append([disease_idx, gene_idx])
        
        edge_features.append(row[disorder_edge_feature_columns].values)
        edge_features.append(row[disorder_edge_feature_columns].values)


In [5]:
max_edge_feature_size = max(len(edge) for edge in edge_features)
padded_edge_features = [
    list(edge) + [0] * (max_edge_feature_size - len(edge)) for edge in edge_features
]

# Convert edge_index and edge_features to tensors
edge_index = torch.tensor(edge_index, dtype=torch.long).t().contiguous()
edge_attr = torch.tensor(padded_edge_features, dtype=torch.float)

# Create node features
num_nodes = len(node_dict)
node_features = torch.zeros((num_nodes, 1), dtype=torch.float)
for idx, node_type in node_type_dict.items():
    node_features[idx] = node_type  # 0 for protein, 1 for gene, 2 for disease

# Create graph data object
data = Data(x=node_features, edge_index=edge_index, edge_attr=edge_attr)

In [6]:
data

Data(x=[2981, 1], edge_index=[2, 1843122], edge_attr=[1843122, 8])