In [1]:
import pandas as pd

file_path = "9606.protein.links.detailed.v12.0.txt"

columns = [
    "protein1", "protein2", "neighborhood", "fusion", "cooccurence", 
    "coexpression", "experimental", "database", "textmining", "combined_score"
]

data = pd.read_csv(file_path, sep=" ", names=columns, header=1)

print(data.head())

               protein1              protein2  neighborhood  fusion  \
0  9606.ENSP00000000233  9606.ENSP00000427567             0       0   
1  9606.ENSP00000000233  9606.ENSP00000253413             0       0   
2  9606.ENSP00000000233  9606.ENSP00000493357             0       0   
3  9606.ENSP00000000233  9606.ENSP00000324127             0       0   
4  9606.ENSP00000000233  9606.ENSP00000325266             0       0   

   cooccurence  coexpression  experimental  database  textmining  \
0            0             0           128         0          70   
1            0           118            49         0          69   
2            0            56            53         0         457   
3            0             0            46         0         197   
4            0            94           125         0          50   

   combined_score  
0             154  
1             151  
2             471  
3             201  
4             180  


In [3]:
import json 

with open("idmapping_2024_11_16.json", "r") as f:
    id_mapping = json.load(f)

In [13]:
protein_to_gene = {
    entry["from"]: entry["to"]["genes"][0]["geneName"]["value"] 
    for entry in id_mapping["results"] if "genes" in entry["to"] and "geneName" in entry["to"]["genes"][0]
}

data["gene1"] = data["protein1"].str.split('.').str[-1].map(protein_to_gene)
data["gene2"] = data["protein2"].str.split('.').str[-1].map(protein_to_gene)

protein_to_gene_data_file = "protein_to_gene_data.csv"
data.to_csv(protein_to_gene_data_file, index=False)

In [8]:
import torch
from torch_geometric.data import Data
from sklearn.preprocessing import MinMaxScaler
import pandas as pd

csv_file = "protein_to_gene_data.csv" 
df = pd.read_csv(csv_file)

edge_feature_columns = [
    "neighborhood", "fusion", "cooccurence", "coexpression",
    "experimental", "database", "textmining", "combined_score"
]
for col in edge_feature_columns:
    scaler = MinMaxScaler()
    df[col] = scaler.fit_transform(df[[col]])

node_dict = {}
node_type_dict = {}
node_index = 0

def get_node_index(node, node_type):
    global node_index
    if node not in node_dict:
        node_dict[node] = node_index
        node_type_dict[node_index] = node_type
        node_index += 1
    return node_dict[node]

edge_index = []
edge_features = []

for _, row in df.iterrows():
    protein1 = row["protein1"]
    protein2 = row["protein2"]
    gene1 = row["gene1"]
    gene2 = row["gene2"]
    
    p1_idx = get_node_index(protein1, 0)  # Protein node
    p2_idx = get_node_index(protein2, 0)  # Protein node
    g1_idx = get_node_index(gene1, 1)     # Gene node
    g2_idx = get_node_index(gene2, 1)     # Gene node
    
    for source, target in [(p1_idx, p2_idx), (p1_idx, g1_idx), (p1_idx, g2_idx)]:
        edge_index.append([source, target])
        edge_index.append([target, source])  
        
        edge_features.append(row[edge_feature_columns].values)
        edge_features.append(row[edge_feature_columns].values)

edge_index = torch.tensor(edge_index, dtype=torch.long).t().contiguous()
edge_attr = torch.tensor(edge_features, dtype=torch.float)

num_nodes = len(node_dict)
node_features = torch.zeros((num_nodes, 1), dtype=torch.float)
for idx, node_type in node_type_dict.items():
    node_features[idx] = node_type  # 0 for protein, 1 for gene

data = Data(x=node_features, edge_index=edge_index, edge_attr=edge_attr)

In [14]:
data

Data(x=[1976, 1], edge_index=[2, 6000], edge_attr=[6000, 8])