In [3]:
import os
import torch
from torch.cuda.amp import autocast, GradScaler
from torch_geometric.data import Data
import pandas as pd
import numpy as np
import torch.nn as nn
from torch.nn import Linear
import torch.nn.functional as F
from torch_geometric.nn import SAGEConv
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.metrics import accuracy_score
import copy
import networkx as nx
import matplotlib.pyplot as plt

In [4]:
torch.cuda.set_device(1)  # Use GPU 1

# Verify that the correct GPU is being used
if torch.cuda.is_available():
    current_device = torch.cuda.current_device()
    print(f"Using GPU: {torch.cuda.get_device_name(current_device)}")
    print(f"Device ID: {current_device}")

Using GPU: NVIDIA H100 PCIe
Device ID: 1


In [10]:
# Load node features
s_df = pd.read_csv('/data/servilla/DT_HGNN/Nodes/s_emb_183.csv', index_col=0)  # Substrates CSV file
proteins_df = pd.read_csv('/data/servilla/DT_HGNN/Nodes/p_emb_filtered.csv', index_col=0)  # Combined proteins CSV file

# Load edges
tp_s_df = pd.read_csv('/data/servilla/DT_HGNN/Edges/trans_sub_edges_13347.csv', index_col=0)
ppi_df = pd.read_csv('/data/servilla/DT_HGNN/Edges/ppi_edges_6663523.csv', index_col=0)
ssi_df = pd.read_csv('/data/servilla/DT_HGNN/Edges/rhea_edges_2186.csv', index_col=0)

In [11]:
# Create sets of all node labels for proteins and substrates
protein_labels = set(proteins_df.index.tolist())
substrate_labels = set(s_df.index.tolist())

# Identify connected nodes for proteins and substrates
connected_protein_labels = set(tp_s_df['source']).union(set(tp_s_df['target']),
                                                        set(ppi_df['source']), set(ppi_df['target']))

connected_substrate_labels = set(tp_s_df['source']).union(set(tp_s_df['target']),
                                                          set(ssi_df['source']), set(ssi_df['target']))

# Identify isolated nodes for proteins and substrates
isolated_protein_labels = protein_labels - connected_protein_labels
isolated_substrate_labels = substrate_labels - connected_substrate_labels

# Print the counts of connected and isolated nodes
print(f"Total protein nodes: {len(protein_labels)}")
print(f"Connected protein nodes: {len(connected_protein_labels)}")
print(f"Isolated protein nodes: {len(isolated_protein_labels)}")

print(f"Total substrate nodes: {len(substrate_labels)}")
print(f"Connected substrate nodes: {len(connected_substrate_labels)}")
print(f"Isolated substrate nodes: {len(isolated_substrate_labels)}")

# # Convert sets to lists for easier handling
# connected_protein_labels = list(connected_protein_labels)
# isolated_protein_labels = list(isolated_protein_labels)
# connected_substrate_labels = list(connected_substrate_labels)
# isolated_substrate_labels = list(isolated_substrate_labels)

# # Save lists to files
# with open('/data/servilla/DT_HGNN/workbench/connected_isolated_nodes/connected_protein_nodes.txt', 'w') as f:
#     for item in connected_protein_labels:
#         f.write("%s\n" % item)

# with open('/data/servilla/DT_HGNN/workbench/connected_isolated_nodes/isolated_protein_nodes.txt', 'w') as f:
#     for item in isolated_protein_labels:
#         f.write("%s\n" % item)

# with open('/data/servilla/DT_HGNN/workbench/connected_isolated_nodes/connected_substrate_nodes.txt', 'w') as f:
#     for item in connected_substrate_labels:
#         f.write("%s\n" % item)

# with open('/data/servilla/DT_HGNN/workbench/connected_isolated_nodes/isolated_substrate_nodes.txt', 'w') as f:
#     for item in isolated_substrate_labels:
#         f.write("%s\n" % item)

# print("Lists of connected and isolated nodes for proteins and substrates saved.")

Total protein nodes: 237197
Connected protein nodes: 237377
Isolated protein nodes: 0
Total substrate nodes: 183
Connected substrate nodes: 12492
Isolated substrate nodes: 0


In [9]:
import pandas as pd

# Load the isolated protein nodes
isolated_proteins_path = '/data/servilla/DT_HGNN/workbench/connected_isolated_nodes/isolated_protein_nodes.txt'
with open(isolated_proteins_path, 'r') as file:
    isolated_protein_nodes = file.read().splitlines()

# Load the p_emb.csv file
proteins_df = pd.read_csv('/data/servilla/DT_HGNN/Nodes/p_emb.csv', index_col=0)

# Filter out the isolated protein nodes
filtered_proteins_df = proteins_df[~proteins_df.index.isin(isolated_protein_nodes)]

# Save the filtered DataFrame to a new CSV file
filtered_proteins_df.to_csv('/data/servilla/DT_HGNN/Nodes/p_emb_filtered.csv')

print("Filtered protein embeddings saved to p_emb_filtered.csv.")


Filtered protein embeddings saved to p_emb_filtered.csv.
