In [243]:
import os
import torch
from torch.cuda.amp import autocast, GradScaler
from torch_geometric.data import Data
import pandas as pd
import numpy as np
import torch.nn as nn
from torch.nn import Linear
import torch.nn.functional as F
from torch_geometric.nn import GCNConv
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.metrics import accuracy_score
import copy
import random
import multiprocessing as mp
from torch_geometric.data import HeteroData
import torch
from torch_geometric.nn import GCNConv
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
import matplotlib.pyplot as plt
from torch.optim.lr_scheduler import ReduceLROnPlateau

In [244]:
# Enable CUDA debugging, use to block cuda calls until completion
os.environ['CUDA_LAUNCH_BLOCKING'] = "1"

In [245]:
# Enable to use cuda
# torch.cuda.set_device(1)  # Use GPU 1

# # Verify that the correct GPU is being used
# if torch.cuda.is_available():
#     current_device = torch.cuda.current_device()
#     print(f"Using GPU: {torch.cuda.get_device_name(current_device)}")
#     print(f"Device ID: {current_device}")

In [246]:
# Load node datasets
s_df = pd.read_csv('/data/servilla/DT_HGNN/Model/Nodes/s_emb_full_183.csv', index_col=0) # Substrates
p_df = pd.read_csv('/data/servilla/DT_HGNN/Model/Nodes/p_emb_full_237197.csv', index_col=0) # Proteins

In [247]:
# Load edge datasets (combined with negative samples)
ppi_df = pd.read_csv('/data/servilla/DT_HGNN/Model/Edges/combined_ppi_edges_full.csv') # Protein-Protein Interactions
ssi_df = pd.read_csv('/data/servilla/DT_HGNN/Model/Edges/combined_ssi_edges_full.csv') # Substrate-Substrate Interactions
tp_s_df = pd.read_csv('/data/servilla/DT_HGNN/Model/Edges/combined_tp_s_edges_full.csv',) # Transporter Protein-Substrate Interactions

In [248]:
# Inspect and clean the data, node dataframes should be numeric
def inspect_and_clean(df):
    non_numeric_columns = df.select_dtypes(exclude=[np.number]).columns
    print(f"Non-numeric columns: {non_numeric_columns}")
    if len(non_numeric_columns) > 0:
        df[non_numeric_columns] = df[non_numeric_columns].apply(pd.to_numeric, errors='coerce')
    df = df.fillna(0)
    return df

In [249]:
s_df = inspect_and_clean(s_df)
p_df = inspect_and_clean(p_df)

Non-numeric columns: Index([], dtype='object')
Non-numeric columns: Index([], dtype='object')


In [250]:
# Convert features to numpy arrays
s_features = s_df.values
p_features = p_df.values

In [251]:
# Check shapes to ensure correct dimensions
print(f"s_features shape: {s_features.shape}")  # Expected (183, 1536)
print(f"p_features shape: {p_features.shape}")  # Expected (237197, 2048)

s_features shape: (183, 1536)
p_features shape: (237197, 2048)


In [252]:
# Normalize node features (normalizes columns to have mean 0 and variance 1)
s_features = (s_features - np.mean(s_features, axis=0)) / np.std(s_features, axis=0)
p_features = (p_features - np.mean(p_features, axis=0)) / np.std(p_features, axis=0)

In [253]:
# Define the transformation layers, changes the number of features 1536 -> 2048
# for substrates and 2048 -> 2048 for proteins. The transform_p layer is useful 
# for transforming the feature representation within the same dimensional space,
#  y = Wx + b.

device = torch.device('cpu')  # Temporarily switch to CPU


transform_s = Linear(1536, 2048).to(device) # Change depending on the number of features
transform_p = Linear(2048, 2048).to(device)

In [254]:
# Apply transformations in batches, this can be useful when dealing with large 
# datasets that may not fit into memory or GPU all at once. 
def transform_in_batches(features, transform_layer, batch_size=10000):
    num_samples = features.shape[0]
    print(f"Number of samples: {num_samples}")
    transformed_features = []
    for i in range(0, num_samples, batch_size):
        batch = features[i:i + batch_size]
        batch_tensor = torch.tensor(batch, dtype=torch.float).to(device)
        transformed_batch = transform_layer(batch_tensor)
        transformed_features.append(transformed_batch.detach().cpu().numpy())  # Use detach() before numpy()
    return np.vstack(transformed_features) # Stack arrays in sequence vertically (row wise)


In [255]:
s_features_transformed = transform_in_batches(s_features, transform_s)
p_features_transformed = transform_in_batches(p_features, transform_p)


Number of samples: 183
Number of samples: 237197


In [256]:
# Convert back to tensors
s_features_tensor = torch.tensor(s_features_transformed, dtype=torch.float).to(device)
p_features_tensor = torch.tensor(p_features_transformed, dtype=torch.float).to(device)


In [257]:
# Combine features, vertically stacks features (dim=0) to create a single tensor
all_features = torch.cat([p_features_tensor, s_features_tensor], dim=0)


In [258]:
protein_ids = set(p_df.index)
substrate_ids = set(s_df.index)

In [259]:
def get_mapping(p_dataframe, s_dataframe):
    protein_mapping = {protein_id: i for i, protein_id in enumerate(p_dataframe.index)}
    substrate_mapping = {substrate_id: i for i, substrate_id in enumerate(s_dataframe.index)}
    return protein_mapping, substrate_mapping

In [260]:
# # Create mappings from the index of the DataFrame since 'Uniprot ID' is the index
# protein_mapping = {protein_id: i for i, protein_id in enumerate(p_df.index)}
# substrate_mapping = {substrate_id: i for i, substrate_id in enumerate(s_df.index)}


In [261]:
def map_edges_debug(edge_df, source_mapping, target_mapping, edge_type):
    edge_df = edge_df.copy()

    # Apply mapping
    edge_df['source'] = edge_df['source'].map(source_mapping)
    edge_df['target'] = edge_df['target'].map(target_mapping)

    # Drop rows with NaNs in mapped columns
    edge_df.dropna(subset=['source', 'target'], inplace=True)
      
    return edge_df




In [262]:
protein_mapping, substrate_mapping = get_mapping(p_df, s_df)

In [263]:
# # Print the entire protein_mapping dictionary
# print("Protein Mapping Dictionary:")
# for protein, index in protein_mapping.items():
#     print(f"{protein}: {index}")

# Alternatively, to print just the first few entries
print(len(protein_mapping))
print("First few entries in the protein_mapping dictionary:")
for i, (protein, index) in enumerate(protein_mapping.items()):
    print(f"{protein}: {index}")
    if i >= 9:  # Limit to first 10 entries
        break


237197
First few entries in the protein_mapping dictionary:
A0A061ACU2: 0
A0A061AE05: 1
A0A061I403: 2
A0A072ULZ1: 3
A0A072VDF2: 4
A0A075F7E9: 5
A0A075QQ08: 6
A0A087WPF7: 7
A0A088MLT8: 8
A0A089QRB9: 9


In [264]:

# Alternatively, to print just the first few entries
print(len(substrate_mapping))
print("First few entries in the protein_mapping dictionary:")
for i, (substrate, index) in enumerate(substrate_mapping.items()):
    print(f"{substrate}: {index}")
    if i >= 9:  # Limit to first 10 entries
        break

183
First few entries in the protein_mapping dictionary:
CHEBI:30616: 0
CHEBI:64837: 1
CHEBI:58245: 2
CHEBI:57673: 3
CHEBI:58115: 4
CHEBI:456215: 5
CHEBI:58437: 6
CHEBI:64716: 7
CHEBI:57643: 8
CHEBI:72999: 9


In [265]:
# Apply the debugging function to map edges
mapped_ppi_edges_df = map_edges_debug(ppi_df, protein_mapping, protein_mapping, 'PPI')
mapped_ssi_edges_df = map_edges_debug(ssi_df, substrate_mapping, substrate_mapping, 'SSI')
mapped_tp_s_df = map_edges_debug(tp_s_df, protein_mapping, substrate_mapping, 'TP-S')

In [266]:
print(mapped_ppi_edges_df)
print(mapped_ssi_edges_df)
print(mapped_tp_s_df)

          source  target  label
0         157025  225772      1
1         187416  234940      1
2         234940  187416      1
3         225772  100854      1
4         225772  157025      1
...          ...     ...    ...
13327041  106801   24539      0
13327042  106199  133771      0
13327043   98851   90276      0
13327044  191132   11958      0
13327045  229284  226859      0

[13327046 rows x 3 columns]
      source  target  label
0         89      17      1
1         17      89      1
2         99      78      1
3         78      99      1
4         19     129      1
...      ...     ...    ...
4349      71      33      0
4350      92      97      0
4351     140     145      0
4352      24     153      0
4353      86      81      0

[4354 rows x 3 columns]
       source  target  label
0      110150       0      1
1      113110       0      1
2       94944      63      1
3       86624      63      1
4      132714       0      1
...       ...     ...    ...
26675  121855      84  

In [267]:
from sklearn.model_selection import train_test_split

# Step 1: Randomly select 50 samples for the test set from the mapped tp_s_df
test_tp_s_df = mapped_tp_s_df.sample(n=50, random_state=42)

# Step 2: Remove the selected test samples from the original mapped_tp_s_df to form the training set
train_tp_s_df = mapped_tp_s_df.drop(test_tp_s_df.index)

# Check the sizes of the resulting DataFrames
print(f"Training set size: {train_tp_s_df.shape[0]}")
print(f"Test set size: {test_tp_s_df.shape[0]}")

# Step 3: Now, you have train_tp_s_df and test_tp_s_df which can be used for training and testing respectively


Training set size: 26630
Test set size: 50


In [268]:
# Create reverse mappings to go from the index (mapped) back to the original IDs
reverse_protein_mapping = {i: protein_id for protein_id, i in protein_mapping.items()}
reverse_substrate_mapping = {i: substrate_id for substrate_id, i in substrate_mapping.items()}

# Convert the mapped indices back to the original Uniprot IDs and ChEBI IDs
test_proteins = test_tp_s_df['source'].map(reverse_protein_mapping).unique()
test_substrates = test_tp_s_df['target'].map(reverse_substrate_mapping).unique()

# Step 2: Split p_df into train and test based on test_proteins
p_test_df = p_df[p_df.index.isin(test_proteins)]  # Test set for proteins
p_train_df = p_df[~p_df.index.isin(test_proteins)]  # Train set for proteins

# Step 3: Split s_df into train and test based on test_substrates
s_test_df = s_df[s_df.index.isin(test_substrates)]  # Test set for substrates
s_train_df = s_df[~s_df.index.isin(test_substrates)]  # Train set for substrates

# Check the sizes of the resulting DataFrames
print(f"Protein training set size: {p_train_df.shape[0]}")
print(f"Protein test set size: {p_test_df.shape[0]}")
print(f"Substrate training set size: {s_train_df.shape[0]}")
print(f"Substrate test set size: {s_test_df.shape[0]}")


Protein training set size: 237147
Protein test set size: 50
Substrate training set size: 147
Substrate test set size: 36


In [269]:
# # Get the unique protein indices from the test_tp_s_df
# test_sources = test_tp_s_df['source'].unique()

# # Get the unique substrate indices from the test_tp_s_df
# test_targets = test_tp_s_df['target'].unique()

# # Remove rows from ppi_df where either source or target is in test_protein_indices
# filtered_ppi_df = mapped_ppi_edges_df[~mapped_ppi_edges_df['source'].isin(test_sources) & 
#                          ~mapped_ppi_edges_df['target'].isin(test_sources)]

# # Remove rows from ssi_df where either source or target is in test_substrate_indices
# filtered_ssi_df = mapped_ssi_edges_df[~mapped_ssi_edges_df['source'].isin(test_targets) & 
#                          ~mapped_ssi_edges_df['target'].isin(test_targets)]

# # Filter the train_tp_s_df to remove rows where source or target is in the test set
# filtered_train_tp_s_df = train_tp_s_df[
#     ~train_tp_s_df['source'].isin(test_sources) & 
#     ~train_tp_s_df['target'].isin(test_targets)
# ]

# # Check the sizes of the resulting DataFrames after filtering
# print(f"Final size of train_tp_s_df after filtering: {filtered_train_tp_s_df.shape}")
# print(f"Final size of PPI after filtering: {filtered_ppi_df.shape}")
# print(f"Final size of SSI after filtering: {filtered_ssi_df.shape}")


In [270]:
protein_train_mapping, substrate_train_mapping = get_mapping(p_train_df, s_train_df)

In [271]:
# Alternatively, to print just the first few entries
print(len(protein_train_mapping))
print("First few entries in the protein_mapping dictionary:")
for i, (protein, index) in enumerate(protein_train_mapping.items()):
    print(f"{protein}: {index}")
    if i >= 9:  # Limit to first 10 entries
        break

237147
First few entries in the protein_mapping dictionary:
A0A061ACU2: 0
A0A061AE05: 1
A0A061I403: 2
A0A072ULZ1: 3
A0A072VDF2: 4
A0A075F7E9: 5
A0A075QQ08: 6
A0A087WPF7: 7
A0A088MLT8: 8
A0A089QRB9: 9


In [272]:

# Alternatively, to print just the first few entries
print(len(substrate_train_mapping))
print("First few entries in the protein_mapping dictionary:")
for i, (substrate, index) in enumerate(substrate_train_mapping.items()):
    print(f"{substrate}: {index}")
    if i >= 9:  # Limit to first 10 entries
        break

147
First few entries in the protein_mapping dictionary:
CHEBI:64837: 0
CHEBI:58245: 1
CHEBI:58115: 2
CHEBI:456215: 3
CHEBI:58437: 4
CHEBI:57643: 5
CHEBI:72999: 6
CHEBI:83228: 7
CHEBI:58950: 8
CHEBI:144581: 9


In [273]:
# Apply the debugging function to map edges
mapped_ppi_edges_df = map_edges_debug(ppi_df, protein_train_mapping, protein_train_mapping, 'PPI')
mapped_ssi_edges_df = map_edges_debug(ssi_df, substrate_train_mapping, substrate_train_mapping, 'SSI')
mapped_tp_s_df = map_edges_debug(tp_s_df, protein_train_mapping, substrate_train_mapping, 'TP-S')

In [274]:
print(mapped_ppi_edges_df)

            source    target  label
0         156986.0  225723.0      1
1         187373.0  234890.0      1
2         234890.0  187373.0      1
3         225723.0  100833.0      1
4         225723.0  156986.0      1
...            ...       ...    ...
13327041  106776.0   24533.0      0
13327042  106176.0  133739.0      0
13327043   98830.0   90256.0      0
13327044  191089.0   11956.0      0
13327045  229235.0  226810.0      0

[13322673 rows x 3 columns]


In [275]:
print(mapped_ppi_edges_df)
print(mapped_ssi_edges_df)
print(mapped_tp_s_df)

            source    target  label
0         156986.0  225723.0      1
1         187373.0  234890.0      1
2         234890.0  187373.0      1
3         225723.0  100833.0      1
4         225723.0  156986.0      1
...            ...       ...    ...
13327041  106776.0   24533.0      0
13327042  106176.0  133739.0      0
13327043   98830.0   90256.0      0
13327044  191089.0   11956.0      0
13327045  229235.0  226810.0      0

[13322673 rows x 3 columns]
      source  target  label
0       72.0    13.0      1
1       13.0    72.0      1
2       79.0    61.0      1
3       61.0    79.0      1
14     121.0   118.0      1
...      ...     ...    ...
4348    45.0    29.0      0
4349    54.0    25.0      0
4351   111.0   115.0      0
4352    17.0   122.0      0
4353    69.0    64.0      0

[2434 rows x 3 columns]
         source  target  label
13     124256.0    50.0      1
26      85773.0    89.0      1
57      93838.0    50.0      1
76      93422.0    50.0      1
79     135469.0     0.0

In [276]:
def split_train_val_data(df, train_size=0.9, val_size=0.1):
    # Split into train and validation (90% train, 10% validation)
    train_df, val_df = train_test_split(df, train_size=train_size, random_state=42)
    
    return train_df, val_df

In [277]:
# Split data for each edge type (without test, as test is predefined)
ppi_train_df, ppi_val_df = split_train_val_data(mapped_ppi_edges_df)
ssi_train_df, ssi_val_df = split_train_val_data(mapped_ssi_edges_df)
tp_s_train_df, tp_s_val_df = split_train_val_data(mapped_tp_s_df)

In [None]:
# # Apply the debugging function to map edges
# mapped_ppi_edges_df = map_edges_debug(ppi_df, protein_mapping, protein_mapping, 'PPI')
# mapped_ssi_edges_df = map_edges_debug(ssi_df, substrate_mapping, substrate_mapping, 'SSI')
# mapped_tp_s_df = map_edges_debug(train_tp_s_df, protein_mapping, substrate_mapping, 'TP-S')

In [278]:
# Create edge index tensors for training, validation, and test
train_edges_tp_s = torch.tensor(tp_s_train_df[['source', 'target']].values.T, dtype=torch.long)
val_edges_tp_s = torch.tensor(tp_s_val_df[['source', 'target']].values.T, dtype=torch.long)
test_edges_tp_s = torch.tensor(test_tp_s_df[['source', 'target']].values.T, dtype=torch.long)

train_edges_ppi = torch.tensor(ppi_train_df[['source', 'target']].values.T, dtype=torch.long)
val_edges_ppi = torch.tensor(ppi_val_df[['source', 'target']].values.T, dtype=torch.long)

train_edges_ssi = torch.tensor(ssi_train_df[['source', 'target']].values.T, dtype=torch.long)
val_edges_ssi = torch.tensor(ssi_val_df[['source', 'target']].values.T, dtype=torch.long)

# Labels remain the same
train_labels_tp_s = torch.tensor(tp_s_train_df['label'].values, dtype=torch.float)
val_labels_tp_s = torch.tensor(tp_s_val_df['label'].values, dtype=torch.float)
test_labels_tp_s = torch.tensor(test_tp_s_df['label'].values, dtype=torch.float)


In [281]:
s_train_df.shape

(147, 1536)

In [279]:
data = HeteroData()

# Assign node features
# During training, only use the node features corresponding to the training set
data['protein'].x = torch.tensor(p_train_df.values, dtype=torch.float).to(device)
data['substrate'].x = torch.tensor(s_train_df.values, dtype=torch.float).to(device)

# Assign training edges
data['protein', 'interacts_with', 'substrate'].edge_index = train_edges_tp_s
data['protein', 'interacts_with', 'protein'].edge_index = train_edges_ppi
data['substrate', 'interacts_with', 'substrate'].edge_index = train_edges_ssi


# Initialize the model
class GCNLinkPredictor(nn.Module):
    def __init__(self, protein_dim, substrate_dim, hidden_channels):
        super(GCNLinkPredictor, self).__init__()
        self.protein_conv1 = GCNConv(protein_dim, hidden_channels)
        self.substrate_conv1 = GCNConv(substrate_dim, hidden_channels)
        self.protein_conv2 = GCNConv(hidden_channels, hidden_channels)
        self.substrate_conv2 = GCNConv(hidden_channels, hidden_channels)
        self.link_predictor = nn.Sequential(
            nn.Linear(hidden_channels * 2, hidden_channels),
            nn.ReLU(),
            nn.Linear(hidden_channels, 1)
        )

    def encode(self, x_dict, edge_index_dict):
        z_protein = self.protein_conv1(x_dict['protein'], edge_index_dict[('protein', 'interacts_with', 'protein')])
        z_substrate = self.substrate_conv1(x_dict['substrate'], edge_index_dict[('substrate', 'interacts_with', 'substrate')])
        return z_protein, z_substrate

    def forward(self, x_dict, edge_index_dict, edges):
        z_protein, z_substrate = self.encode(x_dict, edge_index_dict)
        z_combined = torch.cat([z_protein[edges[0]], z_substrate[edges[1]]], dim=-1)
        return self.link_predictor(z_combined).squeeze()


# Initialize the model
model = GCNLinkPredictor(protein_dim=2048, substrate_dim=2048, hidden_channels=64).to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=1e-4)
criterion = torch.nn.BCEWithLogitsLoss()

# Early stopping parameters
patience = 10  # Number of epochs to wait before stopping if no improvement
best_val_loss = float('inf')
epochs_without_improvement = 0

# Learning Rate Scheduler
scheduler = ReduceLROnPlateau(optimizer, mode='min', factor=0.1, patience=4)


# Assuming `data` contains x_dict and edge_index_dict
x_dict = data.x_dict
edge_index_dict = data.edge_index_dict



# Update the train, validate, and test functions as needed
def train(x_dict, edge_index_dict, train_edges_tp_s, train_labels_tp_s): 
    model.train()
    optimizer.zero_grad()
    out = model(x_dict, edge_index_dict, train_edges_tp_s)
    loss = criterion(out, train_labels_tp_s)
    loss.backward()
    optimizer.step()
    return loss.item(), out.detach()

def validate():
    model.eval()
    with torch.no_grad():
        out = model(data.x_dict, data.edge_index_dict, val_edges_tp_s)
        loss = criterion(out, val_labels_tp_s)
    return loss.item(), out

def test():
    model.eval()
    with torch.no_grad():
        out = model(data.x_dict, data.edge_index_dict, test_edges_tp_s)
        loss = criterion(out, test_labels_tp_s)
    return loss.item(), out

# Calculate additional metrics
def calculate_metrics(labels, preds):
    preds = torch.sigmoid(preds).cpu().numpy()
    preds_binary = (preds > 0.5).astype(int)
    labels = labels.cpu().numpy()

    accuracy = accuracy_score(labels, preds_binary)
    precision = precision_score(labels, preds_binary)
    recall = recall_score(labels, preds_binary)
    f1 = f1_score(labels, preds_binary)
    auc = roc_auc_score(labels, preds)

    return accuracy, precision, recall, f1, auc

# Modify the training loop to include metric calculation and visualization
train_losses = []
val_losses = []
val_accuracies = []

# Training loop
epochs = 800
for epoch in range(epochs):
    # Training step
    train_loss, train_preds = train(x_dict, edge_index_dict, train_edges_tp_s, train_labels_tp_s)
    # Validation step
    val_loss, val_preds = validate()

    # Store losses
    train_losses.append(train_loss)
    val_losses.append(val_loss)

    # Calculate validation metrics
    accuracy, precision, recall, f1, auc = calculate_metrics(val_labels_tp_s, val_preds)
    val_accuracies.append(accuracy)

    # Print metrics
    print(f"Epoch {epoch+1}, Train Loss: {train_loss:.4f}, Validation Loss: {val_loss:.4f}, "
          f"Val Accuracy: {accuracy:.4f}, Precision: {precision:.4f}, Recall: {recall:.4f}, "
          f"F1: {f1:.4f}, AUC: {auc:.4f}, LR: {scheduler.get_last_lr()[0]}")

    # Step the LR scheduler
    scheduler.step(val_loss)

    # Check for early stopping
    if val_loss < best_val_loss:
        best_val_loss = val_loss
        epochs_without_improvement = 0
        torch.save(model.state_dict(), '/data/servilla/DT_HGNN/data/Models_saves/best_model.pth')  # Save the best model
    else:
        epochs_without_improvement += 1
    
    if epochs_without_improvement >= patience:
        print(f"Early stopping at epoch {epoch+1}")
        break

# Load the best model
model.load_state_dict(torch.load('/data/servilla/DT_HGNN/data/Models_saves/best_model.pth'))



RuntimeError: mat1 and mat2 shapes cannot be multiplied (147x1536 and 2048x64)

In [None]:
# Testing
# During testing, use the full node feature set for both proteins and substrates
data['protein'].x = torch.tensor(p_df.values, dtype=torch.float).to(device)
data['substrate'].x = torch.tensor(s_df.values, dtype=torch.float).to(device)

# Use testing edges
data['protein', 'interacts_with', 'substrate'].edge_index = test_edges_tp_s
# data['protein', 'interacts_with', 'protein'].edge_index = test_edges_ppi  # if needed for testing
# data['substrate', 'interacts_with', 'substrate'].edge_index = test_edges_ssi  # if needed for testing


test_loss, test_preds = test()
test_accuracy, test_precision, test_recall, test_f1, test_auc = calculate_metrics(test_labels_tp_s, test_preds)
print(f"Test Loss: {test_loss}")
print(f"Test Accuracy: {test_accuracy:.4f}, Precision: {test_precision:.4f}, Recall: {test_recall:.4f}, F1: {test_f1:.4f}, AUC: {test_auc:.4f}")

# Step 4: Plot loss curves
plt.figure(figsize=(10, 5))
plt.plot(train_losses, label="Train Loss")
plt.plot(val_losses, label="Validation Loss")
plt.xlabel("Epoch")
plt.ylabel("Loss")
plt.title("Loss Curves")
plt.legend()
plt.show()

# Step 5: Plot validation accuracy
plt.figure(figsize=(10, 5))
plt.plot(val_accuracies, label="Validation Accuracy")
plt.xlabel("Epoch")
plt.ylabel("Accuracy")
plt.title("Validation Accuracy")
plt.legend()
plt.show()


In [143]:
# Step 1: Verify that node indices are within bounds
# Ensure that edges in train_edges_tp_s, test_edges_tp_s are within the size of the node feature tensors

def verify_edge_indices(edge_tensor, max_index):
    return (edge_tensor >= 0).all() and (edge_tensor < max_index).all()

# Check training and validation edges for protein and substrate sets
max_protein_idx = p_train_df.shape[0]  # Maximum index for proteins
max_substrate_idx = s_train_df.shape[0]  # Maximum index for substrates

assert verify_edge_indices(train_edges_tp_s[0], max_protein_idx), "Protein indices out of bounds in train_edges_tp_s"
assert verify_edge_indices(train_edges_tp_s[1], max_substrate_idx), "Substrate indices out of bounds in train_edges_tp_s"

# Repeat for validation and test sets


AssertionError: Protein indices out of bounds in train_edges_tp_s

In [144]:
# Check for out-of-bound protein indices
invalid_protein_indices = train_edges_tp_s[0][train_edges_tp_s[0] >= max_protein_idx]
print(f"Invalid protein indices: {invalid_protein_indices}")


Invalid protein indices: tensor([237187, 237164])
