In [72]:
import os
import torch
from torch.cuda.amp import autocast, GradScaler
from torch_geometric.data import Data
import pandas as pd
import numpy as np
import torch.nn as nn
from torch.nn import Linear
import torch.nn.functional as F
from torch_geometric.nn import GCNConv
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.metrics import accuracy_score
import copy
import random
import multiprocessing as mp
from torch_geometric.data import HeteroData
import torch
from torch_geometric.nn import GCNConv
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
import matplotlib.pyplot as plt
from torch.optim.lr_scheduler import ReduceLROnPlateau

In [73]:
# Enable CUDA debugging
os.environ['CUDA_LAUNCH_BLOCKING'] = "1"

In [None]:
# torch.cuda.set_device(1)  # Use GPU 1

# # Verify that the correct GPU is being used
# if torch.cuda.is_available():
#     current_device = torch.cuda.current_device()
#     print(f"Using GPU: {torch.cuda.get_device_name(current_device)}")
#     print(f"Device ID: {current_device}")

In [74]:
# Load node features
s_df = pd.read_csv('/data/servilla/DT_HGNN/Model/Nodes/minus_50_train_substrate_node_dataset.csv', index_col=0)  # Substrates CSV file
proteins_df = pd.read_csv('/data/servilla/DT_HGNN/Model/Nodes/minus_50_train_protein_node_dataset.csv', index_col=0)  # Combined proteins CSV file

In [75]:
# Load edges (USE ONLY IF NEED TO RELOAD EDGES, OTHERWISE LOAD COMBINED EDGES FROM FILES)
# tp_s_df = pd.read_csv('/data/servilla/DT_HGNN/Model/Edges/removed_50_train_tp_s.csv', index_col=0)
ppi_df = pd.read_csv('/data/servilla/DT_HGNN/Model/Edges/Old_Edges/ppi_edges_6663523.csv', index_col=0)
ssi_df = pd.read_csv('/data/servilla/DT_HGNN/Model/Edges/Old_Edges/ssi_edges_2177.csv', index_col=0)

In [76]:
# Inspect and clean the data
def inspect_and_clean(df):
    non_numeric_columns = df.select_dtypes(exclude=[np.number]).columns
    print(f"Non-numeric columns: {non_numeric_columns}")
    if len(non_numeric_columns) > 0:
        df[non_numeric_columns] = df[non_numeric_columns].apply(pd.to_numeric, errors='coerce')
    df = df.fillna(0)
    return df

In [77]:
s_df = inspect_and_clean(s_df)
proteins_df = inspect_and_clean(proteins_df)

Non-numeric columns: Index([], dtype='object')
Non-numeric columns: Index([], dtype='object')


In [78]:
# Convert features to numpy arrays
s_features = s_df.values
p_features = proteins_df.values

In [79]:
# Check shapes to ensure correct dimensions
print(f"s_features shape: {s_features.shape}")  # Expected (183, 1536), no KD (212, 768)
print(f"p_features shape: {p_features.shape}")  # Expected (some number, 2048), no KD (571609, 1280)

s_features shape: (153, 1536)
p_features shape: (237147, 2048)


In [None]:
# # Use GPU 1
# device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
# print(f"Device: {device}")

In [80]:
# Normalize features (normalizes columns to have mean 0 and variance 1)
s_features = (s_features - np.mean(s_features, axis=0)) / np.std(s_features, axis=0)
p_features = (p_features - np.mean(p_features, axis=0)) / np.std(p_features, axis=0)

In [81]:
# Define the transformation layers, changes the number of features 1536 -> 2048
# for substrates and 2048 -> 2048 for proteins. The transform_p layer is useful 
# for transforming the feature representation within the same dimensional space,
#  y = Wx + b.

device = torch.device('cpu')  # Temporarily switch to CPU


transform_s = Linear(1536, 2048).to(device) # Change depending on the number of features
transform_p = Linear(2048, 2048).to(device)

In [82]:
# Apply transformations in batches, this can be useful when dealing with large 
# datasets that may not fit into memory or GPU all at once. 
def transform_in_batches(features, transform_layer, batch_size=10000):
    num_samples = features.shape[0]
    print(f"Number of samples: {num_samples}")
    transformed_features = []
    for i in range(0, num_samples, batch_size):
        batch = features[i:i + batch_size]
        batch_tensor = torch.tensor(batch, dtype=torch.float).to(device)
        transformed_batch = transform_layer(batch_tensor)
        transformed_features.append(transformed_batch.detach().cpu().numpy())  # Use detach() before numpy()
    return np.vstack(transformed_features) # Stack arrays in sequence vertically (row wise)


In [83]:
s_features_transformed = transform_in_batches(s_features, transform_s)
p_features_transformed = transform_in_batches(p_features, transform_p)


Number of samples: 153
Number of samples: 237147


In [84]:
# Convert back to tensors
s_features_tensor = torch.tensor(s_features_transformed, dtype=torch.float).to(device)
p_features_tensor = torch.tensor(p_features_transformed, dtype=torch.float).to(device)


In [85]:
# Combine features, vertically stacks features (dim=0) to create a single tensor
all_features = torch.cat([p_features_tensor, s_features_tensor], dim=0)


In [86]:
protein_ids = set(proteins_df.index)
substrate_ids = set(s_df.index)

In [None]:
# num_cores = 64

In [None]:
# def generate_negative_edges_chunk(chunk, possible_sources, possible_targets, existing_edges):
#     negative_edges = []
#     for _ in chunk:
#         while True:
#             source = random.choice(possible_sources)
#             target = random.choice(possible_targets)
#             if (source, target) not in existing_edges and (target, source) not in existing_edges:
#                 negative_edges.append((source, target))
#                 break
#     return negative_edges

In [None]:
# def generate_negative_edges(df, possible_sources, possible_targets, num_cores, chunk_size=10000):
#     existing_edges = set(zip(df['source'], df['target']))

#     # Create chunks
#     chunks = [range(i, min(i + chunk_size, len(df))) for i in range(0, len(df), chunk_size)]
    
#     with mp.Pool(num_cores) as pool:
#         results = pool.starmap(
#             generate_negative_edges_chunk,
#             [(chunk, possible_sources, possible_targets, existing_edges) for chunk in chunks]
#         )

#     # Flatten the list of lists
#     negative_edges = [edge for sublist in results for edge in sublist]
#     negative_df = pd.DataFrame(negative_edges, columns=['source', 'target'])
#     return negative_df


In [None]:
# # Generate negative edges for PPI
# ppi_neg_df = generate_negative_edges(ppi_df, protein_ids, protein_ids, num_cores)
# ppi_neg_df.to_csv('/data/servilla/DT_HGNN/Edges/negative_ppi_edges.csv')

In [None]:
# # Generate negative edges for SSI
# ssi_neg_df = generate_negative_edges(ssi_df, substrate_ids, substrate_ids, num_cores)
# ssi_neg_df.to_csv('/data/servilla/DT_HGNN/Edges/negative_ssi_edges.csv')

In [None]:
# # Generate negative edges for TP_S
# tp_s_neg_df = generate_negative_edges(tp_s_df, protein_ids, substrate_ids, num_cores)
# tp_s_neg_df.to_csv('/data/servilla/DT_HGNN/Edges/negative_tp_s_edges.csv')

In [None]:
# # Load negative edges
# ppi_neg_df = pd.read_csv('/data/servilla/DT_HGNN/Model/Edges/Negative_Edges/negative_ppi_edges.csv', index_col=0)
# ssi_neg_df = pd.read_csv('/data/servilla/DT_HGNN/Model/Edges/Negative_Edges/negative_ssi_edges.csv', index_col=0)
# tp_s_neg_df = pd.read_csv('/data/servilla/DT_HGNN/Model/Edges/Negative_Edges/negative_tp_s_edges.csv', index_col=0)

In [None]:
# def concatenate_edges(pos_df, neg_df):
#     # Add a label column to indicate positive (1) or negative (0) edges
#     pos_df['label'] = 1
#     neg_df['label'] = 0

#     # Concatenate the positive and negative edges
#     combined_df = pd.concat([pos_df, neg_df], ignore_index=True)
#     return combined_df

In [None]:
# # Concatenate edges for each type
# ppi_combined_df = concatenate_edges(ppi_df, ppi_neg_df)
# ssi_combined_df = concatenate_edges(ssi_df, ssi_neg_df)
# tp_s_combined_df = concatenate_edges(tp_s_df, tp_s_neg_df)

In [None]:
# # Save the combined dataframes to CSV files
# ppi_combined_df.to_csv('/data/servilla/DT_HGNN/Edges/combined_ppi_edges.csv', index=False)
# ssi_combined_df.to_csv('/data/servilla/DT_HGNN/Edges/combined_ssi_edges.csv', index=False)
# tp_s_combined_df.to_csv('/data/servilla/DT_HGNN/Edges/combined_tp_s_edges.csv', index=False)

In [87]:
# Load the combined edges
ppi_combined_df = pd.read_csv('/data/servilla/DT_HGNN//Model/Edges/combined_ppi_edges.csv')
ssi_combined_df = pd.read_csv('/data/servilla/DT_HGNN//Model/Edges/combined_ssi_edges.csv')
tp_s_train_df = pd.read_csv('/data/servilla/DT_HGNN/Model/Edges/combined_tp_s_edges_minus_50.csv')
tp_s_test_df = pd.read_csv('/data/servilla/DT_HGNN/Model/Edges/add_50_test_tp_s.csv')  # Add this line to load the test set


In [88]:
# def split_data(df, train_size=0.8, val_size=0.1, test_size=0.1):
#     # Split into train and temp (80% train, 20% temp)
#     train_df, temp_df = train_test_split(df, train_size=train_size, random_state=42)
    
#     # Calculate the size for validation and test splits
#     val_test_ratio = val_size / (val_size + test_size)  # 50% of temp goes to validation and 50% to test

#     # Split temp into validation and test (10% each)
#     val_df, test_df = train_test_split(temp_df, train_size=val_test_ratio, random_state=42)
    
#     return train_df, val_df, test_df

def split_train_val_data(df, train_size=0.9, val_size=0.1):
    # Split into train and validation (90% train, 10% validation)
    train_df, val_df = train_test_split(df, train_size=train_size, random_state=42)
    
    return train_df, val_df


In [89]:
# # Split data for each edge type
# ppi_train_df, ppi_val_df, ppi_test_df = split_data(ppi_combined_df)
# ssi_train_df, ssi_val_df, ssi_test_df = split_data(ssi_combined_df)
# tp_s_train_df, tp_s_val_df, tp_s_test_df = split_data(tp_s_combined_df)

# Split data for each edge type (without test, as test is predefined)
ppi_train_df, ppi_val_df = split_train_val_data(ppi_combined_df)
ssi_train_df, ssi_val_df = split_train_val_data(ssi_combined_df)
tp_s_train_df, tp_s_val_df = split_train_val_data(tp_s_train_df)  # Use remaining train data for tp_s


In [90]:
tp_s_train_df

Unnamed: 0,source,target,label
944,P55156,CHEBI:64615,1
5981,P41300,CHEBI:17562,0
8113,O09008,CHEBI:57762,0
9224,Q97ZQ3,CHEBI:17368,0
7134,P16291,CHEBI:17968,0
...,...,...,...
11964,Q31L27,CHEBI:82897,0
5191,Q7N6B7,CHEBI:16113,0
5390,P57860,CHEBI:57834,0
860,P58354,CHEBI:58539,1


In [60]:
# Check the columns of the substrate DataFrame (s_df)
print(s_df.columns)


Index(['0', '1', '2', '3', '4', '5', '6', '7', '8', '9',
       ...
       '1526', '1527', '1528', '1529', '1530', '1531', '1532', '1533', '1534',
       '1535'],
      dtype='object', length=1536)


In [61]:
# Check the first few rows of s_df to understand its structure
print(s_df.head())

# Optionally, you can print the first few column names and their content
print(s_df.columns)


                     0         1         2         3         4         5  \
ChEBI ID                                                                   
CHEBI:64837   0.615461  0.182076 -0.157183 -0.101461 -0.146697 -0.613957   
CHEBI:58245   0.506397  0.400621  0.064001 -0.252644 -0.300555 -0.395209   
CHEBI:57673   0.438195  0.195334  0.072717 -0.337968 -0.261332 -0.498841   
CHEBI:58115   0.263242  0.269032  0.145805 -0.515321 -0.187021 -0.480719   
CHEBI:456215  0.285829  0.556172  0.108747 -0.328224 -0.301407 -0.238618   

                     6         7         8         9  ...      1526      1527  \
ChEBI ID                                              ...                       
CHEBI:64837  -0.450166 -0.183678 -0.624151 -0.271550  ...  0.107556 -0.293493   
CHEBI:58245  -0.638782 -0.311317 -1.088331 -0.142135  ...  0.350439 -0.128131   
CHEBI:57673  -0.686211 -0.416058 -0.848771 -0.195883  ...  0.379729 -0.210616   
CHEBI:58115  -0.610310 -0.469400 -0.936680 -0.147377  ...  0.1

In [62]:
# Access the ChEBI IDs from the index of s_df
complete_substrate_list = list(s_df.index) + list(missing_substrates)  # Combine the existing substrates with missing ones

# Generate the substrate mapping
substrate_mapping = {substrate: i for i, substrate in enumerate(complete_substrate_list)}

# Apply the updated mapping
tp_s_train_df['source_mapped'] = tp_s_train_df['source'].map(protein_mapping)
tp_s_train_df['target_mapped'] = tp_s_train_df['target'].map(substrate_mapping)

# Check for remaining NaNs after mapping
nan_sources = tp_s_train_df['source_mapped'].isna().sum()
nan_targets = tp_s_train_df['target_mapped'].isna().sum()

print(f"Remaining NaNs in source_mapped: {nan_sources}, Remaining NaNs in target_mapped: {nan_targets}")


Remaining NaNs in source_mapped: 0, Remaining NaNs in target_mapped: 0


In [63]:
# Print out the first few missing substrates to understand what's going wrong
missing_substrates = tp_s_train_df[tp_s_train_df['target_mapped'].isna()]['target'].unique()
print(f"Sample of missing substrate IDs: {missing_substrates[:10]}")
print(f"Total missing substrates: {len(missing_substrates)}")


Sample of missing substrate IDs: []
Total missing substrates: 0


In [64]:
# Check if the missing substrates are present in the s_df index
missing_in_s_df = [substrate for substrate in missing_substrates if substrate not in s_df.index]
print(f"Substrates missing in s_df: {missing_in_s_df}")


Substrates missing in s_df: []


In [65]:
def apply_correct_mapping(df, source_mapping, target_mapping):
    # Before mapping, check the size of the dataframe
    print(f"Original df size: {df.shape}")
    
    # Apply mapping
    df['source_mapped'] = df['source'].map(source_mapping)
    df['target_mapped'] = df['target'].map(target_mapping)
    
    # Print out any NaNs in the new columns
    print(f"NaNs after mapping - Source: {df['source_mapped'].isna().sum()}, Target: {df['target_mapped'].isna().sum()}")

    # Check the first few rows to verify mappings
    print(df.head())

    # Drop NaN values
    df.dropna(subset=['source_mapped', 'target_mapped'], inplace=True)
    
    # Check the size of the dataframe after dropping NaNs
    print(f"Mapped df size after dropping NaNs: {df.shape}")
    
    # Return the mapped dataframe with renamed columns
    return df[['source_mapped', 'target_mapped', 'label']].rename(columns={'source_mapped': 'source', 'target_mapped': 'target'})

# Apply the refined mapping function to the training set
tp_s_train_df = apply_correct_mapping(tp_s_train_df, protein_mapping, substrate_mapping)


Original df size: (11894, 5)
NaNs after mapping - Source: 0, Target: 0
      source       target  label  source_mapped  target_mapped
944   P55156  CHEBI:64615      1          17197             75
5981  P41300  CHEBI:17562      0         100093             94
8113  O09008  CHEBI:57762      0           2796            114
9224  Q97ZQ3  CHEBI:17368      0         149714            144
7134  P16291  CHEBI:17968      0          53203             44
Mapped df size after dropping NaNs: (11894, 5)


In [91]:
# Create separate mappings
protein_mapping = {node_id: i for i, node_id in enumerate(proteins_df.index)}
substrate_mapping = {node_id: i for i, node_id in enumerate(s_df.index)}

# Helper function to apply the correct mapping
def apply_correct_mapping(df, source_mapping, target_mapping):
    df['source'] = df['source'].map(source_mapping)
    df['target'] = df['target'].map(target_mapping)
    df.dropna(inplace=True)
    return df

# # Apply the correct mappings
# tp_s_train_df = apply_correct_mapping(tp_s_train_df, protein_mapping, substrate_mapping)
# tp_s_val_df = apply_correct_mapping(tp_s_val_df, protein_mapping, substrate_mapping)
# tp_s_test_df = apply_correct_mapping(tp_s_test_df, protein_mapping, substrate_mapping)

# ppi_train_df = apply_correct_mapping(ppi_train_df, protein_mapping, protein_mapping)
# ppi_val_df = apply_correct_mapping(ppi_val_df, protein_mapping, protein_mapping)
# ppi_test_df = apply_correct_mapping(ppi_test_df, protein_mapping, protein_mapping)

# ssi_train_df = apply_correct_mapping(ssi_train_df, substrate_mapping, substrate_mapping)
# ssi_val_df = apply_correct_mapping(ssi_val_df, substrate_mapping, substrate_mapping)
# ssi_test_df = apply_correct_mapping(ssi_test_df, substrate_mapping, substrate_mapping)

# Apply the correct mappings
tp_s_train_df = apply_correct_mapping(tp_s_train_df, protein_mapping, substrate_mapping)
tp_s_val_df = apply_correct_mapping(tp_s_val_df, protein_mapping, substrate_mapping)
# # The test set mappings have already been applied during selection
# tp_s_test_df = apply_correct_mapping(tp_s_train_df, protein_mapping, substrate_mapping)

ppi_train_df = apply_correct_mapping(ppi_train_df, protein_mapping, protein_mapping)
ppi_val_df = apply_correct_mapping(ppi_val_df, protein_mapping, protein_mapping)

ssi_train_df = apply_correct_mapping(ssi_train_df, substrate_mapping, substrate_mapping)
ssi_val_df = apply_correct_mapping(ssi_val_df, substrate_mapping, substrate_mapping)


In [66]:
# Check the protein and substrate mappings
print(f"Protein mapping: {list(protein_mapping.items())[:10]}")  # Print the first 10 items
print(f"Substrate mapping: {list(substrate_mapping.items())[:10]}")  # Print the first 10 items

# Check the unique values in the 'source' and 'target' columns of tp_s_train_df
print(f"Unique proteins in tp_s_train_df['source']: {tp_s_train_df['source'].unique()[:10]}")
print(f"Unique substrates in tp_s_train_df['target']: {tp_s_train_df['target'].unique()[:10]}")

# Check if there are any missing nodes in the mappings
missing_proteins = tp_s_train_df[~tp_s_train_df['source'].isin(protein_mapping.keys())]
missing_substrates = tp_s_train_df[~tp_s_train_df['target'].isin(substrate_mapping.keys())]

print(f"Missing proteins: {len(missing_proteins)}, Missing substrates: {len(missing_substrates)}")


Protein mapping: [('A0A061ACU2', 0), ('A0A061AE05', 1), ('A0A061I403', 2), ('A0A072ULZ1', 3), ('A0A072VDF2', 4), ('A0A075F7E9', 5), ('A0A075QQ08', 6), ('A0A087WPF7', 7), ('A0A088MLT8', 8), ('A0A089QRB9', 9)]
Substrate mapping: [('CHEBI:64837', 0), ('CHEBI:58245', 1), ('CHEBI:57673', 2), ('CHEBI:58115', 3), ('CHEBI:456215', 4), ('CHEBI:72999', 5), ('CHEBI:83228', 6), ('CHEBI:58950', 7), ('CHEBI:144584', 8), ('CHEBI:144582', 9)]
Unique proteins in tp_s_train_df['source']: [ 17197 100093   2796 149714  53203  43274    473 161953 193620 141444]
Unique substrates in tp_s_train_df['target']: [ 75  94 114 144  44  82 130  92  31  10]
Missing proteins: 11894, Missing substrates: 11894


In [67]:
# Check if the 'source' proteins are in the protein mapping
missing_proteins = tp_s_train_df[~tp_s_train_df['source'].isin(protein_mapping.keys())]
print(f"Number of missing proteins: {len(missing_proteins)}")

# Check if the 'target' substrates are in the substrate mapping
missing_substrates = tp_s_train_df[~tp_s_train_df['target'].isin(substrate_mapping.keys())]
print(f"Number of missing substrates: {len(missing_substrates)}")

# Optionally, print some of the missing values for further inspection
print("Sample missing proteins:")
print(missing_proteins['source'].unique()[:10])

print("Sample missing substrates:")
print(missing_substrates['target'].unique()[:10])


Number of missing proteins: 11894
Number of missing substrates: 11894
Sample missing proteins:
[ 17197 100093   2796 149714  53203  43274    473 161953 193620 141444]
Sample missing substrates:
[ 75  94 114 144  44  82 130  92  31  10]


In [92]:
tp_s_train_df

Unnamed: 0,source,target,label
944,17197,75,1
5981,100093,94,0
8113,2796,114,0
9224,149714,144,0
7134,53203,44,0
...,...,...,...
11964,200663,79,0
5191,136281,68,0
5390,101992,134,0
860,17540,62,1


In [93]:
tp_s_val_df

Unnamed: 0,source,target,label
8414,7731,74,0
4816,162104,5,0
6149,135519,62,0
1355,25592,31,1
4464,182814,11,0
...,...,...,...
4676,187964,41,0
1208,59015,13,1
2057,195614,13,1
424,14897,107,1


In [69]:
# Ensure that mapping creates the 'target_mapped' column within the function
tp_s_train_df['source_mapped'] = tp_s_train_df['source'].map(protein_mapping)
tp_s_train_df['target_mapped'] = tp_s_train_df['target'].map(substrate_mapping)

# Now, check for missing mappings after the columns are created
missing_substrates = tp_s_train_df[tp_s_train_df['target_mapped'].isna()]['target'].unique()
print(f"Missing substrate IDs: {missing_substrates[:10]}")  # Print a sample of missing IDs
print(f"Total missing substrates: {len(missing_substrates)}")

# Check remaining NaNs
nan_sources = tp_s_train_df['source_mapped'].isna().sum()
nan_targets = tp_s_train_df['target_mapped'].isna().sum()

print(f"Remaining NaNs in source_mapped: {nan_sources}, Remaining NaNs in target_mapped: {nan_targets}")


Missing substrate IDs: [ 75  94 114 144  44  82 130  92  31  10]
Total missing substrates: 153
Remaining NaNs in source_mapped: 11894, Remaining NaNs in target_mapped: 11894


In [96]:
tp_s_val_df

Unnamed: 0,source,target,label
8414,7731,74,0
4816,162104,5,0
6149,135519,62,0
1355,25592,31,1
4464,182814,11,0
...,...,...,...
4676,187964,41,0
1208,59015,13,1
2057,195614,13,1
424,14897,107,1


In [98]:
tp_s_test_df

Unnamed: 0,source,target,label,source_mapped,target_mapped


In [97]:
# Apply the correct mapping to the test set
tp_s_test_df['source_mapped'] = tp_s_test_df['source'].map(protein_mapping)
tp_s_test_df['target_mapped'] = tp_s_test_df['target'].map(substrate_mapping)

# Drop any rows where the mapping resulted in NaNs
tp_s_test_df.dropna(subset=['source_mapped', 'target_mapped'], inplace=True)

# Convert the mapped columns to integers
tp_s_test_df['source_mapped'] = tp_s_test_df['source_mapped'].astype(int)
tp_s_test_df['target_mapped'] = tp_s_test_df['target_mapped'].astype(int)

# Create edge index tensors
test_edges_tp_s = torch.tensor(tp_s_test_df[['source_mapped', 'target_mapped']].values.T, dtype=torch.long)

print("Test edge tensor created successfully.")


Test edge tensor created successfully.


In [94]:
# # Create edge index tensors
# train_edges_tp_s = torch.tensor(tp_s_train_df[['source', 'target']].values.T, dtype=torch.long)
# val_edges_tp_s = torch.tensor(tp_s_val_df[['source', 'target']].values.T, dtype=torch.long)
# test_edges_tp_s = torch.tensor(tp_s_test_df[['source', 'target']].values.T, dtype=torch.long)

# train_edges_ppi = torch.tensor(ppi_train_df[['source', 'target']].values.T, dtype=torch.long)
# val_edges_ppi = torch.tensor(ppi_val_df[['source', 'target']].values.T, dtype=torch.long)
# test_edges_ppi = torch.tensor(ppi_test_df[['source', 'target']].values.T, dtype=torch.long)

# train_edges_ssi = torch.tensor(ssi_train_df[['source', 'target']].values.T, dtype=torch.long)
# val_edges_ssi = torch.tensor(ssi_val_df[['source', 'target']].values.T, dtype=torch.long)
# test_edges_ssi = torch.tensor(ssi_test_df[['source', 'target']].values.T, dtype=torch.long)

# # Convert the labels to tensors
# train_labels_tp_s = torch.tensor(tp_s_train_df['label'].values, dtype=torch.float)
# val_labels_tp_s = torch.tensor(tp_s_val_df['label'].values, dtype=torch.float)
# test_labels_tp_s = torch.tensor(tp_s_test_df['label'].values, dtype=torch.float)

# train_labels_ppi = torch.tensor(ppi_train_df['label'].values, dtype=torch.float)
# val_labels_ppi = torch.tensor(ppi_val_df['label'].values, dtype=torch.float)
# test_labels_ppi = torch.tensor(ppi_test_df['label'].values, dtype=torch.float)

# train_labels_ssi = torch.tensor(ssi_train_df['label'].values, dtype=torch.float)
# val_labels_ssi = torch.tensor(ssi_val_df['label'].values, dtype=torch.float)
# test_labels_ssi = torch.tensor(ssi_test_df['label'].values, dtype=torch.float)


# Create edge index tensors for training, validation, and test
train_edges_tp_s = torch.tensor(tp_s_train_df[['source', 'target']].values.T, dtype=torch.long)
val_edges_tp_s = torch.tensor(tp_s_val_df[['source', 'target']].values.T, dtype=torch.long)
test_edges_tp_s = torch.tensor(tp_s_test_df[['source', 'target']].values.T, dtype=torch.long)

train_edges_ppi = torch.tensor(ppi_train_df[['source', 'target']].values.T, dtype=torch.long)
val_edges_ppi = torch.tensor(ppi_val_df[['source', 'target']].values.T, dtype=torch.long)

train_edges_ssi = torch.tensor(ssi_train_df[['source', 'target']].values.T, dtype=torch.long)
val_edges_ssi = torch.tensor(ssi_val_df[['source', 'target']].values.T, dtype=torch.long)

# Labels remain the same
train_labels_tp_s = torch.tensor(tp_s_train_df['label'].values, dtype=torch.float)
val_labels_tp_s = torch.tensor(tp_s_val_df['label'].values, dtype=torch.float)
test_labels_tp_s = torch.tensor(tp_s_test_df['label'].values, dtype=torch.float)


TypeError: can't convert np.ndarray of type numpy.object_. The only supported types are: float64, float32, float16, complex64, complex128, int64, int32, int16, int8, uint64, uint32, uint16, uint8, and bool.

In [21]:
# Check for NaNs in node features
print(torch.isnan(p_features_tensor).any())  # Should return False
print(torch.isnan(s_features_tensor).any())  # Should return False

# Check for NaNs in edge indices
print(torch.isnan(train_edges_tp_s).any())  # Should return False
print(torch.isnan(val_edges_tp_s).any())    # Should return False

# Check for NaNs in labels
print(torch.isnan(train_labels_tp_s).any())  # Should return False
print(torch.isnan(val_labels_tp_s).any())    # Should return False


tensor(False)
tensor(False)
tensor(False)
tensor(False)
tensor(False)
tensor(False)


In [22]:
data = HeteroData()

# Assign node features
data['protein'].x = p_features_tensor
data['substrate'].x = s_features_tensor

# Assign training edges
data['protein', 'interacts_with', 'substrate'].edge_index = train_edges_tp_s
data['protein', 'interacts_with', 'protein'].edge_index = train_edges_ppi
data['substrate', 'interacts_with', 'substrate'].edge_index = train_edges_ssi



# Initialize the model
class GCNLinkPredictor(nn.Module):
    def __init__(self, protein_dim, substrate_dim, hidden_channels):
        super(GCNLinkPredictor, self).__init__()
        self.protein_conv1 = GCNConv(protein_dim, hidden_channels)
        self.substrate_conv1 = GCNConv(substrate_dim, hidden_channels)
        self.protein_conv2 = GCNConv(hidden_channels, hidden_channels)
        self.substrate_conv2 = GCNConv(hidden_channels, hidden_channels)
        self.link_predictor = nn.Sequential(
            nn.Linear(hidden_channels * 2, hidden_channels),
            nn.ReLU(),
            nn.Linear(hidden_channels, 1)
        )

    def encode(self, x_dict, edge_index_dict):
        z_protein = self.protein_conv1(x_dict['protein'], edge_index_dict[('protein', 'interacts_with', 'protein')])
        z_substrate = self.substrate_conv1(x_dict['substrate'], edge_index_dict[('substrate', 'interacts_with', 'substrate')])
        return z_protein, z_substrate

    # def forward(self, x_dict, edge_index_dict, edges):
    #     z_protein, z_substrate = self.encode(x_dict, edge_index_dict)
    #     z_combined = torch.cat([z_protein[edges[0]], z_substrate[edges[1]]], dim=-1)
    #     return self.link_predictor(z_combined).squeeze()

    def forward(self, x_dict, edge_index_dict, edges):
        if edges.size(1) == 0:  # Check if the edges tensor is empty
            print("Warning: Empty batch of edges!")
            return torch.tensor([], device=device)  # Return an empty tensor to handle this case
        
        z_protein, z_substrate = self.encode(x_dict, edge_index_dict)
        z_combined = torch.cat([z_protein[edges[0]], z_substrate[edges[1]]], dim=-1)
        output = self.link_predictor(z_combined).squeeze()
        
        return output



# Initialize the model
model = GCNLinkPredictor(protein_dim=2048, substrate_dim=2048, hidden_channels=64).to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=1e-4)
criterion = torch.nn.BCEWithLogitsLoss()

# Early stopping parameters
patience = 10  # Number of epochs to wait before stopping if no improvement
best_val_loss = float('inf')
epochs_without_improvement = 0

# Learning Rate Scheduler
scheduler = ReduceLROnPlateau(optimizer, mode='min', factor=0.1, patience=4)


# Assuming `data` contains x_dict and edge_index_dict
x_dict = data.x_dict
edge_index_dict = data.edge_index_dict


# # Update the train, validate, and test functions to return predictions
# def train(x_dict, edge_index_dict, train_edges_tp_s, train_labels_tp_s): 
#     model.train()
#     optimizer.zero_grad()
#     out = model(x_dict, edge_index_dict, train_edges_tp_s)
#     loss = criterion(out, train_labels_tp_s)
#     loss.backward()
#     optimizer.step()

#     return loss.item(), out.detach()

# def validate():
#     model.eval()
#     with torch.no_grad():
#         out = model(data.x_dict, data.edge_index_dict, val_edges_tp_s)
#         loss = criterion(out, val_labels_tp_s)
#     return loss.item(), out

# def test():
#     model.eval()
#     with torch.no_grad():
#         out = model(data.x_dict, data.edge_index_dict, test_edges_tp_s)
#         loss = criterion(out, test_labels_tp_s)
#     return loss.item(), out

# Update the train, validate, and test functions as needed
def train(x_dict, edge_index_dict, train_edges_tp_s, train_labels_tp_s): 
    model.train()
    optimizer.zero_grad()
    out = model(x_dict, edge_index_dict, train_edges_tp_s)
    loss = criterion(out, train_labels_tp_s)
    loss.backward()
    optimizer.step()
    return loss.item(), out.detach()

def validate():
    model.eval()
    with torch.no_grad():
        out = model(data.x_dict, data.edge_index_dict, val_edges_tp_s)
        loss = criterion(out, val_labels_tp_s)
    return loss.item(), out

def test():
    model.eval()
    with torch.no_grad():
        out = model(data.x_dict, data.edge_index_dict, test_edges_tp_s)
        loss = criterion(out, test_labels_tp_s)
    return loss.item(), out

# Calculate additional metrics
def calculate_metrics(labels, preds):
    preds = torch.sigmoid(preds).cpu().numpy()
    preds_binary = (preds > 0.5).astype(int)
    labels = labels.cpu().numpy()

    accuracy = accuracy_score(labels, preds_binary)
    precision = precision_score(labels, preds_binary)
    recall = recall_score(labels, preds_binary)
    f1 = f1_score(labels, preds_binary)
    auc = roc_auc_score(labels, preds)

    return accuracy, precision, recall, f1, auc

# Modify the training loop to include metric calculation and visualization
train_losses = []
val_losses = []
val_accuracies = []

# Training loop
epochs = 800
for epoch in range(epochs):
    # Training step
    train_loss, train_preds = train(x_dict, edge_index_dict, train_edges_tp_s, train_labels_tp_s)
    # Validation step
    val_loss, val_preds = validate()

    # Store losses
    train_losses.append(train_loss)
    val_losses.append(val_loss)

    # Calculate validation metrics
    accuracy, precision, recall, f1, auc = calculate_metrics(val_labels_tp_s, val_preds)
    val_accuracies.append(accuracy)

    # Print metrics
    print(f"Epoch {epoch+1}, Train Loss: {train_loss:.4f}, Validation Loss: {val_loss:.4f}, "
          f"Val Accuracy: {accuracy:.4f}, Precision: {precision:.4f}, Recall: {recall:.4f}, "
          f"F1: {f1:.4f}, AUC: {auc:.4f}, LR: {scheduler.get_last_lr()[0]}")

    # Step the LR scheduler
    scheduler.step(val_loss)

    # Check for early stopping
    if val_loss < best_val_loss:
        best_val_loss = val_loss
        epochs_without_improvement = 0
        torch.save(model.state_dict(), '/data/servilla/DT_HGNN/data/Models_saves/best_model.pth')  # Save the best model
    else:
        epochs_without_improvement += 1
    
    if epochs_without_improvement >= patience:
        print(f"Early stopping at epoch {epoch+1}")
        break

# Load the best model
model.load_state_dict(torch.load('/data/servilla/DT_HGNN/data/Models_saves/best_model.pth'))

# Testing
test_loss, test_preds = test()
test_accuracy, test_precision, test_recall, test_f1, test_auc = calculate_metrics(test_labels_tp_s, test_preds)
print(f"Test Loss: {test_loss}")
print(f"Test Accuracy: {test_accuracy:.4f}, Precision: {test_precision:.4f}, Recall: {test_recall:.4f}, F1: {test_f1:.4f}, AUC: {test_auc:.4f}")

# Step 4: Plot loss curves
plt.figure(figsize=(10, 5))
plt.plot(train_losses, label="Train Loss")
plt.plot(val_losses, label="Validation Loss")
plt.xlabel("Epoch")
plt.ylabel("Loss")
plt.title("Loss Curves")
plt.legend()
plt.show()

# Step 5: Plot validation accuracy
plt.figure(figsize=(10, 5))
plt.plot(val_accuracies, label="Validation Accuracy")
plt.xlabel("Epoch")
plt.ylabel("Accuracy")
plt.title("Validation Accuracy")
plt.legend()
plt.show()


Model output contains NaNs: False
tensor([], grad_fn=<SqueezeBackward0>)
Model output contains NaNs: False
tensor([0.0591, 0.1030, 0.1702,  ..., 0.0687, 0.0838, 0.0791])
Epoch 1, Train Loss: nan, Validation Loss: 0.7220, Val Accuracy: 0.1901, Precision: 0.1655, Recall: 0.9953, F1: 0.2838, AUC: 0.5560, LR: 0.0001
Model output contains NaNs: False
tensor([], grad_fn=<SqueezeBackward0>)
Model output contains NaNs: False
tensor([0.0591, 0.1030, 0.1702,  ..., 0.0687, 0.0838, 0.0791])
Epoch 2, Train Loss: nan, Validation Loss: 0.7220, Val Accuracy: 0.1901, Precision: 0.1655, Recall: 0.9953, F1: 0.2838, AUC: 0.5560, LR: 0.0001
Model output contains NaNs: False
tensor([], grad_fn=<SqueezeBackward0>)
Model output contains NaNs: False
tensor([0.0591, 0.1030, 0.1702,  ..., 0.0687, 0.0838, 0.0791])
Epoch 3, Train Loss: nan, Validation Loss: 0.7220, Val Accuracy: 0.1901, Precision: 0.1655, Recall: 0.9953, F1: 0.2838, AUC: 0.5560, LR: 0.0001
Model output contains NaNs: False
tensor([], grad_fn=<Sque

  avg = a.mean(axis, **keepdims_kw)
  ret = ret.dtype.type(ret / rcount)
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, "true nor predicted", "F-score is", len(true_sum))


ValueError: Found array with 0 sample(s) (shape=(0,)) while a minimum of 1 is required.