In [1]:
import os
import torch
from torch.cuda.amp import autocast, GradScaler
from torch_geometric.data import Data
import pandas as pd
import numpy as np
import torch.nn as nn
from torch.nn import Linear
import torch.nn.functional as F
from torch_geometric.nn import GCNConv
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.metrics import accuracy_score, roc_auc_score
import copy
import random
import multiprocessing as mp
from torch_geometric.data import HeteroData
import torch
from torch_geometric.nn import GCNConv
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, precision_recall_curve, roc_curve, auc
import matplotlib.pyplot as plt
from torch.optim.lr_scheduler import ReduceLROnPlateau
import torch
import torch.nn.functional as F
from torch_geometric.nn import SAGEConv
from torch_geometric.loader import DataLoader
from torch_geometric.utils import negative_sampling
from torch_geometric.utils import to_undirected
import matplotlib.pyplot as plt
from torch.optim.lr_scheduler import ReduceLROnPlateau
from datetime import datetime
import torch.optim.lr_scheduler as lr_scheduler
from torch.utils.data import DataLoader, TensorDataset


### Preliminaries:

In [2]:
# Load node data from csv
s_df = pd.read_csv('/data/servilla/DT_HGNN/Model/Nodes/s_emb_full_183.csv', index_col=0)
p_df = pd.read_csv('/data/servilla/DT_HGNN/Model/Nodes/Other/tp_only_12301.csv', index_col=0)


In [3]:
# Load edge data from csv
tp_s_df = pd.read_csv('/data/servilla/DT_HGNN/Model/Edges/distributed_combined_tp_s_edges_13340.csv')


In [4]:
tp_s_df

Unnamed: 0,source,target,label
0,Q1CR30,CHEBI:30616,1
1,Q2A1I2,CHEBI:30616,1
2,O67337,CHEBI:132124,1
3,B1MPH4,CHEBI:132124,1
4,Q6HP89,CHEBI:30616,1
...,...,...,...
26675,Q3JC24,CHEBI:456216,0
26676,B2U7R1,CHEBI:30616,0
26677,B2VIV5,CHEBI:15378,0
26678,P0C323,CHEBI:132124,0


In [5]:
# Load device of available GPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
if device.type == 'cuda':
    device = torch.device("cuda:0")
    print(device)
    

cuda:0


In [6]:
tp_s_df

Unnamed: 0,source,target,label
0,Q1CR30,CHEBI:30616,1
1,Q2A1I2,CHEBI:30616,1
2,O67337,CHEBI:132124,1
3,B1MPH4,CHEBI:132124,1
4,Q6HP89,CHEBI:30616,1
...,...,...,...
26675,Q3JC24,CHEBI:456216,0
26676,B2U7R1,CHEBI:30616,0
26677,B2VIV5,CHEBI:15378,0
26678,P0C323,CHEBI:132124,0


### Function Definitions:

In [7]:
# Function to get the current date and time for the file name and log
def get_timestamp():
    return datetime.now().strftime('%Y-%m-%d_%H-%M-%S')

In [8]:
# Inspect and clean the data, converts non-numeric columns to numeric and fills NaN values with 0
def inspect_and_clean(df):
    non_numeric_columns = df.select_dtypes(exclude=[np.number]).columns
    print(f"Non-numeric columns: {non_numeric_columns}")
    if len(non_numeric_columns) > 0:
        df[non_numeric_columns] = df[non_numeric_columns].apply(pd.to_numeric, errors='coerce')
    df = df.fillna(0)
    return df


In [9]:
def set_seed(seed):
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)  # if you are using multi-GPU
    np.random.seed(seed)
    random.seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False


In [10]:
def apply_correct_mapping(df, source_mapping, target_mapping):
    # Apply the mappings to 'source' and 'target'
    df['mapped_source'] = df['source'].map(source_mapping)
    df['mapped_target'] = df['target'].map(target_mapping)

    # Identify rows where mapping failed (NaN values)
    unmapped_sources = df[df['mapped_source'].isna()]['source'].unique()
    unmapped_targets = df[df['mapped_target'].isna()]['target'].unique()

    # Log or print unmapped elements
    if len(unmapped_sources) > 0:
        print(f"Unmapped sources: {unmapped_sources}")
    if len(unmapped_targets) > 0:
        print(f"Unmapped targets: {unmapped_targets}")

    # Remove rows where mapping failed (NaN values)
    df.dropna(subset=['mapped_source', 'mapped_target'], inplace=True)

    # Replace original 'source' and 'target' with mapped values and drop the extra columns
    df['source'] = df['mapped_source']
    df['target'] = df['mapped_target']
    df.drop(columns=['mapped_source', 'mapped_target'], inplace=True)
    
    return df


### Code to manipulate data:

In [11]:
set_seed(42)


In [12]:
s_df = inspect_and_clean(s_df)
p_df = inspect_and_clean(p_df)


Non-numeric columns: Index([], dtype='object')
Non-numeric columns: Index([], dtype='object')


In [13]:
# Convert features to numpy arrays
s_features = s_df.values
p_features = p_df.values


In [14]:
# Check the shape of the features
print(f"Shape of s_features: {s_features.shape}")
print(f"Shape of p_features: {p_features.shape}")


Shape of s_features: (183, 1536)
Shape of p_features: (12301, 2048)


In [15]:
# Create separate mappings
protein_mapping = {node_id: i for i, node_id in enumerate(p_df.index)}
substrate_mapping = {node_id: i for i, node_id in enumerate(s_df.index)}


In [16]:
# Apply mappings to the full edge DataFrames before splitting
tp_s_df = apply_correct_mapping(tp_s_df, protein_mapping, substrate_mapping)


In [17]:
tp_s_df

Unnamed: 0,source,target,label
0,6048,0,1
1,6389,0,1
2,4380,63,1
3,3165,63,1
4,8328,0,1
...,...,...,...
26675,7165,18,0
26676,3351,0,0
26677,3375,19,0
26678,4590,63,0


In [18]:
# Converting the features to tensors (first convert to numpy arrays)
s_np = s_df.values
p_np = p_df.values

s_features_tensor = torch.tensor(s_np, dtype=torch.float).to(device)
p_features_tensor = torch.tensor(p_np, dtype=torch.float).to(device)


In [19]:
p_features_tensor

tensor([[ 0.0171, -0.0576, -0.0224,  ...,  0.3624, -0.1807,  0.1883],
        [-0.0125, -0.0156, -0.0371,  ...,  0.4206, -0.2489,  0.1974],
        [ 0.0028, -0.0836, -0.0325,  ...,  0.3974, -0.2052,  0.1842],
        ...,
        [ 0.0979, -0.0349,  0.0161,  ...,  0.4393, -0.1220,  0.1718],
        [ 0.0939, -0.1774,  0.2202,  ...,  0.4017, -0.1780,  0.1699],
        [ 0.0565, -0.0809,  0.1186,  ...,  0.3937, -0.1902,  0.1530]],
       device='cuda:0')

In [20]:
s_features_tensor

tensor([[ 0.4558,  0.2683, -0.3065,  ..., -0.3660, -0.8484, -0.2447],
        [ 0.6155,  0.1821, -0.1572,  ...,  0.0450, -0.3862, -0.4749],
        [ 0.5064,  0.4006,  0.0640,  ..., -0.5685, -0.8873, -0.2104],
        ...,
        [ 0.6212,  0.5304, -0.4603,  ..., -0.2120,  0.0052, -0.5082],
        [-0.1077,  0.3367, -0.0385,  ...,  0.0214, -0.0697, -0.3905],
        [ 0.4057, -0.2626, -0.5516,  ..., -0.1872, -0.2486, -0.3938]],
       device='cuda:0')

In [21]:
from sklearn.model_selection import train_test_split
import torch
from torch.utils.data import DataLoader, TensorDataset

# Assuming tp_s_df['source'] and tp_s_df['target'] contain valid mapped indices

# Convert the source and target columns to tensors
protein_indices = torch.tensor(tp_s_df['source'].values, dtype=torch.long).to(p_features_tensor.device)
substrate_indices = torch.tensor(tp_s_df['target'].values, dtype=torch.long).to(s_features_tensor.device)

# Index the protein and substrate feature tensors
p_features_tensor_mapped = p_features_tensor[protein_indices]
s_features_tensor_mapped = s_features_tensor[substrate_indices]

# Convert labels to tensor
labels_tensor = torch.tensor(tp_s_df['label'].values, dtype=torch.float32).to(p_features_tensor.device)

# # Create a TensorDataset
# dataset = TensorDataset(p_features_tensor_mapped, s_features_tensor_mapped, labels_tensor)

# # Split the dataset into training (60%), validation (20%), and test (20%) sets
# train_val_data, test_data = train_test_split(dataset, test_size=0.2, random_state=42)  # 80% train+val, 20% test
# train_data, val_data = train_test_split(train_val_data, test_size=0.25, random_state=42)  # 60% train, 20% val

# # Create DataLoaders for training, validation, and test sets
# train_loader = DataLoader(train_data, batch_size=32, shuffle=True)
# val_loader = DataLoader(val_data, batch_size=32, shuffle=False)  # No shuffling for validation
# test_loader = DataLoader(test_data, batch_size=32, shuffle=False)  # No shuffling for test set

# # Check the first batch of data from the training loader
# for batch in train_loader:
#     p_features, s_features, labels = batch
#     print(p_features)
#     print(s_features)
#     print(labels)
#     break

# Add protein and substrate indices to the TensorDataset
dataset = TensorDataset(p_features_tensor_mapped, s_features_tensor_mapped, labels_tensor, protein_indices, substrate_indices)

# Split the dataset into training (60%), validation (20%), and test (20%) sets
train_val_data, test_data = train_test_split(dataset, test_size=0.2, random_state=42)  # 80% train+val, 20% test
train_data, val_data = train_test_split(train_val_data, test_size=0.25, random_state=42)  # 60% train, 20% val

# Create DataLoaders for training, validation, and test sets
train_loader = DataLoader(train_data, batch_size=32, shuffle=True)
val_loader = DataLoader(val_data, batch_size=32, shuffle=False)  # No shuffling for validation
test_loader = DataLoader(test_data, batch_size=32, shuffle=False)  # No shuffling for test set

# # Check the first batch of data from the test loader to extract protein and substrate indices
# for batch in test_loader:
#     p_features, s_features, labels, protein_ids_batch, substrate_ids_batch = batch
#     print("Protein Indices in Test Batch:", protein_ids_batch)
#     print("Substrate Indices in Test Batch:", substrate_ids_batch)



In [33]:
len(protein_indices)

26680

In [22]:
# from sklearn.model_selection import train_test_split
# import torch
# from torch.utils.data import DataLoader, TensorDataset

# # Assuming tp_s_df['source'], tp_s_df['target'], and tp_s_df['label'] are valid

# # Step 1: Split off 20% for a naive test set (this is the first step before any other splits)
# tp_s_train_val_df, tp_s_test_df = train_test_split(tp_s_df, test_size=0.2, random_state=42)

# # Step 2: Prepare the naive test set (20% held out)
# test_protein_indices = torch.tensor(tp_s_test_df['source'].values, dtype=torch.long).to(p_features_tensor.device)
# test_substrate_indices = torch.tensor(tp_s_test_df['target'].values, dtype=torch.long).to(s_features_tensor.device)

# test_p_features_tensor_mapped = p_features_tensor[test_protein_indices]
# test_s_features_tensor_mapped = s_features_tensor[test_substrate_indices]

# test_labels_tensor = torch.tensor(tp_s_test_df['label'].values, dtype=torch.float32).to(p_features_tensor.device)

# # Create a TensorDataset and DataLoader for the naive test set
# test_dataset = TensorDataset(test_p_features_tensor_mapped, test_s_features_tensor_mapped, test_labels_tensor)
# test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)

# # Step 3: Now split the remaining 80% into training (60%) and validation (20%)
# protein_indices_train_val = torch.tensor(tp_s_train_val_df['source'].values, dtype=torch.long).to(p_features_tensor.device)
# substrate_indices_train_val = torch.tensor(tp_s_train_val_df['target'].values, dtype=torch.long).to(s_features_tensor.device)

# p_features_tensor_mapped_train_val = p_features_tensor[protein_indices_train_val]
# s_features_tensor_mapped_train_val = s_features_tensor[substrate_indices_train_val]
# labels_tensor_train_val = torch.tensor(tp_s_train_val_df['label'].values, dtype=torch.float32).to(p_features_tensor.device)

# # Create the TensorDataset for training and validation
# train_val_dataset = TensorDataset(p_features_tensor_mapped_train_val, s_features_tensor_mapped_train_val, labels_tensor_train_val)

# # Step 4: Split the remaining 80% into 60% training and 20% validation
# train_data, val_data = train_test_split(train_val_dataset, test_size=0.25, random_state=42)  # 60% train, 20% val

# # Step 5: Create DataLoaders for training and validation
# train_loader = DataLoader(train_data, batch_size=32, shuffle=True)
# val_loader = DataLoader(val_data, batch_size=32, shuffle=False)


In [23]:
# import torch

# # Assuming tp_s_df has already been mapped (with 'source', 'target', and 'label' columns)

# # Step 1: Shuffle the dataset to ensure even distribution
# tp_s_df = tp_s_df.sample(frac=1, random_state=42).reset_index(drop=True)

# # Step 2: Calculate the split index for 20%
# split_index = int(len(tp_s_df) * 0.8)

# # Step 3: Split the dataset into 80% for training/validation and 20% for testing
# train_val_df = tp_s_df[:split_index]
# test_df = tp_s_df[split_index:]

# # Step 4: Convert the train/validation and test sets into PyTorch tensors

# # For training/validation set
# train_protein_indices = torch.tensor(train_val_df['source'].values, dtype=torch.long).to(p_features_tensor.device)
# train_substrate_indices = torch.tensor(train_val_df['target'].values, dtype=torch.long).to(s_features_tensor.device)

# train_p_features_tensor_mapped = p_features_tensor[train_protein_indices]
# train_s_features_tensor_mapped = s_features_tensor[train_substrate_indices]
# train_labels_tensor = torch.tensor(train_val_df['label'].values, dtype=torch.float32).to(p_features_tensor.device)

# # Create TensorDataset for training/validation
# train_val_dataset = TensorDataset(train_p_features_tensor_mapped, train_s_features_tensor_mapped, train_labels_tensor)

# # For test set
# test_protein_indices = torch.tensor(test_df['source'].values, dtype=torch.long).to(p_features_tensor.device)
# test_substrate_indices = torch.tensor(test_df['target'].values, dtype=torch.long).to(s_features_tensor.device)

# test_p_features_tensor_mapped = p_features_tensor[test_protein_indices]
# test_s_features_tensor_mapped = s_features_tensor[test_substrate_indices]
# test_labels_tensor = torch.tensor(test_df['label'].values, dtype=torch.float32).to(p_features_tensor.device)

# # Create TensorDataset for testing
# test_dataset = TensorDataset(test_p_features_tensor_mapped, test_s_features_tensor_mapped, test_labels_tensor)

# # Step 5: Create DataLoaders for training/validation and test sets
# train_loader = DataLoader(train_val_dataset, batch_size=32, shuffle=True)
# test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)


In [24]:
import torch
import torch.nn as nn
import torch.nn.functional as F

class NonGraphModel(nn.Module):
    def __init__(self, protein_input_dim, substrate_input_dim, hidden_dim, output_dim, dropout_prob=0.5):
        super(NonGraphModel, self).__init__()

        # Transformation layers to match input dimensions
        self.transform_p = nn.Linear(protein_input_dim, hidden_dim)  # Protein: 2048 -> 128
        self.transform_s = nn.Linear(substrate_input_dim, hidden_dim)  # Substrate: 1536 -> 128

        # Additional fully connected layers
        self.fc1 = nn.Linear(hidden_dim * 2, hidden_dim)  # Combine protein + substrate, then reduce dimension
        self.fc2 = nn.Linear(hidden_dim, hidden_dim)
        self.fc3 = nn.Linear(hidden_dim, output_dim)

        # Dropout layer
        self.dropout = nn.Dropout(p=dropout_prob)

    def forward(self, p_features, s_features):
        # Apply transformations to ensure both node types have the same dimensionality
        p_transformed = self.transform_p(p_features)  # Protein features: 2048 -> 128
        s_transformed = self.transform_s(s_features)  # Substrate features: 1536 -> 128

        # Normalize the transformed features
        p_transformed = (p_transformed - p_transformed.mean(dim=0)) / p_transformed.std(dim=0)
        s_transformed = (s_transformed - s_transformed.mean(dim=0)) / s_transformed.std(dim=0)

        # Concatenate the transformed protein and substrate features
        combined_features = torch.cat([p_transformed, s_transformed], dim=1)  # Concatenating along feature dimension

        # Pass through fully connected layers
        out = F.relu(self.fc1(combined_features))
        out = self.dropout(out)  # Apply dropout after the first layer

        out = F.relu(self.fc2(out))
        out = self.dropout(out)

        out = self.fc3(out)  # Output layer
        return out


In [25]:
# Initialize model, optimizer, and loss function
protein_input_dim = 2048
substrate_input_dim = 1536
hidden_dim = 128
output_dim = 1  # Binary classification (interaction or no interaction)
dropout_prob = 0.5

model = NonGraphModel(protein_input_dim, substrate_input_dim, hidden_dim, output_dim, dropout_prob).to(device)

# Optimizer and loss function
optimizer = torch.optim.Adam(model.parameters(), lr=1e-4, weight_decay=1e-5)
criterion = torch.nn.BCEWithLogitsLoss()  # Binary classification loss


In [27]:
# # Check if there are common indices between train/val and test sets
# common_indices = set(tp_s_test_df.index).intersection(tp_s_train_val_df.index)
# if common_indices:
#     print("Data leakage detected!")
# else:
#     print("No data leakage found.")


In [29]:
num_epochs = 20
model.train()
for epoch in range(num_epochs):
    total_loss = 0
    for p_features, s_features, labels, _, _ in train_loader:  # Ignore protein and substrate indices by using '_'
        # Move data to the same device as the model
        p_features, s_features, labels = p_features.to(device), s_features.to(device), labels.to(device)

        # Zero the gradients
        optimizer.zero_grad()

        # Forward pass
        outputs = model(p_features, s_features)

        # Compute the loss
        loss = criterion(outputs.squeeze(), labels)  # Use squeeze to align output and label dimensions
        
        # Backward pass and optimization
        loss.backward()
        optimizer.step()

        # Track the total loss
        total_loss += loss.item()

    avg_loss = total_loss / len(train_loader)
    print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {avg_loss:.4f}')


Epoch [1/20], Loss: 0.4267
Epoch [2/20], Loss: 0.1740
Epoch [3/20], Loss: 0.1151
Epoch [4/20], Loss: 0.0877
Epoch [5/20], Loss: 0.0831
Epoch [6/20], Loss: 0.0751
Epoch [7/20], Loss: 0.0657
Epoch [8/20], Loss: 0.0636
Epoch [9/20], Loss: 0.0555
Epoch [10/20], Loss: 0.0510
Epoch [11/20], Loss: 0.0486
Epoch [12/20], Loss: 0.0537
Epoch [13/20], Loss: 0.0480
Epoch [14/20], Loss: 0.0447
Epoch [15/20], Loss: 0.0391
Epoch [16/20], Loss: 0.0406
Epoch [17/20], Loss: 0.0417
Epoch [18/20], Loss: 0.0333
Epoch [19/20], Loss: 0.0364
Epoch [20/20], Loss: 0.0369


In [30]:
from sklearn.metrics import roc_auc_score

def evaluate(model, loader):
    model.eval()  # Set model to evaluation mode
    total_loss = 0
    all_outputs = []
    all_labels = []

    with torch.no_grad():  # Disable gradient computation for evaluation
        for p_features, s_features, labels, _, _ in loader:  # Ignore protein and substrate indices
            p_features, s_features, labels = p_features.to(device), s_features.to(device), labels.to(device)

            # Forward pass
            outputs = model(p_features, s_features)

            # Compute loss
            loss = criterion(outputs.squeeze(), labels)
            total_loss += loss.item()

            # Collect outputs and labels for AUC calculation
            all_outputs.append(torch.sigmoid(outputs).cpu().numpy())  # Sigmoid to convert logits to probabilities
            all_labels.append(labels.cpu().numpy())

    avg_loss = total_loss / len(loader)

    # Flatten the list of arrays into a single array
    all_outputs = np.concatenate(all_outputs)
    all_labels = np.concatenate(all_labels)

    # Calculate AUC score
    auc = roc_auc_score(all_labels, all_outputs)

    return avg_loss, auc

# Example usage for validation or testing
val_loss, val_auc = evaluate(model, val_loader)
print(f'Validation Loss: {val_loss:.4f}, Validation AUC: {val_auc:.4f}')


Validation Loss: 0.0419, Validation AUC: 0.9988


In [31]:
# After creating the DataLoaders from the split
# test_loader is already defined by the previous code

# Evaluate on the test set (already split from tp_s_df)
test_loss, test_auc = evaluate(model, test_loader)
print(f'Test Loss: {test_loss:.4f}, Test AUC: {test_auc:.4f}')


Test Loss: 0.0407, Test AUC: 0.9989
