In [13]:
import torch
import torch.nn.functional as F
from torch_geometric.nn import GCNConv, global_mean_pool
from torch_geometric.data import Data
from torch_geometric.loader import DataLoader
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from scipy.stats import spearmanr
import matplotlib.pyplot as plt

In [14]:
# Load the datasets
data_dir = "/home/vivian.chu/vivian-sandbox/other/xAI-cancer-competition/.data"
train_data = pd.read_csv(f"{data_dir}/train.csv")
train_targets = pd.read_csv(f"{data_dir}/train_targets.csv")
test_data = pd.read_csv(f"{data_dir}/test.csv")

In [15]:
# Clean and preprocess data
train_data.rename(columns={'Unnamed: 0': 'sample'}, inplace=True)
train_data['sample'] = train_data['sample'].astype(str)
train_targets['sample'] = train_targets['sample'].astype(str)

test_data.rename(columns={'Unnamed: 0': 'sample'}, inplace=True)
test_data['sample'] = test_data['sampleId'].astype(str)

In [16]:
# Merge train data and targets
train_merged = pd.merge(train_data, train_targets, on='sample')

# Create graph data
features = train_merged.iloc[:, 1:-2].values  # Gene expression features
targets = train_merged['AAC'].values          # AAC values
tissue_types = train_merged['tissue'].values  # Tissue types

In [17]:
# Split the data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(features, targets, test_size=0.2, random_state=42)

In [None]:
# Creating a dummy adjacency matrix (you can replace this with a biologically meaningful one)
num_genes = features.shape[1]
adjacency_matrix = np.eye(num_genes)  # Identity matrix as a placeholder

In [None]:
# Convert adjacency matrix to edge indices for PyTorch Geometric
edge_index = torch.tensor(np.array(np.nonzero(adjacency_matrix)), dtype=torch.long)

In [18]:
# Create PyTorch Geometric data objects for training and validation
graph_data_list_train = []
for i in range(X_train.shape[0]):
    x = torch.tensor(X_train[i], dtype=torch.float).view(-1, 1)  # Features for each gene (node)
    y = torch.tensor([y_train[i]], dtype=torch.float)  # Target AAC value
    data = Data(x=x, edge_index=edge_index, y=y)
    graph_data_list_train.append(data)

graph_data_list_val = []
for i in range(X_val.shape[0]):
    x = torch.tensor(X_val[i], dtype=torch.float).view(-1, 1)  # Features for each gene (node)
    y = torch.tensor([y_val[i]], dtype=torch.float)  # Target AAC value
    data = Data(x=x, edge_index=edge_index, y=y)
    graph_data_list_val.append(data)

In [None]:
# DataLoader for batching
data_loader_train = DataLoader(graph_data_list_train, batch_size=32, shuffle=True)
data_loader_val = DataLoader(graph_data_list_val, batch_size=32, shuffle=False)

In [20]:
# Define the Graph Neural Network model
class GNN(torch.nn.Module):
    def __init__(self, num_node_features):
        super(GNN, self).__init__()
        self.conv1 = GCNConv(num_node_features, 64)
        self.conv2 = GCNConv(64, 64)
        self.fc1 = torch.nn.Linear(64, 32)
        self.fc2 = torch.nn.Linear(32, 1)

    def forward(self, data):
        x, edge_index, batch = data.x, data.edge_index, data.batch
        
        # Graph Convolutional Layers
        x = F.relu(self.conv1(x, edge_index))
        x = F.relu(self.conv2(x, edge_index))
        
        # Global Mean Pooling
        x = global_mean_pool(x, batch)
        
        # Fully Connected Layers
        x = F.relu(self.fc1(x))
        x = self.fc2(x)
        return x

In [21]:
# Training the model
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = GNN(num_node_features=1).to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
criterion = torch.nn.MSELoss()

train_spearman = []
val_spearman = []

model.train()
for epoch in range(100):
    total_loss = 0
    model.train()
    true_values_train = []
    predictions_train = []
    for data in data_loader_train:
        data = data.to(device)
        optimizer.zero_grad()
        out = model(data)
        loss = criterion(out.flatten(), data.y)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
        
        true_values_train.extend(data.y.cpu().detach().numpy())
        predictions_train.extend(out.cpu().detach().numpy())
    
    # Calculate Spearman Correlation for training set
    spearman_corr_train, _ = spearmanr(true_values_train, predictions_train)
    train_spearman.append(spearman_corr_train)

    # Validation phase
    model.eval()
    true_values_val = []
    predictions_val = []
    with torch.no_grad():
        for data in data_loader_val:
            data = data.to(device)
            out = model(data)
            true_values_val.extend(data.y.cpu().detach().numpy())
            predictions_val.extend(out.cpu().detach().numpy())
        
    # Calculate Spearman Correlation for validation set
    spearman_corr_val, _ = spearmanr(true_values_val, predictions_val)
    val_spearman.append(spearman_corr_val)
    
    print(f'Epoch {epoch+1}, Loss: {total_loss/len(data_loader_train):.4f}, Spearman Train: {spearman_corr_train:.4f}, Spearman Val: {spearman_corr_val:.4f}')

Epoch 1, Loss: 0.0252, Spearman Train: 0.0215, Spearman Val: -0.0961
Epoch 2, Loss: 0.0116, Spearman Train: 0.0228, Spearman Val: -0.0970
Epoch 3, Loss: 0.0113, Spearman Train: 0.0286, Spearman Val: 0.0966
Epoch 4, Loss: 0.0120, Spearman Train: 0.0660, Spearman Val: -0.1204
Epoch 5, Loss: 0.0118, Spearman Train: -0.0114, Spearman Val: 0.0964
Epoch 6, Loss: 0.0119, Spearman Train: 0.0125, Spearman Val: -0.0161
Epoch 7, Loss: 0.0114, Spearman Train: 0.0049, Spearman Val: -0.0967
Epoch 8, Loss: 0.0139, Spearman Train: 0.0050, Spearman Val: 0.0793
Epoch 9, Loss: 0.0117, Spearman Train: 0.0998, Spearman Val: -0.0975
Epoch 10, Loss: 0.0153, Spearman Train: -0.0211, Spearman Val: -0.0996


KeyboardInterrupt: 

In [None]:
# Predictions on the test set
graph_data_list_test = []
for i in range(test_data.shape[0]):
    x = torch.tensor(test_data.iloc[i, 1:].values, dtype=torch.float).view(-1, 1)  # Features for each gene (node)
    data = Data(x=x, edge_index=edge_index)
    graph_data_list_test.append(data)

data_loader_test = DataLoader(graph_data_list_test, batch_size=32, shuffle=False)

model.eval()
true_values_test = []  # Placeholder as we do not have true AAC values for test set
predictions_test = []
with torch.no_grad():
    for data in data_loader_test:
        data = data.to(device)
        out = model(data)
        predictions_test.extend(out.cpu().detach().numpy())

# Plotting Spearman correlation of train, val, and test
epochs = list(range(1, 101))
plt.plot(epochs, train_spearman, label='Train Spearman Correlation')
plt.plot(epochs, val_spearman, label='Validation Spearman Correlation')
plt.xlabel('Epochs')
plt.ylabel('Spearman Correlation')
plt.title('Spearman Correlation of Train and Validation Sets Over Epochs')
plt.legend()
plt.show()