In [1]:
import os

import numpy as np
from scipy.sparse import load_npz
from scipy.stats import pearsonr
from sklearn.metrics import roc_auc_score, precision_recall_curve, auc
import pandas as pd

import torch
import torch_geometric
import torch.nn.functional as F
import torch.nn as nn
import torch.optim as optim
from torch_geometric.loader import DataLoader
from torch_geometric.loader import NeighborLoader
from torch_geometric.nn.conv import GATConv

import matplotlib.pyplot as plt

In [2]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

if torch.cuda.is_available():
    gpu_count = torch.cuda.device_count()
    for i in range(gpu_count):
        device_name = torch.cuda.get_device_name(i)
        print(f"GPU {i}: {device_name}")
else:
    print("No GPU available")

cell_line = 'E116'
regression_flag = 0
chip_res = 10000
hic_res = 10000
num_hm = 6
num_feat = int((hic_res/chip_res)*num_hm)
num_classes = 2 if regression_flag == 0 else 1

#define data paths
src_dir = os.getcwd()
#src_dir = os.path.dirname(base_path)
save_dir = os.path.join(src_dir, 'data', cell_line, 'saved_runs')
hic_sparse_mat_file = os.path.join(src_dir, 'data', cell_line, 'hic_sparse.npz')
np_nodes_lab_genes_file = os.path.join(src_dir, 'data',  cell_line, \
    'np_nodes_lab_genes_reg' + str(regression_flag) + '.npy')
np_hmods_norm_all_file = os.path.join(src_dir, 'data', cell_line, \
    'np_hmods_norm_chip_' + str(chip_res) + 'bp.npy')

mat = load_npz(hic_sparse_mat_file)
allNodes_hms = np.load(np_hmods_norm_all_file) #contains 6 histone marks for all 279606 regions + id (Shape = [279606, 7])
hms = allNodes_hms[:, 1:] #only includes features, not node ids (Shape = [279606, 6])
X = torch.tensor(hms).float().reshape(-1, num_feat) #convert hms to tensor (Shape = [279606, 6])
allNodes = allNodes_hms[:, 0].astype(int) #contains ids of all regions (Shape = [279606, 1])

geneNodes_labs = np.load(np_nodes_lab_genes_file) #contains the expression level of each gene (Shape = [16699, 2])
geneNodes = geneNodes_labs[:, -2].astype(int) #contains ids of regions that encode a gene (Shape = [16699, 1])

allLabs = -1*np.ones(np.shape(allNodes))
targetNode_mask = torch.tensor(geneNodes).long()
geneLabs = geneNodes_labs[:, -1].astype(int)
allLabs[geneNodes] = geneLabs #contains expression level for each region (-1 if region doesn't encode gene, 1 if gene is expressed, 0 if not)
Y = torch.tensor(allLabs).long()

extract = torch_geometric.utils.from_scipy_sparse_matrix(mat)

data = torch_geometric.data.Data(edge_index = extract[0], edge_attr = extract[1], x = X, y = Y)
G = data

pred_idx_shuff = torch.randperm(targetNode_mask.shape[0])
fin_train = np.floor(0.7*pred_idx_shuff.shape[0]).astype(int)
fin_valid = np.floor(0.85*pred_idx_shuff.shape[0]).astype(int)
train_idx = pred_idx_shuff[:fin_train]
valid_idx = pred_idx_shuff[fin_train:fin_valid]
test_idx = pred_idx_shuff[fin_valid:]

train_n_loader = NeighborLoader(G, num_neighbors = [10, 10], batch_size = 64, input_nodes = targetNode_mask[train_idx], shuffle = True)
valid_n_loader = NeighborLoader(G, num_neighbors = [10, 10], batch_size = 64, input_nodes = targetNode_mask[valid_idx], shuffle = False)

GPU 0: NVIDIA RTX A5000




In [3]:
def to_cpu_npy(x):
    return x.cpu().detach().numpy()

In [4]:
def train_model_classification(model, loss, train_loader, valid_loader, max_epoch, optimizer, train_idx = train_idx, valid_idx = valid_idx):
    model = model.to(device)

    train_losses = []
    valid_losses = []
    for epoch in range(max_epoch):
        model.train()
        for batch in train_loader:
            batch = batch.to(device)
            optimizer.zero_grad()
            train_batch_mask = torch.isin(batch.n_id.to(device), targetNode_mask.to(device))
            train_batch_scores = model(batch)[train_batch_mask]
            train_batch_labels = to_cpu_npy(batch.y[train_batch_mask])
            train_batch_loss = loss(train_batch_scores, torch.LongTensor(train_batch_labels).to(device))
            train_losses.append(train_batch_loss.item())
            train_batch_loss.backward()
            optimizer.step()
        
        model.eval()
        with torch.no_grad():
            for batch in valid_loader:
                batch = batch.to(device)
                valid_batch_mask = torch.isin(batch.n_id.to(device), targetNode_mask.to(device))
                valid_batch_scores = model(batch)[valid_batch_mask]
                valid_batch_labels = to_cpu_npy(batch.y[valid_batch_mask])
                valid_batch_loss = loss(valid_batch_scores, torch.LongTensor(valid_batch_labels).to(device))
                valid_losses.append(valid_batch_loss.item())
                
        print(f'Epoch {epoch}: Train Loss = {train_batch_loss}, Valid Loss = {valid_batch_loss}')
    return train_losses, valid_losses

In [5]:
def eval_model_classification(model, graph, targetNode_mask, train_idx, valid_idx, test_idx):
    model = model.to(device)
    graph = graph.to(device)
    test_labels = to_cpu_npy(graph.y[targetNode_mask[test_idx]])
    
    model.eval()

    forward_scores = model(G)[targetNode_mask]

    test_scores = forward_scores[test_idx]
    test_softmax = F.softmax(test_scores, dim=1)
    test_preds = torch.argmax(test_softmax, dim=1)
    
    test_softmax = to_cpu_npy(test_softmax)
    test_preds = to_cpu_npy(test_preds)
    test_AUROC = roc_auc_score(test_labels, test_softmax[:,1], average="micro")
    test_acc = np.mean(test_preds == test_labels)

    train_labels = to_cpu_npy(graph.y[targetNode_mask[train_idx]])
    train_scores = forward_scores[train_idx]
    train_softmax = F.softmax(train_scores, dim=1)
    train_preds = torch.argmax(train_softmax, dim=1)
    
    train_softmax = to_cpu_npy(train_softmax)
    train_preds = to_cpu_npy(train_preds)
    train_AUROC = roc_auc_score(train_labels, train_softmax[:,1], average="micro")
    train_acc = np.mean(train_preds == train_labels)


    return {'train_AUROC': train_AUROC, 'train_acc': train_acc, 'test_AUROC': test_AUROC, 'test_acc': test_acc}

In [6]:
class GAT(nn.Module):
    def __init__(self, in_channels, hidden_channels, num_heads):
        super(GAT, self).__init__()
        # First GAT layer
        self.gat1 = GATConv(in_channels, hidden_channels[0], heads=num_heads, concat=True)
        # Second GAT layer
        self.gat2 = GATConv(hidden_channels[0] * num_heads, hidden_channels[1], heads=num_heads, concat=False)
        # Third GAT layer
        # self.gat3 = GATConv(hidden_channels[1] * num_heads, hidden_channels[2], heads=num_heads, concat=False)

        # Fully connected layers
        self.ff1 = nn.Linear(hidden_channels[1], hidden_channels[1] // 2)
        self.ff2 = nn.Linear(hidden_channels[1] // 2, 2)
        self.dropout = nn.Dropout(p=0.5)

    def forward(self, data):
        x, edge_index, edge_attr = data.x, data.edge_index, data.edge_attr
        # Pass through GAT layers
        x = torch.relu(self.gat1(x, edge_index, edge_attr))
        x = torch.relu(self.gat2(x, edge_index, edge_attr))
        # x = torch.relu(self.gat3(x, edge_index, edge_attr))
        # Apply dropout and pass through the fully connected layers
        x = self.dropout(x)
        x = torch.relu(self.ff1(x))
        x = self.ff2(x)
        return x

In [7]:
num_heads = 4
learning_rate = 0.001
max_epoch = 20
loss = nn.CrossEntropyLoss()
hidden_channels=[6, 30]
wd = 1e-05

gat = GAT(in_channels=6, hidden_channels=hidden_channels, num_heads = num_heads)

optimizer = torch.optim.Adam(filter(lambda p : p.requires_grad, gat.parameters()), lr = learning_rate, weight_decay = wd)

train_losses, valid_losses = train_model_classification(gat, loss, train_n_loader, valid_n_loader, max_epoch, optimizer)

Epoch 0: Train Loss = 0.48731252551078796, Valid Loss = 0.4492831826210022
Epoch 1: Train Loss = 0.48146113753318787, Valid Loss = 0.4704888164997101
Epoch 2: Train Loss = 0.44378769397735596, Valid Loss = 0.42149537801742554
Epoch 3: Train Loss = 0.3518732488155365, Valid Loss = 0.4540923237800598
Epoch 4: Train Loss = 0.4634682238101959, Valid Loss = 0.45065540075302124
Epoch 5: Train Loss = 0.4096794128417969, Valid Loss = 0.42113691568374634
Epoch 6: Train Loss = 0.4551752209663391, Valid Loss = 0.4474544823169708
Epoch 7: Train Loss = 0.39338433742523193, Valid Loss = 0.42444440722465515
Epoch 8: Train Loss = 0.4476798474788666, Valid Loss = 0.43534204363822937
Epoch 9: Train Loss = 0.47263863682746887, Valid Loss = 0.4376838207244873
Epoch 10: Train Loss = 0.41997382044792175, Valid Loss = 0.44391730427742004
Epoch 11: Train Loss = 0.4752104878425598, Valid Loss = 0.48312488198280334
Epoch 12: Train Loss = 0.4388042390346527, Valid Loss = 0.4019983410835266
Epoch 13: Train Loss =

In [8]:
out = eval_model_classification(gat, G, targetNode_mask, train_idx, valid_idx, test_idx)
out

{'train_AUROC': 0.8255024340693489,
 'train_acc': 0.7594319445632646,
 'test_AUROC': 0.8338861856766094,
 'test_acc': 0.7588822355289421}

In [11]:
G

Data(x=[279606, 6], edge_index=[2, 3906914], edge_attr=[3906914], y=[279606])

In [12]:
import torch
from torch_geometric.utils import subgraph

def get_neighbors_at_hops(edge_index, num_nodes, max_hops=3):
    # Initialize a list to store the number of neighbors at each hop for each node
    neighbors_per_hop = {hop: torch.zeros(num_nodes, dtype=torch.long) for hop in range(max_hops)}

    # Create a graph with edge_index
    edge_list = edge_index.t().contiguous()

    for hop in range(max_hops):
        # Initialize a set of nodes to start from
        visited_nodes = torch.zeros(num_nodes, dtype=torch.bool)
        
        # Start from all nodes and find their neighbors
        for node in range(num_nodes):
            if not visited_nodes[node]:
                neighbors = set()
                # Find neighbors at current hop
                neighbors = set(edge_list[edge_list[:, 0] == node, 1].tolist())
                neighbors_per_hop[hop][node] = len(neighbors)
                
                visited_nodes[node] = True
                for neighbor in neighbors:
                    visited_nodes[neighbor] = True
    return neighbors_per_hop

# Example usage
num_nodes = 279606  # Number of nodes
max_hops = 3  # Maximum number of hops to explore
edge_index = G.edge_index  # Assuming G is your graph object
neighbors_per_hop = get_neighbors_at_hops(edge_index, num_nodes, max_hops)

# Print the number of neighbors per hop for each node
for hop in range(max_hops):
    print(f"Neighbors at hop {hop + 1}: {neighbors_per_hop[hop]}")


Neighbors at hop 1: tensor([ 1, 11, 11,  ...,  0,  0,  0])
Neighbors at hop 2: tensor([ 1, 11, 11,  ...,  0,  0,  0])
Neighbors at hop 3: tensor([ 1, 11, 11,  ...,  0,  0,  0])


In [29]:
torch.mean(neighbors_per_hop[0].float())

tensor(2.7335)

In [39]:
neighbors_per_hop[0][targetNode_mask]

tensor([10,  0,  4,  ...,  0,  0,  0])

In [40]:
targetNode_mask

tensor([     3,      6,     36,  ..., 279593, 279599, 279604])

In [49]:
G.edge_index

tensor([[     0,      1,      1,  ..., 279605, 279605, 279605],
        [  8765,     24,     67,  ..., 279595, 279598, 279599]],
       device='cuda:0')

In [45]:
G

Data(x=[279606, 6], edge_index=[2, 3906914], edge_attr=[3906914], y=[279606])

In [53]:
import torch

# Assuming G is your graph with edge_index
edge_index = G.edge_index  # (2, num_edges)
device = edge_index.device  # Get the device of edge_index (GPU or CPU)

num_nodes = 279606  # Number of nodes in your graph

# Create a sparse adjacency matrix from edge_index, ensuring both are on the same device
edge_values = torch.ones(edge_index.size(1), device=device)  # Edge values tensor on the same device
adj_matrix = torch.sparse_coo_tensor(
    edge_index,  # The (source, target) pairs of edges
    edge_values,  # The values of the edges (1 for each edge)
    (num_nodes, num_nodes),  # The shape of the matrix
    device=device  # Ensure the adjacency matrix is created on the same device
)

# Convert to dense format (if needed)
# dense_adj_matrix = adj_matrix.to_dense()

# If the graph is undirected, symmetrize the adjacency matrix
adj_matrix = adj_matrix + adj_matrix.t()

# Print the adjacency matrix (for small graphs; use only if the graph is small to avoid memory issues)
print(adj_matrix)


tensor(indices=tensor([[     0,      1,      1,  ..., 279595, 279598, 279599],
                       [  8765,     24,     67,  ..., 279605, 279605, 279605]]),
       values=tensor([1., 1., 1.,  ..., 1., 1., 1.]),
       device='cuda:0', size=(279606, 279606), nnz=7813828,
       layout=torch.sparse_coo)
