In [1]:
import os

import numpy as np
from scipy.sparse import load_npz
from scipy.stats import pearsonr
from sklearn.metrics import roc_auc_score, precision_recall_curve, auc
import pandas as pd

import torch
import torch_geometric
import torch.nn.functional as F
import torch.nn as nn
import torch.optim as optim
from torch_geometric.loader import DataLoader
from torch_geometric.nn.conv import GATConv
from model_classes_ import GCN_classification, GCN_regression

import matplotlib.pyplot as plt

torch.manual_seed(0)

%load_ext autoreload
%autoreload 2

In [2]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

device(type='cuda')

In [3]:
regression_flag = 0
chip_res = 10000
hic_res = 10000
num_hm = 6
num_feat = int((hic_res/chip_res)*num_hm)
num_classes = 2 if regression_flag == 0 else 1
src_dir = os.getcwd()

In [4]:
def prepare_data(cell_line, regression_flag, base_path):
    save_dir = os.path.join(base_path, 'data', cell_line, 'saved_runs')
    hic_sparse_mat_file = os.path.join(base_path, 'data', cell_line, 'hic_sparse.npz')
    np_nodes_lab_genes_file = os.path.join(base_path, 'data',  cell_line, \
        'np_nodes_lab_genes_reg' + str(regression_flag) + '.npy')
    np_hmods_norm_all_file = os.path.join(base_path, 'data', cell_line, \
        'np_hmods_norm_chip_' + str(chip_res) + 'bp.npy')
    df_genes_file = os.path.join(base_path, 'data', cell_line, 'df_genes_reg' + str(regression_flag) + '.pkl')
    df_genes = pd.read_pickle(df_genes_file)
    
    mat = load_npz(hic_sparse_mat_file)
    allNodes_hms = np.load(np_hmods_norm_all_file)
    hms = allNodes_hms[:, 1:] #only includes features, not node ids
    X = torch.tensor(hms).float().reshape(-1, num_feat) 
    allNodes = allNodes_hms[:, 0].astype(int)
    geneNodes_labs = np.load(np_nodes_lab_genes_file)

    geneNodes = geneNodes_labs[:, -2].astype(int)
    allLabs = -1*np.ones(np.shape(allNodes))

    targetNode_mask = torch.tensor(geneNodes).long()

    if regression_flag == 0:
        geneLabs = geneNodes_labs[:, -1].astype(int)
        allLabs[geneNodes] = geneLabs
        Y = torch.tensor(allLabs).long()
    else:
        geneLabs = geneNodes_labs[:, -1].astype(float)
        allLabs[geneNodes] = geneLabs
        Y = torch.tensor(allLabs).float()

    extract = torch_geometric.utils.from_scipy_sparse_matrix(mat)
    data = torch_geometric.data.Data(edge_index = extract[0], edge_attr = extract[1], x = X, y = Y)
    G = data
    
    return G, targetNode_mask

In [5]:
def to_cpu_npy(x):
    return x.cpu().detach().numpy()
    
def train_model_classification(model, loss, graph, max_epoch, learning_rate, targetNode_mask, train_idx, valid_idx, optimizer):
    model = model.to(device)
    graph = graph.to(device)

    optimizer = optimizer
    
    train_labels = to_cpu_npy(graph.y[targetNode_mask[train_idx]])
    valid_labels = to_cpu_npy(graph.y[targetNode_mask[valid_idx]])

    model.train()
    train_status = True

    train_losses = []
    valid_losses = []
    
    print('\n')
    for e in list(range(max_epoch)):
        
        model.train()
        train_status = True
        
        optimizer.zero_grad()
        
        all_scores = model(graph.x.float(), graph.edge_index, train_status)[targetNode_mask]
        train_scores = all_scores[train_idx]
        
        train_loss = loss(train_scores, torch.LongTensor(train_labels).to(device))
        train_losses.append(train_loss.item())

        train_loss.backward()
        optimizer.step()

        model.eval()
        valid_scores = all_scores[valid_idx]
        valid_loss = loss(valid_scores, torch.LongTensor(valid_labels).to(device))
        valid_losses.append(valid_loss.item())

        if e%100 == 0:
            print(f'Epoch {e}: Train Loss = {train_loss}, Valid Loss = {valid_loss}')

    return train_losses, valid_losses

def eval_model_classification(model, graph, targetNode_mask, train_idx, valid_idx, test_idx):
    model = model.to(device)
    graph = graph.to(device)
    test_labels = to_cpu_npy(graph.y[targetNode_mask[test_idx]])
    
    model.eval()
    train_status=False

    forward_scores = model(graph.x.float(), graph.edge_index, train_status)[targetNode_mask]

    test_scores = forward_scores[test_idx]
    test_softmax = F.softmax(test_scores, dim=1)
    test_preds = torch.argmax(test_softmax, dim=1)
    
    test_softmax = to_cpu_npy(test_softmax)
    test_preds = to_cpu_npy(test_preds)
    test_AUROC = roc_auc_score(test_labels, test_softmax[:,1], average="micro")
    test_acc = np.mean(test_preds == test_labels)

    train_labels = to_cpu_npy(graph.y[targetNode_mask[train_idx]])
    train_scores = forward_scores[train_idx]
    train_softmax = F.softmax(train_scores, dim=1)
    train_preds = torch.argmax(train_softmax, dim=1)
    
    train_softmax = to_cpu_npy(train_softmax)
    train_preds = to_cpu_npy(train_preds)
    train_AUROC = roc_auc_score(train_labels, train_softmax[:,1], average="micro")
    train_acc = np.mean(train_preds == train_labels)


    return {'train_AUROC': train_AUROC, 'train_acc': train_acc, 'test_AUROC': test_AUROC, 'test_acc': test_acc}

In [6]:
graph_conv_embed_size = 256
num_lin_layers = 2
lin_hidden_size = 256
num_graph_conv_layers = 2
learning_rate = 1e-4
max_epoch = 1000

graph_conv_layer_sizes = [num_feat] + \
    [int(max(graph_conv_embed_size, lin_hidden_size)) \
          for i in np.arange(1, num_graph_conv_layers, 1)] + [lin_hidden_size]

graph_lin_hidden_sizes = [graph_conv_layer_sizes[-1]] + \
    [int(max(lin_hidden_size, num_classes)) \
          for i in np.arange(1, num_lin_layers, 1)] + [num_classes]

In [7]:
cell_lines = ['E116', 'E122', 'E123']
classification_res = pd.DataFrame(columns=cell_lines)

In [None]:
gcn_auroc = []

for cell_line in cell_lines:
    print(f'\nTraining Cell Line {cell_line}...')

    train_idx = torch.load(f'train-test-split/{cell_line}/train_idx.pt')
    valid_idx = torch.load(f'train-test-split/{cell_line}/valid_idx.pt')
    test_idx = torch.load(f'train-test-split/{cell_line}/test_idx.pt')
    
    G, targetNode_mask = prepare_data(cell_line = cell_line, regression_flag = regression_flag, base_path = src_dir)
    
    model = GCN_classification(num_feat, num_graph_conv_layers, graph_conv_layer_sizes, num_lin_layers, graph_lin_hidden_sizes, num_classes, num_nodes=G.x.shape[0], edge_attr=G.edge_attr)
    optimizer = torch.optim.Adam(filter(lambda p : p.requires_grad, model.parameters()), lr = learning_rate)
    loss = nn.CrossEntropyLoss()

    model_parameters = filter(lambda p: p.requires_grad, model.parameters())
    print(f'Number of Parameters: {sum([np.prod(p.size()) for p in model_parameters])}')

    train_out = train_model_classification(model, loss, G, max_epoch, learning_rate, targetNode_mask, train_idx, valid_idx, optimizer)
    test_out = eval_model_classification(model, G, targetNode_mask, train_idx, valid_idx, test_idx)

    gcn_auroc.append(test_out['test_AUROC'])

classification_res.loc['weighted_GCN'] = gcn_auroc
classification_res


Training Cell Line E116...
Number of Parameters: 8014790


Epoch 0: Train Loss = 0.6954270005226135, Valid Loss = 0.6960689425468445
Epoch 100: Train Loss = 0.44232800602912903, Valid Loss = 0.4565892517566681
Epoch 200: Train Loss = 0.3872204124927521, Valid Loss = 0.4094761610031128
Epoch 300: Train Loss = 0.37637436389923096, Valid Loss = 0.3996974229812622
Epoch 400: Train Loss = 0.3711467981338501, Valid Loss = 0.39356479048728943
Epoch 500: Train Loss = 0.3659803867340088, Valid Loss = 0.3928886950016022
Epoch 600: Train Loss = 0.36519986391067505, Valid Loss = 0.39297550916671753
Epoch 700: Train Loss = 0.3611716330051422, Valid Loss = 0.39059409499168396
Epoch 800: Train Loss = 0.3580845296382904, Valid Loss = 0.3909675180912018
Epoch 900: Train Loss = 0.35573840141296387, Valid Loss = 0.3900192677974701

Training Cell Line E122...
Number of Parameters: 7025674


Epoch 0: Train Loss = 0.6941266655921936, Valid Loss = 0.6948403120040894
Epoch 100: Train Loss = 0.468001484870910

In [None]:
vanilla_gcn_res = pd.read_csv('results.csv', index_col=0)
res = pd.concat([classification_res, vanilla_gcn_res])


In [None]:
res

In [None]:
model_parameters = filter(lambda p: p.requires_grad, model.parameters())
sum([np.prod(p.size()) for p in model_parameters])