# Model Evaluation Notebook

### Importing Necessary Packages

In [1]:
# importing torch packages
import torch
import torch_geometric
import torch.nn as nn
import torch.nn.functional as F
from torch import Tensor
from torch_geometric.nn.conv import MessagePassing
from torch_geometric.typing import OptPairTensor, Adj, Size
from torch_sparse import SparseTensor, matmul
from torch_geometric.nn.conv import GATConv
from torch_geometric.loader import NeighborLoader

# importing utility packages
import os
import argparse
import time
from datetime import datetime
import random
from typing import Union, Tuple
import matplotlib.pyplot as plt

# importing evaluation packages
import numpy as np
import pandas as pd
from scipy.sparse import load_npz
from sklearn.metrics import roc_auc_score, f1_score, precision_recall_curve, auc
from scipy.stats import pearsonr

# importing training/eval functions
from training_eval import train_model_classification, eval_model_classification, train_model_regression, eval_model_regression

# importing models
from final_model_classes.gcn_model import GCN_classification, GCN_regression
from final_model_classes.mlp_model import MLP_classification, MLP_regression
from final_model_classes.cnn_model import CNN
from final_model_classes.weighted_gcn_model import GCN_classification_weighted
from final_model_classes.gat_model import GAT

### Setting torch device (CAUTION: Make sure one GPU is available)

In [2]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

device(type='cuda')

#### Helper Functions

In [3]:
def to_cpu_npy(x):
    return x.cpu().detach().numpy()

In [4]:
def calc_pearson(predictions, labels):
    return pearsonr(predictions, labels)[0]

In [5]:
def prepare_data(cell_line, regression_flag, base_path):
    save_dir = os.path.join(base_path, 'data', cell_line, 'saved_runs')
    
    hic_sparse_mat_file = os.path.join(base_path, 'data', cell_line, 'hic_sparse.npz')
    np_nodes_lab_genes_file = os.path.join(base_path, 'data',  cell_line, \
        'np_nodes_lab_genes_reg' + str(regression_flag) + '.npy')
    np_hmods_norm_all_file = os.path.join(base_path, 'data', cell_line, \
        'np_hmods_norm_chip_' + str(chip_res) + 'bp.npy')
    df_genes_file = os.path.join(base_path, 'data', cell_line, 'df_genes_reg' + str(regression_flag) + '.pkl')
    df_genes = pd.read_pickle(df_genes_file)
    
    mat = load_npz(hic_sparse_mat_file)
    allNodes_hms = np.load(np_hmods_norm_all_file) #contains 6 histone marks for all 279606 regions + id (Shape = [279606, 7])
    hms = allNodes_hms[:, 1:] #only includes features, not node ids (Shape = [279606, 6])
    X = torch.tensor(hms).float().reshape(-1, num_feat) #convert hms to tensor (Shape = [279606, 6])
    allNodes = allNodes_hms[:, 0].astype(int) #contains ids of all regions (Shape = [279606, 1])
    geneNodes_labs = np.load(np_nodes_lab_genes_file)  #contains the expression level of each gene (Shape = [16699, 2])

    geneNodes = geneNodes_labs[:, -2].astype(int)  #contains ids of regions that encode a gene (Shape = [16699, 1])
    allLabs = -1*np.ones(np.shape(allNodes))

    targetNode_mask = torch.tensor(geneNodes).long()

    if regression_flag == 0:
        geneLabs = geneNodes_labs[:, -1].astype(int)
        allLabs[geneNodes] = geneLabs #contains expression level for each region (-1 if region doesn't encode gene, 1 if gene is expressed, 0 if not)
        Y = torch.tensor(allLabs).long()
    else:
        geneLabs = geneNodes_labs[:, -1].astype(float)
        allLabs[geneNodes] = geneLabs #contains expression level for each region (-1 if region doesn't encode gene, 1 if gene is expressed, 0 if not)
        Y = torch.tensor(allLabs).float()

    extract = torch_geometric.utils.from_scipy_sparse_matrix(mat)
    data = torch_geometric.data.Data(edge_index = extract[0], edge_attr = extract[1], x = X, y = Y)
    G = data
    
    ###Randomize node order and split into 70%/15%/15% training/validation/test sets
    torch.manual_seed(0)
    pred_idx_shuff = torch.randperm(targetNode_mask.shape[0])
    fin_train = np.floor(0.7*pred_idx_shuff.shape[0]).astype(int)
    fin_valid = np.floor(0.85*pred_idx_shuff.shape[0]).astype(int)
    
    return G, targetNode_mask

### Setting Global Variables

In [6]:
cell_lines = ['E116', 'E122', 'E123']
regression_flag = 0
current_dir = os.getcwd()
src_dir = os.path.dirname(current_dir)
chip_res = 10000
hic_res = 10000
num_hm = 6
num_feat = int((hic_res/chip_res)*num_hm)
lin_hidden_size = 256
learning_rate = 1e-4

GAT_hidden_channels=[6, 30]
GAT_dropout = 0.5
GAT_wd = 1e-05
GAT_num_heads = 4
GAT_lr = 0.002
loss = nn.CrossEntropyLoss()

max_epoch = 1000
max_epoch_weighted_agg = 1500 # increasing epochs due to the increase in model parameters

num_graph_conv_layers = 2
graph_conv_embed_size = 256
num_classes = 2 if regression_flag == 0 else 1

num_lin_layers = 3
graph_conv_layer_sizes = [num_feat] + \
    [int(max(graph_conv_embed_size, lin_hidden_size)) \
          for i in np.arange(1, num_graph_conv_layers, 1)] + [lin_hidden_size]

graph_lin_hidden_sizes = [graph_conv_layer_sizes[-1]] + \
    [int(max(lin_hidden_size, num_classes)) \
          for i in np.arange(1, num_lin_layers, 1)] + [num_classes]

lin_hidden_sizes = [num_feat] + [int(max(lin_hidden_size, num_classes)) for i in np.arange(1, num_lin_layers, 1)] + [num_classes]

num_classes_reg = 1

lin_hidden_sizes_reg = [num_feat] + [int(max(lin_hidden_size, num_classes_reg)) for i in np.arange(1, num_lin_layers, 1)] + [num_classes_reg]

graph_lin_hidden_sizes_reg = [graph_conv_layer_sizes[-1]] + \
    [int(max(lin_hidden_size, num_classes)) \
          for i in np.arange(1, num_lin_layers, 1)] + [1]

### Classification Models

#### GCN Classification

In [7]:
gcn_auroc = []
gcn_acc = []
all_gcn_losses = []

for cell_line in cell_lines:
    print(f'\nTraining Cell Line {cell_line}...')
    
    train_idx = torch.load(f'../train-test-split/{cell_line}/train_idx.pt')
    valid_idx = torch.load(f'../train-test-split/{cell_line}/valid_idx.pt')
    test_idx = torch.load(f'../train-test-split/{cell_line}/test_idx.pt')
    
    G, targetNode_mask = prepare_data(cell_line=cell_line, regression_flag = regression_flag, base_path = src_dir)
    model = GCN_classification(num_feat, num_graph_conv_layers, graph_conv_layer_sizes, num_lin_layers, graph_lin_hidden_sizes, num_classes)
    optimizer = torch.optim.Adam(filter(lambda p : p.requires_grad, model.parameters()), lr = learning_rate)
    
    gcn_train_losses, gcn_valid_losses = train_model_classification(model, loss, G, max_epoch, learning_rate, targetNode_mask, train_idx, valid_idx, optimizer)
    gcn_out = eval_model_classification(model, G, targetNode_mask, train_idx, valid_idx, test_idx)

    all_gcn_losses.append([gcn_train_losses, gcn_valid_losses])
    
    gcn_test_AUROC = gcn_out["test_AUROC"]
    gcn_auroc.append(gcn_test_AUROC)
    
    gcn_test_acc = gcn_out["test_acc"]
    gcn_acc.append(gcn_test_acc)
    
    print(f"{cell_line} Test AUROC: {gcn_test_AUROC}, Test Accuracy: {gcn_test_acc}")


Training Cell Line E116...


Epoch 0: Train Loss = 0.6935580372810364, Valid Loss = 0.6936882734298706
Epoch 100: Train Loss = 0.42203575372695923, Valid Loss = 0.435624897480011
Epoch 200: Train Loss = 0.37795814871788025, Valid Loss = 0.3977866470813751
Epoch 300: Train Loss = 0.36957046389579773, Valid Loss = 0.3947499394416809
Epoch 400: Train Loss = 0.362991601228714, Valid Loss = 0.388906866312027
Epoch 500: Train Loss = 0.3603886067867279, Valid Loss = 0.39000430703163147
Epoch 600: Train Loss = 0.35582607984542847, Valid Loss = 0.385602742433548
Epoch 700: Train Loss = 0.35273268818855286, Valid Loss = 0.38617581129074097
Epoch 800: Train Loss = 0.3484351634979248, Valid Loss = 0.38903307914733887
Epoch 900: Train Loss = 0.3459504246711731, Valid Loss = 0.3883751928806305
E116 Test AUROC: 0.9099551151761517, Test Accuracy: 0.844311377245509

Training Cell Line E122...


Epoch 0: Train Loss = 0.6971882581710815, Valid Loss = 0.6985690593719482
Epoch 100: Train Loss = 0.44630101

#### MLP Classification

In [8]:
mlp_auroc = []
mlp_acc = []
all_mlp_losses = []

for cell_line in cell_lines:
    print(f'\nTraining Cell Line {cell_line}...')

    train_idx = torch.load(f'../train-test-split/{cell_line}/train_idx.pt')
    valid_idx = torch.load(f'../train-test-split/{cell_line}/valid_idx.pt')
    test_idx = torch.load(f'../train-test-split/{cell_line}/test_idx.pt')
    
    G, targetNode_mask = prepare_data(cell_line=cell_line, regression_flag = regression_flag, base_path = src_dir)
    model = MLP_classification(num_feat, num_lin_layers, lin_hidden_sizes, num_classes)
    optimizer = torch.optim.Adam(filter(lambda p : p.requires_grad, model.parameters()), lr = learning_rate)
    
    mlp_train_losses, mlp_valid_losses = train_model_classification(model, loss, G, max_epoch, learning_rate, targetNode_mask, train_idx, valid_idx, optimizer)
    mlp_out = eval_model_classification(model, G, targetNode_mask, train_idx, valid_idx, test_idx)

    all_mlp_losses.append([mlp_train_losses, mlp_valid_losses])

    mlp_test_AUROC = mlp_out["test_AUROC"]
    mlp_auroc.append(mlp_test_AUROC)

    mlp_test_acc = mlp_out["test_acc"]
    mlp_acc.append(mlp_test_acc)
    
    print(f"{cell_line} Test AUROC: {mlp_test_AUROC}, Test Accuracy: {mlp_test_acc}")


Training Cell Line E116...


Epoch 0: Train Loss = 0.6892363429069519, Valid Loss = 0.6896421313285828
Epoch 100: Train Loss = 0.541705310344696, Valid Loss = 0.5454736948013306
Epoch 200: Train Loss = 0.4273169934749603, Valid Loss = 0.4391532838344574
Epoch 300: Train Loss = 0.40098610520362854, Valid Loss = 0.4161554276943207
Epoch 400: Train Loss = 0.3905988037586212, Valid Loss = 0.4068722128868103
Epoch 500: Train Loss = 0.3847435712814331, Valid Loss = 0.40175551176071167
Epoch 600: Train Loss = 0.38063669204711914, Valid Loss = 0.39862364530563354
Epoch 700: Train Loss = 0.3776399791240692, Valid Loss = 0.3967739939689636
Epoch 800: Train Loss = 0.3754136860370636, Valid Loss = 0.3956705927848816
Epoch 900: Train Loss = 0.3736962676048279, Valid Loss = 0.39498934149742126
E116 Test AUROC: 0.9089087008294325, Test Accuracy: 0.8407185628742515

Training Cell Line E122...


Epoch 0: Train Loss = 0.6940168738365173, Valid Loss = 0.6946899890899658
Epoch 100: Train Loss = 0.5908188

#### Weighted Aggregation GCN Classification

In [10]:
w_gcn_auroc = []
w_gcn_acc = []
all_w_gcn_losses = []

for cell_line in cell_lines:
    print(f'\nTraining Cell Line {cell_line}...')

    train_idx = torch.load(f'../train-test-split/{cell_line}/train_idx.pt')
    valid_idx = torch.load(f'../train-test-split/{cell_line}/valid_idx.pt')
    test_idx = torch.load(f'../train-test-split/{cell_line}/test_idx.pt')
    
    G, targetNode_mask = prepare_data(cell_line = cell_line, regression_flag = regression_flag, base_path = src_dir)
    model = GCN_classification_weighted(num_feat, num_graph_conv_layers, graph_conv_layer_sizes, num_lin_layers, graph_lin_hidden_sizes, num_classes, num_nodes=G.x.shape[0], edge_attr=G.edge_attr)
    optimizer = torch.optim.Adam(filter(lambda p : p.requires_grad, model.parameters()), lr = learning_rate)
    
    w_gcn_train_losses, w_gcn_valid_losses = train_model_classification(model, loss, G, max_epoch_weighted_agg, learning_rate, targetNode_mask, train_idx, valid_idx, optimizer)
    w_gcn_out = eval_model_classification(model, G, targetNode_mask, train_idx, valid_idx, test_idx)

    all_w_gcn_losses.append([w_gcn_train_losses, w_gcn_valid_losses])

    w_gcn_test_AUROC = w_gcn_out["test_AUROC"]
    w_gcn_auroc.append(w_gcn_test_AUROC)

    w_gcn_test_acc = w_gcn_out["test_acc"]
    w_gcn_acc.append(w_gcn_test_acc)
    
    print(f"{cell_line} Test AUROC: {w_gcn_test_AUROC}, Test Accuracy: {w_gcn_test_acc}")


Training Cell Line E116...


Epoch 0: Train Loss = 0.6935580372810364, Valid Loss = 0.6936882734298706
Epoch 100: Train Loss = 0.42202967405319214, Valid Loss = 0.4356260597705841
Epoch 200: Train Loss = 0.37797003984451294, Valid Loss = 0.3978056311607361
Epoch 300: Train Loss = 0.369561105966568, Valid Loss = 0.39496418833732605
Epoch 400: Train Loss = 0.363079696893692, Valid Loss = 0.38908401131629944
Epoch 500: Train Loss = 0.3603517413139343, Valid Loss = 0.39001116156578064
Epoch 600: Train Loss = 0.3558855652809143, Valid Loss = 0.3855525851249695
Epoch 700: Train Loss = 0.35269609093666077, Valid Loss = 0.38606080412864685
Epoch 800: Train Loss = 0.34844595193862915, Valid Loss = 0.389275461435318
Epoch 900: Train Loss = 0.34594160318374634, Valid Loss = 0.3892585337162018
Epoch 1000: Train Loss = 0.3426258862018585, Valid Loss = 0.38658204674720764
Epoch 1100: Train Loss = 0.3379587233066559, Valid Loss = 0.3919593095779419
Epoch 1200: Train Loss = 0.33729061484336853, Valid

#### GAT Classification

In [11]:
gat_auroc = []
gat_acc = []
all_gat_losses = []

for cell_line in cell_lines:
    print(f'\nTraining Cell Line {cell_line}...')

    train_idx = torch.load(f'../train-test-split/{cell_line}/train_idx.pt')
    valid_idx = torch.load(f'../train-test-split/{cell_line}/valid_idx.pt')
    test_idx = torch.load(f'../train-test-split/{cell_line}/test_idx.pt')

    G, targetNode_mask = prepare_data(cell_line = cell_line, regression_flag = regression_flag, base_path = src_dir)
    gat = GAT(in_channels = 6, hidden_channels = GAT_hidden_channels, num_heads = GAT_num_heads, dropout = GAT_dropout)
    optimizer = torch.optim.Adam(filter(lambda p : p.requires_grad, gat.parameters()), lr = GAT_lr, weight_decay = GAT_wd)
    
    gat_train_losses, gat_valid_losses = train_model_classification(gat, loss, G, max_epoch, learning_rate, targetNode_mask, train_idx, valid_idx, optimizer)
    gat_out = eval_model_classification(gat, G, targetNode_mask, train_idx, valid_idx, test_idx)

    all_gat_losses.append([gat_train_losses, gat_valid_losses])
    
    gat_test_AUROC = gat_out["test_AUROC"]
    gat_auroc.append(gat_test_AUROC)

    gat_test_acc = gat_out["test_acc"]
    gat_acc.append(gat_test_acc)
    print(f"{cell_line} Test AUROC: {gat_test_AUROC}, Test Accuracy: {gat_test_acc}")


Training Cell Line E116...


Epoch 0: Train Loss = 0.7010973691940308, Valid Loss = 0.6971675157546997
Epoch 100: Train Loss = 0.5876685976982117, Valid Loss = 0.5974135398864746
Epoch 200: Train Loss = 0.5228026509284973, Valid Loss = 0.5355121493339539
Epoch 300: Train Loss = 0.49715372920036316, Valid Loss = 0.5143245458602905
Epoch 400: Train Loss = 0.45887693762779236, Valid Loss = 0.48942384123802185
Epoch 500: Train Loss = 0.4416870176792145, Valid Loss = 0.4674643874168396
Epoch 600: Train Loss = 0.42829152941703796, Valid Loss = 0.46398159861564636
Epoch 700: Train Loss = 0.42272740602493286, Valid Loss = 0.45701536536216736
Epoch 800: Train Loss = 0.4185510277748108, Valid Loss = 0.45829907059669495
Epoch 900: Train Loss = 0.4136381447315216, Valid Loss = 0.4563336670398712
E116 Test AUROC: 0.8798400673400675, Test Accuracy: 0.8135728542914171

Training Cell Line E122...


Epoch 0: Train Loss = 0.6988705396652222, Valid Loss = 0.7006978988647461
Epoch 100: Train Loss = 0.530

#### Classification Results

In [12]:
classification_auroc_res = pd.DataFrame(columns = cell_lines)
classification_auroc_res.loc['GCN'] = gcn_auroc
classification_auroc_res.loc['MLP'] = mlp_auroc
classification_auroc_res.loc['Weighted_Agg_GCN'] = w_gcn_auroc
classification_auroc_res.loc['GAT'] = gat_auroc
classification_auroc_res.to_csv('classification_AUROC_res.csv')
classification_auroc_res

Unnamed: 0,E116,E122,E123
GCN,0.909955,0.903264,0.923019
MLP,0.908909,0.896096,0.918319
Weighted_Agg_GCN,0.90226,0.901844,0.919657
GAT,0.87984,0.878976,0.8944


In [13]:
classification_acc_res = pd.DataFrame(columns = cell_lines)
classification_acc_res.loc['GCN'] = gcn_acc
classification_acc_res.loc['MLP'] = mlp_acc
classification_acc_res.loc['Weighted_Agg_GCN'] = w_gcn_acc
classification_acc_res.loc['GAT'] = gat_acc
classification_acc_res.to_csv('classification_accuracy_res.csv')
classification_acc_res

Unnamed: 0,E116,E122,E123
GCN,0.844311,0.831003,0.861022
MLP,0.840719,0.826209,0.853834
Weighted_Agg_GCN,0.841916,0.836197,0.857029
GAT,0.813573,0.811027,0.819089


In [14]:
all_losses = pd.DataFrame(columns = cell_lines)
all_losses.loc['GCN'] = all_gcn_losses
all_losses.loc['MLP'] = all_mlp_losses
all_losses.loc['Weighted_Agg_GCN'] = all_w_gcn_losses
all_losses.loc['GAT'] = all_gat_losses
all_losses.to_csv('all_losses.csv')
all_losses

Unnamed: 0,E116,E122,E123
GCN,"[[0.6935580372810364, 0.6931409239768982, 0.69...","[[0.6971882581710815, 0.6967305541038513, 0.69...","[[0.6946658492088318, 0.6942091584205627, 0.69..."
MLP,"[[0.6892363429069519, 0.6878113150596619, 0.68...","[[0.6940168738365173, 0.692836344242096, 0.691...","[[0.7027177810668945, 0.7012646794319153, 0.69..."
Weighted_Agg_GCN,"[[0.6935580372810364, 0.6931409239768982, 0.69...","[[0.6971882581710815, 0.6967305541038513, 0.69...","[[0.6946658492088318, 0.6942091584205627, 0.69..."
GAT,"[[0.7010973691940308, 0.7005693316459656, 0.70...","[[0.6988705396652222, 0.6982897520065308, 0.69...","[[0.696287214756012, 0.6958156228065491, 0.695..."


### Regression Models

#### GCN

In [15]:
gcn_pearson = {}
gcn_preds_reg_E116, gcn_preds_lab_E116 = [], []
gcn_preds_reg_E122, gcn_preds_lab_E122 = [], []
gcn_preds_reg_E123, gcn_preds_lab_E123 = [], []

for cell_line in cell_lines:
    
    train_idx = torch.load(f'../train-test-split/{cell_line}/train_idx.pt')
    valid_idx = torch.load(f'../train-test-split/{cell_line}/valid_idx.pt')
    test_idx = torch.load(f'../train-test-split/{cell_line}/test_idx.pt')
    
    print(f'\nTraining Cell Line {cell_line}...')
    G, targetNode_mask = prepare_data(cell_line=cell_line, regression_flag = 1, base_path = src_dir)
    
    model = GCN_regression(num_feat, num_graph_conv_layers, graph_conv_layer_sizes, num_lin_layers, graph_lin_hidden_sizes_reg, num_classes_reg)
    optimizer = torch.optim.Adam(filter(lambda p : p.requires_grad, model.parameters()), lr = learning_rate)
    
    train_loss_vec, train_pearson_vec, valid_loss_vec, valid_pearson_vec = \
        train_model_regression(model, G, max_epoch, learning_rate, targetNode_mask, train_idx, valid_idx, optimizer)
    
    test_pearson, test_pred, test_labels, train_pearson, train_pred, train_labels, \
        valid_pearson, valid_pred, valid_labels = \
            eval_model_regression(model, G, targetNode_mask, train_idx, valid_idx, test_idx)
    
    if cell_line == 'E116':
        gcn_preds_reg_E116.append(test_pred)
        gcn_preds_lab_E116.append(test_labels)
    elif cell_line == 'E122':
        gcn_preds_reg_E122.append(test_pred)
        gcn_preds_lab_E122.append(test_labels)
    elif cell_line == 'E123':
        gcn_preds_reg_E123.append(test_pred)
        gcn_preds_lab_E123.append(test_labels)
    
    gcn_pearson[cell_line] = test_pearson
    print(f"Pearson: {test_pearson}")


Training Cell Line E116...


Epoch 0 out of 1000
Epoch 100 out of 1000
Epoch 200 out of 1000
Epoch 300 out of 1000
Epoch 400 out of 1000
Epoch 500 out of 1000
Epoch 600 out of 1000
Epoch 700 out of 1000
Epoch 800 out of 1000
Epoch 900 out of 1000
Pearson: 0.7661645753475331

Training Cell Line E122...


Epoch 0 out of 1000
Epoch 100 out of 1000
Epoch 200 out of 1000
Epoch 300 out of 1000
Epoch 400 out of 1000
Epoch 500 out of 1000
Epoch 600 out of 1000
Epoch 700 out of 1000
Epoch 800 out of 1000
Epoch 900 out of 1000
Pearson: 0.7691094216951544

Training Cell Line E123...


Epoch 0 out of 1000
Epoch 100 out of 1000
Epoch 200 out of 1000
Epoch 300 out of 1000
Epoch 400 out of 1000
Epoch 500 out of 1000
Epoch 600 out of 1000
Epoch 700 out of 1000
Epoch 800 out of 1000
Epoch 900 out of 1000
Pearson: 0.7943608624082945


#### MLP

In [17]:
mlp_pearson = {}
mlp_preds_reg_E116, mlp_preds_lab_E116 = [], []
mlp_preds_reg_E122, mlp_preds_lab_E122 = [], []
mlp_preds_reg_E123, mlp_preds_lab_E123 = [], []

for cell_line in cell_lines:
    
    train_idx = torch.load(f'../train-test-split/{cell_line}/train_idx.pt')
    valid_idx = torch.load(f'../train-test-split/{cell_line}/valid_idx.pt')
    test_idx = torch.load(f'../train-test-split/{cell_line}/test_idx.pt')
    
    print(f'\nTraining Cell Line {cell_line}...')
    G, targetNode_mask = prepare_data(cell_line=cell_line, regression_flag=1, base_path=src_dir)
    
    # Initialize MLP regression model
    model = MLP_regression(
    num_feat=6, 
    num_lin_layers=2,
    lin_hidden_sizes=[256, 128],  # Ensure this is a list with at least three values
    num_classes=1)
    
    optimizer = torch.optim.Adam(filter(lambda p: p.requires_grad, model.parameters()), lr=learning_rate)
    
    train_loss_vec, train_pearson_vec, valid_loss_vec, valid_pearson_vec = \
        train_model_regression(model, G, max_epoch, learning_rate, targetNode_mask, train_idx, valid_idx, optimizer)
    
    test_pearson, test_pred, test_labels, train_pearson, train_pred, train_labels, \
        valid_pearson, valid_pred, valid_labels = \
            eval_model_regression(model, G, targetNode_mask, train_idx, valid_idx, test_idx)
    
    # Store predictions and labels for the corresponding cell line
    if cell_line == 'E116':
        mlp_preds_reg_E116.append(test_pred)
        mlp_preds_lab_E116.append(test_labels)
    elif cell_line == 'E122':
        mlp_preds_reg_E122.append(test_pred)
        mlp_preds_lab_E122.append(test_labels)
    elif cell_line == 'E123':
        mlp_preds_reg_E123.append(test_pred)
        mlp_preds_lab_E123.append(test_labels)
    
    # Update Pearson dictionary for MLP
    mlp_pearson[cell_line] = test_pearson
    print(f"Pearson: {test_pearson}")


Training Cell Line E116...


Epoch 0 out of 1000
Epoch 100 out of 1000
Epoch 200 out of 1000
Epoch 300 out of 1000
Epoch 400 out of 1000
Epoch 500 out of 1000
Epoch 600 out of 1000
Epoch 700 out of 1000
Epoch 800 out of 1000
Epoch 900 out of 1000
Pearson: 0.7244821373288937

Training Cell Line E122...


Epoch 0 out of 1000
Epoch 100 out of 1000
Epoch 200 out of 1000
Epoch 300 out of 1000
Epoch 400 out of 1000
Epoch 500 out of 1000
Epoch 600 out of 1000
Epoch 700 out of 1000
Epoch 800 out of 1000
Epoch 900 out of 1000
Pearson: 0.6731905849926105

Training Cell Line E123...


Epoch 0 out of 1000
Epoch 100 out of 1000
Epoch 200 out of 1000
Epoch 300 out of 1000
Epoch 400 out of 1000
Epoch 500 out of 1000
Epoch 600 out of 1000
Epoch 700 out of 1000
Epoch 800 out of 1000
Epoch 900 out of 1000
Pearson: 0.7235219911781169


#### Regression Results

In [18]:
regression_res = pd.DataFrame(columns = cell_lines)
regression_res.loc['GCN'] = gcn_pearson
regression_res.loc['MLP'] = mlp_pearson
regression_res.to_csv('regression_res.csv')
regression_res

Unnamed: 0,E116,E122,E123
GCN,0.766165,0.769109,0.794361
MLP,0.724482,0.673191,0.723522


#### Saving files for visualizations in figures.ipynb 

In [19]:
# Create a directory to save all npy arrays in one place
directory = 'scatter_data_regression'
if not os.path.exists(directory):
    os.makedirs(directory)

# Save the files in the created directory
np.save(os.path.join(directory, 'gcn_preds_reg_E116.npy'), gcn_preds_reg_E116[0])
np.save(os.path.join(directory, 'gcn_preds_lab_E116.npy'), gcn_preds_lab_E116[0])

np.save(os.path.join(directory, 'gcn_preds_reg_E123.npy'), gcn_preds_reg_E123[0])
np.save(os.path.join(directory, 'gcn_preds_lab_E123.npy'), gcn_preds_lab_E123[0])

np.save(os.path.join(directory, 'gcn_preds_reg_E122.npy'), gcn_preds_reg_E122[0])
np.save(os.path.join(directory, 'gcn_preds_lab_E122.npy'), gcn_preds_lab_E122[0])

np.save(os.path.join(directory, 'mlp_preds_reg_E116.npy'), mlp_preds_reg_E116[0])
np.save(os.path.join(directory, 'mlp_preds_lab_E116.npy'), mlp_preds_lab_E116[0])

np.save(os.path.join(directory, 'mlp_preds_reg_E123.npy'), mlp_preds_reg_E123[0])
np.save(os.path.join(directory, 'mlp_preds_lab_E123.npy'), mlp_preds_lab_E123[0])

np.save(os.path.join(directory, 'mlp_preds_reg_E122.npy'), mlp_preds_reg_E122[0])
np.save(os.path.join(directory, 'mlp_preds_lab_E122.npy'), mlp_preds_lab_E122[0])