In [83]:
import os
import argparse
import time
from datetime import datetime, date
import random

import numpy as np
from scipy.sparse import load_npz
from scipy.stats import pearsonr
from sklearn.metrics import roc_auc_score, precision_recall_curve, auc
import pandas as pd

import torch
import torch.nn.functional as F
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import TensorDataset, DataLoader

In [3]:
cell_line = 'E116'
max_epoch = 1000
learning_rate = 1e-4
num_graph_conv_layers = 2
graph_conv_embed_size = 256
num_lin_layers = 3
lin_hidden_size = 256
regression_flag = 0
random_seed = 0

chip_res = 10000
hic_res = 10000
num_hm = 6
num_feat = int((hic_res/chip_res)*num_hm)
num_classes = 2 if regression_flag == 0 else 1

In [100]:
base_path = os.getcwd()
save_dir = os.path.join(base_path, 'data', cell_line, 'saved_runs')
hic_sparse_mat_file = os.path.join(base_path, 'data', cell_line, 'hic_sparse.npz')
np_nodes_lab_genes_file = os.path.join(base_path, 'data',  cell_line, \
    'np_nodes_lab_genes_reg' + str(regression_flag) + '.npy')
np_hmods_norm_all_file = os.path.join(base_path, 'data', cell_line, \
    'np_hmods_norm_chip_' + str(chip_res) + 'bp.npy')
df_genes_file = os.path.join(base_path, 'data', cell_line, 'df_genes_reg' + str(regression_flag) + '.pkl')
df_genes = pd.read_pickle(df_genes_file)

mat = load_npz(hic_sparse_mat_file)
allNodes_hms = np.load(np_hmods_norm_all_file)
hms = allNodes_hms[:, 1:] #only includes features, not node ids
X = torch.tensor(hms).float().reshape(-1, num_feat) 
allNodes = allNodes_hms[:, 0].astype(int)
geneNodes_labs = np.load(np_nodes_lab_genes_file)

geneNodes = geneNodes_labs[:, -2].astype(int)
allLabs = -1*np.ones(np.shape(allNodes))

targetNode_mask = torch.tensor(geneNodes).long()

if regression_flag == 0:
    geneLabs = geneNodes_labs[:, -1].astype(int)
    allLabs[geneNodes] = geneLabs
    Y = torch.tensor(allLabs).long()
else:
    geneLabs = geneNodes_labs[:, -1].astype(float)
    allLabs[geneNodes] = geneLabs
    Y = torch.tensor(allLabs).float()

In [101]:
pred_idx_shuff = torch.randperm(targetNode_mask.shape[0])
fin_train = np.floor(0.7*pred_idx_shuff.shape[0]).astype(int)
fin_valid = np.floor(0.85*pred_idx_shuff.shape[0]).astype(int)
train_idx = pred_idx_shuff[:fin_train]
valid_idx = pred_idx_shuff[fin_train:fin_valid]
test_idx = pred_idx_shuff[fin_valid:]

In [365]:
INPUT_LENGTH = 6
NUM_CLASSES = 2 
BATCH_SIZE = 64

class CNN(nn.Module):
    def __init__(self, num_conv_layers, num_linear_layers, dropout_rate=0.2):
        super(CNN, self).__init__()

        self.conv_layers = nn.ModuleList()
        in_channels = 1
        out_channels = 16
        current_length = INPUT_LENGTH

        for i in range(num_conv_layers):
            conv = nn.Sequential(
                nn.Conv1d(in_channels=in_channels, out_channels=out_channels, kernel_size=3, padding=1),
                nn.ReLU(),
                nn.MaxPool1d(kernel_size=2, stride=2) if current_length // 2 > 0 else nn.Identity(),
            )
            self.conv_layers.append(conv)
            
            in_channels = out_channels
            out_channels *= 2
            current_length = max(1, current_length // 2)
        
        self.flatten = nn.Flatten()
        linear_input_size = in_channels * current_length

        self.linear_layers = nn.ModuleList()
        if num_linear_layers > 1:
            for i in range(num_linear_layers - 1):
                next_size = int(linear_input_size // 2)
                fc = nn.Sequential(
                    nn.Linear(int(linear_input_size), next_size),
                    nn.ReLU()
                )
                self.linear_layers.append(fc)
                linear_input_size = next_size

        self.final_linear = nn.Linear(linear_input_size, NUM_CLASSES)
        self.dropout = nn.Dropout(dropout_rate)
        

    def forward(self, x):
        out = self.conv_layers[0](x)
        
        for layer in self.conv_layers[1:]:
            out = layer(out)

        out = self.dropout(out)
        out = self.flatten(out)
        for layer in self.linear_layers:
            out = layer(out)
        
        out = self.final_linear(out)
        
        return out

    def calculate_accuracy(self, dataset):
        data_loader = DataLoader(dataset, batch_size=BATCH_SIZE, shuffle=True)
        num_correct = 0
        with torch.no_grad():
            for batch_inputs, batch_labels in data_loader:
                optimizer.zero_grad()
                
                output = model(batch_inputs)
                pred = torch.argmax(output, dim=1)
                num_correct += torch.sum(pred == batch_labels)
            
        return float(num_correct / len(dataset))

In [366]:
train_data = X[targetNode_mask][train_idx]
train_labels = torch.tensor(geneNodes_labs[train_idx][:, 1]).long()

valid_data = X[targetNode_mask][valid_idx]
valid_labels = torch.tensor(geneNodes_labs[valid_idx][:, 1]).long()

test_data = X[targetNode_mask][test_idx]
test_labels = torch.tensor(geneNodes_labs[test_idx][:, 1]).long()

In [367]:
train_data = train_data.unsqueeze(1)
test_data = test_data.unsqueeze(1)
valid_data = valid_data.unsqueeze(1)

In [368]:
train_dataset = TensorDataset(train_data, train_labels)
train_data_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)

valid_dataset = TensorDataset(valid_data, valid_labels)
valid_data_loader = DataLoader(valid_dataset, batch_size=BATCH_SIZE, shuffle=True)

test_dataset = TensorDataset(test_data, valid_labels)
test_data_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=True)

In [369]:
def train(model, n_epochs):
    criterion = nn.CrossEntropyLoss()

    num_epochs = 100
    valid_accuracies = []
    for epoch in range(n_epochs):
        epoch_loss = 0
        for batch_inputs, batch_labels in train_data_loader:
            optimizer.zero_grad()
            
            output = model(batch_inputs)
            loss = criterion(output, batch_labels)
            
            loss.backward()
            optimizer.step()
            
            epoch_loss += loss.item()
        
        avg_loss = epoch_loss / len(train_data_loader)
        
        if epoch % 10 == 0:
            valid_accuracy = model.calculate_accuracy(valid_dataset)
            valid_accuracies.append(valid_accuracy)
            print(f"Epoch {epoch + 1}/{n_epochs}, Loss: {avg_loss:.4f}, Validation Accuracy: {valid_accuracy:.4f}")
        else:
            print(f"Epoch {epoch + 1}/{n_epochs}, Loss: {avg_loss:.4f}")

        if len(valid_accuracies) > 2:
            if (valid_accuracies[-1] < valid_accuracies[-2]) and (valid_accuracies[-1] < valid_accuracies[-3]):
                print(f'Training stopped due to early stopping at epoch {epoch}')
                break

    return valid_accuracies[-1]

In [371]:
model = CNN(num_conv_layers=3, num_linear_layers=2, dropout_rate=0)
optimizer = optim.Adam(model.parameters(), lr=0.001)

train(model, n_epochs=300)

Epoch 1/300, Loss: 0.5253, Validation Accuracy: 0.8251
Epoch 2/300, Loss: 0.4147
Epoch 3/300, Loss: 0.4094
Epoch 4/300, Loss: 0.4038
Epoch 5/300, Loss: 0.3938
Epoch 6/300, Loss: 0.3901
Epoch 7/300, Loss: 0.3860
Epoch 8/300, Loss: 0.3850
Epoch 9/300, Loss: 0.3855
Epoch 10/300, Loss: 0.3851
Epoch 11/300, Loss: 0.3824, Validation Accuracy: 0.8371


KeyboardInterrupt: 

In [372]:
num_conv_layers = [1, 2, 3]
num_linear_layers = [1, 2, 3]
dropout_rates = [0, 0.1, 0.2, 0.3]

results_df = pd.DataFrame(columns=['num_conv_layers', 'num_linear_layers', 'dropout_rate', 'validation_accuracy'])

for num_conv in num_conv_layers:
    for num_linear in num_linear_layers:
        for dropout_rate in dropout_rates:
            model = CNN(num_conv_layers=num_conv, num_linear_layers=num_linear, dropout_rate=dropout_rate)
            optimizer = optim.Adam(model.parameters(), lr=0.001)

            valid_accuracy = train(model, n_epochs=200)
            results_df.loc[len(results_df)] = [num_conv, num_linear, dropout_rate, valid_accuracy]

1 1 0
Epoch 1/200, Loss: 0.6515, Validation Accuracy: 0.7705
Epoch 2/200, Loss: 0.5252
Epoch 3/200, Loss: 0.4559
Epoch 4/200, Loss: 0.4370
Epoch 5/200, Loss: 0.4301
Epoch 6/200, Loss: 0.4245
Epoch 7/200, Loss: 0.4205
Epoch 8/200, Loss: 0.4170
Epoch 9/200, Loss: 0.4138
Epoch 10/200, Loss: 0.4106
Epoch 11/200, Loss: 0.4076, Validation Accuracy: 0.8323
Epoch 12/200, Loss: 0.4047
Epoch 13/200, Loss: 0.4027
Epoch 14/200, Loss: 0.4007
Epoch 15/200, Loss: 0.3993
Epoch 16/200, Loss: 0.3969
Epoch 17/200, Loss: 0.3952
Epoch 18/200, Loss: 0.3945
Epoch 19/200, Loss: 0.3933
Epoch 20/200, Loss: 0.3917
Epoch 21/200, Loss: 0.3910, Validation Accuracy: 0.8343
Epoch 22/200, Loss: 0.3903
Epoch 23/200, Loss: 0.3900
Epoch 24/200, Loss: 0.3894
Epoch 25/200, Loss: 0.3884
Epoch 26/200, Loss: 0.3879
Epoch 27/200, Loss: 0.3875
Epoch 28/200, Loss: 0.3869
Epoch 29/200, Loss: 0.3865
Epoch 30/200, Loss: 0.3865
Epoch 31/200, Loss: 0.3858, Validation Accuracy: 0.8383
Epoch 32/200, Loss: 0.3856
Epoch 33/200, Loss: 0.3

In [375]:
results_df[results_df['validation_accuracy'] == results_df['validation_accuracy'].max()]


Unnamed: 0,num_conv_layers,num_linear_layers,dropout_rate,validation_accuracy
27,3.0,1.0,0.3,0.846707


In [377]:
results_df.to_csv('grid_search.csv')