In [83]:
import os
import argparse
import time
from datetime import datetime, date
import random

import numpy as np
from scipy.sparse import load_npz
from scipy.stats import pearsonr
from sklearn.metrics import roc_auc_score, precision_recall_curve, auc
import pandas as pd

import torch
import torch.nn.functional as F
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import TensorDataset, DataLoader

In [3]:
cell_line = 'E116'
max_epoch = 1000
learning_rate = 1e-4
num_graph_conv_layers = 2
graph_conv_embed_size = 256
num_lin_layers = 3
lin_hidden_size = 256
regression_flag = 0
random_seed = 0

chip_res = 10000
hic_res = 10000
num_hm = 6
num_feat = int((hic_res/chip_res)*num_hm)
num_classes = 2 if regression_flag == 0 else 1

In [100]:
base_path = os.getcwd()
save_dir = os.path.join(base_path, 'data', cell_line, 'saved_runs')
hic_sparse_mat_file = os.path.join(base_path, 'data', cell_line, 'hic_sparse.npz')
np_nodes_lab_genes_file = os.path.join(base_path, 'data',  cell_line, \
    'np_nodes_lab_genes_reg' + str(regression_flag) + '.npy')
np_hmods_norm_all_file = os.path.join(base_path, 'data', cell_line, \
    'np_hmods_norm_chip_' + str(chip_res) + 'bp.npy')
df_genes_file = os.path.join(base_path, 'data', cell_line, 'df_genes_reg' + str(regression_flag) + '.pkl')
df_genes = pd.read_pickle(df_genes_file)

mat = load_npz(hic_sparse_mat_file)
allNodes_hms = np.load(np_hmods_norm_all_file)
hms = allNodes_hms[:, 1:] #only includes features, not node ids
X = torch.tensor(hms).float().reshape(-1, num_feat) 
allNodes = allNodes_hms[:, 0].astype(int)
geneNodes_labs = np.load(np_nodes_lab_genes_file)

geneNodes = geneNodes_labs[:, -2].astype(int)
allLabs = -1*np.ones(np.shape(allNodes))

targetNode_mask = torch.tensor(geneNodes).long()

if regression_flag == 0:
    geneLabs = geneNodes_labs[:, -1].astype(int)
    allLabs[geneNodes] = geneLabs
    Y = torch.tensor(allLabs).long()
else:
    geneLabs = geneNodes_labs[:, -1].astype(float)
    allLabs[geneNodes] = geneLabs
    Y = torch.tensor(allLabs).float()

In [168]:
16699 / allLabs.shape[0]

0.05972332496441421

In [171]:
np.unique(allLabs, return_counts=True)

(array([-1.,  0.,  1.]), array([262907,   8847,   7852]))

In [172]:
train_data

tensor([[[0.3734, 0.1118, 0.1085, 0.2100, 0.4372, 0.1698]],

        [[0.0263, 0.2426, 0.0643, 0.0446, 0.1653, 0.0377]],

        [[0.0592, 0.0870, 0.2615, 0.5445, 0.2337, 0.0248]],

        ...,

        [[0.0009, 0.0110, 0.0024, 0.0000, 0.0026, 0.0032]],

        [[0.0083, 0.3929, 0.0428, 0.0320, 0.0372, 0.0700]],

        [[0.1449, 0.1034, 0.3776, 0.1160, 0.3957, 0.1118]]])

In [101]:
pred_idx_shuff = torch.randperm(targetNode_mask.shape[0])
fin_train = np.floor(0.7*pred_idx_shuff.shape[0]).astype(int)
fin_valid = np.floor(0.85*pred_idx_shuff.shape[0]).astype(int)
train_idx = pred_idx_shuff[:fin_train]
valid_idx = pred_idx_shuff[fin_train:fin_valid]
test_idx = pred_idx_shuff[fin_valid:]

In [292]:
INPUT_LENGTH = 6
NUM_CLASSES = 2 
BATCH_SIZE = 64

class CNN(nn.Module):
    def __init__(self, num_conv_layers, num_linear_layers, dropout_rate=0.2):
        super(CNN, self).__init__()

        self.conv_layers = nn.ModuleList()
        in_channels = 1
        out_channels = 16
        for i in range(num_conv_layers):
            conv = nn.Sequential(
                nn.Conv1d(in_channels=in_channels, out_channels=out_channels, kernel_size=3, padding=1),
                nn.ReLU(),
                nn.MaxPool1d(kernel_size=1, stride=1),
            )
            self.conv_layers.append(conv)
            
            in_channels = out_channels
            out_channels *= 2
        

        self.linear_layers = nn.ModuleList()

        if num_linear_layers > 1:
            first_linear_out_size = 2**(4 + num_linear_layers)
            for i in range(num_linear_layers-1):
                fc = nn.Sequential(
                    nn.LazyLinear(first_linear_out_size),
                    nn.ReLU()
                )
                self.linear_layers.append(fc)

                first_linear_out_size /= 2
        
        self.final_linear = nn.LazyLinear(NUM_CLASSES)
        self.dropout = nn.Dropout(dropout_rate)
        self.flatten = nn.Flatten()
        

    def forward(self, x):
        out = self.conv_layers[0](x)
        
        for layer in self.conv_layers[1:]:
            out = layer(out)

        out = self.dropout(out)
        out = self.flatten(out)
        for layer in self.linear_layers:
            out = layer(out)
        
        out = self.final_linear(out)
        
        return out

    def calculate_accuracy(self, dataset):
        data_loader = DataLoader(dataset, batch_size=BATCH_SIZE, shuffle=True)
        num_correct = 0
        with torch.no_grad(:)
            for batch_inputs, batch_labels in data_loader:
                optimizer.zero_grad()
                
                output = model(batch_inputs)
                pred = torch.argmax(output, dim=1)
                num_correct += torch.sum(pred == batch_labels)
            
        return float(num_correct / len(dataset))

In [293]:
train_data = X[targetNode_mask][train_idx]
train_labels = torch.tensor(geneNodes_labs[train_idx][:, 1]).long()

valid_data = X[targetNode_mask][valid_idx]
valid_labels = torch.tensor(geneNodes_labs[valid_idx][:, 1]).long()

test_data = X[targetNode_mask][test_idx]
test_labels = torch.tensor(geneNodes_labs[test_idx][:, 1]).long()

In [294]:
train_data = train_data.unsqueeze(1)
test_data = test_data.unsqueeze(1)
valid_data = valid_data.unsqueeze(1)

In [295]:
model = CNN(num_conv_layers=3, num_linear_layers=2)
optimizer = optim.Adam(model.parameters(), lr=0.001)

train_dataset = TensorDataset(train_data, train_labels)
train_data_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)

criterion = nn.CrossEntropyLoss()

num_epochs = 100
for epoch in range(num_epochs):
    epoch_loss = 0
    for batch_inputs, batch_labels in train_data_loader:
        optimizer.zero_grad()
        
        output = model(batch_inputs)
        loss = criterion(output, batch_labels)
        
        loss.backward()
        optimizer.step()
        
        epoch_loss += loss.item()
    
    avg_loss = epoch_loss / len(train_data_loader)
    print(f"Epoch {epoch + 1}/{num_epochs}, Loss: {avg_loss:.4f}")



Epoch 1/100, Loss: 0.4617
Epoch 2/100, Loss: 0.3871
Epoch 3/100, Loss: 0.3825
Epoch 4/100, Loss: 0.3823
Epoch 5/100, Loss: 0.3791
Epoch 6/100, Loss: 0.3783
Epoch 7/100, Loss: 0.3770
Epoch 8/100, Loss: 0.3776
Epoch 9/100, Loss: 0.3774
Epoch 10/100, Loss: 0.3757
Epoch 11/100, Loss: 0.3758
Epoch 12/100, Loss: 0.3761
Epoch 13/100, Loss: 0.3750
Epoch 14/100, Loss: 0.3743
Epoch 15/100, Loss: 0.3746
Epoch 16/100, Loss: 0.3742
Epoch 17/100, Loss: 0.3725
Epoch 18/100, Loss: 0.3740
Epoch 19/100, Loss: 0.3731
Epoch 20/100, Loss: 0.3733
Epoch 21/100, Loss: 0.3722
Epoch 22/100, Loss: 0.3710
Epoch 23/100, Loss: 0.3705
Epoch 24/100, Loss: 0.3711
Epoch 25/100, Loss: 0.3712
Epoch 26/100, Loss: 0.3700
Epoch 27/100, Loss: 0.3700
Epoch 28/100, Loss: 0.3700
Epoch 29/100, Loss: 0.3710
Epoch 30/100, Loss: 0.3688
Epoch 31/100, Loss: 0.3689
Epoch 32/100, Loss: 0.3692
Epoch 33/100, Loss: 0.3672
Epoch 34/100, Loss: 0.3683
Epoch 35/100, Loss: 0.3679
Epoch 36/100, Loss: 0.3676
Epoch 37/100, Loss: 0.3675
Epoch 38/1

In [289]:
valid_dataset = TensorDataset(valid_data, valid_labels)
valid_data_loader = DataLoader(valid_dataset, batch_size=BATCH_SIZE, shuffle=True)


0.8411177396774292


In [297]:
model.calculate_accuracy(valid_dataset)

0.8439121842384338

In [291]:
len(valid_dataset)

2505