In [1]:
import pnet_loader
import util
import torch
import seaborn as sns
import pandas as pd
import numpy as np
import Pnet
import pytorch_lightning as pl
import ReactomeNetwork
from sklearn import metrics
import matplotlib.pyplot as plt
from sklearn.metrics import roc_auc_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.mixture import GaussianMixture
from sklearn.decomposition import PCA
import os

%load_ext autoreload
%autoreload 2

In [2]:
MUTATIONS_DICT = {"3'Flank": 'Silent',
                  "5'Flank": 'Silent',
                  "5'UTR": 'Silent',
                  "3'UTR": 'Silent',
                  "IGR": 'Silent',
                  "Intron": 'Silent',
                  "lincRNA": 'Silent',
                  "RNA": 'Silent',
                  "Silent": 'Silent',
                  "non_coding_transcript_exon": 'Silent',
                  "upstream_gene": 'Silent',
                  "Splice_Region": 'Silent',
                  "Targeted_Region": 'Silent',
                  'Splice_Site': 'LOF',
                  'Nonsense_Mutation': 'LOF',
                  'Frame_Shift_Del': 'LOF',
                  'Frame_Shift_Ins': 'LOF',
                  'Stop_Codon_Del': 'LOF',
                  'Stop_Codon_Ins': 'LOF',
                  'Nonstop_Mutation': 'LOF',
                  'Start_Codon_Del': 'LOF',
                  'Missense_Mutation': 'Other_nonsynonymous',
                  'In_Frame_Del': 'Other_nonsynonymous',
                  'In_Frame_Ins': 'Other_nonsynonymous',
                  'De_novo_Start_InFrame': 'Other_nonsynonymous',
                  'Translation_Start_Site': 'Other_nonsynonymous'}

In [3]:
directories = ['/mnt/disks/pancan/data/' + x for x in os.listdir('/mnt/disks/pancan/data/') if 'tcga_pan_can_atlas' in x]

In [4]:
def load_single_dataset(directory_path):
    muts = pd.read_csv(directory_path + '/data_mutations.txt', delimiter='\t')
    grouped_muts = muts[muts['Variant_Classification'].apply(lambda x: MUTATIONS_DICT[x]) != 'Silent'][['Hugo_Symbol',
                                                                                                    'Variant_Classification',
                                                                                                    'Tumor_Sample_Barcode']].groupby(['Tumor_Sample_Barcode',
                                                                                                                                      'Hugo_Symbol']).count()
    rna_exp = pd.read_csv(directory_path + '/data_mrna_seq_v2_rsem.txt',
                           sep='\t').dropna().set_index('Hugo_Symbol').drop(['Entrez_Gene_Id'], axis=1).T
    rna_exp = rna_exp.loc[:,~rna_exp.columns.duplicated()].astype(float).copy()
    cna = pd.read_csv(directory_path + '/data_cna.txt',
                      sep='\t').dropna().set_index('Hugo_Symbol').drop(['Entrez_Gene_Id'], axis=1).T
    cna.drop('Cytoband', errors='ignore', inplace=True)
    cna = cna.loc[:,~cna.columns.duplicated()].astype(float).copy()
    
    genes = list(set(rna_exp.columns).intersection(cna.columns))
    indices = list(set(rna_exp.index).intersection(cna.index))
    tumor_type = pd.DataFrame(len(indices)*[directory_path.split('/')[-1].split('_')[0]],
                              index = indices, columns=['tumor'])
    return rna_exp[genes], cna[genes], tumor_type, grouped_muts

In [5]:
[directories[1], directories[14]]

['/mnt/disks/pancan/data/luad_tcga_pan_can_atlas_2018',
 '/mnt/disks/pancan/data/lusc_tcga_pan_can_atlas_2018']

In [6]:
rna, cna, tumor, mut = pd.DataFrame(), pd.DataFrame(), pd.DataFrame(), pd.DataFrame()
for path in [directories[1], directories[14]]: # selecting only lung directories
    rna_, cna_, tumor_, grouped_muts_ = load_single_dataset(path)

    mut_ = pd.DataFrame(index=rna_.index, columns=rna_.columns).fillna(0)
    for i in grouped_muts_.iterrows():
        try: 
            mut_.loc[i[0][0]][i[0][1]] = 1
        except KeyError:
            pass
    
    rna = pd.concat([rna_, rna])
    cna = pd.concat([cna_, cna])
    mut = pd.concat([mut_, mut])
    tumor = pd.concat([tumor_, tumor])

    
rna.dropna(axis=1, inplace=True)
cna.dropna(axis=1, inplace=True)
mut.dropna(axis=1, inplace=True)
rna = (rna-rna.mean())/rna.std()

# rna = pd.read_csv('/mnt/disks/pancan/data/pancan_rna.csv').set_index('Unnamed: 0')
# cna = pd.read_csv('/mnt/disks/pancan/data/pancan_cna.csv').set_index('Unnamed: 0')
# mut = pd.read_csv('/mnt/disks/pancan/data/pancan_mut.csv').set_index('Unnamed: 0')

  This is separate from the ipykernel package so we can avoid doing imports until


In [7]:
def load_pathway2genes():
    filename = '../data/reactome/ReactomePathways.gmt'
    genes_start_col = 2
    pathway_col = 1
    pathway2genes_list = []
    with open(filename) as gmt:
        lines = gmt.readlines()
        for line in lines:
            line_list = line.strip().split('\t')
            pathway = line_list[pathway_col]
            for gene in line_list[genes_start_col:]:
                pathway2genes_list.append({'pathway': pathway, 'gene': gene})
    pathway2genes = pd.DataFrame(pathway2genes_list)

    return pd.concat([pathway2genes])

In [8]:
canc_genes = list(pd.read_csv('/mnt/disks/pancan/pnet_database/genes/cancer_genes.txt')['genes'])
pathway2genes = load_pathway2genes()
reactome_genes = list(pathway2genes['gene'].unique())

In [9]:
genetic_data = {'rna': rna, 'cna': cna, 'mut': mut}

In [10]:
# tumor_target = pd.DataFrame(pd.get_dummies(tumor)['tumor_luad'])
# tumor_target
some_target = pd.DataFrame(index=rna.index, columns=['target'])
some_target['target'] = (np.random.rand(some_target.shape[0]) > 0.5).astype(float)
some_target

Unnamed: 0,target
TCGA-18-3406-01,0.0
TCGA-18-3407-01,1.0
TCGA-18-3408-01,1.0
TCGA-18-3410-01,1.0
TCGA-18-3411-01,0.0
...,...
TCGA-NJ-A55O-01,0.0
TCGA-NJ-A55R-01,0.0
TCGA-NJ-A7XG-01,0.0
TCGA-O1-A52J-01,1.0


In [11]:
train_dataset, test_dataset = pnet_loader.generate_train_test(genetic_data, some_target, gene_set=canc_genes)

reactome_network = ReactomeNetwork.ReactomeNetwork(train_dataset.get_genes())

model = Pnet.PNET_NN(reactome_network=reactome_network, nbr_gene_inputs=len(genetic_data), dropout=0.2,
                      additional_dims=0, lr=1e-3, weight_decay=0.1, attn_agg=True)

train_loader, val_loader = pnet_loader.to_dataloader(train_dataset, test_dataset, 256)

Given 3 Input modalities
Found 991 overlapping indicies
Initializing Train Dataset
Found 687 overlapping genes
generated input DataFrame of size (793, 2061)
Initializing Test Dataset
Found 687 overlapping genes
generated input DataFrame of size (198, 2061)
Found 687 overlapping genes


In [12]:
model.input_layer[0].weight

Parameter containing:
tensor([[-0.0053, -0.0010,  0.0208,  ...,  0.0084,  0.0197, -0.0001],
        [-0.0131, -0.0089, -0.0026,  ..., -0.0166,  0.0019,  0.0218],
        [ 0.0026, -0.0174,  0.0009,  ..., -0.0096,  0.0092, -0.0163],
        ...,
        [ 0.0014,  0.0157, -0.0011,  ...,  0.0106,  0.0054, -0.0015],
        [ 0.0216,  0.0187,  0.0142,  ...,  0.0039,  0.0075,  0.0095],
        [-0.0019,  0.0003, -0.0115,  ...,  0.0163,  0.0166,  0.0057]],
       requires_grad=True)

In [13]:
trainer = pl.Trainer(precision=32, accelerator='cuda', max_epochs=3, log_every_n_steps=50, 
                     enable_checkpointing=True, callbacks=[pl.callbacks.LearningRateMonitor(logging_interval='epoch')])

trainer.fit(model=model, train_dataloaders=train_loader, val_dataloaders=val_loader,)

GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
Initializing distributed: GLOBAL_RANK: 0, MEMBER: 1/2
Initializing distributed: GLOBAL_RANK: 1, MEMBER: 2/2
----------------------------------------------------------------------------------------------------
distributed_backend=nccl
All distributed processes registered. Starting with 2 processes
----------------------------------------------------------------------------------------------------

LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1]
LOCAL_RANK: 1 - CUDA_VISIBLE_DEVICES: [0,1]

  | Name             | Type       | Params
------------------------------------------------
0 | layers           | ModuleList | 4.2 M 
1 | preds            | ModuleList | 2.2 K 
2 | input_layer      | Sequential | 2.8 M 
3 | first_gene_layer | Sequential | 1.0 M 
4 | drop1            | Dropout    | 0     
5 | attn             | Linear     | 5     
---------

Sanity Checking: 0it [00:00, ?it/s]

  "num_workers>0, persistent_workers=False, and strategy=ddp_spawn"


before:  before:  tensor([[-0.4898,  0.2298,  0.3140,  ...,  0.0000,  0.0000,  0.0000],
        [ 1.8134,  0.7548, -0.5753,  ...,  0.0000,  0.0000,  0.0000],
        [-0.5219,  1.6974, -0.9188,  ...,  0.0000,  0.0000,  0.0000],
        ...,
        [-0.7075, -1.4653, -0.3705,  ...,  0.0000,  0.0000,  0.0000],
        [ 1.7728, -0.2099, -0.6515,  ...,  0.0000,  0.0000,  0.0000],
        [ 0.0800,  0.0320, -0.9400,  ...,  0.0000,  0.0000,  0.0000]],
       device='cuda:1')
tensor([[-0.2435,  0.2605, -0.1863,  ...,  0.0000,  0.0000,  0.0000],
        [-0.8765, -1.6139,  2.9281,  ...,  0.0000,  0.0000,  0.0000],
        [-0.7316, -0.2879, -0.6402,  ...,  0.0000,  0.0000,  0.0000],
        ...,
        [-0.1780, -0.0630,  0.2241,  ...,  0.0000,  0.0000,  0.0000],
        [-0.4339,  0.6982, -0.6218,  ...,  0.0000,  0.0000,  0.0000],
        [-0.5101,  0.6086,  0.6178,  ...,  0.0000,  0.0000,  0.0000]],
       device='cuda:0')
after:  after:  tensor([[nan, nan, nan,  ..., nan, nan, nan],
    

ProcessRaisedException: 

-- Process 0 terminated with the following error:
Traceback (most recent call last):
  File "/opt/conda/lib/python3.7/site-packages/torch/multiprocessing/spawn.py", line 59, in _wrap
    fn(i, *args)
  File "/opt/conda/lib/python3.7/site-packages/pytorch_lightning/strategies/launchers/multiprocessing.py", line 139, in _wrapping_function
    results = function(*args, **kwargs)
  File "/opt/conda/lib/python3.7/site-packages/pytorch_lightning/trainer/trainer.py", line 650, in _fit_impl
    self._run(model, ckpt_path=self.ckpt_path)
  File "/opt/conda/lib/python3.7/site-packages/pytorch_lightning/trainer/trainer.py", line 1112, in _run
    results = self._run_stage()
  File "/opt/conda/lib/python3.7/site-packages/pytorch_lightning/trainer/trainer.py", line 1191, in _run_stage
    self._run_train()
  File "/opt/conda/lib/python3.7/site-packages/pytorch_lightning/trainer/trainer.py", line 1204, in _run_train
    self._run_sanity_check()
  File "/opt/conda/lib/python3.7/site-packages/pytorch_lightning/trainer/trainer.py", line 1276, in _run_sanity_check
    val_loop.run()
  File "/opt/conda/lib/python3.7/site-packages/pytorch_lightning/loops/loop.py", line 199, in run
    self.advance(*args, **kwargs)
  File "/opt/conda/lib/python3.7/site-packages/pytorch_lightning/loops/dataloader/evaluation_loop.py", line 152, in advance
    dl_outputs = self.epoch_loop.run(self._data_fetcher, dl_max_batches, kwargs)
  File "/opt/conda/lib/python3.7/site-packages/pytorch_lightning/loops/loop.py", line 199, in run
    self.advance(*args, **kwargs)
  File "/opt/conda/lib/python3.7/site-packages/pytorch_lightning/loops/epoch/evaluation_epoch_loop.py", line 137, in advance
    output = self._evaluation_step(**kwargs)
  File "/opt/conda/lib/python3.7/site-packages/pytorch_lightning/loops/epoch/evaluation_epoch_loop.py", line 234, in _evaluation_step
    output = self.trainer._call_strategy_hook(hook_name, *kwargs.values())
  File "/opt/conda/lib/python3.7/site-packages/pytorch_lightning/trainer/trainer.py", line 1494, in _call_strategy_hook
    output = fn(*args, **kwargs)
  File "/opt/conda/lib/python3.7/site-packages/pytorch_lightning/strategies/ddp_spawn.py", line 288, in validation_step
    return self.model(*args, **kwargs)
  File "/opt/conda/lib/python3.7/site-packages/torch/nn/modules/module.py", line 1102, in _call_impl
    return forward_call(*input, **kwargs)
  File "/opt/conda/lib/python3.7/site-packages/torch/nn/parallel/distributed.py", line 886, in forward
    output = self.module(*inputs[0], **kwargs[0])
  File "/opt/conda/lib/python3.7/site-packages/torch/nn/modules/module.py", line 1102, in _call_impl
    return forward_call(*input, **kwargs)
  File "/opt/conda/lib/python3.7/site-packages/pytorch_lightning/overrides/base.py", line 110, in forward
    return self._forward_module.validation_step(*inputs, **kwargs)
  File "/mnt/disks/pancan/pnet/src/Pnet.py", line 129, in validation_step
    loss = self.step('val', batch, batch_nb)
  File "/mnt/disks/pancan/pnet/src/Pnet.py", line 113, in step
    pred_y = self(x, additional)
  File "/opt/conda/lib/python3.7/site-packages/torch/nn/modules/module.py", line 1102, in _call_impl
    return forward_call(*input, **kwargs)
  File "/mnt/disks/pancan/pnet/src/Pnet.py", line 105, in forward
    raise Exception('Yallah want to stop here')
Exception: Yallah want to stop here


In [None]:
x_train = train_dataset.x
additional_train = train_dataset.additional
y_train = train_dataset.y
x_test = test_dataset.x
additional_test = test_dataset.additional
y_test = test_dataset.y

In [None]:
pred = model(x_test, additional_test)
y_pred_proba = pred.detach().numpy().squeeze()
fpr, tpr, _ = metrics.roc_curve(y_test,  y_pred_proba)
test_auc = metrics.roc_auc_score(y_test, pred.detach().numpy().squeeze())
#create ROC curve
plt.plot(fpr,tpr, color="darkorange", label="ROC curve (area = %0.2f)" % test_auc)
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')
plt.plot([0, 1], [0, 1], color="navy", linestyle="--")
plt.legend(loc="lower right")
plt.savefig('PNET_random_ROC_curve.pdf')
plt.show()