In [1]:
%cd ..

/Users/marc/Documents/DFCI/pnet


In [2]:
from src import pnet_loader
from src import util
%load_ext autoreload
%autoreload 2

In [3]:
import pandas as pd

In [4]:
mutations_dict = {"3'Flank": 'Silent',
                  "5'Flank": 'Silent',
                  "5'UTR": 'Silent',
                  "3'UTR": 'Silent',
                  "IGR": 'Silent',
                  "Intron": 'Silent',
                  "lincRNA": 'Silent',
                  "RNA": 'Silent',
                  "Silent": 'Silent',
                  "non_coding_transcript_exon": 'Silent',
                  "upstream_gene": 'Silent',
                  "Splice_Region": 'Silent',
                  "Targeted_Region": 'Silent',
                  'Splice_Site': 'LOF',
                  'Nonsense_Mutation': 'LOF',
                  'Frame_Shift_Del': 'LOF',
                  'Frame_Shift_Ins': 'LOF',
                  'Stop_Codon_Del': 'LOF',
                  'Stop_Codon_Ins': 'LOF',
                  'Nonstop_Mutation': 'LOF',
                  'Start_Codon_Del': 'LOF',
                  'Missense_Mutation': 'Other_nonsynonymous',
                  'In_Frame_Del': 'Other_nonsynonymous',
                  'In_Frame_Ins': 'Other_nonsynonymous',
                  'De_novo_Start_InFrame': 'Other_nonsynonymous',
                  'De_novo_Start_OutOfFrame': 'Other_nonsynonymous',
                  'Start_Codon_Ins': 'Other_nonsynonymous'
                  }

## Load mutation data

In [5]:
maf = pd.read_csv('data/m1000/M1000_CCF.maf', sep='\t')
survival_data = pd.read_csv('data/m1000/M1000_survival_data.txt', sep='\t').set_index('Tumor_Sample_Barcode')
clinical_mapping = pd.read_csv('data/m1000/TCGA_clinical_mapping_and_pathologic_M.txt', sep='\t').set_index('Tumor_Sample_Barcode')

maf = maf[maf['Tumor_Sample_Barcode'].isin(clinical_mapping.index)].copy()
maf['Variant_Classification'] = [mutations_dict[m] for m in maf['Variant_Classification']]
maf = maf[maf['Variant_Classification'] != 'Silent'].copy()

maf_grouped = maf.groupby('Tumor_Sample_Barcode')['Hugo_Symbol'].apply(set).reset_index(name='mut_list').set_index('Tumor_Sample_Barcode')
mutations = pd.DataFrame(columns = maf['Hugo_Symbol'].unique(), index = maf_grouped.index)
mutations.fillna(0, inplace = True)

for i, p in maf_grouped.iterrows():
    for mut in p['mut_list']:
        mutations.loc[i][mut] = 1

mutations = mutations.join(clinical_mapping[['mapping_patient_id']], how='inner').set_index('mapping_patient_id')

In [6]:
mutations.head()

Unnamed: 0_level_0,PRAMEF12,AIM1L,GMEB1,GRIK3,RPE65,CLCA4,COL11A1,NRAS,FLG,KPRP,...,SLC25A5,RAP2C,VGLL1,SPANXN4,RPL10,AAGAB,HIST1H4B,RPS4X,ARMCX1,FATE1
mapping_patient_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
TCGA-3N-A9WB,1,1,1,1,1,1,1,1,1,1,...,0,0,0,0,0,0,0,0,0,0
TCGA-3N-A9WC,0,0,0,1,0,0,1,1,1,0,...,0,0,0,0,0,0,0,0,0,0
TCGA-3N-A9WD,0,0,0,0,0,0,1,1,1,0,...,0,0,0,0,0,0,0,0,0,0
TCGA-BF-A1PU,0,0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
TCGA-BF-A1PV,0,0,0,0,0,0,1,1,0,0,...,0,0,0,0,0,0,0,0,0,0


## Load expression data

In [7]:
skcm_exp = pd.read_csv('data/skcm_tcga_pan_can_atlas_2018/data_mrna_seq_v2_rsem_zscores_ref_all_samples.txt',
                       sep='\t').dropna().set_index('Hugo_Symbol').drop(['Entrez_Gene_Id'], axis=1).T
skcm_exp.index = ['-'.join(ind.split('-')[:-1]) for ind in skcm_exp.index]
skcm_exp = util.select_non_constant_genes(skcm_exp)

In [8]:
genes = util.select_highly_variable_genes(skcm_exp)['Hugo_Symbol'].values
skcm_exp = skcm_exp[list(set(skcm_exp.columns).intersection(genes))].copy()

In [9]:
skcm_exp.head()

Hugo_Symbol,TP53TG3B,STXBP5,LDHAL6B,C1orf131,MESDC2,MLL2,ZIC1,SLC19A2,RBMS3,LOC391322,...,GRASP,LINC02249,GSTT1,ZNF703,ZACN,M6PR,METTL21EP,CYP2J2,NUDT14,MAPK7
TCGA-3N-A9WB,0.337,0.1898,-1.3043,0.9921,-0.8049,-1.654,1.1852,0.9658,-0.4907,-1.944,...,0.6228,0.1991,-2.5119,1.213,0.0412,0.3093,0.24,0.8629,0.7322,-1.7623
TCGA-3N-A9WC,-0.2184,-0.3776,-1.3088,0.4857,0.1739,0.5817,-0.0931,0.6071,0.4306,1.4646,...,0.4761,0.4265,-0.199,-0.6891,-0.1805,-0.7579,1.3744,0.894,-0.3903,0.2881
TCGA-3N-A9WD,0.1969,0.3754,-2.1843,-0.1935,-1.3873,-1.2976,-1.8657,-0.2594,-0.4962,0.6409,...,-0.5616,-1.7812,0.0037,0.9625,-0.2929,-0.2182,0.4744,0.2672,0.3999,-0.6499
TCGA-BF-A1PU,-1.4306,-0.9686,-0.8863,-0.5319,0.4985,1.218,-0.1126,-0.0527,0.1785,-0.0055,...,0.1079,0.4623,-0.1952,1.1466,-0.3643,0.1819,-1.1038,-0.1251,0.9214,1.3793
TCGA-BF-A1PV,-2.1249,-2.0677,0.1864,0.2887,0.1747,-0.1898,0.6385,-0.2359,-1.578,0.4554,...,0.7593,-1.7812,0.893,0.0221,-1.0543,-0.4256,-1.1553,-1.725,-0.4575,0.2606


## Load prediction target

In [10]:
mat_TCGA = pd.read_csv('data/m1000/mat_TCGA.tsv', sep='\t').set_index('Tumor_Sample_Barcode')

mat_TCGA = mat_TCGA.join(clinical_mapping[['mapping_patient_id']], how='inner').set_index('mapping_patient_id')

heterogeneity_y = pd.DataFrame(index=mat_TCGA.index, columns=['dichtomized_heterogeneity'],
                               data=[int(p > mat_TCGA['heterogeneity'].mean()) 
                                     for p in mat_TCGA['heterogeneity'].values])

In [11]:
heterogeneity_y.head()

Unnamed: 0_level_0,dichtomized_heterogeneity
mapping_patient_id,Unnamed: 1_level_1
TCGA-BF-A1PU,0
TCGA-BF-A1PV,1
TCGA-BF-A1PX,0
TCGA-BF-A1PZ,0
TCGA-BF-A1Q0,1


## Generate pnet_loader

In [12]:
genetic_data = {'rna': skcm_exp, 'mut': mutations}

train_dataset, test_dataset = pnet_loader.generate_train_test(genetic_data, heterogeneity_y)

Given 2 Input modalities
Found 421 overlapping indicies
Initializing Train Dataset
Found 688 overlapping genes
generated input DataFrame of size (295, 1376)
Initializing Test Dataset
Found 688 overlapping genes
generated input DataFrame of size (126, 1376)


## Train with run()

In [13]:
%cd src
import Pnet

/Users/marc/Documents/DFCI/pnet/src


In [18]:
model, train_scores, test_scores = Pnet.run(genetic_data, heterogeneity_y, verbose=True)

Given 2 Input modalities
Found 421 overlapping indicies
Initializing Train Dataset
Found 688 overlapping genes
generated input DataFrame of size (337, 1376)
Initializing Test Dataset
Found 688 overlapping genes
generated input DataFrame of size (84, 1376)
Found 688 overlapping genes
passiert was?


Traceback (most recent call last):
  File "<string>", line 1, in <module>
  File "/Users/marc/opt/anaconda3/envs/pnet/lib/python3.10/multiprocessing/spawn.py", line 116, in spawn_main
    exitcode = _main(fd, parent_sentinel)
  File "/Users/marc/opt/anaconda3/envs/pnet/lib/python3.10/multiprocessing/spawn.py", line 126, in _main
    self = reduction.pickle.load(from_parent)
ModuleNotFoundError: No module named 'pnet_loader'


Unexpected exception formatting exception. Falling back to standard exception


Traceback (most recent call last):
  File "/Users/marc/opt/anaconda3/envs/pnet/lib/python3.10/site-packages/IPython/core/interactiveshell.py", line 3442, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "/var/folders/j8/df8v0y1j16ng5mlcnmr38r0r0000gp/T/ipykernel_48514/718278017.py", line 1, in <module>
    model, train_scores, test_scores = Pnet.run(genetic_data, heterogeneity_y, verbose=True)
  File "/Users/marc/Documents/DFCI/pnet/src/Pnet.py", line 196, in run
    model, train_scores, test_scores = train(model, train_loader, test_loader, lr, weight_decay, epochs, verbose)
  File "/Users/marc/Documents/DFCI/pnet/src/Pnet.py", line 171, in train
    train_epoch_scores = fit(model, train_loader, optimizer)
  File "/Users/marc/Documents/DFCI/pnet/src/Pnet.py", line 123, in fit
    for batch in dataloader:
  File "/Users/marc/opt/anaconda3/envs/pnet/lib/python3.10/site-packages/torch/utils/data/dataloader.py", line 430, in __iter__
    self._iterator = self._get_i

## Train with Lightning

In [None]:
%cd src
import Pnet
import ReactomeNetwork

In [None]:
import pytorch_lightning as pl

In [None]:
reactome_network = ReactomeNetwork.ReactomeNetwork(train_dataset.get_genes())

In [None]:
model = Pnet.PNET_NN(hparams=
                     {'reactome_network':reactome_network, 'nbr_gene_inputs':len(genetic_data), 'dropout':0.2,
                      'additional_dims':0, 'lr':1e-3, 'weight_decay':1e-5}
                    )

In [None]:
train_loader, val_loader = pnet_loader.to_dataloader(train_dataset, test_dataset, 56)

In [None]:
batch_x, additional, batch_y = next(iter(train_loader))

In [None]:
additional.shape

In [None]:
trainer = pl.Trainer(precision=16, accelerator='mps', max_epochs=100, log_every_n_steps=50, 
                     enable_checkpointing=False)

trainer.fit(model=model, train_dataloaders=train_loader, val_dataloaders=val_loader,)