In [1]:
import pnet_loader
import util
%load_ext autoreload
%autoreload 2

In [2]:
import pandas as pd

In [3]:
mutations_dict = {"3'Flank": 'Silent',
                  "5'Flank": 'Silent',
                  "5'UTR": 'Silent',
                  "3'UTR": 'Silent',
                  "IGR": 'Silent',
                  "Intron": 'Silent',
                  "lincRNA": 'Silent',
                  "RNA": 'Silent',
                  "Silent": 'Silent',
                  "non_coding_transcript_exon": 'Silent',
                  "upstream_gene": 'Silent',
                  "Splice_Region": 'Silent',
                  "Targeted_Region": 'Silent',
                  'Splice_Site': 'LOF',
                  'Nonsense_Mutation': 'LOF',
                  'Frame_Shift_Del': 'LOF',
                  'Frame_Shift_Ins': 'LOF',
                  'Stop_Codon_Del': 'LOF',
                  'Stop_Codon_Ins': 'LOF',
                  'Nonstop_Mutation': 'LOF',
                  'Start_Codon_Del': 'LOF',
                  'Missense_Mutation': 'Other_nonsynonymous',
                  'In_Frame_Del': 'Other_nonsynonymous',
                  'In_Frame_Ins': 'Other_nonsynonymous',
                  'De_novo_Start_InFrame': 'Other_nonsynonymous',
                  'De_novo_Start_OutOfFrame': 'Other_nonsynonymous',
                  'Start_Codon_Ins': 'Other_nonsynonymous'
                  }

## Load mutation data

In [6]:
maf = pd.read_csv('../data/m1000/M1000_CCF.maf', sep='\t')
survival_data = pd.read_csv('../data/m1000/M1000_survival_data.txt', sep='\t').set_index('Tumor_Sample_Barcode')
clinical_mapping = pd.read_csv('../data/m1000/TCGA_clinical_mapping_and_pathologic_M.txt', sep='\t').set_index('Tumor_Sample_Barcode')

maf = maf[maf['Tumor_Sample_Barcode'].isin(clinical_mapping.index)].copy()
maf['Variant_Classification'] = [mutations_dict[m] for m in maf['Variant_Classification']]
maf = maf[maf['Variant_Classification'] != 'Silent'].copy()

maf_grouped = maf.groupby('Tumor_Sample_Barcode')['Hugo_Symbol'].apply(set).reset_index(name='mut_list').set_index('Tumor_Sample_Barcode')
mutations = pd.DataFrame(columns = maf['Hugo_Symbol'].unique(), index = maf_grouped.index)
mutations.fillna(0, inplace = True)

for i, p in maf_grouped.iterrows():
    for mut in p['mut_list']:
        mutations.loc[i][mut] = 1

mutations = mutations.join(clinical_mapping[['mapping_patient_id']], how='inner').set_index('mapping_patient_id')

In [7]:
mutations.head()

Unnamed: 0_level_0,PRAMEF12,AIM1L,GMEB1,GRIK3,RPE65,CLCA4,COL11A1,NRAS,FLG,KPRP,...,SLC25A5,RAP2C,VGLL1,SPANXN4,RPL10,AAGAB,HIST1H4B,RPS4X,ARMCX1,FATE1
mapping_patient_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
TCGA-3N-A9WB,1,1,1,1,1,1,1,1,1,1,...,0,0,0,0,0,0,0,0,0,0
TCGA-3N-A9WC,0,0,0,1,0,0,1,1,1,0,...,0,0,0,0,0,0,0,0,0,0
TCGA-3N-A9WD,0,0,0,0,0,0,1,1,1,0,...,0,0,0,0,0,0,0,0,0,0
TCGA-BF-A1PU,0,0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
TCGA-BF-A1PV,0,0,0,0,0,0,1,1,0,0,...,0,0,0,0,0,0,0,0,0,0


## Load expression data

In [8]:
skcm_exp = pd.read_csv('../data/skcm_tcga_pan_can_atlas_2018/data_mrna_seq_v2_rsem_zscores_ref_all_samples.txt',
                       sep='\t').dropna().set_index('Hugo_Symbol').drop(['Entrez_Gene_Id'], axis=1).T
skcm_exp.index = ['-'.join(ind.split('-')[:-1]) for ind in skcm_exp.index]
skcm_exp = util.select_non_constant_genes(skcm_exp)

In [9]:
genes = util.select_highly_variable_genes(skcm_exp)['Hugo_Symbol'].values
skcm_exp = skcm_exp[list(set(skcm_exp.columns).intersection(genes))].copy()

In [10]:
skcm_exp.head()

Hugo_Symbol,MTERF,GAB1,STK32A,GNB5,CDCP2,CCDC140,NOV,MRPL49,C18orf45,LCORL,...,MAPK14,MRPS27,MEGF8,PRX,DCTN4,ZNF134,TCEAL6,MAGEA3,TMEM9B,CYorf15A
TCGA-3N-A9WB,-0.6768,-1.2578,0.3126,0.5703,-1.9417,0.3433,0.163,0.037,0.4518,-0.0474,...,-1.3555,1.4033,-0.6356,-0.2611,0.4815,-1.3721,1.1232,-1.0529,0.8677,0.6837
TCGA-3N-A9WC,0.0724,0.6648,0.8809,0.5683,-1.9417,-1.055,1.2266,-1.2362,0.0158,1.2246,...,1.2361,0.6558,0.0953,-0.2083,0.4219,-0.1735,0.0095,0.6328,-0.5721,0.7574
TCGA-3N-A9WD,-0.2928,0.0355,0.0842,-0.4088,-0.0701,0.2022,0.0886,0.1468,1.5431,-0.3169,...,-0.6578,-0.3186,0.1513,0.6823,-0.0397,0.3092,1.2699,-0.7001,0.8854,0.4537
TCGA-BF-A1PU,-0.099,-0.3632,1.0418,0.7011,-1.9417,0.3586,1.3509,1.2616,0.2489,-1.9358,...,-0.5246,1.3873,1.3447,0.3906,0.3621,0.0557,0.7664,-2.0855,-0.5612,-2.1911
TCGA-BF-A1PV,0.1566,0.9984,0.8252,-0.5427,-1.0285,-0.0789,1.3616,0.1482,1.9437,0.5758,...,1.0616,1.1888,0.4733,-0.1056,0.3449,1.0154,-0.1344,-1.7231,-0.5193,-2.1911


## Load prediction target

In [11]:
mat_TCGA = pd.read_csv('../data/m1000/mat_TCGA.tsv', sep='\t').set_index('Tumor_Sample_Barcode')

mat_TCGA = mat_TCGA.join(clinical_mapping[['mapping_patient_id']], how='inner').set_index('mapping_patient_id')

heterogeneity_y = pd.DataFrame(index=mat_TCGA.index, columns=['dichtomized_heterogeneity'],
                               data=[int(p > mat_TCGA['heterogeneity'].mean()) 
                                     for p in mat_TCGA['heterogeneity'].values])

In [12]:
heterogeneity_y.head()

Unnamed: 0_level_0,dichtomized_heterogeneity
mapping_patient_id,Unnamed: 1_level_1
TCGA-BF-A1PU,0
TCGA-BF-A1PV,1
TCGA-BF-A1PX,0
TCGA-BF-A1PZ,0
TCGA-BF-A1Q0,1


## Generate pnet_loader

In [13]:
genetic_data = {'rna': skcm_exp, 'mut': mutations}

train_dataset, test_dataset = pnet_loader.generate_train_test(genetic_data, heterogeneity_y)

Given 2 Input modalities
Found 421 overlapping indicies
Initializing Train Dataset
Found 688 overlapping genes
generated input DataFrame of size (295, 1376)
Initializing Test Dataset
Found 688 overlapping genes
generated input DataFrame of size (126, 1376)


## Train with run()

In [14]:
import Pnet

In [22]:
model, train_scores, test_scores = Pnet.run(genetic_data, heterogeneity_y, verbose=True)

Given 2 Input modalities
Found 421 overlapping indicies
Initializing Train Dataset
Found 688 overlapping genes
generated input DataFrame of size (337, 1376)
Initializing Test Dataset
Found 688 overlapping genes
generated input DataFrame of size (84, 1376)
Found 688 overlapping genes
Epoch 1 of 300
Train scores: (0.7167100566784776, 0.36795252225519287)
Test scores: (0.7231240953717913, 0.2619047619047619)
Epoch 2 of 300
Train scores: (0.7165288132800547, 0.37388724035608306)
Test scores: (0.7216729663667225, 0.2619047619047619)
Epoch 3 of 300
Train scores: (0.7121957020519042, 0.3798219584569733)
Test scores: (0.7201864946456182, 0.2619047619047619)
Epoch 4 of 300
Train scores: (0.7065655626246059, 0.4094955489614243)
Test scores: (0.7186891578492665, 0.2619047619047619)
Epoch 5 of 300
Train scores: (0.7040859131855498, 0.3857566765578635)
Test scores: (0.7177005041213262, 0.2619047619047619)
Epoch 6 of 300
Train scores: (0.7021093340231332, 0.44510385756676557)
Test scores: (0.7170596

Epoch 65 of 300
Train scores: (0.5971113963367677, 0.685459940652819)
Test scores: (0.5993810494740804, 0.75)
Epoch 66 of 300
Train scores: (0.5977256842819803, 0.6913946587537092)
Test scores: (0.6003375280471075, 0.75)
Epoch 67 of 300
Train scores: (0.5964528063992011, 0.6943620178041543)
Test scores: (0.5998532999129522, 0.75)
Epoch 68 of 300
Train scores: (0.5917142817103899, 0.7002967359050445)
Test scores: (0.5976692381359282, 0.75)
Epoch 69 of 300
Train scores: (0.592074312159145, 0.7062314540059347)
Test scores: (0.5962049393426805, 0.75)
Epoch 70 of 300
Train scores: (0.5935490039406615, 0.7032640949554896)
Test scores: (0.5964092754182362, 0.75)
Epoch 71 of 300
Train scores: (0.5864288828139489, 0.7062314540059347)
Test scores: (0.5964405536651611, 0.75)
Epoch 72 of 300
Train scores: (0.5887432749264319, 0.7091988130563798)
Test scores: (0.5956041131700788, 0.75)
Epoch 73 of 300
Train scores: (0.5865417191819551, 0.7151335311572701)
Test scores: (0.5946054004487538, 0.75)
Epo

Epoch 135 of 300
Train scores: (0.4897378672478107, 0.8338278931750742)
Test scores: (0.5770358812241327, 0.7738095238095238)
Epoch 136 of 300
Train scores: (0.49907707814293023, 0.8011869436201781)
Test scores: (0.57621963818868, 0.7857142857142857)
Epoch 137 of 300
Train scores: (0.49319644073350494, 0.8160237388724035)
Test scores: (0.575777428490775, 0.7738095238095238)
Epoch 138 of 300
Train scores: (0.493376344177065, 0.8189910979228486)
Test scores: (0.5743586335863385, 0.7857142857142857)
Epoch 139 of 300
Train scores: (0.4920742844261826, 0.8189910979228486)
Test scores: (0.5727239903949556, 0.7857142857142857)
Epoch 140 of 300
Train scores: (0.4866056314915507, 0.8219584569732937)
Test scores: (0.5748283295404344, 0.7857142857142857)
Epoch 141 of 300
Train scores: (0.4839732823810521, 0.8219584569732937)
Test scores: (0.5751519316718692, 0.7857142857142857)
Epoch 142 of 300
Train scores: (0.48642962198229145, 0.8130563798219584)
Test scores: (0.5733764625730968, 0.78571428571

Epoch 202 of 300
Train scores: (0.4094223028120726, 0.8724035608308606)
Test scores: (0.5696031933739072, 0.75)
Epoch 203 of 300
Train scores: (0.40759393652398085, 0.884272997032641)
Test scores: (0.5702960377647763, 0.75)
Epoch 204 of 300
Train scores: (0.4003850982167954, 0.9020771513353115)
Test scores: (0.5667069753011068, 0.75)
Epoch 205 of 300
Train scores: (0.4062529869532373, 0.8931750741839762)
Test scores: (0.5656527337573823, 0.75)
Epoch 206 of 300
Train scores: (0.403006620152414, 0.8813056379821959)
Test scores: (0.565029711950393, 0.75)
Epoch 207 of 300
Train scores: (0.40019535098655995, 0.8931750741839762)
Test scores: (0.5676513285863967, 0.75)
Epoch 208 of 300
Train scores: (0.39788072540781266, 0.9050445103857567)
Test scores: (0.5653846831548781, 0.75)
Epoch 209 of 300
Train scores: (0.3915881261627469, 0.9050445103857567)
Test scores: (0.5636448973701114, 0.7380952380952381)
Epoch 210 of 300
Train scores: (0.39128365644007834, 0.9020771513353115)
Test scores: (0.5

## Train with Lightning

In [None]:
%cd src
import Pnet
import ReactomeNetwork

In [None]:
import pytorch_lightning as pl

In [None]:
reactome_network = ReactomeNetwork.ReactomeNetwork(train_dataset.get_genes())

In [None]:
model = Pnet.PNET_NN(hparams=
                     {'reactome_network':reactome_network, 'nbr_gene_inputs':len(genetic_data), 'dropout':0.2,
                      'additional_dims':0, 'lr':1e-3, 'weight_decay':1e-5}
                    )

In [None]:
train_loader, val_loader = pnet_loader.to_dataloader(train_dataset, test_dataset, 56)

In [None]:
batch_x, additional, batch_y = next(iter(train_loader))

In [None]:
additional.shape

In [None]:
trainer = pl.Trainer(precision=16, accelerator='mps', max_epochs=100, log_every_n_steps=50, 
                     enable_checkpointing=False)

trainer.fit(model=model, train_dataloaders=train_loader, val_dataloaders=val_loader,)