In this notebook we analyse the stability of the layer conductance values from the model on a fixed train and test set. We are interested in how the random initialization of the weights influences the importance scores obtained for the genes.

Imports:

In [20]:
import pnet_loader
import util
import Pnet
import torch
import seaborn as sns
import pandas as pd
from sklearn import metrics
import matplotlib.pyplot as plt
from sklearn.metrics import roc_auc_score

%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


Data Loading:

In [21]:
prostate_mutations = pd.read_csv('/mnt/disks/pancan/pnet_database/prostate/processed/P1000_final_analysis_set_cross_important_only.csv')
prostate_mutations.set_index('Tumor_Sample_Barcode', inplace=True)

prostate_cnv = pd.read_csv('/mnt/disks/pancan/pnet_database/prostate/processed/P1000_data_CNA_paper.csv')
prostate_cnv.rename(columns={"Unnamed: 0": "Tumor_Sample_Barcode"}, inplace=True)
prostate_cnv.set_index('Tumor_Sample_Barcode', inplace=True)

prostate_response = pd.read_csv('/mnt/disks/pancan/pnet_database/prostate/processed/response_paper.csv')
prostate_response.rename(columns={'id': "Tumor_Sample_Barcode"}, inplace=True)
prostate_response.set_index('Tumor_Sample_Barcode', inplace=True)

prostate_genes = pd.read_csv('/mnt/disks/pancan/pnet_database/genes/tcga_prostate_expressed_genes_and_cancer_genes.csv')
prostate_genes = list(set(prostate_genes['genes']).intersection(set(prostate_mutations.columns)).intersection(set(prostate_cnv.columns)))

# prostate_mutations = pd.read_csv('../../data/pnet_database/prostate/processed/P1000_final_analysis_set_cross_important_only.csv')
# prostate_mutations.set_index('Tumor_Sample_Barcode', inplace=True)

# prostate_cnv = pd.read_csv('../../data/pnet_database/prostate/processed/P1000_data_CNA_paper.csv')
# prostate_cnv.rename(columns={"Unnamed: 0": "Tumor_Sample_Barcode"}, inplace=True)
# prostate_cnv.set_index('Tumor_Sample_Barcode', inplace=True)

# prostate_response = pd.read_csv('../../data/pnet_database/prostate/processed/response_paper.csv')
# prostate_response.rename(columns={'id': "Tumor_Sample_Barcode"}, inplace=True)
# prostate_response.set_index('Tumor_Sample_Barcode', inplace=True)

# prostate_genes = pd.read_csv('../../data/pnet_database/genes/tcga_prostate_expressed_genes_and_cancer_genes.csv')
# prostate_genes = list(set(prostate_genes['genes']).intersection(set(prostate_mutations.columns)).intersection(set(prostate_cnv.columns)))

prostate_cnv = prostate_cnv[prostate_genes].copy()
prostate_mutations = prostate_mutations[prostate_genes].copy()

# prostate_genes = util.select_highly_variable_genes(prostate_mutations)
# prostate_genes = prostate_genes['level_1']
prostate_mutations = prostate_mutations[list(set(prostate_mutations.columns).intersection(prostate_genes))].copy()
prostate_cnv = prostate_cnv[list(set(prostate_cnv.columns).intersection(prostate_genes))].copy()

# Regenerate input as specified in prostate_paper
prostate_mutations = (prostate_mutations > 0).astype(int)
prostate_amp = (prostate_cnv > 1).astype(int)
prostate_del = (prostate_cnv < -1).astype(int)

In [22]:
genetic_data = {'mut': prostate_mutations, 'amp': prostate_amp, 'del': prostate_del}

In [23]:
test_inds = list(pd.read_csv('/mnt/disks/pancan/pnet_database/splits/test_set.csv')['id'])
train_inds = list(pd.read_csv('/mnt/disks/pancan/pnet_database/splits/training_set.csv')['id'])

Model training:

In [24]:
gene_imps = []
layerwise_imps = []
aucs = []
for r in range(20):
    model, train_scores, test_scores, train_dataset, test_dataset = Pnet.run(genetic_data,
                                                                         prostate_response,
                                                                         verbose=False,
                                                                         early_stopping=False,
                                                                         train_inds=train_inds,
                                                                         test_inds=test_inds)
    model.to('cpu')
    x_test = test_dataset.x
    additional_test = test_dataset.additional
    y_test = test_dataset.y
    pred = model(x_test, additional_test)
    y_pred_proba = pred.detach().numpy().squeeze()
    fpr, tpr, _ = metrics.roc_curve(y_test,  y_pred_proba)
    aucs.append(metrics.roc_auc_score(y_test, y_pred_proba))
    gene_imps.append(model.gene_importance(test_dataset))
    layerwise_imps.append(model.layerwise_importance(test_dataset))
    

Given 3 Input modalities
Found 1011 overlapping indicies
Initializing Train Dataset
Found 4854 overlapping genes
generated input DataFrame of size (807, 14562)
Initializing Test Dataset
Found 4854 overlapping genes
generated input DataFrame of size (102, 14562)
Found 4854 overlapping genes
Given 3 Input modalities
Found 1011 overlapping indicies
Initializing Train Dataset
Found 4854 overlapping genes
generated input DataFrame of size (807, 14562)
Initializing Test Dataset
Found 4854 overlapping genes
generated input DataFrame of size (102, 14562)
Found 4854 overlapping genes
Given 3 Input modalities
Found 1011 overlapping indicies
Initializing Train Dataset
Found 4854 overlapping genes
generated input DataFrame of size (807, 14562)
Initializing Test Dataset
Found 4854 overlapping genes
generated input DataFrame of size (102, 14562)
Found 4854 overlapping genes
Given 3 Input modalities
Found 1011 overlapping indicies
Initializing Train Dataset
Found 4854 overlapping genes
generated inpu

In [38]:
pd.concat(gene_imps, axis=1).std(axis=1).nlargest(20)

AR         4.853938
TP53       2.594748
PTEN       2.158988
FNDC1      1.951894
SLC45A4    1.296950
PHLDB2     1.131713
KLHDC8A    1.094294
GPBP1      1.047550
TATDN1     0.949547
PHF20L1    0.885988
ZNF34      0.835381
RAB14      0.804577
BACH2      0.790045
ZMIZ1      0.787808
MAP3K7     0.787134
HSPA6      0.775397
SCRT1      0.741248
FAT4       0.719318
ZNF250     0.706913
PXDNL      0.661662
dtype: float32

In [53]:
pd.concat([gis.rank(ascending=False) for gis in gene_imps], axis=1).mean(axis=1).nsmallest(20)

AR         1.00
TP53       2.15
PTEN       2.85
MDM4      12.50
MED30     12.80
ASB18     17.60
UBE2W     23.85
KBTBD8    27.80
ZNF92     38.65
RB1       41.65
MDM2      42.35
PSMD1     43.15
MGA       44.50
ZNF184    48.65
NR3C2     49.10
RBBP5     50.80
RNF123    52.75
MAML3     54.75
PTPRJ     55.85
CUL9      56.10
dtype: float64

In [32]:
pd.concat(gene_imps, axis=1).mean(axis=1).loc[['AR', 'TP53','PTEN', 'RB1', 'MDM4', 'FGFR1', 'MAML3', 'PDGFA', 'NOTCH1', 'EIF3E']]

AR        28.339069
TP53      10.345586
PTEN       8.134116
RB1        0.711618
MDM4       1.484305
FGFR1     -0.136983
MAML3      0.622123
PDGFA      0.456059
NOTCH1     0.000000
EIF3E      0.065069
dtype: float32

In [55]:
pd.concat([gis.rank(ascending=False) for gis in gene_imps], axis=1).mean(axis=1).rank(ascending=True).loc[['AR', 'TP53','PTEN', 'RB1', 'MDM4', 'FGFR1', 'MAML3', 'PDGFA', 'NOTCH1', 'EIF3E']]

AR           1.0
TP53         2.0
PTEN         3.0
RB1         10.0
MDM4         4.0
FGFR1     4764.0
MAML3       18.0
PDGFA       55.0
NOTCH1    2407.0
EIF3E      515.0
dtype: float64

In [39]:
pd.concat(gene_imps, axis=1).std(axis=1).rank(ascending=False).loc[['AR', 'TP53','PTEN', 'RB1', 'MDM4', 'FGFR1', 'MAML3', 'PDGFA', 'NOTCH1', 'EIF3E']]

AR           1.0
TP53         2.0
PTEN         3.0
RB1        155.0
MDM4        36.0
FGFR1      337.0
MAML3      136.0
PDGFA       70.0
NOTCH1    3969.5
EIF3E      419.0
dtype: float64