# PAM50 Exploratory analysis
---
## Description
<p>PAM50 (Prosigna®) is a tumor profiling test that helps show whether some breast cancers are likely to spread to other organs.</p>

## Related papers
<a href="https://openreview.net/pdf?id=rk4QYDkwz">CLUSTERING MEETS IMPLICIT GENERATIVE MODELS</a><br>
<a href="https://openreview.net/pdf?id=ByKWUeWA-">GANITE: ESTIMATION OF INDIVIDUALIZED TREATMENT EFFECTS USING GENERATIVE ADVERSARIAL NETS</a><br>

### Useful links
<a href="https://ww5.komen.org/BreastCancer/PAM50.html">Problem and data description on komen.org</a><br>
<a href="https://ru.wikipedia.org/wiki/Prosigna">Test description in Russian and Deutsch</a><br>
<a href="https://snet-bio-data.s3-us-west-2.amazonaws.com/example15bmc/ex15bmcMerged.csv.xz">Expression data (explored)</a><br>
<a href="https://github.com/singnet/cancer/blob/master/data/curatedBreastData/bcTabs.ods">Data columns meaning (explored)</a><br>

In [1]:
import numpy as np
import pandas as pd

In [2]:
pam50 = pd.read_csv('data/ex15bmcMerged.csv.xz')
trts = pd.read_csv('data/bmc15mldata1.csv')

### pam50 dataset

In [3]:
pam50.head()

Unnamed: 0,patient_ID,MAGEA12,MAGEA11,KLF1,ADH7,MSH4,BIRC3,AKR1C4,GBX2,GCGR,...,ZNF80,ZNF83,ZNF84,ZNF91,ZNHIT2,ZSCAN2,ZXDC,ZYX,ZZEF1,ZZZ3
0,22449,-0.118953,1.180345,0.252643,-0.262987,0.142903,0.167314,0.498846,0.774632,0.104353,...,-1.564143,0.466733,0.827552,-0.617981,0.303161,1.260602,-0.217995,0.219529,0.389849,1.313703
1,22450,0.423693,-0.922374,-1.202192,-0.105451,-0.061571,-0.093231,-0.09555,-0.481403,-0.214238,...,0.711752,0.358388,0.037911,2.304784,0.328942,-1.028791,-0.850002,-0.292574,-0.068982,0.722123
2,22451,-0.239183,-0.733389,0.523791,-0.081958,-0.004635,-0.008094,0.268636,-0.614192,0.027471,...,-0.011786,-0.474762,-0.349981,-0.097197,0.100946,-0.5547,-0.367363,0.094464,-0.372665,-0.790771
3,22452,0.500445,-0.177686,-0.216638,-0.13085,-0.261039,-0.048521,1.479664,-0.10012,0.233178,...,0.757255,0.590212,0.06015,2.287583,-0.108866,-1.1325,-0.106976,-0.216267,0.393671,-0.027349
4,22453,-0.609235,0.259494,-0.071802,0.027963,0.162509,0.112654,-0.239435,0.229737,-0.132271,...,0.407159,0.570637,0.851658,-0.41295,0.105692,-1.047445,0.08448,-0.224081,-0.021074,0.764555


### Treatments

In [4]:
trts.head()

Unnamed: 0,study,patient_ID,radio,surgery,chemo,hormone,pCR,RFS,DFS,posOutcome
0,study_1379_GPL1223_all-bmc15,22449,0,,0,1,,,0.0,0
1,study_1379_GPL1223_all-bmc15,22450,0,,0,1,,,0.0,0
2,study_1379_GPL1223_all-bmc15,22451,0,,0,1,,,0.0,0
3,study_1379_GPL1223_all-bmc15,22452,0,,0,1,,,0.0,0
4,study_1379_GPL1223_all-bmc15,22453,0,,0,1,,,1.0,1


### Merged dataset

In [5]:
tp50 = pd.merge(pam50, trts, left_index=True, right_index=True)
tp50.head(150)

Unnamed: 0,patient_ID_x,MAGEA12,MAGEA11,KLF1,ADH7,MSH4,BIRC3,AKR1C4,GBX2,GCGR,...,study,patient_ID_y,radio,surgery,chemo,hormone,pCR,RFS,DFS,posOutcome
0,22449,-0.118953,1.180345,0.252643,-0.262987,0.142903,0.167314,0.498846,0.774632,0.104353,...,study_1379_GPL1223_all-bmc15,22449,0,,0,1,,,0.0,0
1,22450,0.423693,-0.922374,-1.202192,-0.105451,-0.061571,-0.093231,-0.095550,-0.481403,-0.214238,...,study_1379_GPL1223_all-bmc15,22450,0,,0,1,,,0.0,0
2,22451,-0.239183,-0.733389,0.523791,-0.081958,-0.004635,-0.008094,0.268636,-0.614192,0.027471,...,study_1379_GPL1223_all-bmc15,22451,0,,0,1,,,0.0,0
3,22452,0.500445,-0.177686,-0.216638,-0.130850,-0.261039,-0.048521,1.479664,-0.100120,0.233178,...,study_1379_GPL1223_all-bmc15,22452,0,,0,1,,,0.0,0
4,22453,-0.609235,0.259494,-0.071802,0.027963,0.162509,0.112654,-0.239435,0.229737,-0.132271,...,study_1379_GPL1223_all-bmc15,22453,0,,0,1,,,1.0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
145,36863,-0.267049,-0.036435,-0.014162,0.229711,-0.012054,1.084890,-0.001190,0.000000,0.006842,...,study_2034_GPL96_all-bmc15,36863,1,,0,0,,1.0,,1
146,36864,-0.267049,-0.036435,-0.014162,-0.030311,-0.012054,-1.278064,-0.001190,0.000000,-0.001621,...,study_2034_GPL96_all-bmc15,36864,1,,0,0,,1.0,,1
147,36865,-0.267049,-0.036435,-0.014162,-0.030311,-0.012054,-0.128375,-0.001190,0.000000,-0.001621,...,study_2034_GPL96_all-bmc15,36865,1,,0,0,,1.0,,1
148,36866,-0.267049,-0.036435,0.061103,-0.030311,-0.002059,-0.788651,-0.001190,0.000000,-0.001621,...,study_2034_GPL96_all-bmc15,36866,1,,0,0,,1.0,,1


In [6]:
tp50['posOutcome'].isnull().values.any()

False

In [7]:
batch = tp50.sample(5)

In [8]:
batch

Unnamed: 0,patient_ID_x,MAGEA12,MAGEA11,KLF1,ADH7,MSH4,BIRC3,AKR1C4,GBX2,GCGR,...,study,patient_ID_y,radio,surgery,chemo,hormone,pCR,RFS,DFS,posOutcome
427,249639,-0.283412,0.780998,0.427012,0.547383,-0.054761,1.741625,-0.332801,0.373853,-0.432119,...,study_9893_GPL5049_all-bmc15,249641,1,,0,1,,1.0,,1
1129,491238,-0.279146,-0.072587,-0.023172,-0.036032,-0.042167,-1.016067,0.770681,-0.004667,-0.002935,...,study_19615_GPL570_all-bmc15,491241,0,,1,0,,1.0,,1
139,36857,-0.264917,-0.036435,0.034185,-0.030311,-0.012054,0.997304,-0.00119,0.0,-0.001621,...,study_2034_GPL96_all-bmc15,36857,1,,0,0,,1.0,,1
1307,505370,-0.410973,0.02064,-0.091741,-0.055913,-0.023981,1.125274,-0.006776,-0.007957,-0.002143,...,study_20194_GPL96_all-bmc15,505382,0,,1,0,0.0,,,0
692,411402,-0.8265,1.813048,0.051341,-0.018589,-0.008299,-0.893171,-0.026835,-0.008385,-0.007893,...,study_16446_GPL570_all-bmc15,411394,0,,1,0,1.0,,1.0,1


In [9]:
tp50['study'].value_counts()

study_2034_GPL96_all-bmc15                           286
study_20194_GPL96_all-bmc15                          261
study_25055_GPL96_MDACC_M-bmc15                      221
study_17705_GPL96_MDACC_Tissue_BC_Tamoxifen-bmc15    195
study_9893_GPL5049_all-bmc15                         148
study_12093_GPL96_all-bmc15                          136
study_22226_GPL1708_all-bmc15                        128
study_22358_GPL5325_all-bmc15                        122
study_32646_GPL570_all-bmc15                         115
study_16446_GPL570_all-bmc15                         114
study_19615_GPL570_all-bmc15                         110
study_17705_GPL96_JBI_Tissue_BC_Tamoxifen-bmc15      103
study_25065_GPL96_MDACC-bmc15                         71
study_1379_GPL1223_all-bmc15                          60
study_25065_GPL96_USO-bmc15                           54
study_20181_GPL96_all-bmc15                           53
study_16391_GPL570_all-bmc15                          48
Name: study, dtype: int64

In [10]:
pam50.columns.values[1:]

array(['MAGEA12', 'MAGEA11', 'KLF1', ..., 'ZYX', 'ZZEF1', 'ZZZ3'],
      dtype=object)

In [11]:
batch_gex = batch[pam50.columns.values[1:]]
batch_gex

Unnamed: 0,MAGEA12,MAGEA11,KLF1,ADH7,MSH4,BIRC3,AKR1C4,GBX2,GCGR,SIGLEC9,...,ZNF80,ZNF83,ZNF84,ZNF91,ZNHIT2,ZSCAN2,ZXDC,ZYX,ZZEF1,ZZZ3
427,-0.283412,0.780998,0.427012,0.547383,-0.054761,1.741625,-0.332801,0.373853,-0.432119,1.370321,...,1.283541,0.303802,0.353321,-0.266744,0.263702,-0.019925,0.666583,-0.342816,0.800792,0.054286
1129,-0.279146,-0.072587,-0.023172,-0.036032,-0.042167,-1.016067,0.770681,-0.004667,-0.002935,-0.025417,...,-0.121864,-0.097302,-0.808033,0.17129,-0.309876,-0.189628,0.994878,0.685528,-0.417114,-0.317502
139,-0.264917,-0.036435,0.034185,-0.030311,-0.012054,0.997304,-0.00119,0.0,-0.001621,0.067521,...,-0.019418,-1.233128,-0.795826,1.513205,-0.093811,-0.015096,0.218607,0.747372,-0.071563,-1.538727
1307,-0.410973,0.02064,-0.091741,-0.055913,-0.023981,1.125274,-0.006776,-0.007957,-0.002143,-0.04143,...,-0.01706,-0.613895,-0.02047,-0.497363,-0.078408,-0.003522,-0.394458,-0.577251,-0.145349,0.464157
692,-0.8265,1.813048,0.051341,-0.018589,-0.008299,-0.893171,-0.026835,-0.008385,-0.007893,-0.116528,...,-0.027651,-1.833862,0.427216,1.108274,0.286461,0.718863,0.613895,-0.015771,-0.13426,-0.543886


In [12]:
batch_gex.values.shape

(5, 8832)

In [13]:
tp50['posOutcome'].describe()

count    2225.000000
mean        0.622921
std         0.484764
min         0.000000
25%         0.000000
50%         1.000000
75%         1.000000
max         1.000000
Name: posOutcome, dtype: float64

In [14]:
tp50['radio'].isnull().values.any()

False

In [15]:
tp50['surgery'].isnull().values.any()

True

In [16]:
tp50['chemo'].isnull().values.any()

False

In [17]:
tp50['hormone'].isnull().values.any()

False

In [18]:
tp50['pCR'].isnull().values.any()

True

In [19]:
np.invert(tp50['pCR'].isnull().values)

array([False, False, False, ...,  True,  True,  True])

In [20]:
tp50.loc[np.invert(tp50['pCR'].isnull().values), ['pCR']]

Unnamed: 0,pCR
678,1.0
679,1.0
680,1.0
681,1.0
682,1.0
...,...
2220,1.0
2221,1.0
2222,0.0
2223,0.0


In [31]:
np.hstack([trts.columns.values[:1], trts.columns.values[2:]])

array(['study', 'radio', 'surgery', 'chemo', 'hormone', 'pCR', 'RFS',
       'DFS', 'posOutcome'], dtype=object)

In [35]:
batch.fillna(-1)

Unnamed: 0,patient_ID_x,MAGEA12,MAGEA11,KLF1,ADH7,MSH4,BIRC3,AKR1C4,GBX2,GCGR,...,study,patient_ID_y,radio,surgery,chemo,hormone,pCR,RFS,DFS,posOutcome
427,249639,-0.283412,0.780998,0.427012,0.547383,-0.054761,1.741625,-0.332801,0.373853,-0.432119,...,study_9893_GPL5049_all-bmc15,249641,1,-1,0,1,-1.0,1.0,-1.0,1
1129,491238,-0.279146,-0.072587,-0.023172,-0.036032,-0.042167,-1.016067,0.770681,-0.004667,-0.002935,...,study_19615_GPL570_all-bmc15,491241,0,-1,1,0,-1.0,1.0,-1.0,1
139,36857,-0.264917,-0.036435,0.034185,-0.030311,-0.012054,0.997304,-0.00119,0.0,-0.001621,...,study_2034_GPL96_all-bmc15,36857,1,-1,0,0,-1.0,1.0,-1.0,1
1307,505370,-0.410973,0.02064,-0.091741,-0.055913,-0.023981,1.125274,-0.006776,-0.007957,-0.002143,...,study_20194_GPL96_all-bmc15,505382,0,-1,1,0,0.0,-1.0,-1.0,0
692,411402,-0.8265,1.813048,0.051341,-0.018589,-0.008299,-0.893171,-0.026835,-0.008385,-0.007893,...,study_16446_GPL570_all-bmc15,411394,0,-1,1,0,1.0,-1.0,1.0,1


In [36]:
batch

Unnamed: 0,patient_ID_x,MAGEA12,MAGEA11,KLF1,ADH7,MSH4,BIRC3,AKR1C4,GBX2,GCGR,...,study,patient_ID_y,radio,surgery,chemo,hormone,pCR,RFS,DFS,posOutcome
427,249639,-0.283412,0.780998,0.427012,0.547383,-0.054761,1.741625,-0.332801,0.373853,-0.432119,...,study_9893_GPL5049_all-bmc15,249641,1,,0,1,,1.0,,1
1129,491238,-0.279146,-0.072587,-0.023172,-0.036032,-0.042167,-1.016067,0.770681,-0.004667,-0.002935,...,study_19615_GPL570_all-bmc15,491241,0,,1,0,,1.0,,1
139,36857,-0.264917,-0.036435,0.034185,-0.030311,-0.012054,0.997304,-0.00119,0.0,-0.001621,...,study_2034_GPL96_all-bmc15,36857,1,,0,0,,1.0,,1
1307,505370,-0.410973,0.02064,-0.091741,-0.055913,-0.023981,1.125274,-0.006776,-0.007957,-0.002143,...,study_20194_GPL96_all-bmc15,505382,0,,1,0,0.0,,,0
692,411402,-0.8265,1.813048,0.051341,-0.018589,-0.008299,-0.893171,-0.026835,-0.008385,-0.007893,...,study_16446_GPL570_all-bmc15,411394,0,,1,0,1.0,,1.0,1
