In [4]:
import pandas as pd
import numpy as np

%matplotlib inline
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.decomposition import PCA

from helpers import *

%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


+ **DHT:** Dihydrotestosterone is an endogenous androgen sex steroid and hormone
+ **E2:** Estradiol (E2), also spelled oestradiol, is an estrogen steroid hormone and the major female sex hormone
+ **P4:**  Progesterone (P4) is an endogenous steroid and progestogen sex hormone involved in the menstrual cycle, pregnancy, and embryogenesis of humans and other species

In [5]:
# Raw information about genes
genes = load_genes()
genes.head()

Unnamed: 0_level_0,dht,dht,e2,e2,p4,p4
Unnamed: 0_level_1,up,down,up,down,up,down
0,KLK3,CPB1,NTS,CPB1,KLK3,RAB31
1,GPC3,CITED1,DCX,AC062028.2,CXCL13,PGR
2,GPR88,GRIK3,COL22A1,SPINK4,DIO2,FLNB-AS1
3,UGT2B11,LYG1,IGSF1,POTEJ,TAT,ZNF71
4,KLK2,DCDC2,NECAB1,ALB,ADAMTS8,ZNF282


In [6]:
# Preprocessed list of genes 
genes_list = load_genes_list()
genes_list.head()

Unnamed: 0,genes,dht,e2,p4,upregulated
0,ABCA13,False,False,True,True
1,AC062028.2,False,True,False,False
2,ADAMTS1,False,False,True,True
3,ADAMTS8,False,False,True,True
4,AL591034.3,False,False,True,True


In [7]:
# Genes that express two hormones
genes_list[genes_list[["dht", "e2", "p4"]].sum(axis=1) == 2]

Unnamed: 0,genes,dht,e2,p4,upregulated
17,CPB1,True,True,False,False
20,CXCL13,True,False,True,True
46,HPGD,True,True,False,True
54,KLK3,True,False,True,True
60,MYBPC1,True,False,True,True
65,NTRK3,False,True,True,True
85,SLC26A3,True,False,True,True
101,UGT2B11,True,False,True,True


In [50]:
# Extract PDX tumor data,
# only retaining selected genes
pdx = load_pdx(genes_list.genes)
#extract the label from the index
pdx['label'] = pdx.index.to_series().str.rsplit("_", 1).map(lambda x: x[1])
#remove the subjects with more than one label
pdx = pdx[pdx['label'].str.contains('\+') == False]
pdx = pdx[pdx['label'].str.contains('CTRL') == False]
pdx

symbol,ABCA13,AC062028.2,ADAMTS1,ADAMTS8,AL591034.3,ALB,ALOX15B,ATP1A2,BICD1,BPIFA4P,...,TPSG1,TSPAN8,UGT2B11,UGT2B28,ZBTB16,ZNF107,ZNF282,ZNF71,ZNRF2P1,label
PL015_G3_M25_E2,3.955433,4.170904,5.847849,3.955433,4.552973,3.505313,9.932854,3.505313,5.966549,4.070573,...,5.265002,5.945713,3.73427,3.189617,3.817308,8.015305,7.73501,5.913858,5.057103,E2
PL015_G3_M26_E2,3.911396,3.849631,5.072529,3.911396,4.045105,3.609216,10.018629,3.486839,5.578606,3.816297,...,4.767569,4.835116,3.39997,3.486839,3.189617,8.102336,7.774623,6.038974,4.578719,E2
PL015_G3_M28_P4,3.707859,5.125263,5.210331,5.516734,5.849519,4.549657,11.584119,3.489893,5.98513,5.320656,...,5.629037,5.463621,5.516734,4.288425,5.071322,8.060938,7.70217,5.676089,4.549657,P4
PL015_G3_M29_P4,3.189617,5.191769,5.214663,5.423715,6.213327,4.666563,11.170545,3.439497,5.629137,5.637391,...,6.347538,6.202451,6.118063,4.04378,5.083292,8.390041,7.673496,5.561211,4.991885,P4
PL015_G3_M33_E2,3.7177,3.834615,6.320852,4.197351,3.834615,3.7177,10.47604,3.454762,5.614629,3.976385,...,4.345855,3.88535,4.474335,3.88535,3.454762,8.088938,7.775115,5.909965,4.873536,E2
PL015_G3_M35_P4,4.175138,5.357577,5.043679,6.085692,6.483086,4.221329,11.696666,3.506722,5.582941,5.540822,...,6.104709,5.890702,6.439134,4.422714,5.569048,8.274393,7.572967,5.324391,4.838715,P4
PL015_G3_M38_P4,3.799084,5.195061,5.067398,5.56927,6.423938,4.604564,11.588569,3.622165,5.807633,5.388019,...,5.681747,5.991532,5.607816,4.417988,5.229293,8.209544,7.64707,5.556172,4.604564,P4
PL015_G3_M39_E2,3.615771,3.615771,5.268289,3.922517,4.032967,3.881409,10.621543,3.537989,5.389631,3.790138,...,4.648828,4.536991,3.615771,3.436252,3.615771,7.895711,7.924608,6.156661,4.858474,E2
T110_G2_M03_E2,3.773092,3.189617,5.546014,3.528011,3.189617,3.939553,3.429171,4.356682,8.034773,3.189617,...,3.189617,8.819314,3.189617,3.189617,3.429171,9.298718,8.196189,6.56641,5.936962,E2
T110_G2_M05_P4,3.744575,3.189617,5.817817,3.744575,3.189617,3.867251,3.583234,3.189617,7.308321,3.189617,...,3.867251,6.848599,3.189617,3.189617,3.583234,9.303299,8.233088,6.136845,5.645886,P4


In [51]:
pdx.shape

(23, 109)

In [None]:
# PCA decomposition of original gene list
# we want to verify that the pre-selected genes are linearly independent

pca = decomposition.PCA()
pca.fit(selectionPDX)
PCA(copy=True, iterated_power='auto', n_components=None,
    random_state=None, svd_solver='auto', tol=0.0, whiten=False)

# Explained variance by PCA
# print(pca.explained_variance_)

# Singular values of PCA
# print(pca.singular_values_)

y_pos = np.arange(len(pca.singular_values_))
plt.bar(y_pos, pca.singular_values_, align='center', alpha=0.5)
plt.ylabel('Values')
plt.xlabel('Principal components')
plt.title('PCA - Singular values')
plt.show()

#
#pca.n_components = 2
#X_reduced = pca.fit_transform(X)
# X_reduced.shape