In [94]:
import pandas as pd
import numpy as np

%matplotlib inline
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.decomposition import PCA

from helpers import *

%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


+ **DHT:** Dihydrotestosterone is an endogenous androgen sex steroid and hormone
+ **E2:** Estradiol (E2), also spelled oestradiol, is an estrogen steroid hormone and the major female sex hormone
+ **P4:**  Progesterone (P4) is an endogenous steroid and progestogen sex hormone involved in the menstrual cycle, pregnancy, and embryogenesis of humans and other species

In [95]:
# Raw information about genes
genes = load_genes()
genes.head()

Unnamed: 0_level_0,dht,dht,e2,e2,p4,p4
Unnamed: 0_level_1,up,down,up,down,up,down
0,KLK3,CPB1,NTS,CPB1,KLK3,RAB31
1,GPC3,CITED1,DCX,AC062028.2,CXCL13,PGR
2,GPR88,GRIK3,COL22A1,SPINK4,DIO2,FLNB-AS1
3,UGT2B11,LYG1,IGSF1,POTEJ,TAT,ZNF71
4,KLK2,DCDC2,NECAB1,ALB,ADAMTS8,ZNF282


In [96]:
# Preprocessed list of genes 
genes_list = load_genes_list()
genes_list.head()

Unnamed: 0,genes,dht,e2,p4,upregulated
0,ABCA13,False,False,True,True
1,AC062028.2,False,True,False,False
2,ADAMTS1,False,False,True,True
3,ADAMTS8,False,False,True,True
4,AL591034.3,False,False,True,True


In [97]:
# Genes that express two hormones
genes_list[genes_list[["dht", "e2", "p4"]].sum(axis=1) == 2]

Unnamed: 0,genes,dht,e2,p4,upregulated
17,CPB1,True,True,False,False
20,CXCL13,True,False,True,True
46,HPGD,True,True,False,True
54,KLK3,True,False,True,True
60,MYBPC1,True,False,True,True
65,NTRK3,False,True,True,True
85,SLC26A3,True,False,True,True
101,UGT2B11,True,False,True,True


In [98]:
# Extract PDX tumor data,
# only retaining selected genes
pdx = load_pdx(genes_list.genes)
pdx.head()

symbol,ABCA13,AC062028.2,ADAMTS1,ADAMTS8,AL591034.3,ALB,ALOX15B,ATP1A2,BICD1,BPIFA4P,...,TP63,TPSG1,TSPAN8,UGT2B11,UGT2B28,ZBTB16,ZNF107,ZNF282,ZNF71,ZNRF2P1
PL015_G3_M25_E2,3.955433,4.170904,5.847849,3.955433,4.552973,3.505313,9.932854,3.505313,5.966549,4.070573,...,5.397508,5.265002,5.945713,3.73427,3.189617,3.817308,8.015305,7.73501,5.913858,5.057103
PL015_G3_M26_E2,3.911396,3.849631,5.072529,3.911396,4.045105,3.609216,10.018629,3.486839,5.578606,3.816297,...,5.457619,4.767569,4.835116,3.39997,3.486839,3.189617,8.102336,7.774623,6.038974,4.578719
PL015_G3_M27_E2+P4,6.049064,4.854421,4.917984,6.98131,6.227043,3.705484,12.228201,3.63696,6.327108,4.599693,...,4.174672,4.715925,5.23064,5.143476,4.142452,5.549342,8.154826,7.428064,5.806377,4.963539
PL015_G3_M28_P4,3.707859,5.125263,5.210331,5.516734,5.849519,4.549657,11.584119,3.489893,5.98513,5.320656,...,4.077971,5.629037,5.463621,5.516734,4.288425,5.071322,8.060938,7.70217,5.676089,4.549657
PL015_G3_M29_P4,3.189617,5.191769,5.214663,5.423715,6.213327,4.666563,11.170545,3.439497,5.629137,5.637391,...,4.200388,6.347538,6.202451,6.118063,4.04378,5.083292,8.390041,7.673496,5.561211,4.991885


In [None]:
# PCA decomposition of original gene list
# we want to verify that the pre-selected genes are linearly independent

pca = decomposition.PCA()
pca.fit(selectionPDX)
PCA(copy=True, iterated_power='auto', n_components=None,
    random_state=None, svd_solver='auto', tol=0.0, whiten=False)

# Explained variance by PCA
# print(pca.explained_variance_)

# Singular values of PCA
# print(pca.singular_values_)

y_pos = np.arange(len(pca.singular_values_))
plt.bar(y_pos, pca.singular_values_, align='center', alpha=0.5)
plt.ylabel('Values')
plt.xlabel('Principal components')
plt.title('PCA - Singular values')
plt.show()

#
#pca.n_components = 2
#X_reduced = pca.fit_transform(X)
# X_reduced.shape