In [None]:
from helpers import *
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np

%matplotlib inline
from sklearn import cluster
from sklearn.decomposition import PCA
from sklearn import metrics


%load_ext autoreload
%autoreload 2

+ **DHT:** Dihydrotestosterone is an endogenous androgen sex steroid and hormone
+ **E2:** Estradiol (E2), also spelled oestradiol, is an estrogen steroid hormone and the major female sex hormone
+ **P4:**  Progesterone (P4) is an endogenous steroid and progestogen sex hormone involved in the menstrual cycle, pregnancy, and embryogenesis of humans and other species

In [None]:
# Raw information about genes
genes = load_genes()
genes.head()

In [None]:
# Preprocessed list of genes
genes_list = load_genes_list()
genes_list

In [None]:
# Genes that express two hormones
genes_list[genes_list[["dht", "e2", "p4"]].sum(axis=1) == 2]

In [None]:
# Extract PDX tumor data,
# only retaining selected genes
pdx = load_pdx(genes_list.genes)
#extract the label from the index
pdx['label'] = pdx.index.to_series().str.rsplit("_", 1).map(lambda x: x[1])
#remove the subjects with more than one label
pdx = pdx[pdx['label'].str.contains('\+') == False]
pdx = pdx[pdx['label'].str.contains('CTRL') == False]
pdx

In [None]:
pdx.shape

In [None]:
labels = pdx.columns.values

In [None]:
corr = np.abs(np.tril(pdx.corr()))

plt.figure(figsize=(25, 25))

heatmap = sns.heatmap(
    corr,
    square=True,
    linewidths=.005,
    xticklabels=labels,
    yticklabels=labels,
    mask=(corr == 0)  # mask cells with missing values
)
fig = heatmap.get_figure()
fig.savefig("corr.png")

In [None]:
clus = cluster.AgglomerativeClustering(n_clusters=2, affinity='manhattan', linkage='complete')
predicted = clus.fit_predict(pdx.transpose())
print(predicted)
accuracy, f2 = performance(predicted)

In [None]:
def performance(labels):
    '''Evaluate performance of predicted cluster compared to pre-selected gene list'''
    # get gene list
    geneNP = genes_list.loc[:,'dht':'p4'].astype(int).values  # replace with Boolean values
    
    nb_clusters = len(np.unique(labels))
    accuracy = np.zeros([nb_clusters, 3])
    f2 =  np.zeros([nb_clusters, 3])
    beta = 2
    for i in np.arange(nb_clusters):
        label = np.zeros_like(labels)
        label[labels == i] = 1
        for j in np.arange(geneNP.shape[1]):
            # plot confusion matrices
            
            #cm = metrics.confusion_matrix(geneNP[:,j], label)
            #cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
            #fig, ax = plt.subplots()
            #im = ax.imshow(cm, interpolation='nearest')
            #ax.figure.colorbar(im, ax=ax)
            accuracy[i,j] = np.mean(geneNP[:,j]==label)
            f2[i,j] = metrics.fbeta_score(geneNP[:,j], label, beta)
    return accuracy, f2
        

In [None]:
# PCA decomposition of original gene list
# we want to verify that the pre-selected genes are linearly independent

pca = PCA()
pca.fit(genes_list)
PCA(copy=True, iterated_power='auto', n_components=None,
    random_state=None, svd_solver='auto', tol=0.0, whiten=False)

# Explained variance by PCA
# print(pca.explained_variance_)

# Singular values of PCA
# print(pca.singular_values_)

y_pos = np.arange(len(pca.singular_values_))
plt.bar(y_pos, pca.singular_values_, align='center', alpha=0.5)
plt.ylabel('Values')
plt.xlabel('Principal components')
plt.title('PCA - Singular values')
plt.show()

#
#pca.n_components = 2
#X_reduced = pca.fit_transform(X)
# X_reduced.shape