In [None]:
%matplotlib inline
# misc. libraries
import pandas as pd
import numpy as np

# plotting libraries
import matplotlib.pyplot as plt
import seaborn as sns

# ml libraries
from sklearn import cluster, metrics
from sklearn.decomposition import PCA

import inspect

%load_ext autoreload
%autoreload 2

In [None]:
# local dependencies
from helpers import *

+ **DHT:** Dihydrotestosterone is an endogenous androgen sex steroid and hormone
+ **E2:** Estradiol (E2), also spelled oestradiol, is an estrogen steroid hormone and the major female sex hormone
+ **P4:**  Progesterone (P4) is an endogenous steroid and progestogen sex hormone involved in the menstrual cycle, pregnancy, and embryogenesis of humans and other species

***
**Data loading and manipulation**
***

In [None]:
# Raw information about genes
genes = load_genes()
genes

In [None]:
# Preprocessed list of genes
genes_list = load_genes_list()
genes_list.head(2)

In [None]:
# Genes showing response to two hormones
genes_list[genes_list[["dht", "e2", "p4"]].sum(axis=1) == 2]

In [None]:
# Load TCGA patient data (first part)
patients = load_patients()
patients.head(2)

In [None]:
patients.describe()

In [None]:
# Load TCGA patient data (second part)
patients2 = load_patients2()
patients2.head(2)

In [None]:
genes_not_found = list(set(genes_list.genes) - set(patients.columns))

print(f"Genes not found in the patients datasets:\n{genes_not_found}")

In [None]:
# Load PDX tumor data
pdx = load_pdx()
pdx.head()

In [None]:
# Label genes into three categories according to list of genes
# genes_labeled = pd.DataFrame(genes_list, columns=["genes"])
# genes_labeled["label"] = genes_list.replace(["dht", "p4", "e2"], [0, 1, 2])["label"]
# genes_labeled.head()
genes_expressed = genes_list.genes[~genes_list.genes.isin(genes_not_found)]
genes_expressed

***
**Exploratory data analysis**
***

In [None]:
def plot_corr(corr, filename='corr.png'):
    tril_abs_corr = np.abs(np.tril(corr, k=-1))

    plt.figure(figsize=(25, 25))

    heatmap = sns.heatmap(
        tril_abs_corr,
        square=True,
        linewidths=.005,
        xticklabels=genes_expressed,
        yticklabels=genes_expressed,
        mask=(tril_abs_corr == 0)  # mask cells with missing values
    )
    fig = heatmap.get_figure()
    fig.savefig(f"../data/{filename}")

In [None]:
pdx_corr = pdx.corr()
plot_corr(pdx_corr, 'corr_pdx.png')

In [None]:
patients_corr = patients.corr()
plot_corr(patients_corr, 'corr_patients.png')

In [None]:
# High correlations in patients data match with correlations found in PDX genes-data!
patients_corr[((patients_corr < -0.6) | (0.6 < patients_corr)) & (patients_corr != 1.0)].stack()

In [None]:
# Even more correlations in the second patients dataset that match!
# Can we therefore conclude that we can expect better/consistent results for this dataset 
# when we run the methods trained on the PDX data?
patients2_corr = patients2.corr()
patients2_corr[(patients2_corr < -.3) & (patients2_corr != 1.0)].stack()

***
**Feature processing**
***

Let's try to reduce the dimensionality of the input space, i.e. the linear mapping of our D-dimensional input into a K-dimensional space $K\leq D$ that best represents the original data.

In [None]:
# PCA decomposition of original gene list
# we want to verify that the pre-selected genes are linearly independent

pca = PCA()
pca.fit(genes_list)
PCA(copy=True, iterated_power='auto', n_components=None,
    random_state=None, svd_solver='auto', tol=0.0, whiten=False)

# Explained variance by PCA
# print(pca.explained_variance_)

# Singular values of PCA
# print(pca.singular_values_)

y_pos = np.arange(len(pca.singular_values_))
plt.bar(y_pos, pca.singular_values_, align='center', alpha=0.5)
plt.ylabel('Values')
plt.xlabel('Principal components')
plt.title('PCA - Singular values')
plt.show()

#
#pca.n_components = 2
#X_reduced = pca.fit_transform(X)
# X_reduced.shape

***
**Clustering**
***

In [None]:
X = pdx.drop("label", axis=1)
y = pdx.label

clus = cluster.AgglomerativeClustering(n_clusters=7)  # , affinity='manhattan', linkage='average')
predicted = clus.fit_predict(X)

# calculate score
score = metrics.adjusted_rand_score(y, predicted)
print(score)
# accuracy, f2 = performance(predicted)

***
**Spectral Clustering**
***

In [None]:
clustering = cluster.SpectralClustering(assign_labels="discretize", random_state=0).fit(X)
print("predicted labels : " + str(clustering.labels_))
print("true labels :      " + str(pdx_labeled.values))
print("Score : " + str(metrics.adjusted_rand_score(x, clustering.labels_)))

***
**K-Means**
***

In [None]:
kmeans = cluster.KMeans(n_clusters=4, random_state=0).fit(X)
print("predicted labels : " + str(kmeans.labels_))
print("true labels :      " + str(pdx_labeled.values))
print("Score : " + str(metrics.adjusted_rand_score(y, kmeans.labels_)))

In [None]:
# we should rather evaluate with the metrics.adjusted_rand_score function 

def performance(labels):
    '''Evaluate performance of predicted cluster compared to pre-selected gene list'''
    # get gene list
    geneNP = genes_list.loc[:,'dht':'p4'].astype(int).values  # replace with Boolean values
    
    nb_clusters = len(np.unique(labels))
    accuracy = np.zeros([nb_clusters, 3])
    f2 =  np.zeros([nb_clusters, 3])
    beta = 2
    for i in np.arange(nb_clusters):
        label = np.zeros_like(labels)
        label[labels == i] = 1
        for j in np.arange(geneNP.shape[1]):
            # plot confusion matrices
            
            #cm = metrics.confusion_matrix(geneNP[:,j], label)
            #cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
            #fig, ax = plt.subplots()
            #im = ax.imshow(cm, interpolation='nearest')
            #ax.figure.colorbar(im, ax=ax)
            accuracy[i,j] = np.mean(geneNP[:,j]==label)
            f2[i,j] = metrics.fbeta_score(geneNP[:,j], label, beta)
    return accuracy, f2
        

***
**Principal Component Analysis**
***