In [1]:
%matplotlib inline
# misc. libraries
import pandas as pd
import numpy as np

# plotting libraries
import matplotlib.pyplot as plt
import seaborn as sns

# ml libraries
from sklearn import cluster, metrics
from sklearn.decomposition import PCA

%load_ext autoreload
%autoreload 2

In [2]:
# local dependencies
from helpers import *

+ **DHT:** Dihydrotestosterone is an endogenous androgen sex steroid and hormone
+ **E2:** Estradiol (E2), also spelled oestradiol, is an estrogen steroid hormone and the major female sex hormone
+ **P4:**  Progesterone (P4) is an endogenous steroid and progestogen sex hormone involved in the menstrual cycle, pregnancy, and embryogenesis of humans and other species

***
**Data loading**
***

In [3]:
# Raw information about genes
genes = load_genes()
genes

Unnamed: 0_level_0,dht,dht,e2,e2,p4,p4
Unnamed: 0_level_1,up,down,up,down,up,down
0,KLK3,CPB1,NTS,CPB1,KLK3,RAB31
1,GPC3,CITED1,DCX,AC062028.2,CXCL13,PGR
2,GPR88,GRIK3,COL22A1,SPINK4,DIO2,FLNB-AS1
3,UGT2B11,LYG1,IGSF1,POTEJ,TAT,ZNF71
4,KLK2,DCDC2,NECAB1,ALB,ADAMTS8,ZNF282
5,UGT2B28,ZNRF2P1,PIEZO2,MS4A7,MYBPC1,QRICH1
6,SLC26A3,CISH,TP63,PLA2G3,UGT2B11,
7,TMEM176A,ZNF107,TMPRSS4,CACNA1H,NTRK3,
8,HPGD,GHR,KLK12,CYP4F30P,DLC1,
9,CYP4F8,NFKBIE,HPGD,MS4A14,ABCA13,


In [96]:
# Preprocessed list of genes

genes_list = load_genes_list()
genes_list.head(2)

Unnamed: 0,genes,dht,e2,p4,upregulated
0,ABCA13,False,False,True,True
1,AC062028.2,False,True,False,False


In [5]:
# Genes showing response to two hormones

genes_list[genes_list[["dht", "e2", "p4"]].sum(axis=1) == 2]

Unnamed: 0,genes,dht,e2,p4,upregulated
17,CPB1,True,True,False,False
20,CXCL13,True,False,True,True
46,HPGD,True,True,False,True
54,KLK3,True,False,True,True
60,MYBPC1,True,False,True,True
65,NTRK3,False,True,True,True
85,SLC26A3,True,False,True,True
101,UGT2B11,True,False,True,True


In [99]:
# Label genes into three categories according to list of genes

genes_labeled = pd.DataFrame(genes_list, columns = ['genes'])
genes_list.loc[genes_list['dht'] == True, 'label'] = 0
genes_list.loc[genes_list['p4'] == True, 'label'] = 1
genes_list.loc[genes_list['e2'] == True, 'label'] = 2
genes_list['label'] = genes_list['label'].astype(int)
genes_labeled['label'] = genes_list['label']
genes_labeled.head()

Unnamed: 0,genes,label
0,ABCA13,1
1,AC062028.2,2
2,ADAMTS1,1
3,ADAMTS8,1
4,AL591034.3,1


In [6]:
# Load PDX tumor data
pdx = load_pdx(genes_list.genes)

# Remove control subjects and subjects exposed to two treatments
pdx = pdx[~pdx.index.str.contains(r"\+|CTRL")]

# Extract the label from the index
labels = pdx.index.str.rsplit("_", 1).map(lambda x: x[1].lower())
pdx.insert(0, "label", labels)

In [7]:
pdx.shape

(23, 109)

In [58]:
# Load TCGA patient data (first part)
patients = load_patients(genes_list.genes)
patients.head(2)

In [None]:
# Load TCGA patient data (second part)
patients2 = load_patients2(genes_list.genes)
patients2.head(2)

***
**Exploratory data analysis**
***

In [None]:
corr = np.abs(np.tril(pdx.corr()))

plt.figure(figsize=(25, 25))
ticklabels = genes_list.genes

heatmap = sns.heatmap(
    corr,
    square=True,
    linewidths=.005,
    xticklabels=ticklabels,
    yticklabels=ticklabels,
    mask=(corr == 0)  # mask cells with missing values
)
fig = heatmap.get_figure()
fig.savefig("corr.png")

***
**Feature processing**
***

In [None]:
# todo

***
**Clustering**
***

In [151]:
clus = cluster.AgglomerativeClustering(n_clusters=4, affinity='manhattan', linkage='average')
predicted = clus.fit_predict(pdx.drop(labels='label', axis=1).transpose())
print(predicted)

# calculate score
score = metrics.adjusted_rand_score(genes_labeled['label'], predicted)
print(score)
#accuracy, f2 = performance(predicted)

[2 2 2 2 2 2 3 2 0 2 2 0 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 0 2 2 0 2
 2 0 0 2 2 2 0 0 2 2 2 2 2 2 2 2 2 2 2 2 0 2 2 2 2 2 2 0 2 2 0 2 0 2 0 2 2
 2 0 2 0 0 2 2 0 2 1 2 2 0 2 2 2 2 2 2 2 0 2 2 0 2 2 2 2 2 2 0 0 2 2]
0.059724796827055344


In [126]:
score = metrics.adjusted_rand_score(genes_labeled['label'], predicted)
print(score)

0.09395374302133423


In [None]:
# we should rather evaluate with the metrics.adjusted_rand_score function 

def performance(labels):
    '''Evaluate performance of predicted cluster compared to pre-selected gene list'''
    # get gene list
    geneNP = genes_list.loc[:,'dht':'p4'].astype(int).values  # replace with Boolean values
    
    nb_clusters = len(np.unique(labels))
    accuracy = np.zeros([nb_clusters, 3])
    f2 =  np.zeros([nb_clusters, 3])
    beta = 2
    for i in np.arange(nb_clusters):
        label = np.zeros_like(labels)
        label[labels == i] = 1
        for j in np.arange(geneNP.shape[1]):
            # plot confusion matrices
            
            #cm = metrics.confusion_matrix(geneNP[:,j], label)
            #cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
            #fig, ax = plt.subplots()
            #im = ax.imshow(cm, interpolation='nearest')
            #ax.figure.colorbar(im, ax=ax)
            accuracy[i,j] = np.mean(geneNP[:,j]==label)
            f2[i,j] = metrics.fbeta_score(geneNP[:,j], label, beta)
    return accuracy, f2
        

***
**Principal Component Analysis**
***

In [None]:
# PCA decomposition of original gene list
# we want to verify that the pre-selected genes are linearly independent

pca = PCA()
pca.fit(genes_list)
PCA(copy=True, iterated_power='auto', n_components=None,
    random_state=None, svd_solver='auto', tol=0.0, whiten=False)

# Explained variance by PCA
# print(pca.explained_variance_)

# Singular values of PCA
# print(pca.singular_values_)

y_pos = np.arange(len(pca.singular_values_))
plt.bar(y_pos, pca.singular_values_, align='center', alpha=0.5)
plt.ylabel('Values')
plt.xlabel('Principal components')
plt.title('PCA - Singular values')
plt.show()

#
#pca.n_components = 2
#X_reduced = pca.fit_transform(X)
# X_reduced.shape