In [2]:
%matplotlib inline
# misc. libraries
import pandas as pd
import numpy as np

# plotting libraries
import matplotlib.pyplot as plt
import seaborn as sns

# ml libraries
from sklearn import cluster, metrics
from sklearn.decomposition import PCA

import inspect

%load_ext autoreload
%autoreload 2

In [3]:
# local dependencies
from helpers import *

+ **DHT:** Dihydrotestosterone is an endogenous androgen sex steroid and hormone
+ **E2:** Estradiol (E2), also spelled oestradiol, is an estrogen steroid hormone and the major female sex hormone
+ **P4:**  Progesterone (P4) is an endogenous steroid and progestogen sex hormone involved in the menstrual cycle, pregnancy, and embryogenesis of humans and other species

***
**Data loading and manipulation**
***

In [4]:
# Raw information about genes
genes = load_genes()
genes

Unnamed: 0_level_0,dht,dht,e2,e2,p4,p4
Unnamed: 0_level_1,up,down,up,down,up,down
0,KLK3,CPB1,NTS,CPB1,KLK3,RAB31
1,GPC3,CITED1,DCX,AC062028.2,CXCL13,PGR
2,GPR88,GRIK3,COL22A1,SPINK4,DIO2,FLNB-AS1
3,UGT2B11,LYG1,IGSF1,POTEJ,TAT,ZNF71
4,KLK2,DCDC2,NECAB1,ALB,ADAMTS8,ZNF282
5,UGT2B28,ZNRF2P1,PIEZO2,MS4A7,MYBPC1,QRICH1
6,SLC26A3,CISH,TP63,PLA2G3,UGT2B11,
7,TMEM176A,ZNF107,TMPRSS4,CACNA1H,NTRK3,
8,HPGD,GHR,KLK12,CYP4F30P,DLC1,
9,CYP4F8,NFKBIE,HPGD,MS4A14,ABCA13,


In [5]:
# Preprocessed list of genes
genes_list = load_genes_list()
genes_list.head(2)

Unnamed: 0,genes,dht,e2,p4,upregulated
0,ABCA13,False,False,True,True
1,AC062028.2,False,True,False,False


In [6]:
# Genes showing response to two hormones
genes_list[genes_list[["dht", "e2", "p4"]].sum(axis=1) == 2]

Unnamed: 0,genes,dht,e2,p4,upregulated
17,CPB1,True,True,False,False
20,CXCL13,True,False,True,True
46,HPGD,True,True,False,True
54,KLK3,True,False,True,True
60,MYBPC1,True,False,True,True
65,NTRK3,False,True,True,True
85,SLC26A3,True,False,True,True
101,UGT2B11,True,False,True,True


In [7]:
# Load TCGA patient data (first part)
patients = load_patients()
patients.head(2)

Unnamed: 0,ABCA13,ADAMTS1,ADAMTS8,ALB,ALOX15B,ATP1A2,BICD1,CACNA1H,CISH,CITED1,...,TNFAIP3,TP63,TPSG1,TSPAN8,UGT2B11,UGT2B28,ZBTB16,ZNF107,ZNF282,ZNF71
0,0.9925,1658.0928,30.7667,448.2673,397.3203,102.5556,187.2467,1772.8691,1333.2231,105.533,...,1490.3647,1332.8922,2.9973,30.7667,30.105,13.8946,17.8645,542.5523,733.1073,186.5851
1,3.8556,1251.3669,40.2699,4.284,672.5936,46.696,188.0692,679.8765,1311.7718,113.0985,...,928.3506,864.09,299.8825,1.7136,13.7089,1.2852,509.8003,451.9658,1116.8481,193.21


In [8]:
patients.describe()

Unnamed: 0,ABCA13,ADAMTS1,ADAMTS8,ALB,ALOX15B,ATP1A2,BICD1,CACNA1H,CISH,CITED1,...,TNFAIP3,TP63,TPSG1,TSPAN8,UGT2B11,UGT2B28,ZBTB16,ZNF107,ZNF282,ZNF71
count,617.0,617.0,617.0,617.0,617.0,617.0,617.0,617.0,617.0,617.0,...,617.0,617.0,617.0,617.0,617.0,617.0,617.0,617.0,617.0,617.0
mean,38.47764,1134.31713,43.996165,1794.785557,1213.34115,81.28451,147.01736,658.56715,1134.134003,103.441115,...,832.653793,335.672331,48.426685,160.287644,829.2068,125.02314,62.81036,653.155103,744.671385,211.228286
std,166.18712,1281.717976,109.820584,16558.914625,6138.701484,237.50572,197.90414,1178.658312,774.013538,361.007134,...,632.617182,507.052602,142.970546,822.585688,3386.427121,990.735383,138.974192,419.256811,266.685931,88.906534
min,0.0,79.9297,0.0,0.0,1.1806,0.0,2.2185,3.5738,78.6852,0.3625,...,67.7621,0.0,0.0,0.0,0.0,0.0,0.0,78.7097,203.9545,30.036
25%,0.4987,379.0386,4.7548,2.6344,32.4519,6.0191,76.418,77.8551,529.6902,5.0147,...,396.267,36.9554,0.8097,0.9696,0.3272,0.0,3.6891,388.5572,573.9045,156.8557
50%,1.6579,705.3606,11.8863,7.5019,87.2136,25.3684,115.0782,172.1232,985.4834,16.2277,...,655.2106,135.7196,5.9055,4.9003,3.6199,0.468,14.7479,566.161,704.2503,199.0017
75%,7.5944,1386.9048,32.5371,34.6868,329.4762,67.362,173.5798,669.8015,1522.5989,50.2522,...,1111.5835,445.5875,34.4119,30.335,50.4341,4.1732,56.8475,799.2235,873.3839,245.4456
max,1691.6674,10380.3655,1181.1585,283133.4623,84906.6607,2687.3713,4170.0329,8955.5357,5730.314,3623.3878,...,5139.758,4141.989,1818.7406,10664.9462,30588.5464,14153.4314,1713.3067,4294.5502,2152.3577,916.1518


In [9]:
# Load TCGA patient data (second part)
patients2 = load_patients2()
patients2.head(2)

gene_id,ABCA13,ADAMTS1,ADAMTS8,ALB,ALOX15B,ATP1A2,BICD1,CACNA1H,CISH,CITED1,...,TNFAIP3,TP63,TPSG1,TSPAN8,UGT2B11,UGT2B28,ZBTB16,ZNF107,ZNF282,ZNF71
0,0.0,234.0233,8.9611,10.3398,10.6844,14.131,77.8929,131.315,686.905,37.2232,...,242.9845,24.1261,0.3447,4.4806,0.3447,0.3447,156.1304,1718.4689,992.6174,338.455
1,1.6313,514.9538,6.5253,15.7694,36.4328,12.5068,51.1147,104.4046,566.6123,21.2072,...,540.5111,28.82,0.0,4.894,26.9222,40.2393,4.3502,516.0413,1320.8265,243.0669


In [10]:
genes_not_found = list(set(genes_list.genes) - set(patients.columns))

print(f"Genes not found in the patients datasets:\n{genes_not_found}")

Genes not found in the patients datasets:
['FAR2P3', 'FLNB-AS1', 'KLF2P3', 'SOCS2-AS1', 'RUBCNL', 'RFLNA', 'CYP4F30P', 'CYP4F62P', 'POTEJ', 'FAM217B', 'AL591034.3', 'PIEZO2', 'CYP4F29P', 'BPIFA4P', 'ZNRF2P1', 'FAM92A', 'AC062028.2']


In [11]:
# Load PDX tumor data
pdx = load_pdx()
pdx.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,label,ABCA13,ADAMTS1,ADAMTS8,ALB,ALOX15B,ATP1A2,BICD1,CACNA1H,CISH,...,TNFAIP3,TP63,TPSG1,TSPAN8,UGT2B11,UGT2B28,ZBTB16,ZNF107,ZNF282,ZNF71
treatment,id,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
dht,t111_g2_m09,0,3.189617,7.923171,8.423222,7.117144,3.614543,3.189617,5.325093,7.794179,7.166217,...,4.493391,9.518162,6.694106,8.178994,5.823731,5.844421,8.941439,5.823731,6.540557,8.834605
dht,t111_g2_m02,0,7.431527,3.189617,8.053738,3.189617,3.189617,3.189617,8.053738,6.306352,7.145168,...,3.189617,10.065067,9.24458,6.306352,8.053738,5.568833,8.716628,6.306352,6.786746,6.786746
dht,t111_g2_m06,0,6.357327,6.265651,8.954985,5.686476,4.23274,3.189617,6.872338,8.81834,9.442043,...,4.23274,10.280245,8.395725,6.872338,7.659201,5.163905,8.834178,7.341522,4.23274,9.039379
p4,t111_g2_m15,1,3.189617,8.62838,6.111433,7.388242,3.189617,3.189617,6.577726,5.618014,5.965769,...,3.189617,8.848252,8.121159,8.827714,6.927842,4.824432,9.125653,7.143498,5.965769,3.189617
p4,pl015_g3_m28,1,6.957474,6.553337,3.489893,6.290409,4.077971,3.189617,8.211545,4.077971,5.743738,...,5.365206,6.297802,8.221174,8.060938,9.544617,5.676089,8.591505,6.957474,3.707859,5.516734


In [16]:
# Label genes into three categories according to list of genes
# genes_labeled = pd.DataFrame(genes_list, columns=["genes"])
# genes_labeled["label"] = genes_list.replace(["dht", "p4", "e2"], [0, 1, 2])["label"]
# genes_labeled.head()
genes_expressed = genes_list.genes[~genes_list.genes.isin(genes_not_found)].reset_index(drop=True)
genes_expressed

0      ABCA13
1     ADAMTS1
2     ADAMTS8
3         ALB
4     ALOX15B
       ...   
86    UGT2B28
87     ZBTB16
88     ZNF107
89     ZNF282
90      ZNF71
Name: genes, Length: 91, dtype: object

***
**Exploratory data analysis**
***

In [None]:
def plot_corr(corr, filename='corr.png'):
    tril_abs_corr = np.abs(np.tril(corr, k=-1))

    plt.figure(figsize=(25, 25))

    heatmap = sns.heatmap(
        tril_abs_corr,
        square=True,
        linewidths=.005,
        xticklabels=genes_expressed,
        yticklabels=genes_expressed,
        mask=(tril_abs_corr == 0)  # mask cells with missing values
    )
    fig = heatmap.get_figure()
    fig.savefig(f"../data/{filename}")

In [None]:
pdx_corr = pdx.corr()
plot_corr(pdx_corr, 'corr_pdx.png')

In [None]:
patients_corr = patients.corr()
plot_corr(patients_corr, 'corr_patients.png')

We find that high correlations in the first patients dataset match with correlations found in the PDX genes-data.

In [None]:
patients_corr[((patients_corr < -0.6) | (0.6 < patients_corr)) & (patients_corr != 1.0)].stack()

There are appear to be even more correlations in the second patients dataset that match the PDX data.
Can we therefore conclude that we can expect better/consistent results for this dataset 
when we run the methods trained on the PDX data?

In [18]:
patients2_corr = patients2.corr()
patients2_corr[((patients2_corr < -0.6) | (0.6 < patients2_corr)) & (patients2_corr != 1.0)].stack()

gene_id  gene_id
ADAMTS1  DLC1       0.676533
         PDE2A      0.630073
         SPARCL1    0.604069
ALOX15B  HPGD       0.635834
         UGT2B28    0.766995
ATP1A2   MYBPC1     0.861628
         SYNPO2     0.839422
COL12A1  COL3A1     0.723924
COL3A1   COL12A1    0.723924
CYYR1    SPARCL1    0.626837
DLC1     ADAMTS1    0.676533
         GHR        0.641009
         PDE2A      0.743580
         SPARCL1    0.660314
GHR      DLC1       0.641009
         PDE2A      0.761749
GIMAP6   PDE2A      0.779010
         SPARCL1    0.696268
HPGD     ALOX15B    0.635834
KLK12    KLK14      0.975478
KLK14    KLK12      0.975478
KLK2     KLK3       0.745600
KLK3     KLK2       0.745600
MS4A14   MS4A7      0.755502
MS4A7    MS4A14     0.755502
MYBPC1   ATP1A2     0.861628
         SYNPO2     0.824060
PDE2A    ADAMTS1    0.630073
         DLC1       0.743580
         GHR        0.761749
         GIMAP6     0.779010
         SPARCL1    0.741066
         ZBTB16     0.636432
SPARCL1  ADAMTS1    0.6040

In [19]:
patients2_corr[(patients2_corr < -.3) & (patients2_corr != 1.0)].stack()

gene_id   gene_id 
LYG1      TMEM106B   -0.309033
NFKBIE    PER2       -0.343993
          QRICH1     -0.332949
          TMEM106B   -0.356903
PER2      NFKBIE     -0.343993
PLEKHO1   QRICH1     -0.310589
          TMEM106B   -0.416926
QRICH1    NFKBIE     -0.332949
          PLEKHO1    -0.310589
RIPK2     SPARCL1    -0.318804
SPARCL1   RIPK2      -0.318804
TMEM106B  LYG1       -0.309033
          NFKBIE     -0.356903
          PLEKHO1    -0.416926
          ZNF71      -0.327706
ZNF71     TMEM106B   -0.327706
dtype: float64

***
**Feature processing**
***

Let's try to reduce the dimensionality of the input space, i.e. the linear mapping of our D-dimensional input into a K-dimensional space $K\leq D$ that best represents the original data.

In [None]:
# PCA decomposition of original gene list
# we want to verify that the pre-selected genes are linearly independent

pca = PCA()
pca.fit(genes_list)
PCA(copy=True, iterated_power='auto', n_components=None,
    random_state=None, svd_solver='auto', tol=0.0, whiten=False)

# Explained variance by PCA
# print(pca.explained_variance_)

# Singular values of PCA
# print(pca.singular_values_)

y_pos = np.arange(len(pca.singular_values_))
plt.bar(y_pos, pca.singular_values_, align='center', alpha=0.5)
plt.ylabel('Values')
plt.xlabel('Principal components')
plt.title('PCA - Singular values')
plt.show()

#
#pca.n_components = 2
#X_reduced = pca.fit_transform(X)
# X_reduced.shape

***
**Clustering**
***

In [None]:
X = pdx.drop("label", axis=1)
y = pdx.label

clus = cluster.AgglomerativeClustering(n_clusters=7)  # , affinity='manhattan', linkage='average')
predicted = clus.fit_predict(X)

# calculate score
score = metrics.adjusted_rand_score(y, predicted)
print(score)
# accuracy, f2 = performance(predicted)

***
**Spectral Clustering**
***

In [None]:
clustering = cluster.SpectralClustering(assign_labels="discretize", random_state=0).fit(X)
print("predicted labels : " + str(clustering.labels_))
print("true labels :      " + str(pdx_labeled.values))
print("Score : " + str(metrics.adjusted_rand_score(x, clustering.labels_)))

***
**K-Means**
***

In [None]:
kmeans = cluster.KMeans(n_clusters=4, random_state=0).fit(X)
print("predicted labels : " + str(kmeans.labels_))
print("true labels :      " + str(pdx_labeled.values))
print("Score : " + str(metrics.adjusted_rand_score(y, kmeans.labels_)))

In [None]:
# we should rather evaluate with the metrics.adjusted_rand_score function 

def performance(labels):
    '''Evaluate performance of predicted cluster compared to pre-selected gene list'''
    # get gene list
    geneNP = genes_list.loc[:,'dht':'p4'].astype(int).values  # replace with Boolean values
    
    nb_clusters = len(np.unique(labels))
    accuracy = np.zeros([nb_clusters, 3])
    f2 =  np.zeros([nb_clusters, 3])
    beta = 2
    for i in np.arange(nb_clusters):
        label = np.zeros_like(labels)
        label[labels == i] = 1
        for j in np.arange(geneNP.shape[1]):
            # plot confusion matrices
            
            #cm = metrics.confusion_matrix(geneNP[:,j], label)
            #cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
            #fig, ax = plt.subplots()
            #im = ax.imshow(cm, interpolation='nearest')
            #ax.figure.colorbar(im, ax=ax)
            accuracy[i,j] = np.mean(geneNP[:,j]==label)
            f2[i,j] = metrics.fbeta_score(geneNP[:,j], label, beta)
    return accuracy, f2
        

***
**Principal Component Analysis**
***