In [None]:
%matplotlib inline

# misc. libraries
import inspect
import pandas as pd
import numpy as np

# plotting libraries
import matplotlib.pyplot as plt
import seaborn as sns
from mpl_toolkits.mplot3d import Axes3D

# ml libraries
from sklearn import cluster, metrics
from sklearn.decomposition import PCA

%load_ext autoreload
%autoreload 2

In [None]:
# local dependencies
from load import *
from helpers import *
from plots import *
from constants import *

**General notes**
* The aim is to find out if the tumor is responding to a specific hormone. This response is "induced" in mice by treating them with the hormone. In humans we try to find similar expression patterns to determine if the tumor is driven by a certain hormone. In which case we can group such tumors together for more targeted and better treatment. We know that the patients have a certain type of cancer and this is recorded somewhere although we don't have this information now. If we discover a clear clustering, then it will be valuable to see which cancer types each data point (patient) has. If we interpret each cluster as being certain type of cancer instead of a hormone response, then we will misinterpret the results since the genetic expressions (features) are not results
* [Patient derived xenograft, Wikipedia](https://en.wikipedia.org/wiki/Patient_derived_xenograft)
* [Few useful things to know about ML](https://homes.cs.washington.edu/~pedrod/papers/cacm12.pdf)
* Useful magics:
  * `%pycat <filename>` to show a syntax-highlighted file
  * `%psource <object>` prints the source code for an object (function, for instance)

**Notes from Fabio**
* The data include: a matrix from our PDX models stimulated with different hormones (estrogen, progesterone and testosterone) - from which I estimated a list of differentially expressed genes to interrogate the patients' datasets - and a matrix from breast cancer patients retrieved from the TCGA (the Cancer Genome Atlas Consortium - published data).
* I suggest you to start testing the list of differentially expressed genes on your training data (PDXs, which is labelled with the correct treatment) that you can use as positive control in order to test the performance of your methods (you can estimate the sensitivity and specificity of the methods that you want to use, so that we can have an idea of which method should be in principle better to use in the patients dataset). Once you will be able to correctly discriminate the samples in the training set, then you can start interrogating the patients' matrix.
* Keep in mind that one of the early things that we do when dealing with sequencing data is to get rid of genes that show no or "minor" expression overall in the dataset. This means that, for those genes that do not reach a given threshold of expression in a number of patients, they are simply excluded from the analysis because they are supposedly not informative and just confounding the further analysis. Therefore, to some degree, we should expect to have differences in the genes that are expressed. 
* Regarding the patients data I already gave you they should be already normalized, whereas the new patients' dataset that you downloaded from the Internet  you should make sure it is correctly normalized (for this you should more or less have a normal distribution of your counts and more or less the samples - columns- should have a similar number of counts) and then run the analysis. However, I the interpretation of the PCA plot will not be trivial: since these are patients samples they are likely to be heterogeneous and therefore I do not expect to have a clear clustering of the samples based on their cancer type.
* [Informations about drugs used to treat breast cancer](https://www.nature.com/articles/d41573-019-00201-w) over the last decades, to give you an idea of the need for a more personalized medicine.
* [Small review on personalized medicine and why it should be pursued](https://notendur.hi.is/~vol1/pdx-papers/nejmsb1503104.pdf)
* [Patient-derived xenograft models of breast
cancer and their predictive power](https://notendur.hi.is/~vol1/pdx-papers/2015%20PDX%20Breast%20Cancer%20Research.pdf)
* [A paper from our lab in which there is the description of our PDX (Patient-Derived Xenograft) model - your training dataset](https://notendur.hi.is/~vol1/pdx-papers/2016%20MIND%20for%20Breast%20Cancer%20Sflomos%20Cancer%20Cell.pdf)

+ **DHT:** Dihydrotestosterone is an endogenous androgen sex steroid and hormone
+ **E2:** Estradiol (E2), also spelled oestradiol, is an estrogen steroid hormone and the major female sex hormone
+ **P4:**  Progesterone (P4) is an endogenous steroid and progestogen sex hormone involved in the menstrual cycle, pregnancy, and embryogenesis of humans and other species

***
**Data loading and manipulation**
***

*Load data about differentially expressed genes in the PDX experiment*

In [None]:
# %psource load_genes

In [None]:
# Raw information about genes
genes = load_genes()
genes

In [None]:
# Preprocessed list of genes
genes_list = load_genes_list()
genes_list.head(2)

In [None]:
# Genes showing response to two hormones
genes_list[genes_list[HORMONES].sum(axis=1) == 2]

*Load genetic expression levels from the patient derived xenograft (PDX) experiment on mice*

In [None]:
pdx = load_pdx()
pdx

*Load genetic expression levels of tumor patients*

In [None]:
# Load TCGA first tumor-patients dataset
patients = load_patients()
print(f"There are {len(patients)} records in the first dataframe")

In [None]:
# Load TCGA second tumor-patients dataset
patients2 = load_patients2()
print(f"There are {len(patients2)} records in the second dataframe")

We don't know beforehand if it is okay to merge the two datasets. They have the same 91 features but we don't know their distributions or how they were normalized. Let's investigate the distribution of features for both patient-datasets in order to evaluate the feasibility of merging the two sets:

In [None]:
# Discard highly expressed genes to make the plot readable
highly_expressed_genes = ["COL12A1", "COL3A1", "CPB1"]

pat = patients.drop(columns=highly_expressed_genes)
pat2 = patients2.drop(columns=highly_expressed_genes)

pat_mean_std = {"mean": pat.mean(), "std": pat.std()}
pat2_mean_std = {"mean": pat2.mean(), "std": pat2.std()}

In [None]:
plot_means_std_patients(pat_mean_std, pat2_mean_std)

Let's also compare the highly expressed genes: 

In [None]:
pat_high = patients[highly_expressed_genes]
pat2_high = patients2[highly_expressed_genes]

pd.DataFrame(
    [pat_high.mean(), pat2_high.mean(), pat_high.std(), pat2_high.std()],
    index=pd.MultiIndex.from_tuples(
        [(i, j) for i in ["mean", "std"] for j in ["first", "second"]]
    ),
)

We observe that the standard deviations of features are quite high, which indicates that the values are spread out over a wide range. Genetic expression levels in tumor patients don't appear to follow normal distributions, rather quite heavy-tailed distributions. We note that some features, like `MYBPC1` and `NTR`, have much greater variance in the second dataset. This can perhaps be explained by the fact that the second dataset has more datapoints, so the number of outliers will generally be higher which results in a higher standard deviation. We shall analyze the distribution in more detail just a bit later.

To conclude with the analysis of the above plot, we generally see that each feature seems to have a similar distribution in the two datasets. Based on this information we reason it is safe to merge the two sets. Let's call the new dataframe `pats`:

In [None]:
pats = pd.concat([patients, patients2]).reset_index(drop=True)

In [None]:
num_duplicates = pats.duplicated(keep=False).sum()
print(f"There are {len(pats)} records in the merged dataframe")
print(f"There are {num_duplicates} duplicated records in the merged dataframe")

Interestingly, the two datasets have 104 entries in common! Presumably, these are 104 unique tumor patients.

However, we haven't discovered all duplicated records, since there are some *fuzzy* duplicates which contain floating point precision errors in one dataset and not the other. Here is one example:

In [None]:
pats_fuzzy_duplicate_example = pats.loc[[197, 1134], :]
fuzzy_genes = ["TPSG1", "MAP3K14"]
for g in fuzzy_genes:
    print(f"{g}: {list(pats_fuzzy_duplicate_example[g].values)}")

We will fix this by rounding all values to 4 decimal places and see how many duplicates there are in total:

In [None]:
pats = round(pats, 4)

In [None]:
pats.duplicated(keep=False).sum()

Seems like there are 617 entries in common between the two datasets. But wait a minute... that's exactly the number of entries in the first dataset! Let's see if the second dataset contains the entire first dataset:

In [None]:
duplicates = pd.concat([pats[pats.duplicated()], round(patients, 4)]).reset_index(drop=True)
duplicates.duplicated(keep=False).sum()

When concatenating the duplicated entries in the merged dataframe and entries in the first dataframe, once again we get 617 entries in common.

Let's drop duplicated entries:

In [None]:
pats = pats.drop_duplicates().reset_index(drop=True)
print(f"Now, we have {len(pats)} unique records in the merged dataframe")

Finally, we verify that no expression values are negative or `NaN`:

In [None]:
pats[pats < 0 | pats.isna()].any(None)

***
Let's now analyze the distribution of the features (genetic expressions) on boxplots. **Todo: Should we eliminate outliers? If yes, then how?**

In [None]:
plot_feature_distributions(pats, "feature_distribution_p1+p2")

***
Not all of the genes that were strongly expressed in the PDX experiment did also show strong expression within a broad sample of tumor patients. This is what is to be expected, especially when taking into account that we are comparing data on human patients with data from a xenograft experiment. This is why the loading function of the patient dataset only keeps genes in common with the PDX dataset.

In [None]:
genes_expressed = patients.columns
genes_not_found = genes_list.genes[~genes_list.genes.isin(genes_expressed)]

In [None]:
print(f"Total number of differentially expressed genes:    {len(genes_list.genes)}")
print(f"Number of which found in the patients dataset:     {len(genes_expressed)}")
print(f"Number of which not found in the patients dataset: {len(genes_not_found)}")

***
**Exploratory data analysis**
***

Let's investigate the correlation between features in the first patients dataset. We shall plot a heatmap to visualize the lower triangular Pearson correlation matrix.

In [None]:
# %psource df_to_tril

In [None]:
pats_corr = df_to_tril(pats.corr())

In [None]:
plot_corr(pats_corr, genes_expressed, "corr_patients")

We observe that most of the genes are relatively uncorrelated.

Which pairs of genes are highly correlated? We define our correlation threshold to be...

In [None]:
CORR_THRESHOLD

We find all pairs of genes differentially expressed upon the same treatment:

In [None]:
# %psource gene_pairs_per_treatment

In [None]:
genes_pairs = gene_pairs_per_treatment()

Then we find the highly correlated pairs of genes in the patients dataset:

In [None]:
# Get pairs of highly correlated genes
pats_corr_genes = pats_corr[pats_corr > CORR_THRESHOLD].stack()

# Turn the multi-index into a normal index,
# give the series a name and then sort it in a descending order
pats_corr_genes.index = pats_corr_genes.index.tolist()
pats_corr_genes.name = "patients_correlation"
pats_corr_genes.sort_values(ascending=False, inplace=True)

In [None]:
# Merge the two sets of pairs to find pairs present in both sets
(
    pd.DataFrame(pats_corr_genes)
    .join(genes_pairs)
)

There appear to be many correlations in the patient dataset that match the PDX data. Note that `NaN` means the two genes showed expressions from different hormone treatments in the PDX experiment, i.e. the correlation does not match the PDX results.

Does this not confirm the potentiality of transferring what has been learned in the PDX experiment to tumor patients? Can we conclude that we can expect consistent results when we run the methods trained on the PDX data?

***
**Feature processing**
***

Let's try to reduce the dimensionality of the input space, i.e. the linear mapping of our D-dimensional input into a K-dimensional space K<=D that best represents the original data.

In [None]:
# PCA decomposition of original gene list
# we want to verify that the pre-selected genes are linearly independent

pca = PCA()
pca.fit(pdx)
PCA(copy=True, iterated_power="auto", svd_solver="auto", tol=0.0, whiten=False)

plot_pca_info(pca)

In [None]:
# plot data explained by 2nd and 3rd principal component
pca.n_components = 3
X_reduced = pca.fit_transform(pdx)
X_reduced = np.append(X_reduced, pdx.label.values.reshape((33, 1)), axis=1)
plt.plot(X_reduced[:3, 1], X_reduced[:3, 2], "ro")
plt.plot(X_reduced[3:14, 1], X_reduced[3:14, 2], "bo")
plt.plot(X_reduced[14:23, 1], X_reduced[14:23, 2], "co")
plt.plot(X_reduced[23:, 1], X_reduced[23:, 2], "go")
plt.legend(HORMONES_CTRL)
# plt.plot(X_reduced[23:,1], X_reduced[24:,2], 'yo')
# plt.legend(['dht', 'p4', 'e2', 'ctrl'])
plt.xlabel("2nd PC")
plt.ylabel("3rd PC")
plt.show()

In [None]:
# interactive 3D plot of first 3 principal components

# uncomment below line to have interactive plot!
# %matplotlib notebook

pca.n_components = 3
X_reduced = pca.fit_transform(pdx)
labels = pdx.label.values.reshape((33, 1))
X_reduced = np.append(X_reduced, labels, axis=1)
fig = plt.figure()
ax = plt.axes(projection="3d")
Axes3D.scatter(ax, X_reduced[:3, 0], X_reduced[:3, 1], X_reduced[:3, 2])
Axes3D.scatter(ax, X_reduced[3:14, 0], X_reduced[3:14, 1], X_reduced[3:14, 2])
Axes3D.scatter(ax, X_reduced[14:23, 0],
               X_reduced[14:23, 1], X_reduced[14:23, 2])
# Axes3D.scatter(ax, X_reduced[23:,0], X_reduced[23:,1], X_reduced[23:,2])
ax.set_xlabel("1st PC")
ax.set_ylabel("2nd PC")
ax.set_zlabel("3rd PC")
ax.legend(HORMONES)
# ax.legend(['dht', 'p4', 'e2', 'ctrl'])

plt.rcParams["figure.figsize"] = (12, 6)

In [None]:
pca = PCA()
pca.fit(pdx)

plot_pca_expl_var(pca)

print(pca.n_components_)

***
**Clustering**
***

In [None]:
X = pdx.drop("label", axis=1)
pdx_labeled = pdx.label

# , affinity='manhattan', linkage='average')
clus = cluster.AgglomerativeClustering(n_clusters=4)
predicted = clus.fit_predict(X)

# calculate score
score = metrics.adjusted_rand_score(pdx_labeled, predicted)
print(score)
# accuracy, f2 = performance(predicted)

***
**Spectral Clustering**
***

In [None]:
clustering = cluster.SpectralClustering(
    assign_labels="discretize", n_clusters=4, random_state=0).fit(X)
print("predicted labels : " + str(clustering.labels_))
print("true labels :      " + str(pdx_labeled.values))
print("Score : " + str(metrics.adjusted_rand_score(pdx_labeled, clustering.labels_)))

***
**K-Means**
***

In [None]:
kmeans = cluster.KMeans(n_clusters=4, random_state=0).fit(X)
print("predicted labels : " + str(kmeans.labels_))
print("true labels :      " + str(pdx_labeled.values))
print("Score : " + str(metrics.adjusted_rand_score(pdx_labeled, kmeans.labels_)))

In [None]:
# we should rather evaluate with the metrics.adjusted_rand_score function


def performance(labels):
    """Evaluate performance of predicted cluster compared to pre-selected gene list"""
    # get gene list
    geneNP = (
        genes_list.loc[:, "dht":"p4"].astype(int).values
    )  # replace with Boolean values

    nb_clusters = len(np.unique(labels))
    accuracy = np.zeros([nb_clusters, 3])
    f2 = np.zeros([nb_clusters, 3])
    beta = 2
    for i in np.arange(nb_clusters):
        label = np.zeros_like(labels)
        label[labels == i] = 1
        for j in np.arange(geneNP.shape[1]):
            # plot confusion matrices

            # cm = metrics.confusion_matrix(geneNP[:,j], label)
            # cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
            # fig, ax = plt.subplots()
            # im = ax.imshow(cm, interpolation='nearest')
            # ax.figure.colorbar(im, ax=ax)
            accuracy[i, j] = np.mean(geneNP[:, j] == label)
            f2[i, j] = metrics.fbeta_score(geneNP[:, j], label, beta)
    return accuracy, f2