In [None]:
%matplotlib inline

# misc. libraries
import pandas as pd
import numpy as np

# plotting libraries
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns
from mpl_toolkits.mplot3d import Axes3D

# ml libraries
from sklearn.decomposition import PCA

%load_ext autoreload
%autoreload 2

In [None]:
# local dependencies
from load import *
from helpers import *
from plots import *
from constants import *

***
**Data loading and manipulation**
***

*Load data about differentially expressed genes in the PDX experiment*

In [None]:
%pycat constants

In [None]:
%psource load_genes

In [None]:
# Raw information about genes
genes = load_genes()
genes

In [None]:
%psource load_genes_list

In [None]:
# Preprocessed list of genes
genes_list = load_genes_list()
genes_list.head()

In [None]:
# Genes showing response to two hormones
genes_list[genes_list[HORMONES].sum(axis=1) == 2]

*Load genetic expression levels from the patient derived xenograft (PDX) experiment on mice*

In [None]:
%psource load_pdx

In [None]:
pdx = load_pdx()
pdx.head()

**Note** that `ctrl` means that the subject is a control subject, i.e. it was not treated with a hormone. Also, `pl015` is a metastasis tumor whereas `t110` and `t111` are types of primary site tumors.

Let's now standardize the features per tumor (to eliminate bias introduced by different tumors being injected):

In [None]:
# Separate features from the label
X_pdx = pdx.drop("label", axis=1)
y_pdx = pdx["label"]

dfs_stdized = []
for tumor in TUMORS:
    df = X_pdx.xs(tumor, level=1, drop_level=False)
    df_stdized = df_standardize_columns(df)
    dfs_stdized.append(df_stdized)

In [None]:
X_pdx_stdized = pd.concat(dfs_stdized).sort_values(["treatment", "tumor"])

*Load genetic expression levels of tumor patients*

In [None]:
%psource load_patients

In [None]:
%psource load_patients2

In [None]:
# Load TCGA first tumor-patients dataset
patients = load_patients()
print(f"There are {len(patients)} records in the first dataframe")

In [None]:
# Load TCGA second tumor-patients dataset
patients2 = load_patients2()
print(f"There are {len(patients2)} records in the second dataframe")

***
**Exploratory data analysis**
***

We don't know beforehand if it is okay to merge the two patient datasets. They have the same 91 features but we don't know their distributions or how they were normalized. Let's investigate the distribution of features for both datasets in order to evaluate the feasibility of merging the two sets:

In [None]:
# Discard highly expressed genes to make the plot readable
highly_expressed_genes = ["COL12A1", "COL3A1", "CPB1"]

pat = patients.drop(columns=highly_expressed_genes)
pat2 = patients2.drop(columns=highly_expressed_genes)

pat_mean_std = {"mean": pat.mean(), "std": pat.std()}
pat2_mean_std = {"mean": pat2.mean(), "std": pat2.std()}

In [None]:
plot_means_std_patients(pat_mean_std, pat2_mean_std)

Let's also compare the highly expressed genes: 

In [None]:
pat_high = patients[highly_expressed_genes]
pat2_high = patients2[highly_expressed_genes]

pd.DataFrame(
    [pat_high.mean(), pat2_high.mean(), pat_high.std(), pat2_high.std()],
    index=pd.MultiIndex.from_tuples(
        [(i, j) for i in ["mean", "std"] for j in ["first", "second"]]
    ),
)

We observe that the standard deviations of features are quite high, which indicates that the values are spread out over a wide range. Genetic expression levels in tumor patients don't appear to follow normal distributions, rather quite heavy-tailed distributions. We note that some features, like `MYBPC1` and `NTR`, have much greater variance in the second dataset. This can perhaps be explained by the fact that the second dataset has more datapoints, so the number of outliers will generally be higher which results in a higher standard deviation.

To conclude with the analysis of the above plot, we generally see that each feature seems to have a similar distribution in the two datasets. Based on this information we reason it is safe to merge the two sets. Let's call the new dataframe `pats`:

In [None]:
pats = pd.concat([patients, patients2]).reset_index(drop=True)

In [None]:
num_duplicates = pats.duplicated(keep=False).sum()
print(f"There are {len(pats)} records in the merged dataframe")
print(f"There are {num_duplicates} duplicated records in the merged dataframe")

Interestingly, the two datasets have 104 entries in common! Presumably, these are 104 unique tumor patients.

However, we haven't discovered all duplicated records, since there are some *fuzzy* duplicates which contain floating point precision errors in one dataset and not the other. Here is one example:

In [None]:
pats_fuzzy_duplicate_example = pats.loc[[197, 1134], :]
fuzzy_genes = ["TPSG1", "MAP3K14"]
for g in fuzzy_genes:
    print(f"{g}: {list(pats_fuzzy_duplicate_example[g].values)}")

We will fix this by rounding all values to 4 decimal places and see how many duplicates there are in total:

In [None]:
pats = round(pats, 4)

In [None]:
pats.duplicated(keep=False).sum()

Seems like there are 617 entries in common between the two datasets. But wait a minute... that's exactly the number of entries in the first dataset! Let's see if the second dataset contains the entire first dataset:

In [None]:
duplicates = pd.concat([pats[pats.duplicated()], round(patients, 4)]).reset_index(drop=True)
duplicates.duplicated(keep=False).sum()

When concatenating the duplicated entries in the merged dataframe and entries in the first dataframe, once again we get 617 entries in common.

Let's drop duplicated entries:

In [None]:
pats = pats.drop_duplicates().reset_index(drop=True)
print(f"Now, we have {len(pats)} unique records in the merged dataframe")

Finally, we verify that no expression values are negative or `NaN`:

In [None]:
pats[pats < 0 | pats.isna()].any(None)

***
The below boxplots further confirm the heavy-tailed nature of the feature distributions in the `pats` dataset:

In [None]:
plot_feature_distributions(pats)

We will take the log of all features to make the optimization behave better, since they appear to follow log-normal distributions:

In [None]:
pats_log = np.log(pats + .1)  # add a small constant because log(0) is undefined
pats_log_stdized = df_standardize_columns(pats_log)

In [None]:
plot_feature_distributions(pats_log_stdized, ylim=None)

***
Not all of the genes that were strongly expressed in the PDX experiment did also show strong expression within a broad sample of tumor patients. This is what is to be expected, especially when taking into account that we are comparing data on human patients with data from a xenograft experiment. This is why the loading function of the patient dataset only keeps genes in common with the PDX dataset.

In [None]:
genes_expressed = patients.columns
genes_not_found = genes_list.genes[~genes_list.genes.isin(genes_expressed)]

In [None]:
print(f"Total number of differentially expressed genes:    {len(genes_list.genes)}")
print(f"Number of which found in the patients dataset:     {len(genes_expressed)}")
print(f"Number of which not found in the patients dataset: {len(genes_not_found)}")

***
Let's investigate the correlation between features in the first patients dataset. We shall plot a heatmap to visualize the lower triangular Pearson correlation matrix.

In [None]:
%psource df_to_tril

In [None]:
pats_corr = df_to_tril(pats.corr())

In [None]:
plot_corr(pats_corr, genes_expressed, "corr_patients")

We observe that most of the genes are relatively uncorrelated.

It is interesting to see which pairs of genes are highly correlated. We define the reference value for "high correlation" to be...

In [None]:
CORR_THRESHOLD

We find all pairs of genes differentially expressed upon the same treatment:

In [None]:
%psource gene_pairs_per_treatment

In [None]:
genes_pairs = gene_pairs_per_treatment()

Then we find the highly correlated pairs of genes in the patients dataset:

In [None]:
# Get pairs of highly correlated genes
pats_corr_genes = pats_corr[pats_corr > CORR_THRESHOLD].stack()

# Turn the multi-index into a normal index,
# give the series a name and then sort it in a descending order
pats_corr_genes.index = pats_corr_genes.index.tolist()
pats_corr_genes.name = "patients_correlation"
pats_corr_genes.sort_values(ascending=False, inplace=True)

In [None]:
# Merge the two sets of pairs to find pairs present in both sets
(
    pd.DataFrame(pats_corr_genes)
    .join(genes_pairs)
)

There appear to be many correlations in the patient dataset that match the PDX data. Note that `NaN` means the two genes showed expressions from different hormone treatments in the PDX experiment, i.e. the correlation does not match the PDX results.

Does this not confirm the potentiality of transferring what has been learned in the PDX experiment to tumor patients? Can we conclude that we can expect consistent results when we run the methods trained on the PDX data?

***
*Visualizing patterns with PCA*

Let's visualize the PDX samples by dimensionality reduction using PCA:

**Original PDX data**

In [None]:
# labels_pdx = [", ".join(index[0:2]) for index in pdx.index]
pca_visualize_2d(X_pdx, pdx.index, title="PCA visualization of PDX samples")

In [None]:
pca_visualize_3d(X_pdx, labels=pdx.index, filename="pdx-original-pca-3d")

[See plot here](https://plot.ly/~valentin.loftsson/65/#/)

**Standardized PDX data**

In [None]:
pca_visualize_2d(X_pdx_stdized, pdx.index, title="PCA visualization of standardized PDX samples")

In [None]:
pca_visualize_3d(X_pdx_stdized, labels=pdx.index, filename="pdx-stdized-pca-3d")

[See plot here](https://plot.ly/~valentin.loftsson/63/#/)

***
Let's visualize the patient dataset in the same way:

**Original patients data**

In [None]:
pca_visualize_2d(pats)

In [None]:
pca_visualize_3d(pats)

[See plot here](https://plot.ly/~valentin.loftsson/8/#/)

**Log transformed and standardized patients data**

In [None]:
pca_visualize_2d(pats_log_stdized)

In [None]:
pca_visualize_3d(pats_log_stdized, filename="pats-log-stdized-pca-3d")

[See plot here](https://plot.ly/~valentin.loftsson/69/#/)