In [None]:
# local dependencies
from load import *
from plots import *
from clustering_helpers import *
from constants import *
from helpers import *

%load_ext autoreload
%autoreload 2

***
**Cluster analysis**
***

In this notebook, we will perform cluster analysis on both the PDX and the tumor patient datasets. Before reading this notebook, please make sure you have read the exploratory [data analysis](data_analysis.ipynb). To support the interpretation and evaluation of clustering models, we will compute the [silhouette coefficient](https://scikit-learn.org/stable/modules/generated/sklearn.metrics.silhouette_score.html) and the [Davis-Boudin index](https://scikit-learn.org/stable/modules/generated/sklearn.metrics.davies_bouldin_score.html). For the labeled PDX dataset we will also compute the [adjusted Rand index](https://scikit-learn.org/stable/modules/generated/sklearn.metrics.adjusted_rand_score.html).
The clustering methods used here are [Agglomerative Clustering](https://scikit-learn.org/stable/modules/generated/sklearn.cluster.AgglomerativeClustering.html), [K-means](https://scikit-learn.org/stable/modules/generated/sklearn.cluster.KMeans.html?highlight=kmeans#sklearn.cluster.KMeans), and [Spectral Clustering](https://scikit-learn.org/stable/modules/generated/sklearn.cluster.SpectralClustering.html?highlight=spectralclustering#sklearn.cluster.SpectralClustering).

***PDX dataset***

Let's first run the analysis on the raw data:

In [None]:
pdx = load_pdx()
X_pdx = pdx.drop(columns="label")
y_pdx = pdx.label

In [None]:
pdx_scores = run_cluster_analysis(X_pdx, y_pdx)

In [None]:
pdx_scores.to_latex(open("../results/pdx_scores.tex", "w"))
pdx_scores

Now, let's run the analysis on the standardized data (per tumor):

In [None]:
X_pdx_stdized = pdx_standardize(X_pdx)

In [None]:
pdx_stdized_scores = run_cluster_analysis(X_pdx_stdized, y_pdx)

In [None]:
pdx_stdized_scores.to_latex(open("../results/pdx_stdized_scores.tex", "w"))
pdx_stdized_scores

We observe that the ARI score is higher for the standardized data, especially for *K-means*.

The number of clusters we are looking for is 3, since we have three hormones/labels. Let's see how the methods perform without the `ctrl` label.

In [None]:
X_pdx_stdized_noctrl = X_pdx_stdized.drop('ctrl')
y_pdx_noctrl = y_pdx.drop('ctrl')

First, we will find the optimal initial centroids for K-means and Spectral clustering, and then apply those optimal centroids.

In [None]:
kmeans_opt_state = optimize_ARI(X_pdx_stdized_noctrl, y_pdx_noctrl, n=120, method="kmeans")

In [None]:
spectral_opt_state = optimize_ARI(X_pdx_stdized_noctrl, y_pdx_noctrl, n=120, method="spectral")

In [None]:
opt_random_states = {
    "kmeans": kmeans_opt_state,
    "spectral": spectral_opt_state,
}
pdx_stdized_noctrl_scores = run_cluster_analysis(X_pdx_stdized_noctrl, y_pdx_noctrl, opt_random_states)

In [None]:
pdx_stdized_noctrl_scores.to_latex(open("../results/pdx_stdized_noctrl_scores.tex", "w"))
pdx_stdized_noctrl_scores

As expected, the methods are better able to cluster when we drop the `ctrl` subjects and use optimum initial centroids. We also observe that the optimal number of clusters is k=3, just as we expect.

***Patient dataset***

In [None]:
pats = load_patients2()
pats_log_stdized = df_log_standardize_cols(pats)

In [None]:
#pats_scores = run_cluster_analysis(pats_log_stdized)

In [None]:
#pats_scores.to_latex(open("../results/pats_scores.tex", "w"))
#pats_scores

***
**Applying the best method**
***

Here, we'll apply K-means using 3 clusters on both the PDX and patient datasets. Note, that the optimal random state for the PDX data cannot be applied to the patient data.

***PDX dataset***

In [None]:
clus = cluster.KMeans(n_clusters=3, random_state=116)
predicted = clus.fit_predict(X_pdx_stdized_noctrl)

ari_score = metrics.adjusted_rand_score(y_pdx_noctrl, predicted)

describe_prediction(predicted, y_pdx_noctrl)

print(f"ARI score: {ari_score}")

pca = PCA()
pdx_components = pca.fit_transform(X_pdx_stdized_noctrl)
data = pd.DataFrame(pdx_components[:, :3], columns=["1st PC", "2nd PC", "3rd PC"])
data["predicted"] = y_pdx_noctrl.values
px.scatter_3d(data, x="1st PC", y="2nd PC", z="3rd PC", color="predicted")

In [None]:
score = optimize_ARI(X_pdx_stdized_noctrl, y_pdx_noctrl, 120)

In [None]:
patients_labels = apply_pdx_centroids_on_patients(X_pdx_stdized_noctrl, y_pdx_noctrl, pats_log_stdized, 116, 2)
print(patients_labels)

In [None]:
#get_gene_ratios(pats_log_stdized, patients_labels)

***Patient dataset***

In [None]:
pca.fit(pats_log_stdized)

plot_pca_expl_var(pca, 91)

Take 67 components

In [None]:
pats_components = pca.transform(pats_log_stdized)
pats_pca = pats_components[:,:68]

In [None]:
clus = cluster.KMeans(n_clusters=3, random_state=116)
predicted = clus.fit_predict(pats_pca)
silhouette = metrics.silhouette_score(pats_pca, predicted, metric='euclidean')
db = metrics.davies_bouldin_score(pats_pca, predicted)

print("silhouette is: " + str(silhouette))
print("db is: " + str(db))

In [None]:
data = pd.DataFrame(pats_pca[:,:3], columns=["1st PC", "2nd PC", "3rd PC"])
data['predicted'] = predicted
px.scatter_3d(data, x="1st PC", y="2nd PC", z="3rd PC", color='predicted')
    

Overall, we get results comparable to the standardized datasets