In [1]:
# local dependencies
from load import *
from plots import *
from clustering_helpers import *
from constants import *
from helpers import *

***
**Cluster analysis**
***

In this notebook, we will perform cluster analysis on both the PDX and the tumor patient datasets. To support the interpretation and evaluation of clustering models, we will compute the [silhouette coefficient](https://scikit-learn.org/stable/modules/generated/sklearn.metrics.silhouette_score.html) and the [Davis-Boudin index](https://scikit-learn.org/stable/modules/generated/sklearn.metrics.davies_bouldin_score.html). For the PDX dataset we will also compute the [adjusted Rand index](https://scikit-learn.org/stable/modules/generated/sklearn.metrics.adjusted_rand_score.html).
The clustering methods used for the analysis are [Agglomerative Clustering](), [K-means](), and [Spectral Clustering]().

*PDX dataset*

In [2]:
pdx = load_pdx()
X_pdx = pdx.drop(columns="label")
y_pdx = pdx.label

In [None]:
test_all_methods(X_pdx, y_pdx, with_score=True)

REMINDER: Lower the DB index value, better is the clustering
agglomerative db for 2 clusters: 0.72
agglomerative db for 3 clusters: 1.03
agglomerative db for 4 clusters: 0.88
agglomerative db for 5 clusters: 0.77
agglomerative db for 6 clusters: 0.65
kmeans db for 2 clusters: 1.37
kmeans db for 3 clusters: 1.02
kmeans db for 4 clusters: 0.91
kmeans db for 5 clusters: 0.83
kmeans db for 6 clusters: 0.84
spectral db for 2 clusters: 2.24
spectral db for 3 clusters: 2.21
spectral db for 4 clusters: 2.13
spectral db for 5 clusters: 1.39
spectral db for 6 clusters: 1.56
meanshift db for 2 clusters: 0.77
meanshift db for 3 clusters: 0.77
meanshift db for 4 clusters: 0.77
meanshift db for 5 clusters: 0.77
meanshift db for 6 clusters: 0.77
agglomerative score for 2 clusters: 0.03
agglomerative score for 3 clusters: 0.02
agglomerative score for 4 clusters: -0.03
agglomerative score for 5 clusters: 0.02
agglomerative score for 6 clusters: 0.01
kmeans score for 2 clusters: 0.01
kmeans score for 3 

In [None]:
X_pdx_stdized = pdx_standardize(X_pdx)

In [None]:
test_all_methods(X_pdx_stdized, y_pdx, with_score=True)

*Patient dataset*

In [None]:
patients2 = load_patients2()
pats_log = np.log(patients2 + .1)  # add a small constant because log(0) is undefined
pats_log_stdized = df_standardize_columns(pats_log)

In [None]:
test_all_methods(pats_log_stdized)