In [None]:
# local dependencies
from load import *
from plots import *
from clustering_helpers import *
from constants import *
from helpers import *

***
**Cluster analysis**
***

In this notebook, we will perform cluster analysis on both the PDX and the tumor patient datasets. To support the interpretation and evaluation of clustering models, we will compute the [silhouette coefficient](https://scikit-learn.org/stable/modules/generated/sklearn.metrics.silhouette_score.html) and the [Davis-Boudin index](https://scikit-learn.org/stable/modules/generated/sklearn.metrics.davies_bouldin_score.html). For the PDX dataset we will also compute the [adjusted Rand index](https://scikit-learn.org/stable/modules/generated/sklearn.metrics.adjusted_rand_score.html).
The clustering methods used for the analysis are [Agglomerative Clustering](), [K-means](), and [Spectral Clustering]().

*PDX dataset*

In [None]:
pdx = load_pdx()
X_pdx = pdx.drop(columns="label")
y_pdx = pdx.label

In [None]:
test_all_methods(X_pdx, y_pdx, with_score=True)

In [None]:
X_pdx_stdized = pdx_standardize(X_pdx)

In [None]:
test_all_methods(X_pdx_stdized, y_pdx, with_score=True)

*Patient dataset*

In [None]:
patients2 = load_patients2()
pats_log = np.log(patients2 + .1)  # add a small constant because log(0) is undefined
pats_log_stdized = df_standardize_columns(pats_log)

In [None]:
test_all_methods(pats_log_stdized)

### How about a little PCA?

#### pdx

In [None]:
pca = PCA()
pca.fit(X_pdx_stdized)

plot_pca_expl_var(pca)

print(pca.n_components_)

Take 10 components

In [None]:
pdx_components = pca.transform(X_pdx_stdized)
pdx_pca = pdx_components[:,:11]

In [None]:
clus = cluster.KMeans(n_clusters=4, random_state=182)
predicted = clus.fit_predict(pdx_pca)

score = metrics.adjusted_rand_score(y_pdx, predicted)
silhouette = metrics.silhouette_score(pdx_pca, predicted, metric='euclidean')
db = metrics.davies_bouldin_score(pdx_pca, predicted)

print("score is: " + str(score))
print("silhouette is: " + str(silhouette))
print("db is: " + str(db))

In [None]:
# optimize for ARI score

score = np.zeros((1000,5))
for j in range(score.shape[1]):
    for i in range(score.shape[0]):
        clus = cluster.KMeans(n_clusters=j+2, random_state=i)
        predicted = clus.fit_predict(pdx_pca)
        score[i,j] = metrics.adjusted_rand_score(y_pdx, predicted)
    
for sc in range(score.shape[1]):
    plt.plot(score[:,sc])
plt.legend(['2','3','4','5','6'])
plt.ylim(bottom=0)
plt.show()
print(score.max())
print(np.where(score==score.max()))

#### patients

In [None]:
pca.fit(pats_log_stdized)

plot_pca_expl_var(pca, 91)

Take 67 components

In [None]:
pats_components = pca.transform(pats_log_stdized)
pats_pca = pdx_components[:,:68]

In [None]:
clus = cluster.KMeans(n_clusters=4, random_state=0)
predicted = clus.fit_predict(pats_pca)

silhouette = metrics.silhouette_score(pats_pca, predicted, metric='euclidean')
db = metrics.davies_bouldin_score(pats_pca, predicted)

print("silhouette is: " + str(silhouette))
print("db is: " + str(db))

Overally we get results comparable to the standardized datasets