In [None]:
# local dependencies
from load import *
from plots import *
from clustering_helpers import *
from constants import *
from helpers import *

***
**Cluster analysis**
***

In this notebook, we will perform cluster analysis on both the PDX and the tumor patient datasets. To support the interpretation and evaluation of clustering models, we will compute the [silhouette coefficient](https://scikit-learn.org/stable/modules/generated/sklearn.metrics.silhouette_score.html) and the [Davis-Boudin index](https://scikit-learn.org/stable/modules/generated/sklearn.metrics.davies_bouldin_score.html). For the PDX dataset we will also compute the [adjusted Rand index](https://scikit-learn.org/stable/modules/generated/sklearn.metrics.adjusted_rand_score.html).
The clustering methods used for the analysis are [Agglomerative Clustering](), [K-means](), and [Spectral Clustering]().

*PDX dataset*

In [None]:
pdx = load_pdx()
X_pdx = pdx.drop(columns="label")
y_pdx = pdx.label

In [None]:
test_all_methods(X_pdx, y_pdx, with_score=True)

In [None]:
X_pdx_stdized = pdx_standardize(X_pdx)
X_pdx_stdized

In [None]:
test_all_methods(X_pdx_stdized, y_pdx, with_score=True)

*Patient dataset*

In [None]:
patients2 = load_patients2()
pats_log = np.log(patients2 + .1)  # add a small constant because log(0) is undefined
pats_log_stdized = df_standardize_columns(pats_log)

#### note: the cell below takes a long time due to spectral clustering's O(n<sup>3</sup>) complexity

In [None]:
test_all_methods(pats_log_stdized)

### How about a little PCA?

#### pdx

In [None]:
pca = PCA()
pca.fit(X_pdx_stdized)
plot_pca_expl_var(pca)
print(pca.n_components_)

Take 10 components

In [None]:
X_pdx_stdized_noctrl = X_pdx_stdized.drop('ctrl')
y_pdx_noctrl = y_pdx.drop('ctrl')

In [None]:
pdx_components = pca.transform(X_pdx_stdized_noctrl)
pdx_pca = pdx_components[:,:11]

In [None]:
clus = cluster.KMeans(n_clusters=3, random_state=33)
predicted = clus.fit_predict(X_pdx_stdized_noctrl)

score = metrics.adjusted_rand_score(y_pdx_noctrl, predicted)
silhouette = metrics.silhouette_score(X_pdx_stdized_noctrl, predicted, metric='euclidean')
db = metrics.davies_bouldin_score(X_pdx_stdized_noctrl, predicted)

describe_prediction(predicted, y_pdx_noctrl)


print("score is: " + str(score))
print("silhouette is: " + str(silhouette))
print("db is: " + str(db))

data = pd.DataFrame(pdx_pca[:,:3], columns=["1st PC", "2nd PC", "3rd PC"])
data['predicted'] = y_pdx_noctrl.values
px.scatter_3d(data, x="1st PC", y="2nd PC", z="3rd PC", color='predicted')

In [None]:
score = optimize_ARI(X_pdx_stdized_noctrl, y_pdx_noctrl, 120)
#score = optimize_ARI(X_pdx_stdized, y_pdx, 400)


In [None]:
patientsLabels = applyClusterCentersOnPatients(X_pdx_stdized_noctrl, y_pdx_noctrl, pats_log_stdized, 33)
print(patientsLabels)

In [None]:
get_gene_ratios(pats_log_stdized, patientsLabels)

#### patients

In [None]:
pca.fit(pats_log_stdized)

plot_pca_expl_var(pca, 91)

Take 67 components

In [None]:
pats_components = pca.transform(pats_log_stdized)
pats_pca = pats_components[:,:68]

In [None]:
clus = cluster.KMeans(n_clusters=4, random_state=78)
predicted = clus.fit_predict(pats_pca)
silhouette = metrics.silhouette_score(pats_pca, predicted, metric='euclidean')
db = metrics.davies_bouldin_score(pats_pca, predicted)

print("silhouette is: " + str(silhouette))
print("db is: " + str(db))

Overall, we get results comparable to the standardized datasets