In [None]:
import sys
sys.path.append("../")

import numpy as np
from sklearn.cluster import Birch
from sklearn import metrics
from sklearn.metrics import accuracy_score

from const import tsne_cluster
from utils import embedding

In [7]:
def get_label(spectra):
    smiles_seq = np.array([s.get("smiles") for s in spectra])
    unique_smiles = np.unique(smiles_seq)
    labels = np.zeros((len(spectra, )))
    for i, smiles in enumerate(unique_smiles):
        labels[smiles_seq == smiles] = i
    return labels, unique_smiles

In [8]:
def purity_score(y_true, y_pred):
    y_voted_labels = np.zeros(y_true.shape)
    labels = np.unique(y_true)
    ordered_labels = np.arange(labels.shape[0])
    for k in range(labels.shape[0]):
        y_true[y_true == labels[k]] = ordered_labels[k]
    # Update unique labels
    labels = np.unique(y_true)
    bins = np.concatenate((labels, [np.max(labels) + 1]), axis=0)

    for cluster in np.unique(y_pred):
        hist, _ = np.histogram(y_true[y_pred == cluster], bins=bins)
        # Find the most present label in the cluster
        winner = np.argmax(hist)
        y_voted_labels[y_pred == cluster] = winner

    return accuracy_score(y_true, y_voted_labels)

In [9]:
def CalEvaluate(labels_true, labels_pred):
    purity = purity_score(labels_true, labels_pred)
    ari = metrics.adjusted_rand_score(labels_true, labels_pred)
    homogeneity = metrics.homogeneity_score(labels_true, labels_pred)
    completeness = metrics.completeness_score(labels_true, labels_pred)
    v_measure = metrics.v_measure_score(labels_true, labels_pred, beta=0.5)
    result = {
        'ARI': ari,
        "purity": purity,
        'homogeneity': homogeneity,
        'completeness': completeness,
        'v_measure': v_measure
    }
    return result

In [None]:
spectra, spectra_embedding = embedding(str(tsne_cluster.SPECEMBEDDING_CLUSTER_RAW))
labels, unique_smiles = get_label(spectra)

brc = Birch(threshold=0.5, n_clusters=len(unique_smiles))
pred_labels = brc.fit_predict(spectra_embedding)
CalEvaluate(labels, pred_labels)

Computing DreaMS embedding: 100%|██████████| 34122/34122 [00:35<00:00, 965.25it/s]


{'ARI': 0.1647235609708263,
 'purity': 0.6468554012074321,
 'homogeneity': 0.8505239088108061,
 'completeness': 0.7405298018757613,
 'v_measure': 0.8103998397939786}

In [None]:
spectra, spectra_embedding = embedding(str(tsne_cluster.MSBERT_CLUSTER_RAW))
labels, unique_smiles = get_label(spectra)

brc = Birch(threshold=0.5, n_clusters=len(unique_smiles))
pred_labels = brc.fit_predict(spectra_embedding)
CalEvaluate(labels, pred_labels)

Computing DreaMS embedding: 100%|██████████| 13217/13217 [00:13<00:00, 963.52it/s]


{'ARI': 0.17364648145255715,
 'purity': 0.7026556707270939,
 'homogeneity': 0.8576354697514671,
 'completeness': 0.7220907042226495,
 'v_measure': 0.8071327561091342}