In [None]:
import sys
sys.path.append("../")

from gensim.models import Word2Vec
import numpy as np
from sklearn.cluster import Birch
from sklearn import metrics
from sklearn.metrics import accuracy_score

from const import tsne_cluster
from utils import embedding

In [2]:
def get_label(spectra):
    smiles_seq = np.array([s.get("smiles") for s in spectra])
    unique_smiles = np.unique(smiles_seq)
    labels = np.zeros((len(spectra, )))
    for i, smiles in enumerate(unique_smiles):
        labels[smiles_seq == smiles] = i
    return labels, unique_smiles

In [3]:
def purity_score(y_true, y_pred):
    y_voted_labels = np.zeros(y_true.shape)
    labels = np.unique(y_true)
    ordered_labels = np.arange(labels.shape[0])
    for k in range(labels.shape[0]):
        y_true[y_true == labels[k]] = ordered_labels[k]
    # Update unique labels
    labels = np.unique(y_true)
    bins = np.concatenate((labels, [np.max(labels) + 1]), axis=0)

    for cluster in np.unique(y_pred):
        hist, _ = np.histogram(y_true[y_pred == cluster], bins=bins)
        # Find the most present label in the cluster
        winner = np.argmax(hist)
        y_voted_labels[y_pred == cluster] = winner

    return accuracy_score(y_true, y_voted_labels)

In [4]:
def CalEvaluate(labels_true, labels_pred):
    purity = purity_score(labels_true, labels_pred)
    ari = metrics.adjusted_rand_score(labels_true, labels_pred)
    homogeneity = metrics.homogeneity_score(labels_true, labels_pred)
    completeness = metrics.completeness_score(labels_true, labels_pred)
    v_measure = metrics.v_measure_score(labels_true, labels_pred, beta=0.5)
    result = {
        'ARI': ari,
        "purity": purity,
        'homogeneity': homogeneity,
        'completeness': completeness,
        'v_measure': v_measure
    }
    return result

In [5]:
show_progress_bar = False
model = Word2Vec.load("orbitrap.model")

In [6]:
spectra = np.load(tsne_cluster.SPECEMBEDDING_CLUSTER, allow_pickle=True)
labels, unique_smiles = get_label(spectra)

spectra_embedding, _ = embedding(model,  spectra, show_progress_bar)

brc = Birch(threshold=0.5, n_clusters=len(unique_smiles))
pred_labels = brc.fit_predict(spectra_embedding)
CalEvaluate(labels, pred_labels)

{'ARI': 0.08706158804987005,
 'purity': 0.5177011898481918,
 'homogeneity': 0.7167324859727832,
 'completeness': 0.7078143143384031,
 'v_measure': 0.713734896454794}

In [7]:
spectra = np.load(tsne_cluster.MSBERT_CLUSTER, allow_pickle=True)
labels, unique_smiles = get_label(spectra)

spectra_embedding, _ = embedding(model,  spectra, show_progress_bar)

brc = Birch(threshold=0.5, n_clusters=len(unique_smiles))
pred_labels = brc.fit_predict(spectra_embedding)
CalEvaluate(labels, pred_labels)

{'ARI': 0.1006049521645423,
 'purity': 0.5697964742377242,
 'homogeneity': 0.7096235833528495,
 'completeness': 0.6920674683943767,
 'v_measure': 0.703673406631542}