In [1]:
import sys
sys.path.append("../")

import torch
import numpy as np
from sklearn.cluster import Birch
from sklearn import metrics
from sklearn.metrics import accuracy_score

from train import ModelTester
from const import tsne_cluster
from data import Tokenizer
from utils import embedding, load_transformer_model

In [2]:
def get_label(spectra):
    smiles_seq = np.array([s.get("smiles") for s in spectra])
    unique_smiles = np.unique(smiles_seq)
    labels = np.zeros((len(spectra, )))
    for i, smiles in enumerate(unique_smiles):
        labels[smiles_seq == smiles] = i
    return labels, unique_smiles

In [3]:
def purity_score(y_true, y_pred):
    y_voted_labels = np.zeros(y_true.shape)
    labels = np.unique(y_true)
    ordered_labels = np.arange(labels.shape[0])
    for k in range(labels.shape[0]):
        y_true[y_true == labels[k]] = ordered_labels[k]
    # Update unique labels
    labels = np.unique(y_true)
    bins = np.concatenate((labels, [np.max(labels) + 1]), axis=0)

    for cluster in np.unique(y_pred):
        hist, _ = np.histogram(y_true[y_pred == cluster], bins=bins)
        # Find the most present label in the cluster
        winner = np.argmax(hist)
        y_voted_labels[y_pred == cluster] = winner

    return accuracy_score(y_true, y_voted_labels)

In [4]:
def CalEvaluate(labels_true, labels_pred):
    purity = purity_score(labels_true, labels_pred)
    ari = metrics.adjusted_rand_score(labels_true, labels_pred)
    homogeneity = metrics.homogeneity_score(labels_true, labels_pred)
    completeness = metrics.completeness_score(labels_true, labels_pred)
    v_measure = metrics.v_measure_score(labels_true, labels_pred, beta=0.5)
    result = {
        'ARI': ari,
        "purity": purity,
        'homogeneity': homogeneity,
        'completeness': completeness,
        'v_measure': v_measure
    }
    return result

In [5]:
show_progress_bar = False
is_augment = True
model_backbone = "transformer"
loss_type = "SupConWithTanimotoLoss"
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
tokenizer = Tokenizer(100, show_progress_bar)
model = load_transformer_model(device, loss_type, is_augment)
tester = ModelTester(model, device, show_progress_bar)

In [6]:
spectra = np.load(tsne_cluster.SPECEMBEDDING_CLUSTER, allow_pickle=True)
labels, unique_smiles = get_label(spectra)

spectra_embedding, _ = embedding(
    tester, tokenizer,
    512, spectra,
    False
)

brc = Birch(threshold=0.5, n_clusters=len(unique_smiles))
pred_labels = brc.fit_predict(spectra_embedding)
CalEvaluate(labels, pred_labels)

{'ARI': 0.5127917531520478,
 'purity': 0.9198171267803763,
 'homogeneity': 0.9748241693544608,
 'completeness': 0.8884910349058514,
 'v_measure': 0.9442407565840419}

In [7]:
len(unique_smiles)

2043

In [8]:
spectra = np.load(tsne_cluster.MSBERT_CLUSTER, allow_pickle=True)
labels, unique_smiles = get_label(spectra)

spectra_embedding, _ = embedding(
    tester, tokenizer,
    512, spectra,
    False
)

brc = Birch(threshold=0.5, n_clusters=len(unique_smiles))
pred_labels = brc.fit_predict(spectra_embedding)
CalEvaluate(labels, pred_labels)

{'ARI': 0.4988632034514985,
 'purity': 0.946583944919422,
 'homogeneity': 0.9828430936028273,
 'completeness': 0.8793606461515936,
 'v_measure': 0.9457449282963973}

In [None]:
len(unique_smiles)