In [17]:
import sys
sys.path.append("../")
from pathlib import Path

import torch
import numpy as np
from sklearn.metrics import roc_curve, auc
from gensim.models import Word2Vec

from const import gnps
from utils import embedding, cosine_similarity

ROC_DIR = Path("/data1/xp/code/specEmbedding/ROC")
ROC_DIR.mkdir(exist_ok=True, parents=True)

In [18]:
model_files = {
    "orbitrap":{
        "model": "orbitrap.model",
    },
    "qtof": {
        "model": "qtof.model",
    },
    "other": {
        "model": "other.model",
    }
}

spectra_paths = {
    "gnps":{
        "orbitrap": {
            "train": (gnps.ORBITRAP_TRAIN_QUERY, gnps.ORBITRAP_TEST_REF),
            "test": (gnps.ORBITRAP_TEST_QUERY, gnps.ORBITRAP_TEST_REF)
        },
        "qtof": {
            "test": (gnps.QTOF_TEST_QUERY, gnps.QTOF_TEST_REF)
        },
        "other": {
            "test": (gnps.OTHER_TEST_QUERY, gnps.OTHER_TEST_REF)
        }
    }
}
gnps_train_ref = np.load(gnps.ORBITRAP_TRAIN_REF, allow_pickle=True)

In [19]:
replica_df_seq = []

models = {
    desc: Word2Vec.load(metadata["model"])
    for desc, metadata in model_files.items()
}
show_progress_bar = False

for db, db_metadata in spectra_paths.items():
    for desc, path_metadata in db_metadata.items():
        model = models[desc]
        for info, paths in path_metadata.items():
            print("-" * 40, f"{db}-{desc}-{info}", "-" * 40)
            query_path, ref_path = paths
            ref_spectra = np.load(ref_path, allow_pickle=True)
            query_spectra = np.load(query_path, allow_pickle=True)
            if db == "gnps" and desc == "orbitrap":
                ref_spectra = np.hstack((gnps_train_ref, ref_spectra))
            
            model = models[desc]
            query_embedding, query_smiles = embedding(model, query_spectra, show_progress_bar)
            ref_embedding, ref_smiles = embedding(model, ref_spectra, show_progress_bar)
            cosine_score = cosine_similarity(query_embedding, ref_embedding)
            mask = np.equal(query_smiles.reshape(-1, 1), ref_smiles.reshape(-1, 1).T)
            dir_ = ROC_DIR / f"{db}-{desc}-{info}"
            dir_.mkdir(parents=True, exist_ok=True)
            indices = np.load(dir_ / "random_indices.npy", allow_pickle=True).item()
            random_rows = indices["row"]
            random_cols = indices["col"]
            fpr, tpr, _ = roc_curve(mask[random_rows, random_cols], cosine_score[random_rows, random_cols])
            print(auc(fpr, tpr))
            np.save(dir_ / "Spec2Vec.npy", {"fpr": fpr, "tpr": tpr})

---------------------------------------- gnps-orbitrap-train ----------------------------------------
0.8529426563034559
---------------------------------------- gnps-orbitrap-test ----------------------------------------
0.8182689606625946
---------------------------------------- gnps-qtof-test ----------------------------------------
0.944074365820332
---------------------------------------- gnps-other-test ----------------------------------------
0.9245773544093007
