In [1]:
import sys
sys.path.append("../")
from pathlib import Path

import numpy as np
from sklearn.metrics import roc_curve, auc

from const import gnps
from utils import embedding, cosine_similarity, get_smiles

ROC_DIR = Path("/data1/xp/code/specEmbedding/ROC")
ROC_DIR.mkdir(exist_ok=True, parents=True)

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
spectra_paths = {
    "gnps":{
        "orbitrap": {
            "train": (gnps.ORBITRAP_TRAIN_QUERY, gnps.ORBITRAP_TEST_REF),
            "test": (gnps.ORBITRAP_TEST_QUERY, gnps.ORBITRAP_TEST_REF)
        },
        "qtof": {
            "test": (gnps.QTOF_TEST_QUERY, gnps.QTOF_TEST_REF)
        },
        "other": {
            "test": (gnps.OTHER_TEST_QUERY, gnps.OTHER_TEST_REF)
        }
    }
}

In [3]:
show_progress_bar = True
replica_suffix = "-replication-{}"
k_metric = [5, 1, 10]
batch_size = None

In [4]:
for db, db_metadata in spectra_paths.items():
    for desc, path_metadata in db_metadata.items():
        for info, paths in path_metadata.items():
            print("-" * 40, f"{db}-{desc}-{info}", "-" * 40)
            query_path, ref_path = paths
            if db == "gnps" and desc == "orbitrap":                
                train_ref_spectra, train_ref_embedding = embedding(
                    str(gnps.ORBITRAP_TRAIN_REF.with_suffix(".mgf")),
                    show_progress_bar
                )
                ref_spectra, ref_embedding = embedding(
                    str(ref_path.with_suffix(".mgf")),
                    show_progress_bar
                )
                query_spectra, query_embedding = embedding(
                    str(query_path.with_suffix(".mgf")),
                    show_progress_bar
                )
                ref_spectra = np.hstack((train_ref_spectra, ref_spectra))
                ref_embedding = np.concatenate((train_ref_embedding, ref_embedding))

            else:
                ref_spectra, ref_embedding = embedding(
                    str(ref_path.with_suffix(".mgf")),
                    show_progress_bar,
                )
                query_spectra, query_embedding = embedding(
                    str(query_path.with_suffix(".mgf")),
                    show_progress_bar
                )
            
            ref_smiles = get_smiles(ref_spectra)
            query_smiles = get_smiles(query_spectra)
            cosine_score = cosine_similarity(query_embedding, ref_embedding)
            mask = np.equal(query_smiles.reshape(-1, 1), ref_smiles.reshape(-1, 1).T)
            dir_ = ROC_DIR / f"{db}-{desc}-{info}"
            dir_.mkdir(parents=True, exist_ok=True)
            indices = np.load(dir_ / "random_indices.npy", allow_pickle=True).item()
            random_rows = indices["row"]
            random_cols = indices["col"]
            fpr, tpr, _ = roc_curve(mask[random_rows, random_cols], cosine_score[random_rows, random_cols])
            print(auc(fpr, tpr))
            np.save(dir_ / "DreaMS.npy", {"fpr": fpr, "tpr": tpr})

---------------------------------------- gnps-orbitrap-train ----------------------------------------


Computing DreaMS embedding: 100%|██████████| 122979/122979 [02:05<00:00, 983.01it/s] 
Computing DreaMS embedding: 100%|██████████| 32436/32436 [00:32<00:00, 983.30it/s] 
Computing DreaMS embedding: 100%|██████████| 6851/6851 [00:06<00:00, 984.51it/s] 


0.8381234949385583
---------------------------------------- gnps-orbitrap-test ----------------------------------------


Computing DreaMS embedding: 100%|██████████| 122979/122979 [02:03<00:00, 994.78it/s] 
Computing DreaMS embedding: 100%|██████████| 32436/32436 [00:32<00:00, 988.05it/s] 
Computing DreaMS embedding: 100%|██████████| 1686/1686 [00:01<00:00, 932.10it/s]


0.8394401668693443
---------------------------------------- gnps-qtof-test ----------------------------------------


Computing DreaMS embedding: 100%|██████████| 37040/37040 [00:37<00:00, 998.58it/s] 
Computing DreaMS embedding: 100%|██████████| 7520/7520 [00:07<00:00, 987.11it/s] 


0.9770307677765868
---------------------------------------- gnps-other-test ----------------------------------------


Computing DreaMS embedding: 100%|██████████| 44241/44241 [00:44<00:00, 997.35it/s] 
Computing DreaMS embedding: 100%|██████████| 6451/6451 [00:06<00:00, 986.71it/s] 


0.9466458857016649
