In [1]:
import sys
sys.path.append("../")
from pathlib import Path

from tqdm import tqdm
import pandas as pd
import numpy as np

from const import gnps, mona
from utils import embedding, most_similar, search_with_embedding, get_smiles, cosine_similarity

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
spectra_paths = {
    "gnps":{
        "orbitrap": {
            "train": (gnps.ORBITRAP_TRAIN_QUERY, gnps.ORBITRAP_TEST_REF),
            "test": (gnps.ORBITRAP_TEST_QUERY, gnps.ORBITRAP_TEST_REF)
        },
        "qtof": {
            "test": (gnps.QTOF_TEST_QUERY, gnps.QTOF_TEST_REF)
        },
        "other": {
            "test": (gnps.OTHER_TEST_QUERY, gnps.OTHER_TEST_REF)
        }
    }
}

In [3]:
show_progress_bar = False
replica_suffix = "-replication-{}"
k_metric = [5, 1, 10]
batch_size = None

In [4]:
replica_df_seq = []

for i in tqdm(range(10)):
    df_seq = []
    for db, db_metadata in spectra_paths.items():
        for desc, path_metadata in db_metadata.items():
            for info, paths in path_metadata.items():
                print("-" * 40, f"{db}-{desc}-{info}", "-" * 40)
                query_path, ref_path = paths
                query_path = query_path.with_stem(query_path.stem + replica_suffix.format(i + 1))
                ref_path = ref_path.with_stem(ref_path.stem + replica_suffix.format(i + 1))
                if db == "gnps" and desc == "orbitrap":
                    if info == "train":
                        query_path = gnps.ORBITRAP_TRAIN_QUERY
                    
                    train_ref_spectra, train_ref_embedding = embedding(
                        str(gnps.ORBITRAP_TRAIN_REF.with_suffix(".mgf")),
                        show_progress_bar
                    )
                    ref_spectra, ref_embedding = embedding(
                        str(ref_path.with_suffix(".mgf")),
                        show_progress_bar
                    )
                    query_spectra, query_embedding = embedding(
                        str(query_path.with_suffix(".mgf")),
                        show_progress_bar
                    )
                    ref_spectra = np.hstack((train_ref_spectra, ref_spectra))
                    ref_embedding = np.concatenate((train_ref_embedding, ref_embedding))

                else:
                    ref_spectra, ref_embedding = embedding(
                        str(ref_path.with_suffix(".mgf")),
                        show_progress_bar,
                    )
                    query_spectra, query_embedding = embedding(
                        str(query_path.with_suffix(".mgf")),
                        show_progress_bar
                    )

                ref_smiles = get_smiles(ref_spectra)
                query_smiles = get_smiles(query_spectra)
                df = search_with_embedding(
                f"{db}-{desc}-{info}", k_metric,
                query_embedding, ref_embedding,
                query_smiles, ref_smiles,
                show_progress_bar, batch_size
                )
                df_seq.append(df)

    df = pd.concat(df_seq, axis=0)
    print(df)
    replica_df_seq.append(df)

  0%|          | 0/10 [00:00<?, ?it/s]

---------------------------------------- gnps-orbitrap-train ----------------------------------------
---------------------------------------- gnps-orbitrap-test ----------------------------------------
---------------------------------------- gnps-qtof-test ----------------------------------------
---------------------------------------- gnps-other-test ----------------------------------------


 10%|█         | 1/10 [09:29<1:25:24, 569.38s/it]

                         top1      top5     top10
gnps-orbitrap-train  0.711283  0.835498  0.868048
gnps-orbitrap-test   0.720641  0.836892  0.866548
gnps-qtof-test       0.524468  0.708245  0.747340
gnps-other-test      0.746706  0.887614  0.914432
---------------------------------------- gnps-orbitrap-train ----------------------------------------
---------------------------------------- gnps-orbitrap-test ----------------------------------------
---------------------------------------- gnps-qtof-test ----------------------------------------
---------------------------------------- gnps-other-test ----------------------------------------


 20%|██        | 2/10 [19:00<1:16:02, 570.30s/it]

                         top1      top5     top10
gnps-orbitrap-train  0.712013  0.835061  0.868486
gnps-orbitrap-test   0.715896  0.844009  0.866548
gnps-qtof-test       0.521543  0.702793  0.742686
gnps-other-test      0.746861  0.887149  0.914897
---------------------------------------- gnps-orbitrap-train ----------------------------------------
---------------------------------------- gnps-orbitrap-test ----------------------------------------
---------------------------------------- gnps-qtof-test ----------------------------------------
---------------------------------------- gnps-other-test ----------------------------------------


 30%|███       | 3/10 [28:21<1:06:02, 566.01s/it]

                         top1      top5     top10
gnps-orbitrap-train  0.712743  0.834915  0.867757
gnps-orbitrap-test   0.720047  0.851127  0.877817
gnps-qtof-test       0.525000  0.700665  0.737633
gnps-other-test      0.739420  0.887304  0.913192
---------------------------------------- gnps-orbitrap-train ----------------------------------------
---------------------------------------- gnps-orbitrap-test ----------------------------------------
---------------------------------------- gnps-qtof-test ----------------------------------------
---------------------------------------- gnps-other-test ----------------------------------------


 40%|████      | 4/10 [37:41<56:23, 563.83s/it]  

                         top1      top5     top10
gnps-orbitrap-train  0.712013  0.836082  0.867465
gnps-orbitrap-test   0.720641  0.835706  0.868327
gnps-qtof-test       0.528723  0.708644  0.747606
gnps-other-test      0.751976  0.892110  0.916292
---------------------------------------- gnps-orbitrap-train ----------------------------------------
---------------------------------------- gnps-orbitrap-test ----------------------------------------
---------------------------------------- gnps-qtof-test ----------------------------------------
---------------------------------------- gnps-other-test ----------------------------------------


 50%|█████     | 5/10 [47:01<46:51, 562.28s/it]

                         top1      top5     top10
gnps-orbitrap-train  0.711867  0.836228  0.867757
gnps-orbitrap-test   0.718268  0.844603  0.870107
gnps-qtof-test       0.523005  0.700798  0.740559
gnps-other-test      0.740040  0.883119  0.910557
---------------------------------------- gnps-orbitrap-train ----------------------------------------
---------------------------------------- gnps-orbitrap-test ----------------------------------------
---------------------------------------- gnps-qtof-test ----------------------------------------
---------------------------------------- gnps-other-test ----------------------------------------


 60%|██████    | 6/10 [56:25<37:32, 563.08s/it]

                         top1      top5     top10
gnps-orbitrap-train  0.711867  0.835644  0.867319
gnps-orbitrap-test   0.734282  0.854686  0.883749
gnps-qtof-test       0.528856  0.706915  0.743218
gnps-other-test      0.741280  0.885444  0.914122
---------------------------------------- gnps-orbitrap-train ----------------------------------------
---------------------------------------- gnps-orbitrap-test ----------------------------------------
---------------------------------------- gnps-qtof-test ----------------------------------------
---------------------------------------- gnps-other-test ----------------------------------------


 70%|███████   | 7/10 [1:05:46<28:06, 562.17s/it]

                         top1      top5     top10
gnps-orbitrap-train  0.711867  0.835644  0.868194
gnps-orbitrap-test   0.718268  0.840451  0.867141
gnps-qtof-test       0.519149  0.704122  0.740559
gnps-other-test      0.748101  0.886684  0.912417
---------------------------------------- gnps-orbitrap-train ----------------------------------------
---------------------------------------- gnps-orbitrap-test ----------------------------------------
---------------------------------------- gnps-qtof-test ----------------------------------------
---------------------------------------- gnps-other-test ----------------------------------------


 80%|████████  | 8/10 [1:15:13<18:47, 563.95s/it]

                         top1      top5     top10
gnps-orbitrap-train  0.712305  0.835207  0.867611
gnps-orbitrap-test   0.711744  0.829775  0.862989
gnps-qtof-test       0.522340  0.703723  0.743750
gnps-other-test      0.743916  0.888389  0.915517
---------------------------------------- gnps-orbitrap-train ----------------------------------------
---------------------------------------- gnps-orbitrap-test ----------------------------------------
---------------------------------------- gnps-qtof-test ----------------------------------------
---------------------------------------- gnps-other-test ----------------------------------------


 90%|█████████ | 9/10 [1:24:41<09:24, 564.96s/it]

                         top1      top5     top10
gnps-orbitrap-train  0.711867  0.835353  0.867611
gnps-orbitrap-test   0.714116  0.837485  0.864769
gnps-qtof-test       0.525798  0.702261  0.741489
gnps-other-test      0.741280  0.887769  0.913347
---------------------------------------- gnps-orbitrap-train ----------------------------------------
---------------------------------------- gnps-orbitrap-test ----------------------------------------
---------------------------------------- gnps-qtof-test ----------------------------------------
---------------------------------------- gnps-other-test ----------------------------------------


100%|██████████| 10/10 [1:34:07<00:00, 564.77s/it]

                         top1      top5     top10
gnps-orbitrap-train  0.712597  0.835498  0.867902
gnps-orbitrap-test   0.716489  0.844603  0.874259
gnps-qtof-test       0.523803  0.701729  0.739761
gnps-other-test      0.739730  0.886839  0.913192





In [5]:
data = []
indices = replica_df_seq[0].index
columns = replica_df_seq[0].columns
for item in replica_df_seq:
    data.append([item.values])

In [6]:
data = np.concatenate(data, axis=0)
np.set_printoptions(precision=4, suppress=True)
np.mean(data, axis=0) * 100, np.std(data, axis=0) * 100

(array([[71.2042, 83.5513, 86.7815],
        [71.9039, 84.1934, 87.0225],
        [52.4269, 70.3989, 74.246 ],
        [74.3931, 88.7242, 91.3796]]),
 array([[0.0396, 0.0394, 0.0334],
        [0.5775, 0.7069, 0.6157],
        [0.2881, 0.2814, 0.3022],
        [0.4069, 0.2148, 0.1557]]))

In [7]:
pd.set_option('display.precision', 4)
mean_df = pd.DataFrame(np.mean(data, axis=0) * 100, index=indices, columns=columns)
std_df = pd.DataFrame(np.std(data, axis=0) * 100, index=indices, columns=columns)

In [8]:
mean_df.to_csv(f"./mean.tsv", sep='\t')
std_df.to_csv(f"./std.tsv", sep='\t')

In [2]:
show_progress_bar = False
k_metric = [5, 1, 10]
batch_size = None

dir_ = Path("/data1/xp/data/MSBert/MTBLS1572")
query_path = dir_ / "query.npy"
ref_path = dir_ / "ref.npy"

query_spectra, query_embedding = embedding(query_path.with_suffix(".mgf"), show_progress_bar)
ref_spectra, ref_embedding = embedding(ref_path.with_suffix(".mgf"), show_progress_bar)
query_smiles = get_smiles(query_spectra)
ref_smiles = get_smiles(ref_spectra)

search_with_embedding(
    "MTBLS1572", k_metric,
    query_embedding, ref_embedding,
    query_smiles, ref_smiles,
    show_progress_bar, batch_size
)

Unnamed: 0,top1,top5,top10
MTBLS1572,0.857143,1.0,1.0


In [4]:
cosine_score = cosine_similarity(
    query_embedding, ref_embedding
)
for i, j in enumerate(np.argmax(cosine_score, axis=1)):
    if i != j:
        print(f"{i}-th answer is [{ref_spectra[i].get("compound_name")}] but get {j}-th [{ref_spectra[j].get("compound_name")}]")

4-th answer is [Butyryl-L-carnitine] but get 11-th [Isovaleryl-L-carnitine]
17-th answer is [Pantothenic acid] but get 19-th [Propionyl-L-carnitine]
18-th answer is [Proline] but get 16-th [Ornithine]


In [6]:
cosine_score[4][4], cosine_score[4][11]

(0.4315794722035844, 0.6193784627802925)

In [2]:
show_progress_bar = True

orbitrap_ref_spectra, orbitrap_ref_embedding = embedding(
    str(mona.ORBITRAP_COMMON.with_suffix(".mgf")), 
    show_progress_bar
)
orbitrap_query_spectra, orbitrap_query_embedding = embedding(
    str(mona.ORBITRAP_UNIQUE.with_suffix(".mgf")), 
    show_progress_bar
)
qtof_ref_spectra, qtof_ref_embedding = embedding(
    str(mona.ORBITRAP_COMMON.with_suffix(".mgf")), 
    show_progress_bar
)
qtof_query_spectra, qtof_query_embedding = embedding(
    str(mona.QTOF_UNIQUE.with_suffix(".mgf")), 
    show_progress_bar
)

Computing DreaMS embedding: 100%|██████████| 3376/3376 [00:03<00:00, 980.10it/s]
Computing DreaMS embedding: 100%|██████████| 15655/15655 [00:16<00:00, 968.02it/s]
Computing DreaMS embedding: 100%|██████████| 3376/3376 [00:03<00:00, 954.85it/s]
Computing DreaMS embedding: 100%|██████████| 674/674 [00:00<00:00, 861.37it/s]


In [3]:
orbitrap_score, orbitrap_indices = most_similar(orbitrap_query_embedding, orbitrap_ref_embedding, 512, show_progress_bar)
qtof_score, qtof_indices = most_similar(qtof_query_embedding, qtof_ref_embedding, 512, show_progress_bar)

processing: 100%|██████████| 31/31 [00:05<00:00,  5.46it/s]
processing: 100%|██████████| 2/2 [00:00<00:00, 16.95it/s]


In [4]:
dir = Path("/data1/xp/code/specEmbedding/score_distribution/compound search/")
orbitrap_dir = dir / "orbitrap"
qtof_dir = dir / "qtof"

orbitrap_dir.mkdir(exist_ok=True, parents=True)
qtof_dir.mkdir(exist_ok=True, parents=True)

np.save(orbitrap_dir / "DreaMS_Score.npy", orbitrap_score)
np.save(orbitrap_dir / "DreaMS_Indices.npy", orbitrap_indices)
np.save(qtof_dir / "DreaMS_Score.npy", qtof_score)
np.save(qtof_dir / "DreaMS_Indices.npy", qtof_indices)

In [13]:
show_progress_bar = True
k_metric = [5, 1, 10]
batch_size = None

query_spectra, query_embedding = embedding(mona.ORBITRAP_COMMON.with_suffix(".mgf"), show_progress_bar)
ref_spectra, ref_embedding = embedding(gnps.ORBITRAP_ALL.with_suffix(".mgf"), show_progress_bar)
query_smiles = get_smiles(query_spectra)
ref_smiles = get_smiles(ref_spectra)

search_with_embedding(
    "Orbitrap Common", k_metric,
    query_embedding, ref_embedding,
    query_smiles, ref_smiles,
    show_progress_bar, batch_size
)

Computing DreaMS embedding: 100%|██████████| 3376/3376 [00:03<00:00, 937.08it/s]
Computing DreaMS embedding: 100%|██████████| 163952/163952 [02:49<00:00, 966.60it/s]
calculate hit and recall count: 100%|██████████| 1/1 [00:05<00:00,  5.77s/it]


Unnamed: 0,top1,top5,top10
Orbitrap Common,0.8178,0.878,0.9043


In [14]:
show_progress_bar = True
k_metric = [5, 1, 10]
batch_size = None

query_spectra, query_embedding = embedding(mona.QTOF_COMMON.with_suffix(".mgf"), show_progress_bar)
ref_spectra, ref_embedding = embedding(gnps.QTOF_ALL.with_suffix(".mgf"), show_progress_bar)
query_smiles = get_smiles(query_spectra)
ref_smiles = get_smiles(ref_spectra)

search_with_embedding(
    "QTOF Common", k_metric,
    query_embedding, ref_embedding,
    query_smiles, ref_smiles,
    show_progress_bar, batch_size
)

Computing DreaMS embedding: 100%|██████████| 7243/7243 [00:07<00:00, 976.21it/s] 
Computing DreaMS embedding: 100%|██████████| 44560/44560 [00:46<00:00, 968.13it/s] 
calculate hit and recall count: 100%|██████████| 1/1 [00:01<00:00,  1.64s/it]


Unnamed: 0,top1,top5,top10
QTOF Common,0.9873,0.9953,0.997
