In [1]:
import sys
sys.path.append("../")
from pathlib import Path

import pandas as pd
import numpy as np
from tqdm import tqdm

from const import gnps, mona
from utils import search_with_spectra, load_model, most_similar, cosine_similarity

In [2]:
show_progress_bar = False
model = load_model()

batch_size = None
k_metric = [5, 1, 10]
loader_batch_size = 512

replica_suffix = "-replication-{}"

In [3]:
spectra_paths = {
    "gnps":{
        "orbitrap": {
            "train": (gnps.ORBITRAP_TRAIN_QUERY, gnps.ORBITRAP_TEST_REF),
            "test": (gnps.ORBITRAP_TEST_QUERY, gnps.ORBITRAP_TEST_REF)
        },
        "qtof": {
            "test": (gnps.QTOF_TEST_QUERY, gnps.QTOF_TEST_REF)
        },
        "other": {
            "test": (gnps.OTHER_TEST_QUERY, gnps.OTHER_TEST_REF)
        }
    },
}

gnps_train_ref = np.load(gnps.ORBITRAP_TRAIN_REF, allow_pickle=True)

In [4]:
replica_df_seq = []
for i in tqdm(range(10)):
    df_seq = []
    for db, db_metadata in spectra_paths.items():
        for desc, path_metadata in db_metadata.items():
            for info, paths in path_metadata.items():
                print("-" * 40, f"{db}-{desc}-{info}", "-" * 40)
                query_path, ref_path = paths
                query_path = query_path.with_stem(query_path.stem + replica_suffix.format(i + 1))
                ref_path = ref_path.with_stem(ref_path.stem + replica_suffix.format(i + 1))
                if info == "train":
                    query_path = gnps.ORBITRAP_TRAIN_QUERY
                
                ref_spectra = np.load(ref_path, allow_pickle=True)
                query_spectra = np.load(query_path, allow_pickle=True)
                if desc == "orbitrap":
                    ref_spectra = np.hstack((gnps_train_ref, ref_spectra))
                
                df = search_with_spectra(
                    f"{db}-{desc}-{info}", model, 
                    query_spectra, ref_spectra,
                    k_metric, batch_size,
                    show_progress_bar
                )
                df_seq.append(df)
    df = pd.concat(df_seq, axis=0)
    print(df)
    replica_df_seq.append(df)

  0%|          | 0/10 [00:00<?, ?it/s]

---------------------------------------- gnps-orbitrap-train ----------------------------------------


6851it [00:11, 597.07it/s]
155415it [03:26, 751.57it/s]


---------------------------------------- gnps-orbitrap-test ----------------------------------------


1686it [00:03, 549.86it/s]
155415it [03:28, 746.77it/s]


---------------------------------------- gnps-qtof-test ----------------------------------------


7520it [00:09, 758.98it/s] 
37040it [00:50, 736.02it/s]


---------------------------------------- gnps-other-test ----------------------------------------


6451it [00:08, 742.03it/s] 
44241it [01:01, 721.20it/s]
 10%|█         | 1/10 [09:44<1:27:37, 584.20s/it]

                         top1      top5     top10
gnps-orbitrap-train  0.652459  0.798570  0.838126
gnps-orbitrap-test   0.641756  0.794187  0.832740
gnps-qtof-test       0.388165  0.555984  0.604122
gnps-other-test      0.507363  0.662223  0.711363
---------------------------------------- gnps-orbitrap-train ----------------------------------------


6851it [00:10, 641.55it/s]
155415it [03:49, 676.04it/s]


---------------------------------------- gnps-orbitrap-test ----------------------------------------


1686it [00:02, 617.84it/s]
155415it [03:18, 783.84it/s]


---------------------------------------- gnps-qtof-test ----------------------------------------


7520it [00:16, 444.23it/s]
37040it [01:13, 505.13it/s]


---------------------------------------- gnps-other-test ----------------------------------------


6451it [00:15, 415.10it/s]
44241it [01:30, 487.80it/s]
 20%|██        | 2/10 [20:45<1:23:54, 629.28s/it]

                         top1      top5     top10
gnps-orbitrap-train  0.652751  0.799007  0.838272
gnps-orbitrap-test   0.625741  0.791815  0.830368
gnps-qtof-test       0.386835  0.553059  0.602793
gnps-other-test      0.505968  0.662843  0.709502
---------------------------------------- gnps-orbitrap-train ----------------------------------------


6851it [00:12, 558.75it/s] 
155415it [05:12, 497.31it/s]


---------------------------------------- gnps-orbitrap-test ----------------------------------------


1686it [00:03, 453.90it/s]
155415it [05:19, 485.69it/s]


---------------------------------------- gnps-qtof-test ----------------------------------------


7520it [00:17, 423.12it/s]
37040it [01:18, 472.40it/s]


---------------------------------------- gnps-other-test ----------------------------------------


6451it [00:15, 427.95it/s]
44241it [01:25, 517.95it/s]
 30%|███       | 3/10 [35:14<1:26:11, 738.82s/it]

                         top1      top5     top10
gnps-orbitrap-train  0.653627  0.798716  0.837250
gnps-orbitrap-test   0.624555  0.794187  0.832740
gnps-qtof-test       0.384176  0.552394  0.604521
gnps-other-test      0.504263  0.659743  0.706712
---------------------------------------- gnps-orbitrap-train ----------------------------------------


6851it [00:16, 403.29it/s]
155415it [05:17, 488.77it/s]


---------------------------------------- gnps-orbitrap-test ----------------------------------------


1686it [00:03, 449.19it/s]
155415it [05:16, 491.81it/s]


---------------------------------------- gnps-qtof-test ----------------------------------------


7520it [00:15, 473.48it/s]
37040it [01:18, 474.64it/s]


---------------------------------------- gnps-other-test ----------------------------------------


6451it [00:14, 432.13it/s]
44241it [01:33, 474.33it/s]
 40%|████      | 4/10 [49:54<1:19:27, 794.63s/it]

                         top1      top5     top10
gnps-orbitrap-train  0.652314  0.799007  0.838126
gnps-orbitrap-test   0.640569  0.788256  0.826809
gnps-qtof-test       0.377394  0.552128  0.601862
gnps-other-test      0.512944  0.665943  0.714773
---------------------------------------- gnps-orbitrap-train ----------------------------------------


6851it [00:09, 729.15it/s]
155415it [04:30, 573.54it/s] 


---------------------------------------- gnps-orbitrap-test ----------------------------------------


1686it [00:02, 592.43it/s]
155415it [03:35, 722.25it/s] 


---------------------------------------- gnps-qtof-test ----------------------------------------


7520it [00:11, 665.13it/s]
37040it [00:48, 758.99it/s] 


---------------------------------------- gnps-other-test ----------------------------------------


6451it [00:10, 625.15it/s]
44241it [01:04, 683.40it/s]
 50%|█████     | 5/10 [1:00:51<1:02:05, 745.18s/it]

                         top1      top5     top10
gnps-orbitrap-train  0.652168  0.797986  0.838126
gnps-orbitrap-test   0.631673  0.791222  0.833333
gnps-qtof-test       0.376330  0.546941  0.601729
gnps-other-test      0.496822  0.653542  0.703612
---------------------------------------- gnps-orbitrap-train ----------------------------------------


6851it [00:09, 703.47it/s] 
155415it [03:40, 704.87it/s]


---------------------------------------- gnps-orbitrap-test ----------------------------------------


1686it [00:02, 673.51it/s]
155415it [03:36, 718.17it/s] 


---------------------------------------- gnps-qtof-test ----------------------------------------


7520it [00:12, 610.76it/s]
37040it [00:51, 716.87it/s]


---------------------------------------- gnps-other-test ----------------------------------------


6451it [00:10, 612.50it/s]
44241it [01:04, 681.83it/s]
 60%|██████    | 6/10 [1:11:04<46:40, 700.01s/it]  

                         top1      top5     top10
gnps-orbitrap-train  0.652605  0.798424  0.837688
gnps-orbitrap-test   0.631673  0.795374  0.842230
gnps-qtof-test       0.381782  0.549601  0.602660
gnps-other-test      0.511084  0.664548  0.711363
---------------------------------------- gnps-orbitrap-train ----------------------------------------


6851it [00:07, 902.35it/s]
155415it [03:43, 695.93it/s]


---------------------------------------- gnps-orbitrap-test ----------------------------------------


1686it [00:03, 518.62it/s]
155415it [04:14, 611.35it/s]


---------------------------------------- gnps-qtof-test ----------------------------------------


7520it [00:12, 615.19it/s]
37040it [00:52, 707.65it/s]


---------------------------------------- gnps-other-test ----------------------------------------


6451it [00:10, 640.64it/s]
44241it [01:02, 708.03it/s]
 70%|███████   | 7/10 [1:21:51<34:08, 682.82s/it]

                         top1      top5     top10
gnps-orbitrap-train  0.652022  0.799591  0.838418
gnps-orbitrap-test   0.645907  0.803677  0.845789
gnps-qtof-test       0.385106  0.552793  0.604521
gnps-other-test      0.501473  0.656177  0.701287
---------------------------------------- gnps-orbitrap-train ----------------------------------------


6851it [00:07, 871.20it/s]
155415it [03:18, 781.96it/s] 


---------------------------------------- gnps-orbitrap-test ----------------------------------------


1686it [00:03, 486.85it/s]
155415it [03:15, 795.37it/s]


---------------------------------------- gnps-qtof-test ----------------------------------------


7520it [00:10, 739.28it/s]
37040it [00:52, 699.13it/s]


---------------------------------------- gnps-other-test ----------------------------------------


6451it [00:09, 672.79it/s]
44241it [01:04, 689.36it/s] 
 80%|████████  | 8/10 [1:31:13<21:28, 644.41s/it]

                         top1      top5     top10
gnps-orbitrap-train  0.652751  0.797840  0.838564
gnps-orbitrap-test   0.633452  0.781732  0.828588
gnps-qtof-test       0.382979  0.549202  0.599069
gnps-other-test      0.514339  0.665168  0.715548
---------------------------------------- gnps-orbitrap-train ----------------------------------------


6851it [00:07, 889.98it/s]
155415it [04:17, 602.91it/s]


---------------------------------------- gnps-orbitrap-test ----------------------------------------


1686it [00:02, 574.53it/s]
155415it [03:37, 715.20it/s]


---------------------------------------- gnps-qtof-test ----------------------------------------


7520it [00:11, 649.55it/s]
37040it [00:54, 675.36it/s]


---------------------------------------- gnps-other-test ----------------------------------------


6451it [00:10, 621.50it/s]
44241it [01:03, 701.08it/s]
 90%|█████████ | 9/10 [1:42:00<10:45, 645.27s/it]

                         top1      top5     top10
gnps-orbitrap-train  0.651876  0.799299  0.838126
gnps-orbitrap-test   0.622776  0.776394  0.828588
gnps-qtof-test       0.384043  0.550399  0.600665
gnps-other-test      0.508603  0.660363  0.706402
---------------------------------------- gnps-orbitrap-train ----------------------------------------


6851it [00:11, 622.00it/s]
155415it [02:34, 1006.89it/s]


---------------------------------------- gnps-orbitrap-test ----------------------------------------


1686it [00:01, 1173.50it/s]
155415it [02:19, 1116.09it/s]


---------------------------------------- gnps-qtof-test ----------------------------------------


7520it [00:05, 1300.57it/s]
37040it [00:32, 1152.05it/s]


---------------------------------------- gnps-other-test ----------------------------------------


6451it [00:06, 1051.58it/s]
44241it [00:40, 1084.50it/s]
100%|██████████| 10/10 [1:48:53<00:00, 653.31s/it]

                         top1      top5     top10
gnps-orbitrap-train  0.652168  0.798716  0.838272
gnps-orbitrap-test   0.625148  0.781139  0.823843
gnps-qtof-test       0.380718  0.544548  0.597074
gnps-other-test      0.502868  0.654627  0.704077





In [5]:
data = []
indices = replica_df_seq[0].index
columns = replica_df_seq[0].columns
for item in replica_df_seq:
    data.append([item.values])
data = np.concatenate(data, axis=0)
np.set_printoptions(precision=2, suppress=True)
np.mean(data, axis=0) * 100, np.std(data, axis=0) * 100

(array([[65.25, 79.87, 83.81],
        [63.23, 78.98, 83.25],
        [38.28, 55.07, 60.19],
        [50.66, 66.05, 70.85]]),
 array([[0.05, 0.05, 0.04],
        [0.77, 0.77, 0.64],
        [0.36, 0.31, 0.23],
        [0.52, 0.42, 0.46]]))

In [6]:
pd.set_option('display.precision', 2)
mean_df = pd.DataFrame(np.mean(data, axis=0) * 100, index=indices, columns=columns)
std_df = pd.DataFrame(np.std(data, axis=0) * 100, index=indices, columns=columns)
mean_df.to_csv("./mean.tsv", sep='\t')
std_df.to_csv("./std.tsv", sep='\t')

In [2]:
show_progress_bar = False
model = load_model()

batch_size = None
k_metric = [5, 1, 10]
loader_batch_size = 512

query_spectra = np.load("/data1/xp/data/MSBert/MTBLS1572/query.npy", allow_pickle=True)
ref_spectra = np.load("/data1/xp/data/MSBert/MTBLS1572/ref.npy", allow_pickle=True)

search_with_spectra(
    "MTBLS1572",
    model,
    query_spectra,
    ref_spectra,
    k_metric,
    batch_size,
    show_progress_bar
)

21it [00:00, 26.32it/s]
21it [00:00, 1319.28it/s]


Unnamed: 0,top1,top5,top10
MTBLS1572,0.809524,1.0,1.0


In [3]:
show_progress_bar = False
model = load_model()

batch_size = None
k_metric = [5, 1, 10]
loader_batch_size = 512

query_spectra = np.load("/data1/xp/data/MSBert/MTBLS1572/query.npy", allow_pickle=True)
ref_spectra = np.load("/data1/xp/data/MSBert/MTBLS1572/ref.npy", allow_pickle=True)

query_embedding = model.get_embedding_array(query_spectra)
ref_embedding = model.get_embedding_array(ref_spectra)

cosine_score = cosine_similarity(
    query_embedding, ref_embedding
)
for i, j in enumerate(np.argmax(cosine_score, axis=1)):
    if i != j:
        print(f"{i}-th answer is [{ref_spectra[i].get("compound_name")}] but get {j}-th [{ref_spectra[j].get("compound_name")}]")

21it [00:00, 1425.43it/s]
21it [00:00, 1495.04it/s]

4-th answer is [Butyryl-L-carnitine] but get 11-th [Isovaleryl-L-carnitine]
17-th answer is [Pantothenic acid] but get 19-th [Propionyl-L-carnitine]
18-th answer is [Proline] but get 7-th [Cytosine]
19-th answer is [Propionyl-L-carnitine] but get 11-th [Isovaleryl-L-carnitine]





In [4]:
cosine_score[4][4], cosine_score[4][11]

(0.7985360549130454, 0.966494467784608)

In [9]:
from utils import cosine_similarity

cosine_score = cosine_similarity(query_embedding, ref_embedding)

In [10]:
np.argsort(cosine_score, axis=1)[:, ::-1][:, 0]

array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16,
       17, 16, 17, 20])

In [11]:
cosine_score[[18, 19], :]

array([[0.35, 0.31, 0.18, 0.2 , 0.19, 0.38, 0.48, 0.32, 0.11, 0.17, 0.56,
        0.2 , 0.67, 0.14, 0.15, 0.2 , 0.87, 0.58, 0.54, 0.31, 0.29],
       [0.3 , 0.48, 0.24, 0.32, 0.7 , 0.67, 0.59, 0.45, 0.17, 0.08, 0.68,
        0.72, 0.58, 0.18, 0.09, 0.16, 0.54, 0.98, 0.49, 0.85, 0.5 ]])

In [2]:
model = load_model()

orbitrap_ref_spectra = np.load(mona.ORBITRAP_COMMON, allow_pickle=True)
orbitrap_query_spectra = np.load(mona.ORBITRAP_UNIQUE, allow_pickle=True)

qtof_ref_spectra = np.load(mona.ORBITRAP_COMMON, allow_pickle=True)
qtof_query_spectra = np.load(mona.QTOF_UNIQUE, allow_pickle=True)

orbitrap_ref_embedding = model.get_embedding_array(orbitrap_ref_spectra)
orbitrap_query_embedding = model.get_embedding_array(orbitrap_query_spectra)

qtof_ref_embedding = model.get_embedding_array(qtof_ref_spectra)
qtof_query_embedding = model.get_embedding_array(qtof_query_spectra)

3376it [00:01, 1744.15it/s]
15655it [00:06, 2546.51it/s]
3376it [00:01, 2522.50it/s]
674it [00:00, 2528.02it/s]


In [3]:
orbitrap_score, orbitrap_indices = most_similar(orbitrap_query_embedding, orbitrap_ref_embedding, 512)
qtof_score, qtof_indices = most_similar(qtof_query_embedding, qtof_ref_embedding, 512)

processing: 100%|██████████| 31/31 [00:05<00:00,  5.82it/s]
processing: 100%|██████████| 2/2 [00:00<00:00, 25.84it/s]


In [4]:
dir = Path("/data1/xp/code/specEmbedding/score_distribution/compound search/")
orbitrap_dir = dir / "orbitrap"
qtof_dir = dir / "qtof"

orbitrap_dir.mkdir(exist_ok=True, parents=True)
qtof_dir.mkdir(exist_ok=True, parents=True)

np.save(orbitrap_dir / "MS2DeepScore_Score.npy", orbitrap_score)
np.save(orbitrap_dir / "MS2DeepScore_Indices.npy", orbitrap_indices)
np.save(qtof_dir / "MS2DeepScore_Score.npy", qtof_score)
np.save(qtof_dir / "MS2DeepScore_Indices.npy", qtof_indices)

In [5]:
show_progress_bar = False
model = load_model()

batch_size = None
k_metric = [5, 1, 10]
loader_batch_size = 512

In [6]:
query_spectra = np.load(mona.ORBITRAP_COMMON, allow_pickle=True)
ref_spectra = np.load(gnps.ORBITRAP_ALL, allow_pickle=True)

search_with_spectra(
    "Orbitrap Common",
    model,
    query_spectra,
    ref_spectra,
    k_metric,
    batch_size,
    show_progress_bar
)

3376it [00:01, 2039.29it/s]
163952it [01:09, 2374.03it/s]


Unnamed: 0,top1,top5,top10
Orbitrap Common,0.746149,0.784064,0.80628


In [7]:
query_spectra = np.load(mona.QTOF_COMMON, allow_pickle=True)
ref_spectra = np.load(gnps.QTOF_ALL, allow_pickle=True)

search_with_spectra(
    "QTOF Common",
    model,
    query_spectra,
    ref_spectra,
    k_metric,
    batch_size,
    show_progress_bar
)

7243it [00:03, 2408.12it/s]
44560it [00:19, 2236.87it/s]


Unnamed: 0,top1,top5,top10
QTOF Common,0.982052,0.993511,0.994754
