In [1]:
import sys
sys.path.append("../")
from pathlib import Path

import torch
import pandas as pd
import numpy as np
from tqdm import tqdm

from const import gnps, mona
from type import TokenizerConfig
from data import Tokenizer
from utils import load_model, search, search_with_spectra, embedding, most_similar, cosine_similarity

In [2]:
device = torch.device("cuda:1" if torch.cuda.is_available() else "cpu")
model = load_model(device)

batch_size = None
k_metric = [5, 1, 10]
loader_batch_size = 512
show_progress_bar = False

replica_suffix = "-replication-{}"

tokenizer_config = TokenizerConfig(
    max_len=100,
    n_decimals=2,
    show_progress_bar=show_progress_bar
)
tokenizer = Tokenizer(**tokenizer_config)

In [3]:
spectra_paths = {
    "gnps":{
        "orbitrap": {
            "train": (gnps.ORBITRAP_TRAIN_QUERY, gnps.ORBITRAP_TEST_REF),
            "test": (gnps.ORBITRAP_TEST_QUERY, gnps.ORBITRAP_TEST_REF)
        },
        "qtof": {
            "test": (gnps.QTOF_TEST_QUERY, gnps.QTOF_TEST_REF)
        },
        "other": {
            "test": (gnps.OTHER_TEST_QUERY, gnps.OTHER_TEST_REF)
        }
    },
}

gnps_train_ref = np.load(gnps.ORBITRAP_TRAIN_REF, allow_pickle=True)

In [4]:
replica_df_seq = []
for i in tqdm(range(10)):
    df_seq = []
    for db, db_metadata in spectra_paths.items():
        for desc, path_metadata in db_metadata.items():
            for info, paths in path_metadata.items():
                print("-" * 40, f"{db}-{desc}-{info}", "-" * 40)
                query_path, ref_path = paths
                query_path = query_path.with_stem(query_path.stem + replica_suffix.format(i + 1))
                ref_path = ref_path.with_stem(ref_path.stem + replica_suffix.format(i + 1))
                if db == "gnps" and desc == "orbitrap":
                    if info == "train":
                        query_path = gnps.ORBITRAP_TRAIN_QUERY
                    
                    ref_spectra = np.load(ref_path, allow_pickle=True)
                    query_spectra = np.load(query_path, allow_pickle=True)
                    ref_spectra = np.hstack((gnps_train_ref, ref_spectra))
                    df = search_with_spectra(
                        f"{db}-{desc}-{info}", tokenizer,
                        model, device,
                        query_spectra, ref_spectra,
                        k_metric,
                        loader_batch_size, batch_size,
                        show_progress_bar
                    )
                else:
                    df = search(
                        f"{db}-{desc}-{info}", tokenizer,
                        model, device,
                        query_path, ref_path,
                        k_metric,
                        loader_batch_size, batch_size,
                        show_progress_bar
                    )
                df_seq.append(df)
    df = pd.concat(df_seq, axis=0)
    print(df)
    replica_df_seq.append(df)

  0%|          | 0/10 [00:00<?, ?it/s]

---------------------------------------- gnps-orbitrap-train ----------------------------------------
tokenize the query and reference data success
---------------------------------------- gnps-orbitrap-test ----------------------------------------
tokenize the query and reference data success
---------------------------------------- gnps-qtof-test ----------------------------------------
tokenize the query and reference data success
---------------------------------------- gnps-other-test ----------------------------------------
tokenize the query and reference data success


 10%|█         | 1/10 [03:58<35:47, 238.64s/it]

                         top1      top5     top10
gnps-orbitrap-train  0.764268  0.883959  0.906437
gnps-orbitrap-test   0.780546  0.892645  0.912811
gnps-qtof-test       0.497074  0.663032  0.704122
gnps-other-test      0.748876  0.886064  0.907611
---------------------------------------- gnps-orbitrap-train ----------------------------------------
tokenize the query and reference data success
---------------------------------------- gnps-orbitrap-test ----------------------------------------
tokenize the query and reference data success
---------------------------------------- gnps-qtof-test ----------------------------------------
tokenize the query and reference data success
---------------------------------------- gnps-other-test ----------------------------------------
tokenize the query and reference data success


 20%|██        | 2/10 [07:55<31:43, 237.88s/it]

                         top1      top5     top10
gnps-orbitrap-train  0.764122  0.884250  0.906583
gnps-orbitrap-test   0.786477  0.895611  0.918149
gnps-qtof-test       0.494814  0.665293  0.705452
gnps-other-test      0.751046  0.882499  0.908386
---------------------------------------- gnps-orbitrap-train ----------------------------------------
tokenize the query and reference data success
---------------------------------------- gnps-orbitrap-test ----------------------------------------
tokenize the query and reference data success
---------------------------------------- gnps-qtof-test ----------------------------------------
tokenize the query and reference data success
---------------------------------------- gnps-other-test ----------------------------------------
tokenize the query and reference data success


 30%|███       | 3/10 [11:49<27:29, 235.71s/it]

                         top1      top5     top10
gnps-orbitrap-train  0.764852  0.883959  0.906437
gnps-orbitrap-test   0.762752  0.892052  0.908660
gnps-qtof-test       0.488165  0.660771  0.698537
gnps-other-test      0.739730  0.880794  0.903736
---------------------------------------- gnps-orbitrap-train ----------------------------------------
tokenize the query and reference data success
---------------------------------------- gnps-orbitrap-test ----------------------------------------
tokenize the query and reference data success
---------------------------------------- gnps-qtof-test ----------------------------------------
tokenize the query and reference data success
---------------------------------------- gnps-other-test ----------------------------------------
tokenize the query and reference data success


 40%|████      | 4/10 [15:49<23:44, 237.44s/it]

                         top1      top5     top10
gnps-orbitrap-train  0.764560  0.883959  0.906583
gnps-orbitrap-test   0.775801  0.889680  0.912811
gnps-qtof-test       0.498936  0.666755  0.706516
gnps-other-test      0.758177  0.891180  0.913037
---------------------------------------- gnps-orbitrap-train ----------------------------------------
tokenize the query and reference data success
---------------------------------------- gnps-orbitrap-test ----------------------------------------
tokenize the query and reference data success
---------------------------------------- gnps-qtof-test ----------------------------------------
tokenize the query and reference data success
---------------------------------------- gnps-other-test ----------------------------------------
tokenize the query and reference data success


 50%|█████     | 5/10 [19:17<18:54, 226.86s/it]

                         top1      top5     top10
gnps-orbitrap-train  0.764268  0.884250  0.906729
gnps-orbitrap-test   0.777580  0.891459  0.912811
gnps-qtof-test       0.491356  0.661436  0.703856
gnps-other-test      0.747016  0.885754  0.906836
---------------------------------------- gnps-orbitrap-train ----------------------------------------
tokenize the query and reference data success
---------------------------------------- gnps-orbitrap-test ----------------------------------------
tokenize the query and reference data success
---------------------------------------- gnps-qtof-test ----------------------------------------
tokenize the query and reference data success
---------------------------------------- gnps-other-test ----------------------------------------
tokenize the query and reference data success


 60%|██████    | 6/10 [22:47<14:45, 221.32s/it]

                         top1      top5     top10
gnps-orbitrap-train  0.764560  0.883375  0.906729
gnps-orbitrap-test   0.782325  0.900356  0.918149
gnps-qtof-test       0.493484  0.664495  0.702261
gnps-other-test      0.747791  0.882499  0.903116
---------------------------------------- gnps-orbitrap-train ----------------------------------------
tokenize the query and reference data success
---------------------------------------- gnps-orbitrap-test ----------------------------------------
tokenize the query and reference data success
---------------------------------------- gnps-qtof-test ----------------------------------------
tokenize the query and reference data success
---------------------------------------- gnps-other-test ----------------------------------------
tokenize the query and reference data success


 70%|███████   | 7/10 [26:19<10:54, 218.11s/it]

                         top1      top5     top10
gnps-orbitrap-train  0.764852  0.883959  0.906437
gnps-orbitrap-test   0.778173  0.887900  0.914591
gnps-qtof-test       0.492553  0.664229  0.699734
gnps-other-test      0.749186  0.884824  0.910867
---------------------------------------- gnps-orbitrap-train ----------------------------------------
tokenize the query and reference data success
---------------------------------------- gnps-orbitrap-test ----------------------------------------
tokenize the query and reference data success
---------------------------------------- gnps-qtof-test ----------------------------------------
tokenize the query and reference data success
---------------------------------------- gnps-other-test ----------------------------------------
tokenize the query and reference data success


 80%|████████  | 8/10 [29:48<07:10, 215.16s/it]

                         top1      top5     top10
gnps-orbitrap-train  0.764414  0.884834  0.906875
gnps-orbitrap-test   0.780546  0.891459  0.910439
gnps-qtof-test       0.494548  0.665160  0.703059
gnps-other-test      0.748721  0.883894  0.909626
---------------------------------------- gnps-orbitrap-train ----------------------------------------
tokenize the query and reference data success
---------------------------------------- gnps-orbitrap-test ----------------------------------------
tokenize the query and reference data success
---------------------------------------- gnps-qtof-test ----------------------------------------
tokenize the query and reference data success
---------------------------------------- gnps-other-test ----------------------------------------
tokenize the query and reference data success


 90%|█████████ | 9/10 [33:19<03:33, 213.86s/it]

                         top1      top5     top10
gnps-orbitrap-train  0.764560  0.884105  0.906875
gnps-orbitrap-test   0.771056  0.887307  0.909253
gnps-qtof-test       0.496809  0.660771  0.703989
gnps-other-test      0.747171  0.881104  0.907301
---------------------------------------- gnps-orbitrap-train ----------------------------------------
tokenize the query and reference data success
---------------------------------------- gnps-orbitrap-test ----------------------------------------
tokenize the query and reference data success
---------------------------------------- gnps-qtof-test ----------------------------------------
tokenize the query and reference data success
---------------------------------------- gnps-other-test ----------------------------------------
tokenize the query and reference data success


100%|██████████| 10/10 [36:47<00:00, 220.77s/it]

                         top1      top5     top10
gnps-orbitrap-train  0.765582  0.884542  0.906437
gnps-orbitrap-test   0.785884  0.890273  0.912811
gnps-qtof-test       0.494016  0.660904  0.702261
gnps-other-test      0.744071  0.884049  0.907611





In [5]:
data = []
indices = replica_df_seq[0].index
columns = replica_df_seq[0].columns
for item in replica_df_seq:
    data.append([item.values])

In [6]:
data = np.concatenate(data, axis=0)

In [7]:
np.set_printoptions(precision=2, suppress=True)
np.mean(data, axis=0) * 100, np.std(data, axis=0) * 100

(array([[76.46, 88.41, 90.66],
        [77.81, 89.19, 91.3 ],
        [49.42, 66.33, 70.3 ],
        [74.82, 88.43, 90.78]]),
 array([[0.04, 0.04, 0.02],
        [0.67, 0.36, 0.31],
        [0.29, 0.21, 0.23],
        [0.45, 0.29, 0.28]]))

In [8]:
pd.set_option('display.precision', 2)
mean_df = pd.DataFrame(np.mean(data, axis=0) * 100, index=indices, columns=columns)
std_df = pd.DataFrame(np.std(data, axis=0) * 100, index=indices, columns=columns)

In [9]:
mean_df.to_csv("./mean.tsv", sep='\t')
std_df.to_csv("./std.tsv", sep='\t')

In [None]:
device = torch.device("cuda:1" if torch.cuda.is_available() else "cpu")
model = load_model(device)

batch_size = None
k_metric = [5, 1, 10]
loader_batch_size = 512
show_progress_bar = False

replica_suffix = "-replication-{}"

tokenizer_config = TokenizerConfig(
    max_len=100,
    n_decimals=2,
    show_progress_bar=show_progress_bar
)
tokenizer = Tokenizer(**tokenizer_config)

query_spectra = np.load("/data1/xp/data/MSBert/MTBLS1572/query.npy", allow_pickle=True)
ref_spectra = np.load("/data1/xp/data/MSBert/MTBLS1572/ref.npy", allow_pickle=True)

search_with_spectra(
    "MTBLS1572",
    tokenizer,
    model,
    device,
    query_spectra,
    ref_spectra,
    k_metric,
    512
)

tokenize the query and reference data success


calculate hit and recall count: 100%|██████████| 1/1 [00:03<00:00,  3.86s/it]


Unnamed: 0,top1,top5,top10
MTBLS1572,0.904762,1.0,1.0


In [2]:
device = torch.device("cuda:1" if torch.cuda.is_available() else "cpu")
model = load_model(device)

batch_size = None
k_metric = [5, 1, 10]
loader_batch_size = 512
show_progress_bar = False

replica_suffix = "-replication-{}"

tokenizer_config = TokenizerConfig(
    max_len=100,
    n_decimals=2,
    show_progress_bar=show_progress_bar
)
tokenizer = Tokenizer(**tokenizer_config)

query_spectra = np.load("/data1/xp/data/MSBert/MTBLS1572/query.npy", allow_pickle=True)
ref_spectra = np.load("/data1/xp/data/MSBert/MTBLS1572/ref.npy", allow_pickle=True)

query_embedding, _ = embedding(model, device, tokenizer, 512, query_spectra, show_progress_bar)
ref_embedding, _ = embedding(model, device, tokenizer, 512, ref_spectra, show_progress_bar)

cosine_score = cosine_similarity(
    query_embedding, ref_embedding
)

for i, j in enumerate(np.argmax(cosine_score, axis=1)):
    if i != j:
        print(f"{i}-th answer is [{ref_spectra[i].get("compound_name")}] but get {j}-th [{ref_spectra[j].get("compound_name")}]")

4-th answer is [Butyryl-L-carnitine] but get 11-th [Isovaleryl-L-carnitine]
18-th answer is [Proline] but get 12-th [Leucine]


In [3]:
cosine_score[4][4], cosine_score[4][11]

(0.4265302738145448, 0.4380454200931992)

In [2]:
show_progress_bar = True
tokenizer = Tokenizer(2, 100, show_progress_bar)

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
model = load_model(device)

orbitrap_ref_spectra = np.load(mona.ORBITRAP_COMMON, allow_pickle=True)
orbitrap_query_spectra = np.load(mona.ORBITRAP_UNIQUE, allow_pickle=True)

qtof_ref_spectra = np.load(mona.QTOF_COMMON, allow_pickle=True)
qtof_query_spectra = np.load(mona.QTOF_UNIQUE, allow_pickle=True)

orbitrap_ref_embedding, _ = embedding(model, device, tokenizer, 512, orbitrap_ref_spectra, show_progress_bar)
orbitrap_query_embedding, _ = embedding(model, device, tokenizer, 512, orbitrap_query_spectra,  show_progress_bar)

qtof_ref_embedding, _ = embedding(model, device, tokenizer, 512, qtof_ref_spectra, show_progress_bar)
qtof_query_embedding, _ = embedding(model, device, tokenizer, 512, qtof_query_spectra,  show_progress_bar)

100%|██████████| 3376/3376 [00:01<00:00, 3193.80it/s]
100%|██████████| 7/7 [00:01<00:00,  6.84it/s]
100%|██████████| 15655/15655 [00:04<00:00, 3625.97it/s]
100%|██████████| 31/31 [00:02<00:00, 11.12it/s]
100%|██████████| 7243/7243 [00:03<00:00, 1928.92it/s]
100%|██████████| 15/15 [00:01<00:00, 11.83it/s]
100%|██████████| 674/674 [00:00<00:00, 3504.86it/s]
100%|██████████| 2/2 [00:00<00:00, 16.53it/s]


In [3]:
orbitrap_score, orbitrap_indices = most_similar(orbitrap_query_embedding, orbitrap_ref_embedding, 512, show_progress_bar)
qtof_score, qtof_indices = most_similar(qtof_query_embedding, qtof_ref_embedding, 512, show_progress_bar)

processing: 100%|██████████| 31/31 [00:05<00:00,  5.82it/s]
processing: 100%|██████████| 2/2 [00:00<00:00, 13.47it/s]


In [4]:
dir = Path("/data1/xp/code/specEmbedding/score_distribution/compound search/")
orbitrap_dir = dir / "orbitrap"
qtof_dir = dir / "qtof"

orbitrap_dir.mkdir(exist_ok=True, parents=True)
qtof_dir.mkdir(exist_ok=True, parents=True)

np.save(orbitrap_dir / "MSBERT_Score.npy", orbitrap_score)
np.save(orbitrap_dir / "MSBERT_Indices.npy", orbitrap_indices)
np.save(qtof_dir / "MSBERT_Score.npy", qtof_score)
np.save(qtof_dir / "MSBERT_Indices.npy", qtof_indices)

In [6]:
device = torch.device("cuda:1" if torch.cuda.is_available() else "cpu")
model = load_model(device)

batch_size = None
k_metric = [5, 1, 10]
loader_batch_size = 512
show_progress_bar = True

tokenizer_config = TokenizerConfig(
    max_len=100,
    n_decimals=2,
    show_progress_bar=show_progress_bar
)
tokenizer = Tokenizer(**tokenizer_config)

In [7]:
query_spectra = np.load(mona.ORBITRAP_COMMON, allow_pickle=True)
ref_spectra = np.load(gnps.ORBITRAP_ALL, allow_pickle=True)

search_with_spectra(
    "Orbitrap Common",
    tokenizer,
    model,
    device,
    query_spectra,
    ref_spectra,
    k_metric,
    512
)

100%|██████████| 3376/3376 [00:01<00:00, 2380.91it/s]
100%|██████████| 163952/163952 [00:59<00:00, 2760.64it/s]


tokenize the query and reference data success


calculate hit and recall count: 100%|██████████| 1/1 [00:03<00:00,  3.34s/it]


Unnamed: 0,top1,top5,top10
Orbitrap Common,0.80154,0.856043,0.885664


In [5]:
query_spectra = np.load(mona.QTOF_COMMON, allow_pickle=True)
ref_spectra = np.load(gnps.QTOF_ALL, allow_pickle=True)

search_with_spectra(
    "QTOF Common",
    tokenizer,
    model,
    device,
    query_spectra,
    ref_spectra,
    k_metric,
    512
)

tokenize the query and reference data success


calculate hit and recall count: 100%|██████████| 1/1 [00:01<00:00,  1.64s/it]


Unnamed: 0,top1,top5,top10
QTOF Common,0.984261,0.996134,0.996686
