#### Test

Top-k accuracy

In [1]:
import sys
sys.path.append("../")
from pathlib import Path

import torch
from tqdm import tqdm
import pandas as pd
import numpy as np

from train import ModelTester
from const import gnps, mona
from data import Tokenizer
from utils import embedding, search_with_spectra, search, load_transformer_model, most_similar, cosine_similarity

In [6]:
spectra_paths = {
    "gnps":{
        "orbitrap": {
            "train": (gnps.ORBITRAP_TRAIN_QUERY, gnps.ORBITRAP_TEST_REF),
            "test": (gnps.ORBITRAP_TEST_QUERY, gnps.ORBITRAP_TEST_REF)
        },
        "qtof": {
            "test": (gnps.QTOF_TEST_QUERY, gnps.QTOF_TEST_REF)
        },
        "other": {
            "test": (gnps.OTHER_TEST_QUERY, gnps.OTHER_TEST_REF)
        }
    }
}
gnps_train_ref = np.load(gnps.ORBITRAP_TRAIN_REF, allow_pickle=True)

In [7]:
show_progress_bar = False
is_augment = True
model_backbone = "transformer"
loss_type = "SupConWithTanimotoLoss"
replica_suffix = "-replication-{}"
k_metric = [5, 1, 10]
batch_size = None
loader_batch_size = 4096
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
tokenizer = Tokenizer(100, show_progress_bar)
model = load_transformer_model(device, loss_type, is_augment)

tester = ModelTester(model, device, show_progress_bar)

In [8]:
replica_df_seq = []

for i in tqdm(range(10)):
    df_seq = []
    for db, db_metadata in spectra_paths.items():
        for desc, path_metadata in db_metadata.items():
            for info, paths in path_metadata.items():
                print("-" * 40, f"{db}-{desc}-{info}", "-" * 40)
                query_path, ref_path = paths
                query_path = query_path.with_stem(query_path.stem + replica_suffix.format(i + 1))
                ref_path = ref_path.with_stem(ref_path.stem + replica_suffix.format(i + 1))
                if db == "gnps" and desc == "orbitrap":
                    if info == "train":
                        query_path = gnps.ORBITRAP_TRAIN_QUERY
                    
                    ref_spectra = np.load(ref_path, allow_pickle=True)
                    query_spectra = np.load(query_path, allow_pickle=True)
                    ref_spectra = np.hstack((gnps_train_ref, ref_spectra))
                    df = search_with_spectra(
                        f"{db}-{desc}-{info}", tester,
                        k_metric, tokenizer,
                        query_spectra, ref_spectra,
                        loader_batch_size,
                        show_progress_bar, batch_size
                    )
                else:
                    df = search(
                    f"{db}-{desc}-{info}", tester, 
                    k_metric, tokenizer,
                    query_path, ref_path, 
                    loader_batch_size,
                    show_progress_bar, 512
                )
                df_seq.append(df)
    df = pd.concat(df_seq, axis=0)
    print(df)
    replica_df_seq.append(df)

  0%|          | 0/10 [00:00<?, ?it/s]

---------------------------------------- gnps-orbitrap-train ----------------------------------------
---------------------------------------- gnps-orbitrap-test ----------------------------------------
---------------------------------------- gnps-qtof-test ----------------------------------------
---------------------------------------- gnps-other-test ----------------------------------------


 10%|█         | 1/10 [01:49<16:22, 109.15s/it]

                         top1      top5     top10
gnps-orbitrap-train  0.809225  0.917822  0.941176
gnps-orbitrap-test   0.823843  0.937129  0.953144
gnps-qtof-test       0.500266  0.690293  0.749734
gnps-other-test      0.813517  0.939234  0.957371
---------------------------------------- gnps-orbitrap-train ----------------------------------------
---------------------------------------- gnps-orbitrap-test ----------------------------------------
---------------------------------------- gnps-qtof-test ----------------------------------------
---------------------------------------- gnps-other-test ----------------------------------------


 20%|██        | 2/10 [03:37<14:31, 108.96s/it]

                         top1      top5     top10
gnps-orbitrap-train  0.808933  0.918260  0.941176
gnps-orbitrap-test   0.816133  0.931791  0.951957
gnps-qtof-test       0.502793  0.684973  0.744548
gnps-other-test      0.812587  0.941869  0.960316
---------------------------------------- gnps-orbitrap-train ----------------------------------------
---------------------------------------- gnps-orbitrap-test ----------------------------------------
---------------------------------------- gnps-qtof-test ----------------------------------------
---------------------------------------- gnps-other-test ----------------------------------------


 30%|███       | 3/10 [05:26<12:40, 108.71s/it]

                         top1      top5     top10
gnps-orbitrap-train  0.808787  0.918844  0.941322
gnps-orbitrap-test   0.814947  0.928233  0.948399
gnps-qtof-test       0.498404  0.688564  0.743617
gnps-other-test      0.814602  0.933189  0.955666
---------------------------------------- gnps-orbitrap-train ----------------------------------------
---------------------------------------- gnps-orbitrap-test ----------------------------------------
---------------------------------------- gnps-qtof-test ----------------------------------------
---------------------------------------- gnps-other-test ----------------------------------------


 40%|████      | 4/10 [07:14<10:50, 108.46s/it]

                         top1      top5     top10
gnps-orbitrap-train  0.808349  0.917968  0.940739
gnps-orbitrap-test   0.823250  0.931198  0.948992
gnps-qtof-test       0.501330  0.692287  0.752261
gnps-other-test      0.810727  0.940319  0.961091
---------------------------------------- gnps-orbitrap-train ----------------------------------------
---------------------------------------- gnps-orbitrap-test ----------------------------------------
---------------------------------------- gnps-qtof-test ----------------------------------------
---------------------------------------- gnps-other-test ----------------------------------------


 50%|█████     | 5/10 [09:02<09:02, 108.43s/it]

                         top1      top5     top10
gnps-orbitrap-train  0.808641  0.918844  0.940739
gnps-orbitrap-test   0.816133  0.930605  0.949585
gnps-qtof-test       0.498404  0.688032  0.744681
gnps-other-test      0.808402  0.940319  0.958611
---------------------------------------- gnps-orbitrap-train ----------------------------------------
---------------------------------------- gnps-orbitrap-test ----------------------------------------
---------------------------------------- gnps-qtof-test ----------------------------------------
---------------------------------------- gnps-other-test ----------------------------------------


 60%|██████    | 6/10 [10:53<07:16, 109.15s/it]

                         top1      top5     top10
gnps-orbitrap-train  0.808057  0.918844  0.941176
gnps-orbitrap-test   0.819098  0.929419  0.947212
gnps-qtof-test       0.501862  0.690293  0.746543
gnps-other-test      0.811192  0.940784  0.958766
---------------------------------------- gnps-orbitrap-train ----------------------------------------
---------------------------------------- gnps-orbitrap-test ----------------------------------------
---------------------------------------- gnps-qtof-test ----------------------------------------
---------------------------------------- gnps-other-test ----------------------------------------


 70%|███████   | 7/10 [12:51<05:36, 112.19s/it]

                         top1      top5     top10
gnps-orbitrap-train  0.808203  0.917822  0.941031
gnps-orbitrap-test   0.819098  0.931198  0.948992
gnps-qtof-test       0.502128  0.686436  0.744681
gnps-other-test      0.811812  0.937374  0.956906
---------------------------------------- gnps-orbitrap-train ----------------------------------------
---------------------------------------- gnps-orbitrap-test ----------------------------------------
---------------------------------------- gnps-qtof-test ----------------------------------------
---------------------------------------- gnps-other-test ----------------------------------------


 80%|████████  | 8/10 [14:41<03:42, 111.41s/it]

                         top1      top5     top10
gnps-orbitrap-train  0.809663  0.918698  0.941322
gnps-orbitrap-test   0.815540  0.927046  0.946026
gnps-qtof-test       0.498271  0.691755  0.746941
gnps-other-test      0.808557  0.935669  0.955511
---------------------------------------- gnps-orbitrap-train ----------------------------------------
---------------------------------------- gnps-orbitrap-test ----------------------------------------
---------------------------------------- gnps-qtof-test ----------------------------------------
---------------------------------------- gnps-other-test ----------------------------------------


 90%|█████████ | 9/10 [16:26<01:49, 109.48s/it]

                         top1      top5     top10
gnps-orbitrap-train  0.807181  0.918406  0.941031
gnps-orbitrap-test   0.805457  0.922894  0.942467
gnps-qtof-test       0.498404  0.690426  0.745080
gnps-other-test      0.805612  0.938149  0.957216
---------------------------------------- gnps-orbitrap-train ----------------------------------------
---------------------------------------- gnps-orbitrap-test ----------------------------------------
---------------------------------------- gnps-qtof-test ----------------------------------------
---------------------------------------- gnps-other-test ----------------------------------------


100%|██████████| 10/10 [18:14<00:00, 109.42s/it]

                         top1      top5     top10
gnps-orbitrap-train  0.808349  0.918406  0.940885
gnps-orbitrap-test   0.819098  0.933571  0.944840
gnps-qtof-test       0.499335  0.689761  0.744149
gnps-other-test      0.806077  0.935204  0.956596





In [9]:
data = []
indices = replica_df_seq[0].index
columns = replica_df_seq[0].columns
for item in replica_df_seq:
    data.append([item.values])

In [10]:
data = np.concatenate(data, axis=0)
np.set_printoptions(precision=5, suppress=True)
np.mean(data, axis=0) * 100, np.std(data, axis=0) * 100

(array([[80.85389, 91.83915, 94.10597],
        [81.72598, 93.03084, 94.81613],
        [50.01197, 68.92819, 74.62234],
        [81.03085, 93.82111, 95.7805 ]]),
 array([[0.06497, 0.03941, 0.02043],
        [0.48942, 0.36297, 0.30267],
        [0.16899, 0.21787, 0.26337],
        [0.28951, 0.26717, 0.17699]]))

In [11]:
pd.set_option('display.precision', 4)
mean_df = pd.DataFrame(np.mean(data, axis=0) * 100, index=indices, columns=columns)
std_df = pd.DataFrame(np.std(data, axis=0) * 100, index=indices, columns=columns)

In [12]:
suffix = ""
if is_augment:
    suffix = "_Augmentation"

mean_df.to_csv(f"./result/{model_backbone}_{loss_type}{suffix}_mean.tsv", sep='\t')
std_df.to_csv(f"./result/{model_backbone}_{loss_type}{suffix}_std.tsv", sep='\t')

In [4]:
show_progress_bar = False
is_augment = True
model_backbone = "transformer"
loss_type = "SupConWithTanimotoLoss"
replica_suffix = "-replication-{}"
k_metric = [5, 1, 10]
batch_size = None
loader_batch_size = 512
device = torch.device("cuda:1" if torch.cuda.is_available() else "cpu")
tokenizer = Tokenizer(100, show_progress_bar)
model = load_transformer_model(device, loss_type, is_augment)
tester = ModelTester(model, device, show_progress_bar)

query_spectra = np.load("/data1/xp/data/MSBert/MTBLS1572/query.npy", allow_pickle=True)
ref_spectra = np.load("/data1/xp/data/MSBert/MTBLS1572/ref.npy", allow_pickle=True)

search_with_spectra(
    "MTBLS1572", tester,
    k_metric, tokenizer,
    query_spectra, ref_spectra,
    loader_batch_size,
    show_progress_bar, batch_size
)

Unnamed: 0,top1,top5,top10
MTBLS1572,1.0,1.0,1.0


In [8]:
from utils import cosine_similarity

query_embedding, _ = embedding(
    tester, tokenizer,
    512, query_spectra,
    False
)

ref_embedding, _ = embedding(
    tester, tokenizer,
    512, ref_spectra,
    False
)
cosine_score = cosine_similarity(
    query_embedding, ref_embedding
)
for i, j in enumerate(np.argmax(cosine_score, axis=1)):
    if i != j:
        print(f"{i}-th answer is [{ref_spectra[i].get("compound_name")}] but get [{ref_spectra[j].get("compound_name")}]")

In [9]:
cosine_score[4][4]

0.7911037987449514

In [2]:
show_progress_bar = True
tokenizer = Tokenizer(100, show_progress_bar)

is_augment = True
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
model = load_transformer_model(device, "SupConWithTanimotoLoss", is_augment)
tester = ModelTester(model, device, show_progress_bar)

orbitrap_ref_spectra = np.load(mona.ORBITRAP_COMMON, allow_pickle=True)
orbitrap_query_spectra = np.load(mona.ORBITRAP_UNIQUE, allow_pickle=True)

qtof_ref_spectra = np.load(mona.ORBITRAP_COMMON, allow_pickle=True)
qtof_query_spectra = np.load(mona.QTOF_UNIQUE, allow_pickle=True)

orbitrap_ref_embedding, _ = embedding(tester, tokenizer, 512, orbitrap_ref_spectra, show_progress_bar)
orbitrap_query_embedding, _ = embedding(tester, tokenizer, 512, orbitrap_query_spectra,  show_progress_bar)

qtof_ref_embedding, _ = embedding(tester, tokenizer, 512, qtof_ref_spectra, show_progress_bar)
qtof_query_embedding, _ = embedding(tester, tokenizer, 512, qtof_query_spectra,  show_progress_bar)

tokenization: 100%|██████████| 3376/3376 [00:00<00:00, 5468.01it/s]
get smiles: 100%|██████████| 3376/3376 [00:00<00:00, 2793995.72it/s]
embedding: 100%|██████████| 7/7 [00:00<00:00,  9.90it/s]
tokenization: 100%|██████████| 15655/15655 [00:01<00:00, 8458.12it/s]
get smiles: 100%|██████████| 15655/15655 [00:00<00:00, 3501030.61it/s]
embedding: 100%|██████████| 31/31 [00:01<00:00, 24.81it/s]
tokenization: 100%|██████████| 3376/3376 [00:00<00:00, 7742.36it/s]
get smiles: 100%|██████████| 3376/3376 [00:00<00:00, 3367412.68it/s]
embedding: 100%|██████████| 7/7 [00:00<00:00, 26.28it/s]
tokenization: 100%|██████████| 674/674 [00:00<00:00, 8342.07it/s]
get smiles: 100%|██████████| 674/674 [00:00<00:00, 2935577.25it/s]
embedding: 100%|██████████| 2/2 [00:00<00:00, 37.29it/s]


In [3]:
orbitrap_score, orbitrap_indices = most_similar(orbitrap_query_embedding, orbitrap_ref_embedding, 512, show_progress_bar)
qtof_score, qtof_indices = most_similar(qtof_query_embedding, qtof_ref_embedding, 512, show_progress_bar)

processing: 100%|██████████| 31/31 [00:05<00:00,  5.84it/s]
processing: 100%|██████████| 2/2 [00:00<00:00, 17.34it/s]


In [4]:
dir = Path("/data1/xp/code/specEmbedding/score_distribution/compound search/")
orbitrap_dir = dir / "orbitrap"
qtof_dir = dir / "qtof"

orbitrap_dir.mkdir(exist_ok=True, parents=True)
qtof_dir.mkdir(exist_ok=True, parents=True)

np.save(orbitrap_dir / "SpecEmbedding_Score.npy", orbitrap_score)
np.save(orbitrap_dir / "SpecEmbedding_Indices.npy", orbitrap_indices)
np.save(qtof_dir / "SpecEmbedding_Score.npy", qtof_score)
np.save(qtof_dir / "SpecEmbedding_Indices.npy", qtof_indices)

In [19]:
show_progress_bar = True
is_augment = True
model_backbone = "transformer"
loss_type = "SupConWithTanimotoLoss"
k_metric = [5, 1, 10]
batch_size = None
loader_batch_size = 512
device = torch.device("cuda:1" if torch.cuda.is_available() else "cpu")
tokenizer = Tokenizer(100, show_progress_bar)
model = load_transformer_model(device, loss_type, is_augment)
tester = ModelTester(model, device, show_progress_bar)

In [20]:
query_spectra = np.load(mona.ORBITRAP_COMMON, allow_pickle=True)
ref_spectra = np.load(gnps.ORBITRAP_ALL, allow_pickle=True)

search_with_spectra(
    "Orbitrap Common", tester,
    k_metric, tokenizer,
    query_spectra, ref_spectra,
    loader_batch_size,
    show_progress_bar, batch_size
)

tokenization: 100%|██████████| 3376/3376 [00:00<00:00, 7387.69it/s]
get smiles: 100%|██████████| 3376/3376 [00:00<00:00, 2351373.35it/s]
embedding: 100%|██████████| 7/7 [00:00<00:00, 23.60it/s]
tokenization: 100%|██████████| 163952/163952 [00:18<00:00, 9034.35it/s]
get smiles: 100%|██████████| 163952/163952 [00:00<00:00, 1379223.99it/s]
embedding: 100%|██████████| 321/321 [00:14<00:00, 21.74it/s]
calculate hit and recall count: 100%|██████████| 1/1 [00:02<00:00,  2.86s/it]


Unnamed: 0,top1,top5,top10
Orbitrap Common,0.8557,0.918,0.9375


In [21]:
query_spectra = np.load(mona.QTOF_COMMON, allow_pickle=True)
ref_spectra = np.load(gnps.QTOF_ALL, allow_pickle=True)

search_with_spectra(
    "QTOF Common", tester,
    k_metric, tokenizer,
    query_spectra, ref_spectra,
    loader_batch_size,
    show_progress_bar, batch_size
)

tokenization: 100%|██████████| 7243/7243 [00:01<00:00, 5954.90it/s]
get smiles: 100%|██████████| 7243/7243 [00:00<00:00, 2614401.37it/s]
embedding: 100%|██████████| 15/15 [00:00<00:00, 23.15it/s]
tokenization: 100%|██████████| 44560/44560 [00:07<00:00, 5845.26it/s]
get smiles: 100%|██████████| 44560/44560 [00:00<00:00, 1460678.42it/s]
embedding: 100%|██████████| 88/88 [00:03<00:00, 22.33it/s]
calculate hit and recall count: 100%|██████████| 1/1 [00:01<00:00,  1.35s/it]


Unnamed: 0,top1,top5,top10
QTOF Common,0.9861,0.9952,0.9982
