In [1]:
import sys
sys.path.append("../")
from pathlib import Path

import pandas as pd
import numpy as np
from tqdm import tqdm
from gensim.models import Word2Vec

from const import gnps, mona
from utils import embedding, search, search_with_spectra, load_model, most_similar, cosine_similarity

In [None]:
model_files = {
    "orbitrap":{
        "model": "orbitrap.model",
    },
    "qtof": {
        "model": "qtof.model",
    },
    "other": {
        "model": "other.model",
    }
}

spectra_paths = {
    "gnps":{
        "orbitrap": {
            "train": (gnps.ORBITRAP_TRAIN_QUERY, gnps.ORBITRAP_TEST_REF),
            "test": (gnps.ORBITRAP_TEST_QUERY, gnps.ORBITRAP_TEST_REF)
        },
        "qtof": {
            "test": (gnps.QTOF_TEST_QUERY, gnps.QTOF_TEST_REF)
        },
        "other": {
            "test": (gnps.OTHER_TEST_QUERY, gnps.OTHER_TEST_REF)
        }
    },
}

gnps_train_ref = np.load(gnps.ORBITRAP_TRAIN_REF, allow_pickle=True)

In [4]:
batch_size = None
k_metric = [5, 1, 10]
show_progress_bar = False

replica_suffix = "-replication-{}"

In [5]:
replica_df_seq = []

models = {
    desc: Word2Vec.load(metadata["model"])
    for desc, metadata in model_files.items()
}

for i in tqdm(range(10)):
    df_seq = []
    for db, db_metadata in spectra_paths.items():
        for desc, path_metadata in db_metadata.items():
            model = models[desc]
            for info, paths in path_metadata.items():
                print("-" * 40, f"{db}-{desc}-{info}", "-" * 40)
                query_path, ref_path = paths
                query_path = query_path.with_stem(query_path.stem + replica_suffix.format(i + 1))
                ref_path = ref_path.with_stem(ref_path.stem + replica_suffix.format(i + 1))
                if db == "gnps" and desc == "orbitrap":
                    if info == "train":
                        query_path = gnps.ORBITRAP_TRAIN_QUERY
                    
                    ref_spectra = np.load(ref_path, allow_pickle=True)
                    query_spectra = np.load(query_path, allow_pickle=True)
                    ref_spectra = np.hstack((gnps_train_ref, ref_spectra))
                    df = search_with_spectra(
                        f"{db}-{desc}-{info}", model,
                        query_spectra, ref_spectra,
                        k_metric,
                        show_progress_bar, batch_size
                    )
                else:
                    df = search(
                        f"{db}-{desc}-{info}", model,
                        query_path, ref_path,
                        k_metric,
                        show_progress_bar, batch_size
                    )
                df_seq.append(df)
    df = pd.concat(df_seq, axis=0)
    print(df)
    replica_df_seq.append(df)

  0%|          | 0/10 [00:00<?, ?it/s]

---------------------------------------- gnps-orbitrap-train ----------------------------------------
---------------------------------------- gnps-orbitrap-test ----------------------------------------
---------------------------------------- gnps-qtof-test ----------------------------------------
---------------------------------------- gnps-other-test ----------------------------------------


 10%|█         | 1/10 [06:38<59:43, 398.19s/it]

                         top1      top5     top10
gnps-orbitrap-train  0.698000  0.821340  0.847905
gnps-orbitrap-test   0.701661  0.814947  0.840451
gnps-qtof-test       0.432846  0.596011  0.644016
gnps-other-test      0.694001  0.817238  0.842195
---------------------------------------- gnps-orbitrap-train ----------------------------------------
---------------------------------------- gnps-orbitrap-test ----------------------------------------
---------------------------------------- gnps-qtof-test ----------------------------------------
---------------------------------------- gnps-other-test ----------------------------------------


 20%|██        | 2/10 [13:16<53:08, 398.51s/it]

                         top1      top5     top10
gnps-orbitrap-train  0.697708  0.821340  0.847613
gnps-orbitrap-test   0.698695  0.821471  0.843416
gnps-qtof-test       0.436569  0.596676  0.644149
gnps-other-test      0.694466  0.820028  0.844055
---------------------------------------- gnps-orbitrap-train ----------------------------------------
---------------------------------------- gnps-orbitrap-test ----------------------------------------
---------------------------------------- gnps-qtof-test ----------------------------------------
---------------------------------------- gnps-other-test ----------------------------------------


 30%|███       | 3/10 [20:55<49:40, 425.74s/it]

                         top1      top5     top10
gnps-orbitrap-train  0.698000  0.820464  0.847322
gnps-orbitrap-test   0.706999  0.822657  0.848754
gnps-qtof-test       0.431649  0.585372  0.631383
gnps-other-test      0.691831  0.816463  0.841110
---------------------------------------- gnps-orbitrap-train ----------------------------------------
---------------------------------------- gnps-orbitrap-test ----------------------------------------
---------------------------------------- gnps-qtof-test ----------------------------------------
---------------------------------------- gnps-other-test ----------------------------------------


 40%|████      | 4/10 [28:25<43:32, 435.47s/it]

                         top1      top5     top10
gnps-orbitrap-train  0.697416  0.820902  0.847759
gnps-orbitrap-test   0.708778  0.841044  0.858837
gnps-qtof-test       0.432846  0.596941  0.643351
gnps-other-test      0.698186  0.824833  0.847000
---------------------------------------- gnps-orbitrap-train ----------------------------------------
---------------------------------------- gnps-orbitrap-test ----------------------------------------
---------------------------------------- gnps-qtof-test ----------------------------------------
---------------------------------------- gnps-other-test ----------------------------------------


 50%|█████     | 5/10 [35:49<36:33, 438.63s/it]

                         top1      top5     top10
gnps-orbitrap-train  0.697416  0.821048  0.848197
gnps-orbitrap-test   0.703440  0.825030  0.845789
gnps-qtof-test       0.434043  0.585904  0.634574
gnps-other-test      0.692141  0.820803  0.845760
---------------------------------------- gnps-orbitrap-train ----------------------------------------
---------------------------------------- gnps-orbitrap-test ----------------------------------------
---------------------------------------- gnps-qtof-test ----------------------------------------
---------------------------------------- gnps-other-test ----------------------------------------


 60%|██████    | 6/10 [43:25<29:37, 444.45s/it]

                         top1      top5     top10
gnps-orbitrap-train  0.698292  0.821340  0.847468
gnps-orbitrap-test   0.711151  0.826809  0.856465
gnps-qtof-test       0.433910  0.592952  0.640559
gnps-other-test      0.688730  0.817083  0.842660
---------------------------------------- gnps-orbitrap-train ----------------------------------------
---------------------------------------- gnps-orbitrap-test ----------------------------------------
---------------------------------------- gnps-qtof-test ----------------------------------------
---------------------------------------- gnps-other-test ----------------------------------------


 70%|███████   | 7/10 [50:54<22:17, 445.98s/it]

                         top1      top5     top10
gnps-orbitrap-train  0.698584  0.820902  0.847322
gnps-orbitrap-test   0.711744  0.831554  0.852313
gnps-qtof-test       0.433378  0.591090  0.638431
gnps-other-test      0.696326  0.819873  0.842815
---------------------------------------- gnps-orbitrap-train ----------------------------------------
---------------------------------------- gnps-orbitrap-test ----------------------------------------
---------------------------------------- gnps-qtof-test ----------------------------------------
---------------------------------------- gnps-other-test ----------------------------------------


 80%|████████  | 8/10 [58:28<14:57, 448.58s/it]

                         top1      top5     top10
gnps-orbitrap-train  0.697416  0.820610  0.847468
gnps-orbitrap-test   0.698102  0.819692  0.846975
gnps-qtof-test       0.428457  0.587766  0.636702
gnps-other-test      0.692141  0.820803  0.843900
---------------------------------------- gnps-orbitrap-train ----------------------------------------
---------------------------------------- gnps-orbitrap-test ----------------------------------------
---------------------------------------- gnps-qtof-test ----------------------------------------
---------------------------------------- gnps-other-test ----------------------------------------


 90%|█████████ | 9/10 [1:05:58<07:29, 449.07s/it]

                         top1      top5     top10
gnps-orbitrap-train  0.698584  0.820756  0.847613
gnps-orbitrap-test   0.699881  0.823250  0.845789
gnps-qtof-test       0.434574  0.593750  0.641090
gnps-other-test      0.693536  0.818478  0.847310
---------------------------------------- gnps-orbitrap-train ----------------------------------------
---------------------------------------- gnps-orbitrap-test ----------------------------------------
---------------------------------------- gnps-qtof-test ----------------------------------------
---------------------------------------- gnps-other-test ----------------------------------------


100%|██████████| 10/10 [1:13:32<00:00, 441.20s/it]

                         top1      top5     top10
gnps-orbitrap-train  0.698146  0.820464  0.847905
gnps-orbitrap-test   0.704033  0.829775  0.854686
gnps-qtof-test       0.434176  0.592819  0.639761
gnps-other-test      0.693691  0.818943  0.843900





In [6]:
data = []
indices = replica_df_seq[0].index
columns = replica_df_seq[0].columns
for item in replica_df_seq:
    data.append([item.values])

data = np.concatenate(data, axis=0)

np.set_printoptions(precision=2, suppress=True)
print(np.mean(data, axis=0) * 100)
print(np.std(data, axis=0) * 100)
mean_df = pd.DataFrame(np.mean(data, axis=0) * 100, index=indices, columns=columns)
std_df = pd.DataFrame(np.std(data, axis=0) * 100, index=indices, columns=columns)
mean_df.to_csv("./mean.tsv", sep='\t')
std_df.to_csv("./std.tsv", sep='\t')

[[69.8  82.09 84.77]
 [70.44 82.56 84.93]
 [43.32 59.19 63.94]
 [69.35 81.95 84.41]]
[[0.04 0.03 0.03]
 [0.48 0.69 0.57]
 [0.2  0.41 0.4 ]
 [0.25 0.23 0.19]]


In [7]:
model = Word2Vec.load("./orbitrap.model")
batch_size = None
k_metric = [5, 1, 10]
show_progress_bar = False

query_spectra = np.load("/data1/xp/data/MSBert/MTBLS1572/query.npy", allow_pickle=True)
ref_spectra = np.load("/data1/xp/data/MSBert/MTBLS1572/ref.npy", allow_pickle=True)

search_with_spectra(
    "MTBLS1572",
    model,
    query_spectra,
    ref_spectra,
    k_metric,
    show_progress_bar,
    batch_size
)

Unnamed: 0,top1,top5,top10
MTBLS1572,0.904762,1.0,1.0


In [2]:
model = Word2Vec.load("./orbitrap.model")
batch_size = None
k_metric = [5, 1, 10]
show_progress_bar = False

query_spectra = np.load("/data1/xp/data/MSBert/MTBLS1572/query.npy", allow_pickle=True)
ref_spectra = np.load("/data1/xp/data/MSBert/MTBLS1572/ref.npy", allow_pickle=True)

query_embedding, _ = embedding(
    model,
    query_spectra,
    False
)

ref_embedding, _ = embedding(
    model,
    ref_spectra,
    False
)

cosine_score = cosine_similarity(
    query_embedding, ref_embedding
)
for i, j in enumerate(np.argmax(cosine_score, axis=1)):
    if i != j:
        print(f"{i}-th answer is [{ref_spectra[i].get("compound_name")}] but get {j}-th [{ref_spectra[j].get("compound_name")}]")

4-th answer is [Butyryl-L-carnitine] but get 11-th [Isovaleryl-L-carnitine]
17-th answer is [Pantothenic acid] but get 19-th [Propionyl-L-carnitine]


In [3]:
cosine_score[4][4], cosine_score[4][11]

(0.7996484088097755, 0.9023208424890726)

In [3]:
show_progress_bar = True

model = load_model("./orbitrap.model")

orbitrap_ref_spectra = np.load(mona.ORBITRAP_COMMON, allow_pickle=True)
orbitrap_query_spectra = np.load(mona.ORBITRAP_UNIQUE, allow_pickle=True)

qtof_ref_spectra = np.load(mona.ORBITRAP_COMMON, allow_pickle=True)
qtof_query_spectra = np.load(mona.QTOF_UNIQUE, allow_pickle=True)

orbitrap_ref_embedding, _ = embedding(model, orbitrap_ref_spectra, show_progress_bar)
orbitrap_query_embedding, _ = embedding(model, orbitrap_query_spectra,  show_progress_bar)

model = load_model("./qtof.model")
qtof_ref_embedding, _ = embedding(model, qtof_ref_spectra, show_progress_bar)
qtof_query_embedding, _ = embedding(model, qtof_query_spectra,  show_progress_bar)

get smiles, embedding: 100%|██████████| 3376/3376 [00:03<00:00, 859.04it/s] 
get smiles, embedding: 100%|██████████| 15655/15655 [00:06<00:00, 2422.93it/s]
get smiles, embedding: 100%|██████████| 3376/3376 [00:03<00:00, 914.11it/s] 
get smiles, embedding: 100%|██████████| 674/674 [00:00<00:00, 2244.17it/s]


In [4]:
orbitrap_score, orbitrap_indices = most_similar(orbitrap_query_embedding, orbitrap_ref_embedding, 512, show_progress_bar)
qtof_score, qtof_indices = most_similar(qtof_query_embedding, qtof_ref_embedding, 512, show_progress_bar)

processing: 100%|██████████| 31/31 [00:05<00:00,  5.81it/s]
processing: 100%|██████████| 2/2 [00:00<00:00, 27.42it/s]


In [5]:
dir = Path("/data1/xp/code/specEmbedding/score_distribution/compound search/")
orbitrap_dir = dir / "orbitrap"
qtof_dir = dir / "qtof"

orbitrap_dir.mkdir(exist_ok=True, parents=True)
qtof_dir.mkdir(exist_ok=True, parents=True)

np.save(orbitrap_dir / "Spec2Vec_Score.npy", orbitrap_score)
np.save(orbitrap_dir / "Spec2Vec_Indices.npy", orbitrap_indices)
np.save(qtof_dir / "Spec2Vec_Score.npy", qtof_score)
np.save(qtof_dir / "Spec2Vec_Indices.npy", qtof_indices)

In [2]:
batch_size = None
k_metric = [5, 1, 10]
show_progress_bar = False

In [3]:
model = Word2Vec.load("./orbitrap.model")

query_spectra = np.load(mona.ORBITRAP_COMMON, allow_pickle=True)
ref_spectra = np.load(gnps.ORBITRAP_ALL, allow_pickle=True)

search_with_spectra(
    "Orbitrap Common",
    model,
    query_spectra,
    ref_spectra,
    k_metric,
    show_progress_bar,
    batch_size
)

Unnamed: 0,top1,top5,top10
Orbitrap Common,0.76955,0.81487,0.84064


In [4]:
model = Word2Vec.load("./qtof.model")

query_spectra = np.load(mona.QTOF_COMMON, allow_pickle=True)
ref_spectra = np.load(gnps.QTOF_ALL, allow_pickle=True)

search_with_spectra(
    "QTOF Common",
    model,
    query_spectra,
    ref_spectra,
    k_metric,
    show_progress_bar,
    batch_size
)

Unnamed: 0,top1,top5,top10
QTOF Common,0.983156,0.993235,0.994615
