In [None]:
import torch

from utils import embedding
from train import ModelTester
from data import Tokenizer
from model import SiameseModel
from utils import read_raw_spectra, cosine_similarity

q = read_raw_spectra("./q.msp")
r = read_raw_spectra("./r.msp")

show_progress_bar = True
tokenizer = Tokenizer(100, show_progress_bar)
device = torch.device("cpu")
model = SiameseModel(
    embedding_dim=512,
    n_head=16,
    n_layer=4,
    dim_feedward=512,
    dim_target=512,
    feedward_activation="selu"
)
model_state = torch.load("./model.ckpt", device)
model.load_state_dict(model_state)
tester = ModelTester(model, device, show_progress_bar)
q, _ = embedding(tester, tokenizer, 512, q, show_progress_bar)
r, _ = embedding(tester, tokenizer, 512, r, show_progress_bar)

cosine_scores = cosine_similarity(q, r)

tokenization: 100%|██████████| 21/21 [00:00<00:00, 4235.24it/s]
get smiles: 100%|██████████| 21/21 [00:00<00:00, 307973.37it/s]
embedding: 100%|██████████| 1/1 [00:00<00:00, 12.31it/s]
tokenization: 100%|██████████| 21/21 [00:00<00:00, 5186.08it/s]
get smiles: 100%|██████████| 21/21 [00:00<00:00, 248113.76it/s]
embedding: 100%|██████████| 1/1 [00:00<00:00, 22.72it/s]


In [None]:
import torch

from utils import embedding
from train import ModelTester
from data import Tokenizer
from model import SiameseModel
from utils import read_raw_spectra, cosine_similarity, top_k_indices

show_progress_bar = False
q_spectra = read_raw_spectra("./q.msp")
r_spectra = read_raw_spectra("./r.msp")

tokenizer = Tokenizer(100, show_progress_bar)
device = torch.device("cpu")
model = SiameseModel(
    embedding_dim=512,
    n_head=16,
    n_layer=4,
    dim_feedward=512,
    dim_target=512,
    feedward_activation="selu"
)
model_state = torch.load("./model.ckpt", device)
model.load_state_dict(model_state)
tester = ModelTester(model, device, show_progress_bar)
q, _ = embedding(tester, tokenizer, 512, q_spectra, show_progress_bar)
r, _ = embedding(tester, tokenizer, 512, r_spectra, show_progress_bar)
cosine_scores = cosine_similarity(q, r)

indices = top_k_indices(cosine_scores, 1)
for i, index in enumerate(indices[:, 0]):
    print(f"The {i}-th {q_spectra[i].get('smiles')} spectra most similar compound is {r_spectra[index].get('smiles')}")

tokenization: 100%|██████████| 21/21 [00:00<00:00, 4817.87it/s]
tokenization: 100%|██████████| 21/21 [00:00<00:00, 4623.64it/s]


The 0-th OC(=O)c1ccc(O)cc1 spectra most similar compound is OC(=O)c1ccc(O)cc1
The 1-th CSCC[C@H](NC(C)=O)C(O)=O spectra most similar compound is CSCC[C@H](NC(C)=O)C(O)=O
The 2-th OC(=O)CC(=CC(O)=O)C(O)=O spectra most similar compound is OC(=O)CC(=CC(O)=O)C(O)=O
The 3-th Nc1ccccc1C(O)=O spectra most similar compound is Nc1ccccc1C(O)=O
The 4-th CCCC(=O)O[C@H](CC([O-])=O)C[N+](C)(C)C spectra most similar compound is CCCC(=O)O[C@H](CC([O-])=O)C[N+](C)(C)C
The 5-th C[N+](C)(C)CC(O)CC([O-])=O spectra most similar compound is C[N+](C)(C)CC(O)CC([O-])=O
The 6-th C[N+](C)(C)CCO spectra most similar compound is C[N+](C)(C)CCO
The 7-th Nc1cc[nH]c(=O)n1 spectra most similar compound is Nc1cc[nH]c(=O)n1
The 8-th C1=CC(=C(C(=C1)O)O)C(=O)O spectra most similar compound is C1=CC(=C(C(=C1)O)O)C(=O)O
The 9-th CC1=CC2=C(C=C1C)N(C3=NC(=O)NC(=O)C3=N2)CC(C(C(COP(=O)(O)OP(=O)(O)OCC4C(C(C(O4)N5C=NC6=C(N=CN=C65)N)O)O)O)O)O spectra most similar compound is CC1=CC2=C(C=C1C)N(C3=NC(=O)NC(=O)C3=N2)CC(C(C(COP(=O)(O