In [1]:
# %%
import warnings
warnings.filterwarnings("ignore", category=FutureWarning, message="You are using `torch.load` with `weights_only=False`")
warnings.filterwarnings("ignore", category=UserWarning, message="The PyTorch API of nested tensors is in prototype stage and will change in the near future.")



import sys, os

model_number = os.environ.get("MODEL_NUMBER")
sys.path.insert(0,"/root/gurusmart/MorganFP_prediction/reproduce_previous_works/Spectre")
            
import torch
torch.set_printoptions(precision=10)


import yaml
torch.set_float32_matmul_precision('high')
from pathlib import Path

from rdkit import Chem
from rdkit.Chem import Draw
# load model 
from datasets.dataset_utils import  fp_loader_configer

fp_loader_configer.select_version("Hash_Entropy")
fp_loader = fp_loader_configer.fp_loader

import numpy as np 
import random
seed=2
torch.cuda.manual_seed_all(seed) 
np.random.seed(seed)
random.seed(seed)



# %%
import pickle
with open(f'/root/gurusmart/MorganFP_prediction/inference_data/coconut_loutus_hyun_training/inference_metadata_latest_RDkit.pkl', 'rb') as file:
    smiles_and_names = pickle.load(file)
print(len(smiles_and_names))

# %%
from inference.inference_utils import choose_model , plot_topk_accuracy
from inference.inference_utils import retrieve_top_k_by_rankingset, compute_cos_sim, unpack_inputs_no_delimiter, build_input, inference_topK


old_loader_idx_to_name_mapping = {
    0: "All Inputs",
    1: "HSQC and H NMR",
    2: "HSQC and C NMR",
    3: "HSQC Only",
    4: "C NMR and H NMR",
    5: "H NMR Only",
    6: "C NMR Only",
    7: "Standard HSQC (Non-Multiplicity-Edited)",
}


loader_idx_to_name_mapping = {
    0: "eHSQC, H NMR, and C NMR",
    1: "eHSQC and H NMR",
    2: "eHSQC and C NMR",
    3: "eHSQC Only",
    4: "C NMR and H NMR",
    5: "H NMR Only",
    6: "C NMR Only",
    7: "Standard HSQC only",
    8: "Standard HSQC, H NMR, and C NMR",
    9: "Standard HSQC and H NMR",
    10: "Standard HSQC and C NMR",
    
    
}
import tqdm


choosing Hash_Entropy_FP_loader
526316


In [2]:
from torchmetrics.classification import BinaryRecall, BinaryPrecision, BinaryF1Score, BinaryAccuracy
import torch.nn as nn

do_f1 = BinaryF1Score().to('cuda')
do_cos = nn.CosineSimilarity(dim=1)


In [3]:

def get_mean(x):
    return {k:np.mean(v) for k,v in x.items()}

    
def get_single_model_cos_f1(model_select):
    hparams, model = choose_model(model_select, return_data_loader=False)
    from datasets.optional_2d_folder_dataset import OptionalInputDataModule

    datamodule = OptionalInputDataModule(dir="/workspace/SMILES_dataset", FP_choice=hparams["FP_choice"], input_src=["HSQC", "oneD_NMR"], fp_loader = fp_loader_configer.fp_loader, batch_size=1, parser_args=hparams)


    # %%
    datamodule.setup("predict")
    # loader_all_inputs, loader_HSQC_H_NMR, loader_HSQC_C_NMR, loader_only_hsqc, loader_only_1d, loader_only_H_NMR, loader_only_C_NMR = datamodule.test_dataloader()
    test_dataloaders = datamodule.predict_dataloader()

    # %%
    max_radius = fp_loader.max_radius
    rankingset_path = f"/root/gurusmart/MorganFP_prediction/inference_data/inference_rankingset_with_stable_sort/non_collision_FP_rankingset_max_radius_{max_radius}_dim_{hparams['out_dim']}_stacked_together/FP.pt"

    rankingset_data = torch.load(rankingset_path)#.to("cuda")
    rankingset_data = rankingset_data.to(model.device)


    # %%


    # %%
    from collections import defaultdict

    ## with MW
    mean_rank_records_cos_with_mw = defaultdict(list)
    mean_rank_records_f1_with_mw = defaultdict(list)
    
    mean_rank_records_cos_without_mw = defaultdict(list)
    mean_rank_records_f1_without_mw = defaultdict(list)
    for i in range(11):
   
        loader_idx = i
        if i == 7: # Standard HSQC only
            loader_idx = 3
        if i == 8: # Standard HSQC, H NMR, and C NMR
            loader_idx = 0
        if i == 9: # Standard HSQC and H NMR
            loader_idx = 1
        if i == 10: # Standard HSQC and C NMR
            loader_idx = 2
            
        for idx, batch in enumerate(tqdm.tqdm(test_dataloaders[loader_idx])):
            
            if i >= 7:
                # last column all zeros
                batch[0][:, :, -1] = 0
            with torch.no_grad():
                inputs, extra_info = batch
                (ground_smiles, names, NMR_type_indicator, path, superclasses) = zip(*extra_info)
                fp_label = torch.unsqueeze(fp_loader.build_mfp_for_new_SMILES(ground_smiles[0]), dim=0).to(model.device)
                
                # with mw
                inputs = inputs.to(model.device)
                NMR_type_indicator = NMR_type_indicator[0].to(model.device)
                pred = model(inputs, NMR_type_indicator)
            
                fp_pred = (pred >= 0).float()
                # print(fp_label.shape)
                # print(fp_pred.shape)
                mean_rank_records_cos_with_mw[loader_idx_to_name_mapping[i]].append(do_cos(fp_label, fp_pred).cpu().item())
                mean_rank_records_f1_with_mw[loader_idx_to_name_mapping[i]].append(do_f1(fp_label, fp_pred).item())

   
                # remove mw
                inputs = inputs[:, :-1]
                # remove mw
                NMR_type_indicator = NMR_type_indicator[:-1]
                pred = model(inputs, NMR_type_indicator)
                fp_pred = (pred >= 0).float()
                mean_rank_records_cos_without_mw[loader_idx_to_name_mapping[i]].append(do_cos(fp_label, fp_pred).cpu().item())
                mean_rank_records_f1_without_mw[loader_idx_to_name_mapping[i]].append(do_f1(fp_label, fp_pred).cpu().item())
    return get_mean(mean_rank_records_cos_with_mw), get_mean(mean_rank_records_f1_with_mw), get_mean(mean_rank_records_cos_without_mw), get_mean(mean_rank_records_f1_without_mw)

    
for model_select in ["optional3", ]:
    dat = get_single_model_cos_f1(model_select)
    print(dat) 
    save_dir = "/root/gurusmart/MorganFP_prediction/reproduce_previous_works/Spectre/inference"
    with open(save_dir + f"cos_f1_{model_select}.pkl", 'wb') as file:
        pickle.dump(dat, file)

loading model from:  /root/gurusmart/MorganFP_prediction/reproduce_previous_works/entropy_on_hashes/flexible_models_jittering_flexible_MW_flexible_normal_hsqc/r0_r6_trial_3/checkpoints/epoch=67-step=46104.ckpt


Initialized SignCoordinateEncoder[784] with dims [365, 365, 54] and 2 positional encoders. 54 bits are reserved for encoding the final bit


HsqcRankedTransformer saving args
finish entropy list
Hash_Entropy_FP_loader is setup, out_dim=16384, max_radius=6


  result = torch.sparse_compressed_tensor(
100%|██████████| 4056/4056 [03:28<00:00, 19.41it/s]
100%|██████████| 4056/4056 [03:25<00:00, 19.76it/s]
100%|██████████| 4056/4056 [03:27<00:00, 19.59it/s]
100%|██████████| 4056/4056 [03:34<00:00, 18.92it/s]
100%|██████████| 4056/4056 [03:27<00:00, 19.57it/s]
100%|██████████| 4056/4056 [03:26<00:00, 19.67it/s]
100%|██████████| 4056/4056 [03:30<00:00, 19.30it/s]
100%|██████████| 4056/4056 [03:32<00:00, 19.07it/s]
100%|██████████| 4056/4056 [03:31<00:00, 19.20it/s]
100%|██████████| 4056/4056 [03:29<00:00, 19.39it/s]
100%|██████████| 4056/4056 [03:24<00:00, 19.85it/s]


({'eHSQC, H NMR, and C NMR': np.float64(0.8970229703477443), 'eHSQC and H NMR': np.float64(0.8791183323601235), 'eHSQC and C NMR': np.float64(0.8954051761966956), 'eHSQC Only': np.float64(0.871503987470807), 'C NMR and H NMR': np.float64(0.80099688388804), 'H NMR Only': np.float64(0.5640236688975274), 'C NMR Only': np.float64(0.7620332616802004), 'Standard HSQC only': np.float64(0.8524614887411249), 'Standard HSQC, H NMR, and C NMR': np.float64(0.887710772649453), 'Standard HSQC and H NMR': np.float64(0.8641952941014158), 'Standard HSQC and C NMR': np.float64(0.8850914266316819)}, {'eHSQC, H NMR, and C NMR': np.float64(0.8956385006250068), 'eHSQC and H NMR': np.float64(0.8773942493339468), 'eHSQC and C NMR': np.float64(0.8939942494034767), 'eHSQC Only': np.float64(0.869539353439926), 'C NMR and H NMR': np.float64(0.7968193151100853), 'H NMR Only': np.float64(0.5423581875994787), 'C NMR Only': np.float64(0.7566894247055171), 'Standard HSQC only': np.float64(0.8499520587251767), 'Standar

load data

In [1]:
import pickle
for model_select in ['optional', "optional2","optional3", ]:
   
    save_dir = "/root/gurusmart/MorganFP_prediction/reproduce_previous_works/Spectre/inference"
    with open(save_dir + f"cos_f1_{model_select}.pkl", 'rb') as file:
        dat = pickle.load(file)
        
    mean_rank_records_cos_with_mw, mean_rank_records_f1_with_mw, mean_rank_records_cos_without_mw, mean_rank_records_f1_without_mw = dat


In [None]:
import pickle
import numpy as np

# Ordered keys (these define the desired output order)
ordered_keys = [
    "eHSQC, H NMR, and C NMR",
    "eHSQC and C NMR",
    "eHSQC and H NMR",
    "eHSQC Only",
    "C NMR and H NMR",
    "C NMR Only",
    "H NMR Only",
    "Standard HSQC only",
    "Standard HSQC and C NMR",
    "Standard HSQC and H NMR",
    "Standard HSQC, H NMR, and C NMR"
]

# Container to collect values
all_cos_with, all_cos_without = [], []
all_f1_with, all_f1_without = [], []

# Load and collect values from each model
for model_select in ['optional', "optional2", "optional3"]:
    save_dir = "/root/gurusmart/MorganFP_prediction/reproduce_previous_works/Spectre/inference"
    with open(save_dir + f"cos_f1_{model_select}.pkl", 'rb') as file:
        dat = pickle.load(file)

    mean_rank_cos_with, mean_rank_f1_with, mean_rank_cos_without, mean_rank_f1_without = dat

    # Collect in the correct order
    all_cos_with.append([mean_rank_cos_with[k] for k in ordered_keys])
    all_f1_with.append([mean_rank_f1_with[k] for k in ordered_keys])
    all_cos_without.append([mean_rank_cos_without[k] for k in ordered_keys])
    all_f1_without.append([mean_rank_f1_without[k] for k in ordered_keys])

# Convert to numpy arrays for easy mean/std computation
all_cos_with = np.array(all_cos_with)
all_f1_with = np.array(all_f1_with)
all_cos_without = np.array(all_cos_without)
all_f1_without = np.array(all_f1_without)

# Final lists
cosine_flexible = np.round(all_cos_without.mean(axis=0) * 100, 2).tolist()
cosine_flexible_err = np.round(all_cos_without.std(axis=0) * 100, 2).tolist()

f1_flexible = np.round(all_f1_without.mean(axis=0) * 100, 2).tolist()
f1_flexible_err = np.round(all_f1_without.std(axis=0) * 100, 2).tolist()

# Also compute _with_mw variants if needed
cosine_flexible_with_mw = np.round(all_cos_with.mean(axis=0) * 100, 2).tolist()
cosine_flexible_err_with_mw = np.round(all_cos_with.std(axis=0) * 100, 2).tolist()

f1_flexible_with_mw = np.round(all_f1_with.mean(axis=0) * 100, 2).tolist()
f1_flexible_err_with_mw = np.round(all_f1_with.std(axis=0) * 100, 2).tolist()

# Print results
print("without MW")
print("cosine_flexible =", cosine_flexible)
print("cosine_flexible_err =", cosine_flexible_err)
print("f1_flexible =", f1_flexible)
print("f1_flexible_err =", f1_flexible_err)

print("\nwith MW")
print("\ncosine_flexible =", cosine_flexible_with_mw)
print("cosine_flexible_err =", cosine_flexible_err_with_mw)
print("f1_flexible =", f1_flexible_with_mw)
print("f1_flexible_err =", f1_flexible_err_with_mw)


without MW
cosine_flexible = [89.2, 89.11, 87.34, 86.49, 79.47, 75.48, 56.91, 84.78, 88.17, 85.94, 88.34]
cosine_flexible_err = [0.36, 0.31, 0.41, 0.46, 0.44, 0.51, 0.36, 0.34, 0.25, 0.34, 0.3]
f1_flexible = [89.07, 88.98, 87.18, 86.31, 79.11, 75.02, 54.94, 84.55, 88.02, 85.75, 88.2]
f1_flexible_err = [0.36, 0.31, 0.4, 0.45, 0.41, 0.46, 0.5, 0.33, 0.25, 0.33, 0.3]

with MW

cosine_flexible = [89.2, 89.11, 87.34, 86.49, 79.47, 75.48, 56.91, 84.78, 88.17, 85.94, 88.34]
cosine_flexible_err = [0.36, 0.31, 0.41, 0.46, 0.44, 0.51, 0.36, 0.34, 0.25, 0.34, 0.3]
f1_flexible = [89.07, 88.98, 87.18, 86.31, 79.11, 75.02, 54.94, 84.55, 88.02, 85.75, 88.2]
f1_flexible = [0.36, 0.31, 0.4, 0.45, 0.41, 0.46, 0.5, 0.33, 0.25, 0.33, 0.3]


In [5]:
cosine_specialized_with_mw

[89.2, 89.11, 87.34, 86.49, 79.47, 75.48, 56.91, 84.78, 88.17, 85.94, 88.34]

In [7]:
mean_rank_cos_with

{'eHSQC, H NMR, and C NMR': np.float64(0.897035877031129),
 'eHSQC and H NMR': np.float64(0.8791289882444183),
 'eHSQC and C NMR': np.float64(0.8953874333958184),
 'eHSQC Only': np.float64(0.8714794737086641),
 'C NMR and H NMR': np.float64(0.8009897376864384),
 'H NMR Only': np.float64(0.564067937980665),
 'C NMR Only': np.float64(0.7620080567848606),
 'Standard HSQC only': np.float64(0.8524294680855155),
 'Standard HSQC, H NMR, and C NMR': np.float64(0.887700769517578),
 'Standard HSQC and H NMR': np.float64(0.8641958430710688),
 'Standard HSQC and C NMR': np.float64(0.8851275568041781)}

In [8]:
mean_rank_cos_without

{'eHSQC, H NMR, and C NMR': np.float64(0.897035877031129),
 'eHSQC and H NMR': np.float64(0.8791289882444183),
 'eHSQC and C NMR': np.float64(0.8953874333958184),
 'eHSQC Only': np.float64(0.8714794737086641),
 'C NMR and H NMR': np.float64(0.8009897376864384),
 'H NMR Only': np.float64(0.564067937980665),
 'C NMR Only': np.float64(0.7620080567848606),
 'Standard HSQC only': np.float64(0.8524294680855155),
 'Standard HSQC, H NMR, and C NMR': np.float64(0.887700769517578),
 'Standard HSQC and H NMR': np.float64(0.8641958430710688),
 'Standard HSQC and C NMR': np.float64(0.8851275568041781)}