# 04/14/23

This notebook is intended to test chemformer loading. 

In [1]:
%load_ext autoreload
%autoreload 2

from models.chemformer.molbart import BARTModel
from models.chemformer.tokeniser import MolEncTokeniser
from models.chemformer.utils import REGEX, DEFAULT_MAX_SEQ_LEN
from models.chemformer.molbart_dataset import Uspto50
from models.chemformer.molbart_datamodule import FineTuneReactionDataModule
from models.chemformer.decoder import DecodeSampler

from pytorch_lightning import Trainer
from pytorch_lightning.loggers import TensorBoardLogger

Load the model. Using code from the original chemformer repo (```molbart/util.py```)

In [2]:
DEFAULT_NUM_BEAMS = 10

In [3]:
vocab_path = "tempdata/chemformer/bart_vocab.txt"
chem_token_start = 272
tokeniser = MolEncTokeniser.from_vocab_file(
  vocab_path, REGEX, chem_token_start
)

data_path = "tempdata/chemformer/uspto_50.pickle"
aug_prob = 0.0
dataset = Uspto50(
  data_path, aug_prob, forward=True
)

model_type = "bart"
uni_model = model_type == "unified"
batch_size = 8
train_tokens = None
num_buckets = None

dm = FineTuneReactionDataModule(
    dataset,
    tokeniser,
    batch_size,
    DEFAULT_MAX_SEQ_LEN,
    forward_pred=True,
    val_idxs=dataset.val_idxs,
    test_idxs=dataset.test_idxs,
    train_token_batch_size=train_tokens,
    num_buckets=num_buckets,
    unified_model=uni_model
)

sampler = DecodeSampler(tokeniser, DEFAULT_MAX_SEQ_LEN)


Using a batch size of 8.
Building data module for forward prediction task...


In [4]:
model_path = "tempdata/chemformer/model.ckpt"
model = BARTModel.load_from_checkpoint(
  model_path,
  decode_sampler = sampler
)
model.cuda().eval()
model.num_beams = DEFAULT_NUM_BEAMS

In [5]:
logger = TensorBoardLogger("tb_logs", name=f"eval_bart_uspto_50")
trainer = Trainer(
    logger=logger,
    limit_test_batches=1,
    gpus=1
)

  rank_zero_warn(
GPU available: True, used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
`Trainer(limit_test_batches=1)` was configured so 1 batch will be used.


In [7]:
results = trainer.test(model, datamodule=dm)
def print_results(results):
  print(f"Results for model: {model_path}")
  print(f"{'Item':<25}Result")
  for key, val in results.items():
    print(f"{key:<25} {val:.4f}")
print_results(results[0])

LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Testing DataLoader 0:   0%|          | 0/1 [00:00<?, ?it/s]Test Step
Target Smiles: 
['COC(=O)CCC(=O)c1ccc(OC2CCCCO2)cc1O', 'COC(=O)c1cccc(-c2nc3cccnc3[nH]2)c1', 'CON(C)C(=O)C1CCC(NC(=O)OC(C)(C)C)CC1', 'O=[N+]([O-])c1ccc(Cl)nc1Nc1ccc(O)cc1', 'NCC1=CC[C@@H](c2ccc(Cl)cc2Cl)[C@H]([N+](=O)[O-])C1', 'CCc1oc(-c2ccc(C(F)(F)F)cc2)nc1COC[C@@H]1CCC[C@H](COC(C)(C)C(=O)O)C1', 'Cc1noc(C)c1NC(=O)Nc1ccc2nc(N[C@@H]3CCc4ccccc43)ccc2c1', 'Cc1cnc(N2CCN(C(=O)c3ccc(N4CCCS4(=O)=O)nc3C)CC2)c(C)c1']
Mol_strs: 
[['C1=C(COC(=O)CCC(c2ccc(O)cc2O)=O)CCO1', 'c1(O)ccc(C(=O)CCC(=O)OCC2=COCC2)c(O)c1', 'C1=C(COC(=O)CCC(=O)c2ccc(O)cc2O)CCO1', 'C1=C(COC(=O)CCC(c2c(O)cc(O)cc2)=O)CCO1', 'C1(COC(=O)CCC(c2ccc(O)cc2O)=O)=COCC1', 'C1=C(COC(CCC(=O)c2ccc(O)cc2O)=O)CCO1', 'c1cc(O)cc(O)c1C(=O)CCC(=O)OCC1=COCC1', 'c1cc(O)cc(O)c1C(=O)CCC(OCC1=COCC1)=O', 'C1=C(COC(=O)CCC(=O)c2c(O)cc(O)cc2)CCO1', 'C1(COC(=O)CCC(=O)c2ccc(O)cc2O)=COCC1'], ['c1(Nc2cccnc2N)c(C(OC)=O)cccc1C(O)=O', 'c1(Nc2cccnc2N)c(C(OC)=O)cccc1C(=O)O', 'c1ccnc(N)c1Nc1c(C(O