# 04/14/23

This notebook is intended to test chemformer loading. 

In [12]:
%load_ext autoreload
%autoreload 2

from models.chemformer.molbart import BARTModel
from models.chemformer.tokeniser import MolEncTokeniser
from models.chemformer.utils import REGEX, DEFAULT_MAX_SEQ_LEN
from models.chemformer.molbart_dataset import Uspto50
from models.chemformer.molbart_datamodule import FineTuneReactionDataModule
from models.chemformer.decoder import DecodeSampler

from models.ranked_transformer import Moonshot

from pytorch_lightning import Trainer
from pytorch_lightning.loggers import TensorBoardLogger

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


Load the model. Using code from the original chemformer repo (```molbart/util.py```)

In [2]:
# Vocabulary
from pathlib import Path
vocab_path = "tempdata/chemformer/bart_vocab.txt"
chem_token_start = 272
tokeniser = MolEncTokeniser.from_vocab_file(
  vocab_path, REGEX, chem_token_start
)

In [3]:
# Dataset
data_path = "tempdata/chemformer/uspto_50.pickle"
aug_prob = 0.0
dataset = Uspto50(
  data_path, aug_prob, forward=True
)


Uspto50 __init()__: 
[DS] <class 'pandas.core.frame.DataFrame'> 50037


In [4]:

# Model
model_type = "bart"
uni_model = model_type == "unified"
batch_size = 2
train_tokens = None
num_buckets = None

dm = FineTuneReactionDataModule(
    dataset,
    tokeniser,
    batch_size,
    DEFAULT_MAX_SEQ_LEN,
    forward_pred=True,
    val_idxs=dataset.val_idxs,
    test_idxs=dataset.test_idxs,
    train_token_batch_size=train_tokens,
    num_buckets=num_buckets,
    unified_model=uni_model
)
sampler = DecodeSampler(tokeniser, DEFAULT_MAX_SEQ_LEN)

Using a batch size of 2.
Building data module for forward prediction task...


### Chemformer Hacking

In [17]:
import torch
model_path = "tempdata/chemformer/model.ckpt"
obj = torch.load(model_path)

In [26]:
obj["state_dict"]["decoder.layers.1.norm1.bias"][:5]

tensor([ 0.0847, -0.0391,  0.0424, -0.0030, -0.0634], device='cuda:0',
       dtype=torch.float16)

In [16]:
model = Moonshot.load_from_checkpoint(
  model_path, strict=False, module_only=True
)

Lightning automatically upgraded your loaded checkpoint from v1.2.3 to v2.0.1.post0. To apply the upgrade to your files permanently, run `python -m pytorch_lightning.utilities.upgrade_checkpoint --file tempdata/chemformer/model.ckpt`


Pushed an encoder with no defined bounds
Pushed an encoder with no defined bounds
Pushed an encoder with no defined bounds


  rank_zero_warn(
  rank_zero_warn(


In [32]:
model.decoder.layers[1].norm1.bias[:5]

tensor([ 0.0847, -0.0391,  0.0424, -0.0030, -0.0634], grad_fn=<SliceBackward0>)

### Continuing the normal code...

In [40]:
DEFAULT_NUM_BEAMS = 10
model_path = "tempdata/chemformer/model.ckpt"
model = BARTModel.load_from_checkpoint(
  model_path,
  decode_sampler = sampler,
  my_tokeniser = tokeniser,
)
model.cuda().eval()
model.num_beams = DEFAULT_NUM_BEAMS

Lightning automatically upgraded your loaded checkpoint from v1.2.3 to v2.0.1.post0. To apply the upgrade to your files permanently, run `python -m pytorch_lightning.utilities.upgrade_checkpoint --file tempdata/chemformer/model.ckpt`


In [35]:
logger = TensorBoardLogger("tb_logs", name=f"eval_bart_uspto_50")
trainer = Trainer(
    logger=logger,
    limit_test_batches=1,
)

GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
`Trainer(limit_test_batches=1)` was configured so 1 batch will be used.


In [74]:
results = trainer.test(model, datamodule=dm)
def print_results(results):
  print(f"Results for model: {model_path}")
  print(f"{'Item':<25}Result")
  for key, val in results.items():
    print(f"{key:<25} {val:.4f}")
print_results(results[0])

LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Testing DataLoader 0:   0%|          | 0/1 [00:55<?, ?it/s]

=== Describing forward input ===
==> Key: encoder_input
==> Value: <class 'torch.Tensor'>
==> Size: torch.Size([38, 2]) torch.cuda.LongTensor torch.int64
==> Decoded:  ['C1=COCCC1.COC(=O)CCC(=O)c1ccc(O)cc1O', 'COC(=O)c1cccc(C(=O)O)c1.Nc1cccnc1N']
==> value
	tensor([[  2,   2],
	        [272, 272],
	        [274, 285],
	        [280, 272],
	        [272, 275]], device='cuda:0')
==> Key: encoder_pad_mask
==> Value: <class 'torch.Tensor'>
==> Size: torch.Size([38, 2]) torch.cuda.BoolTensor torch.bool
==> value
	tensor([[False, False],
	        [False, False],
	        [False, False],
	        [False, False],
	        [False, False]], device='cuda:0')
==> Key: decoder_input
==> Value: <class 'torch.Tensor'>
==> Size: torch.Size([35, 2]) torch.cuda.LongTensor torch.int64
==> Decoded:  ['COC(=O)CCC(=O)c1ccc(OC2CCCCO2)cc1O', 'COC(=O)c1cccc(-c2nc3cccnc3[nH]2)c1']
==> value
	tensor([[  2,   2],
	        [272, 272],
	        [285, 285]