# Inference Sample

In [18]:
import warnings
from rdkit import Chem

warnings.filterwarnings('ignore')
warnings.simplefilter('ignore')

In [3]:
from typing import List
from pathlib import Path
import os

try:
    BIONEMO_HOME: Path = Path(os.environ['BIONEMO_HOME']).absolute()
except KeyError:
    print("Must have BIONEMO_HOME set in the environment! See docs for instructions.")
    raise

config_path = BIONEMO_HOME / "examples" / "molecule" / "molmim" / "conf"
print(f"Using model configuration at: {config_path}")
assert config_path.is_dir()

Using model configuration at: /workspace/bionemo/examples/molecule/molmim/conf


## Set up and test data

In [23]:
# Here, we are taking two example SMILES for two widely used Antimalarial drugs -- Mefloquine and Hydroxychloroquine
smis = ['OC(c1cc(C(F)(F)F)nc2c(C(F)(F)F)cccc12)C1CCCCN1',     
        'CCN(CCO)CCCC(C)Nc1ccnc2cc(Cl)ccc12'] 

In [7]:
from bionemo.utils.hydra import load_model_config

cfg = load_model_config(config_name="infer.yaml", config_path=config_path)

In [8]:
from bionemo.triton.utils import load_model_for_inference
from bionemo.model.molecule.molmim.infer import MolMIMInference

inferer = load_model_for_inference(cfg, interactive=True)

print(f"Loaded a {type(inferer)}")
assert isinstance(inferer, MolMIMInference)

INFO:rdkit:Enabling RDKit 2023.09.1 jupyter extensions
INFO:datasets:PyTorch version 2.1.0a0+32f93b1 available.


[NeMo I 2024-05-21 19:17:11 megatron_hiddens:110] Registered hidden transform sampled_var_cond_gaussian at bionemo.model.core.hiddens_support.SampledVarGaussianHiddenTransform
[NeMo I 2024-05-21 19:17:11 megatron_hiddens:110] Registered hidden transform interp_var_cond_gaussian at bionemo.model.core.hiddens_support.InterpVarGaussianHiddenTransform
[NeMo I 2024-05-21 19:17:11 utils:490] pytorch DDP is not initialized. Initializing with pytorch-lightening...


GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs


[NeMo I 2024-05-21 19:17:11 utils:333] Restoring model from /workspace/bionemo/models/molecule/molmim/molmim_70m_24_3.nemo
[NeMo I 2024-05-21 19:17:11 utils:337] Loading model class: bionemo.model.molecule.molmim.molmim_model.MolMIMModel


GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs


Interactive mode selected, using strategy='auto'
[NeMo I 2024-05-21 19:17:12 exp_manager:394] Experiments will be logged at /workspace/bionemo/examples/molecule/molmim/nemo_experiments/MolMIM_Inference/2024-05-21_19-17-11
[NeMo I 2024-05-21 19:17:12 exp_manager:835] TensorboardLogger has been set up
[NeMo I 2024-05-21 19:17:12 utils:306] 
    
    ************** Trainer configuration ***********
[NeMo I 2024-05-21 19:17:12 utils:307] 
    name: MolMIM_Inference
    desc: Minimum configuration for initializing a MolMIM model for inference.
    trainer:
      precision: 16-mixed
      devices: 1
      num_nodes: 1
      accelerator: gpu
      logger: false
      accumulate_grad_batches: 1
    exp_manager:
      explicit_log_dir: null
      exp_dir: null
      name: ${name}
      create_checkpoint_callback: false
    model:
      encoder:
        num_layers: 6
        hidden_size: 512
        ffn_hidden_size: 2048
        num_attention_heads: 8
        init_method_std: 0.02
        hidden

[NeMo W 2024-05-21 19:17:12 megatron_base_model:821] The model: MolMIMModel() does not have field.name: context_parallel_size in its cfg. Add this key to cfg or config_mapping to make to make it configurable.
[NeMo W 2024-05-21 19:17:12 megatron_base_model:821] The model: MolMIMModel() does not have field.name: virtual_pipeline_model_parallel_size in its cfg. Add this key to cfg or config_mapping to make to make it configurable.
[NeMo W 2024-05-21 19:17:12 megatron_base_model:821] The model: MolMIMModel() does not have field.name: sequence_parallel in its cfg. Add this key to cfg or config_mapping to make to make it configurable.
[NeMo W 2024-05-21 19:17:12 megatron_base_model:821] The model: MolMIMModel() does not have field.name: expert_model_parallel_size in its cfg. Add this key to cfg or config_mapping to make to make it configurable.
[NeMo W 2024-05-21 19:17:12 megatron_base_model:821] The model: MolMIMModel() does not have field.name: use_cpu_initialization in its cfg. Add this 

[NeMo I 2024-05-21 19:17:12 megatron_init:234] Rank 0 has data parallel group: [0]
[NeMo I 2024-05-21 19:17:12 megatron_init:237] All data parallel group ranks: [[0]]
[NeMo I 2024-05-21 19:17:12 megatron_init:238] Ranks 0 has data parallel rank: 0
[NeMo I 2024-05-21 19:17:12 megatron_init:246] Rank 0 has model parallel group: [0]
[NeMo I 2024-05-21 19:17:12 megatron_init:247] All model parallel group ranks: [[0]]
[NeMo I 2024-05-21 19:17:12 megatron_init:257] Rank 0 has tensor model parallel group: [0]
[NeMo I 2024-05-21 19:17:12 megatron_init:261] All tensor model parallel group ranks: [[0]]
[NeMo I 2024-05-21 19:17:12 megatron_init:262] Rank 0 has tensor model parallel rank: 0
[NeMo I 2024-05-21 19:17:12 megatron_init:276] Rank 0 has pipeline model parallel group: [0]
[NeMo I 2024-05-21 19:17:12 megatron_init:288] Rank 0 has embedding group: [0]
[NeMo I 2024-05-21 19:17:12 megatron_init:294] All pipeline model parallel group ranks: [[0]]
[NeMo I 2024-05-21 19:17:12 megatron_init:295]

[NeMo W 2024-05-21 19:17:12 megatron_base_model:821] The model: MolMIMModel() does not have field.name: context_parallel_size in its cfg. Add this key to cfg or config_mapping to make to make it configurable.
[NeMo W 2024-05-21 19:17:12 megatron_base_model:821] The model: MolMIMModel() does not have field.name: virtual_pipeline_model_parallel_size in its cfg. Add this key to cfg or config_mapping to make to make it configurable.
[NeMo W 2024-05-21 19:17:12 megatron_base_model:821] The model: MolMIMModel() does not have field.name: sequence_parallel in its cfg. Add this key to cfg or config_mapping to make to make it configurable.
[NeMo W 2024-05-21 19:17:12 megatron_base_model:821] The model: MolMIMModel() does not have field.name: expert_model_parallel_size in its cfg. Add this key to cfg or config_mapping to make to make it configurable.
[NeMo W 2024-05-21 19:17:12 megatron_base_model:821] The model: MolMIMModel() does not have field.name: use_cpu_initialization in its cfg. Add this 

[NeMo I 2024-05-21 19:17:12 tokenizer_utils:199] Using regex tokenization
[NeMo I 2024-05-21 19:17:12 regex_tokenizer:240] Loading vocabulary from file = /tmp/tmpxac6vhjf/dd344353154640acbbaea1d4536fa7d0_molmim.vocab
[NeMo I 2024-05-21 19:17:12 regex_tokenizer:254] Loading regex from file = /tmp/tmpxac6vhjf/048c1f797f464dd5b6a90f60f9405827_molmim.model
[NeMo I 2024-05-21 19:17:12 megatron_base_model:315] Padded vocab_size: 640, original vocab_size: 523, dummy tokens: 117.
[NeMo I 2024-05-21 19:17:12 megatron_hiddens:121] NOTE: Adding hiddens transforms and losses
[NeMo I 2024-05-21 19:17:12 megatron_hiddens:149] Added transform q_z_given_x with cfg={'cls_name': 'sampled_var_cond_gaussian', 'hidden_size': 512, 'min_logvar': -6.0, 'max_logvar': 0.0, 'map_var_to_hiddens': False}
[NeMo I 2024-05-21 19:17:12 megatron_hiddens:177] Added loss mim with cfg={'cls_name': 'a_mim', 'loss_weight': 1.0}
[NeMo I 2024-05-21 19:17:12 nlp_overrides:752] Model MolMIMModel was successfully restored from /

## SMILES to hidden state

The MolMIM hidden state has a controlled number of tokens since it uses a Perceiver encoder, so no pooling is necessary to create a fixed size embedding.


In [11]:
hidden_states, pad_masks = inferer.seq_to_hiddens(smis)
print(f"{hidden_states.shape=}")
print(f"{pad_masks.shape=}")

assert tuple(hidden_states.shape) == (2, 1, 512)
assert tuple(pad_masks.shape) == (2, 1)

hidden_states.shape=torch.Size([2, 1, 512])
pad_masks.shape=torch.Size([2, 1])


## SMILES to embedding

In [12]:
embedding = inferer.seq_to_embeddings(smis)
print(f"{embedding.shape=}")
assert tuple(embedding.shape) == (2, 512)

embedding.shape=torch.Size([2, 512])


In [13]:
hidden_states

tensor([[[-0.2774,  0.0274, -0.0518,  ..., -0.5057,  0.1316,  0.4939]],

        [[-0.1865,  0.1537,  0.4342,  ...,  0.4155, -0.5273,  0.3010]]],
       device='cuda:0')

In [14]:
embedding

tensor([[-0.2774,  0.0274, -0.0518,  ..., -0.5057,  0.1316,  0.4939],
        [-0.1865,  0.1537,  0.4342,  ...,  0.4155, -0.5273,  0.3010]],
       device='cuda:0')

## Hidden state to SMILES

In [15]:
# Obtaining SMILES chemical representation from a hidden state using the hidden_to_seqs function 
inferred_smis = inferer.hiddens_to_seq(hidden_states, pad_masks)

# Examine the inferred SMILES
inferred_smis

[NeMo I 2024-05-21 20:08:16 megatron_lm_encoder_decoder_model:1195] Decoding using the greedy-search method...


['OC(c1cc(C(F)(F)F)nc2c(C(F)(F)F)cccc12)C1CCCC1',
 'CCN(CCO)CCCC(C)Nc1ccnc2cc(Cl)ccc12']

## Sampling: Generate SMILES

In [25]:
samples = inferer.sample(
    num_samples = 20,       # Maximum number of generated molecules per query compound
    scaled_radius = 0.7,    # Radius of exploration [range: 0.0 - 1.0] --- the extent of perturbation of the original hidden state for sampling
    sampling_method="beam-search-perturbate", 
    sampler_kwargs = {
        "beam_size": 3, "keep_only_best_tokens": True, "return_scores": False
    },
    seqs=smis
)

print(f"Generated {len(samples)} samples")

uniq_canonical_smiles = []
for smis_samples, original in zip(samples, smis):
    smis_samples = set(smis_samples) - set([original])  # unique strings that are not the same as we started from
    valid_molecules = []
    for sample in smis_samples:
        mol = Chem.MolFromSmiles(sample)
        if mol:
            valid_molecules.append(Chem.MolToSmiles(mol,True))
    uniq_canonical_smiles.append(valid_molecules)

for i,s in enumerate(uniq_canonical_smiles):
    print(f'Number of mols generated for input {i+1}: {len(s)}')
    print('Generated molecules: ')
    print(s)

[NeMo I 2024-05-21 21:01:11 megatron_lm_encoder_decoder_model:1192] Decoding using the beam search method with beam size=5...


Generated 2 samples
Number of mols generated for input 1: 19
Generated molecules: 
['OC(c1ccc(C(F)(F)F)nc1)C(F)(F)F', 'OCc1nsc(N2CCC(CO)(c3ccccc3)CC2)n1', 'COc1ncc([C@H](O)C(=O)c2cc(C(F)(F)F)ccc2F)cc1Cl', 'OC(c1cc(F)c(Cl)cc1Br)N(CC(F)(F)F)C1CC1', 'CCOc1cc(C(=O)N2CC[C@@](O)(C(F)(F)F)C2)ccc1C', 'OC(c1ccccc1)c1cn(C(F)(F)F)nc1C(F)(F)F', 'OCc1ccc(C(F)(F)F)nc1N(Cc1c(F)cccc1F)C1CCCCC1', 'COc1ncc(CN2CC([C@H](C)NC(=O)[C@]3(C)CCC[C@H]3C)C2)c(C)n1', 'OC(c1cc(C(F)(F)F)nc2c(C(F)(F)F)cccc12)C1CCCC1', 'OC[C@@]1(CN(CC(F)(F)F)C2CCC2)CCCO1', 'OC(c1cc(C(F)(F)F)nc(-c2ccc(Cl)cc2)n1)C1CCNCC1', 'Oc1nc(C(O)c2nc3ccccc3s2)cc(C(F)(F)F)n1', 'O=C(c1cccnn1)N1CCCC12CN(C(O)c1cc(C(F)(F)F)ncc1Cl)C2', 'OCc1ccc(C(F)(F)F)nc1N(CC(F)(F)F)C1CCCC1', 'OCCc1cc(C(F)(F)F)nc2c(C(F)(F)F)cccc12', 'OC(c1cc(C(F)(F)F)n[nH]1)(C1CC1)C(F)(F)F', 'OC[C@@]1(O)CC[C@H]2CN(Cc3nc4c(C(F)(F)F)cccc4o3)C[C@@H]2C1', 'OCc1ccc(C(F)(F)F)nc1N(Cc1ccccc1)C[C@@H]1CCCO1', 'OC(c1cc(C(F)(F)F)nc2c(C(F)(F)F)cccc12)C1CCNCC1']
Number of mols generated for input 2: