# Generate molecules

In [34]:
# Imports

from hyformer.configs.tokenizer import TokenizerConfig
from hyformer.configs.model import ModelConfig

from hyformer.utils.tokenizers.auto import AutoTokenizer
from hyformer.models.auto import AutoModel

from hyformer.utils.chemistry import is_valid


# auxiliary imports
import torch
import torch.nn.functional as F

# autoreload magic
%reload_ext autoreload
%autoreload 2


In [35]:
# Paths

MODEL_NAME = 'hyformer_small'
TASK_NAME = 'lm'

TOKENIZER_CONFIG_PATH = 'configs/tokenizers/smiles/config.json'
MODEL_CONFIG_PATH = 'configs/models/hyformer_small/config.json'
MODEL_CHECKPOINT_PATH = f'/lustre/groups/aih/hyformer/results/distribution_learning/guacamol/hyformer_small/lm_enumerated/checkpoint.pt'

In [42]:
# Load model

tokenizer_config = TokenizerConfig.from_config_filepath(TOKENIZER_CONFIG_PATH)
tokenizer = AutoTokenizer.from_config(tokenizer_config)

# Load model
model_config = ModelConfig.from_config_filepath(MODEL_CONFIG_PATH)
model = AutoModel.from_config(model_config)
model.load_pretrained(filepath=MODEL_CHECKPOINT_PATH)




In [43]:
batch_size = 64
temperature = 1.2
top_k = None
top_p = 0.9
max_sequence_length = 100

device = 'cuda:0'

generator = model.to_generator(
    tokenizer=tokenizer,
    batch_size=batch_size,
    temperature=temperature,
    top_k=top_k,
    top_p=top_p,
    max_sequence_length=max_sequence_length,
    device=device,
    use_cache=False
    )

samples = generator.generate(number_samples=1000)
is_valid_smiles = [is_valid(sample) for sample in samples]
print("Validity: ", sum(is_valid_smiles) / len(is_valid_smiles))

Generating samples: 100%|██████████| 16/16 [00:15<00:00,  1.01it/s]

Validity:  0.995





In [51]:
len(list(set(samples))) / len(samples)

1.0