# Generate molecules

In [1]:
# Imports

from hyformer.configs.tokenizer import TokenizerConfig
from hyformer.configs.model import ModelConfig

from hyformer.utils.tokenizers.auto import AutoTokenizer
from hyformer.models.auto import AutoModel

from hyformer.utils.chemistry import is_valid


# auxiliary imports
import torch
import torch.nn.functional as F

# autoreload magic
%reload_ext autoreload
%autoreload 2


In [7]:
# Paths

MODEL_NAME = 'hyformer_small'
TASK_NAME = 'combined'

TOKENIZER_CONFIG_PATH = 'configs/tokenizers/smiles/guacamol/config.json'
MODEL_CONFIG_PATH = 'configs/models/guacamol/hyformer_small/config.json'
MODEL_CHECKPOINT_PATH = f'/lustre/groups/aih/hyformer/results/distribution_learning/guacamol/{MODEL_NAME}/{TASK_NAME}/ckpt.pt'

In [8]:
# Load model

tokenizer_config = TokenizerConfig.from_config_filepath(TOKENIZER_CONFIG_PATH)
tokenizer = AutoTokenizer.from_config(tokenizer_config)

# Load model
model_config = ModelConfig.from_config_filepath(MODEL_CONFIG_PATH)
model = AutoModel.from_config(model_config)
model.load_pretrained(filepath=MODEL_CHECKPOINT_PATH)




In [28]:
batch_size = 64
temperature = 1.0
top_k = None
top_p = None
max_sequence_length = 100

device = 'cuda:0'

generator = model.to_generator(
    tokenizer=tokenizer,
    batch_size=batch_size,
    temperature=temperature,
    top_k=top_k,
    top_p=top_p,
    max_sequence_length=max_sequence_length,
    device=device
    )

samples = generator.generate(number_samples=1000)
is_valid_smiles = [is_valid(sample) for sample in samples]
print("Validity: ", sum(is_valid_smiles) / len(is_valid_smiles))

Generating samples:   0%|          | 0/16 [00:00<?, ?it/s]

Generating samples: 100%|██████████| 16/16 [00:09<00:00,  1.61it/s]


Validity:  0.948


In [29]:
samples

['CCn1cc(C(=O)O)c(=O)c2cc(F)c(N3CC=C(CNC(=O)c4cc(Cl)cc(Cl)c4)C3)cc21',
 'CN1C2CCCC1CC(N1CCN(c3ccc(Cl)cc3)C1=O)C2',
 'CCCn1c(Cc2cc(OC)ccc2OC)nnc1SCC(=O)Nc1ccc2c(c1)CCC2',
 'COc1cc(C)c(C)cc1S(=O)(=O)NCC1OC(n2cnc3c(N)ncnc32)C(O)C1O',
 'N=C1SC(=Cc2cccc(OC(F)F)c2Cl)C(=O)N1c1ccc(Cl)cc1',
 'COc1ccc(OCC(=O)Nc2ccc3nc(C)sc3c2)cc1',
 'Cc1ccc2c(-n3ccc(OCc4ccccc4)cc3=O)cc(N3CCCCC3)nc2c1',
 'CC1CCCC(C)N1Cc1coc(-c2ccccc2O)n1',
 'Cc1cccc(C(=N)NOC(=O)c2cccc(C(F)(F)F)c2)c1',
 'c1ccc(-c2[nH]c(Nc3cccc(SCCN4CCOCC4)c3)nc2-c2ccc(CN3CCCCC3)cc2)cc1',
 'Cc1sc(-c2ccc(Br)cc2)nc1CCC(=O)c1ccc(Cl)cc1',
 'CC(C)c1nccn1CC1CC(c2cccc(C(F)(F)F)c2)=NO1',
 'COc1ccc(CCN2C(=O)C(CC(=O)NCc3cccs3)CC(C(=O)N3CCOCC3)=C2C)cc1OC',
 'COc1ccccc1C=C1SC(=S)N(CCC(=O)Nc2ccccc2C)C1=O',
 'COc1ccc(CCC[n+]2cccc3ccccc32)cc1OC',
 'CCOC(=O)C1CCCN(Cc2c[nH]nc2-c2nc(-c3cc(OC)c(OC)c(OC)c3)no2)C1',
 'COc1ccc(CC(=O)N2CCC(Oc3cccnc3)CC2)cc1OC',
 'CC1=CC(C(=O)Nc2cccc(C(=O)N3CCCCC3)c2)=CC(=O)N1C',
 'CN1CCN(c2cc(-c3n[nH]c4ccc(-c5cn[nH]c5)cc34)ncn2)CC1',
 'C

In [51]:
len(list(set(samples))) / len(samples)

1.0

In [25]:
# Create initial input for generation (task token + BOS token)

from tqdm import tqdm

prefix_input_ids = torch.tensor(
    [[generator._tokenizer.task_token_id('lm'), generator._tokenizer.bos_token_id]] * generator._batch_size,
    dtype=torch.long,
    device=generator._device
)

for _ in tqdm(range(0, 64, generator._batch_size), "Generating samples"):
    outputs = model.generate(
        prefix_input_ids=prefix_input_ids,
        num_tokens_to_generate=generator._max_sequence_length - len(prefix_input_ids[0]), 
        eos_token_id=generator._tokenizer.eos_token_id,
        pad_token_id=generator._tokenizer.pad_token_id,
        temperature=temperature,
        top_k=top_k,
        top_p=top_p,
        use_cache=generator._use_cache
    )
    _samples = generator._tokenizer.decode(outputs)
    samples.extend(_samples)

Generating samples: 100%|██████████| 1/1 [00:00<00:00,  1.32it/s]


In [27]:
samples

['C',
 'C',
 'N',
 'c',
 '1',
 'n',
 'c',
 '(',
 'N',
 'C',
 'C',
 ')',
 'n',
 '2',
 'c',
 '(',
 'S',
 'C',
 'C',
 '(',
 '=',
 'O',
 ')',
 'N',
 '(',
 'C',
 '(',
 'C',
 ')',
 'C',
 ')',
 'C',
 '(',
 'C',
 ')',
 'C',
 ')',
 'n',
 'n',
 'c',
 '2',
 'n',
 '1',
 'C',
 'C',
 'C',
 'C',
 'C',
 'C',
 'C',
 'C',
 'C',
 'C',
 'C',
 'C',
 'C',
 'C',
 '(',
 '=',
 'O',
 ')',
 'O',
 'C',
 '1',
 'O',
 'C',
 '(',
 '=',
 'O',
 ')',
 'C',
 '2',
 'C',
 'C',
 'C',
 'N',
 '2',
 'C',
 '1',
 '(',
 'O',
 ')',
 'C',
 '(',
 'C',
 'O',
 ')',
 'O',
 'C',
 '(',
 '=',
 'O',
 ')',
 'C',
 'I',
 'N',
 'c',
 '1',
 'c',
 'c',
 'c',
 '(',
 '-',
 'n',
 '2',
 'c',
 'n',
 'c',
 '3',
 'c',
 '2',
 'n',
 'c',
 'n',
 '2',
 'c',
 'n',
 'n',
 'c',
 '3',
 '2',
 ')',
 'c',
 'c',
 '1',
 'C',
 'N',
 '(',
 'C',
 'c',
 '1',
 'c',
 'n',
 'n',
 '(',
 'C',
 ')',
 'c',
 '1',
 ')',
 'C',
 '(',
 '=',
 'O',
 ')',
 'c',
 '1',
 'c',
 'c',
 'c',
 'c',
 '(',
 'O',
 'C',
 'c',
 '2',
 'c',
 'c',
 'c',
 '3',
 'c',
 'c',
 'c',
 'c',
 'c',
 '3',
 'c'

## MolGPT

In [3]:
# Paths

MODEL_NAME = 'molgpt'

MODEL_CONFIG_PATH = f'configs/models/{MODEL_NAME}/config.json'
MODEL_CHECKPOINT_PATH = f'/lustre/groups/aih/hyformer/results/distribution_learning/guacamol/molgpt/guacamol_nocond.pt'

In [23]:
# Load dataset

from hyformer.models.baselines.molgpt import SmilesDataset

dataset = SmilesDataset(['C'])

# Load model
tokenizer = None

# Load model
model_config = ModelConfig.from_config_filepath(MODEL_CONFIG_PATH)
model = AutoModel.from_config(model_config)
model.load_pretrained(filepath=MODEL_CHECKPOINT_PATH)
model._model.to('cuda')




Filtering data: 100%|██████████| 1/1 [00:00<00:00, 1608.25it/s]


Removed key: prop_nn.weight from checkpoint; not found in model or size mismatch.
Removed key: prop_nn.bias from checkpoint; not found in model or size mismatch.


GPT(
  (tok_emb): Embedding(94, 256)
  (type_emb): Embedding(2, 256)
  (drop): Dropout(p=0.1, inplace=False)
  (blocks): Sequential(
    (0): Block(
      (ln1): LayerNorm((256,), eps=1e-05, elementwise_affine=True)
      (ln2): LayerNorm((256,), eps=1e-05, elementwise_affine=True)
      (attn): CausalSelfAttention(
        (key): Linear(in_features=256, out_features=256, bias=True)
        (query): Linear(in_features=256, out_features=256, bias=True)
        (value): Linear(in_features=256, out_features=256, bias=True)
        (attn_drop): Dropout(p=0.1, inplace=False)
        (resid_drop): Dropout(p=0.1, inplace=False)
        (proj): Linear(in_features=256, out_features=256, bias=True)
      )
      (mlp): Sequential(
        (0): Linear(in_features=256, out_features=1024, bias=True)
        (1): GELU(approximate='none')
        (2): Linear(in_features=1024, out_features=256, bias=True)
        (3): Dropout(p=0.1, inplace=False)
      )
    )
    (1): Block(
      (ln1): LayerNorm((

In [24]:

batch_size = 2
x = torch.tensor([dataset.stoi[s] for s in dataset.regex.findall('C')], dtype=torch.long)[None,...].repeat(batch_size, 1).to('cuda')

In [25]:
y = model.sample(x=x, steps=model._config.block_size, temperature=0.7, sample=True, top_k=None, prop = None, scaffold = None)



ValueError: too many values to unpack (expected 3)

In [None]:

for gen_mol in y:
        completion = ''.join([itos[int(i)] for i in gen_mol])
        completion = completion.replace('<', '')
        # gen_smiles.append(completion)
        mol = get_mol(completion)
        if mol:
                molecules.append(mol)