# Jointformer Examples

This notebook shows how to use Jointformer, as a SMILES encoder, generator and how to train the model with default Trainer. 

In [14]:
# Imports

import os
import torch

from jointformer.configs.tokenizer import TokenizerConfig
from jointformer.configs.model import ModelConfig

from jointformer.utils.tokenizers.auto import AutoTokenizer
from jointformer.models.auto import AutoModel


%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [15]:
# Set working directory of the project

REPOSITORY_DIR = '/home/adamizdebski/projects/jointformer'
os.chdir(REPOSITORY_DIR)

In [18]:
# Configs

DATA_DIR = '/home/adamizdebski/files/data'
OUTPUT_DIR = '/home/adamizdebski/files/jointformer/results/pretrain'

PATH_TO_TOKENIZER_CONFIG = '/home/adamizdebski/projects/jointformer/configs/tokenizers/smiles_separate_task_token'
PATH_TO_MODEL_CONFIG = '/home/adamizdebski/projects/jointformer/configs/models/jointformer_separate_task_token'
PRETRAINED_JOINTFORMER_FILENAME = 'ckpt.pt'


In [None]:
# Init params and tokenizer


device = 'cuda:0' if torch.cuda.is_available() else 'cpu'

tokenizer_config = TokenizerConfig.from_config_file(PATH_TO_TOKENIZER_CONFIG)
tokenizer = AutoTokenizer.from_config(tokenizer_config)

In [None]:
# Init Jointformer

model_config = ModelConfig.from_config_file(PATH_TO_MODEL_CONFIG)
model = AutoModel.from_config(model_config)
model.load_pretrained(PRETRAINED_JOINTFORMER_FILENAME)

## Jointformer as a SMILES Encoder

In [26]:
# Init Smiles Encoder as a wrapper for molecule encoding

encoding_batch_size = 16
smiles_encoder = model.to_smiles_encoder(tokenizer=tokenizer, device=device, batch_size=encoding_batch_size)

In [27]:
# Encode smiles

smiles = [
    "COC(=O)c1cc(C(=CCCCC(=O)SC)c2cc(Cl)c(OC)c(C(=O)OC)c2)cc(Cl)c1OC",
    "Cc1cc(COc2ccc(NC(=O)C3CN(C)CCC3C(=O)NO)cc2)c2ccccc2n1",
    "CCOC(=O)c1ccc(O)c(-n2cc3c(c2-c2ccccc2)c(=O)n(C)c(=O)n3C)c1",
    "COc1cc(OC)c2c(-c3cccc(-c4ccc(C#N)cc4)c3)cc(=O)oc2c1",
    "COc1cc2[nH]c(C(=O)Nc3ccc(F)cc3)cc2c(OC)c1OC",
    "COc1cc(F)cc(-c2ccc(C(CC(=O)O)NC(=O)C3CCCN3S(=O)(=O)c3cc(Cl)cc(Cl)c3)cc2)c1",
    "COc1cc(-c2cc(OC)c(-n3c(=O)ccc4cc(S(=O)(=O)Nc5cccnn5)ccc43)cc2F)ccc1Cl",
    "CCCCC1(C)CC(CO)C(CCCC)(OC)OO1"
]

encoding = smiles_encoder.encode(smiles)  # setting the device and model.eval() under the hood

Encoding samples: 100%|██████████| 1/1 [00:00<00:00,  5.02it/s]


## Jointformer as a SMILES generator

In [28]:
batch_size = 8
temperature = 0.8
top_k = 10

generator = model.to_guacamole_generator(tokenizer=tokenizer, batch_size=batch_size, temperature=temperature, top_k=top_k, device=device)

In [29]:
generator.generate(number_samples=16)

Generating samples: 100%|██████████| 2/2 [00:02<00:00,  1.09s/it]


['C1=C2C=CC=CC2=C(C(NC2=CC=CC(C(F)(F)F)=C2)=O)C=C1OC',
 'CC1=C(C(=O)NCCN2CCOCC2)C(C)=C(C=C2C(=O)NC3=CC=C(C4=CN(C5CCN(C)CC5)N=C4)C=C23)N1',
 'CC1=CC=C(C2=CC=C3C(=C2)C(NC2=CC=C(C(=O)NCCCC(=O)NO)C=C2)=C2C(=N3)C=CC=C2)C=C1',
 'C1=CC(C#N)=CC2=C1N(C1=NC=NC(NC3=CC(C(NC)=O)=CC=C3)=N1)CC2',
 'C1(S(=O)(=O)N2CCN(C(=O)C3=CC=CC(C)=C3)CC2)=CC(C(=O)NC2=CC=CC=C2)=CC=C1',
 'C1C2C(C)(C)C(=NO)CCC2(C)C2C(C)(C3(C)C(CC2)C2C(C(=C)C)CCC2(C(O)=O)CC3)C1',
 'C1(C2=CC=CS2)=CC(C2=CC=CC=C2)=NCCN1',
 'CCC(C)C(NC(=O)C(CC1=CC=CC=C1)NCP(=O)(O)CCCN)C(=O)NC(CC1=CC=CC=C1)C(=O)O',
 'CC(NC1=NC=NC2=C1N=CN2C1CN(CCC#N)CC(CO)O1)C1=CC=CC=C1',
 'COC1=CC=C(CC(=O)NC2=CC=C(C3=CC=C(C4=CSC(CC(=O)O)=N4)O3)C=C2)C=C1OC',
 'CC1=CC=C(CSC2=NN=C(C3=NC=CN=C3)N2C)C=C1',
 'C1=CC=C(C2=CC(C)=CC(C3=CC=CC=C3)=[N+]2C2=CC=C(C)C=C2)C=C1C',
 'C1(=O)C=CC2=C(C(C3=CC(S(=O)(NCCC)=O)=CC=C3OC)=CN=C2OC)O1',
 'O=C(O)C1=NC(C2=NC(C3=CC=C(F)C=C3)=NO2)=CS1',
 'C1=CC(NC(=O)C(CC(C)C)NC(=O)C(CSC2=CC=C(C(=O)O)C=C2)NC(=O)C)=CC=C1F',
 'CCC1=CC=C(CCC2(C)C(=O)NC3=CC=C(Cl)

In [41]:
from tdc.single_pred import ADME

In [42]:
TDC_TO_MOOD = {
    "BBB_Martins": "BBB",
    "CYP2C9_Veith": "CYP2C9",
    "Caco2_Wang": "Caco-2",
    "Clearance_Hepatocyte_AZ": "Clearance",
    "DILI": "DILI",
    "HIA_Hou": "HIA",
    "Half_Life_Obach": "HalfLife",
    "Lipophilicity_AstraZeneca": "Lipophilicity",
    "PPBR_AZ": "PPBR",
    "Pgp_Broccatelli": "Pgp",
    "hERG": "hERG",
}

# Ordered by size
MOOD_DATASETS = [
    "DILI",
    "HIA",
    "hERG",
    "HalfLife",
    "Caco-2",
    "Clearance",
    "Pgp",
    "PPBR",
    "BBB",
    "Lipophilicity",
    "CYP2C9",
]

MOOD_TO_TDC = {v: k for k, v in TDC_TO_MOOD.items()}

In [46]:
dataset = ADME(name=MOOD_TO_TDC['Lipophilicity'], path=DATA_DIR)

Downloading...
0.00iB [00:00, ?iB/s]
Loading...


AttributeError: 'tuple' object has no attribute 'tb_frame'