In [4]:
#@title Select the task type you want to try out, then run this cell

#@markdown The model we are going to use is a multitask model. That means that 
#@markdown it can make predictions on 3 different tasks!

#@markdown First, select the task type you would like to try. 
task_type = "product" #@param ["product","reactants","reagents"] {type:"string"}
task2prefix = {
    'product': "Product:",
    'reactants': "Reactants:",
    'reagents': "Reagents:",
}
#@markdown Then, fill your input sequence in:

#@markdown You can find example [here](https://gist.github.com/HelloJocelynLu/7f10386e1678c4335914aaac37113c78):

input_seq = "CN1CCNCC1.OCCCBr.Cc1ccccc1>>" #@param {type:"string"}
import sys
# sys.path.append('/usr/local/lib/python3.7/site-packages/')
from rdkit import Chem

if task_type == "product":
  mols = input_seq.split('>')
  if len(mols) != 3:
    raise ValueError("Make sure you provided reactants and reagents with two '>'s")
  else:
    for mol in mols:
      # Only Check non-blank string
      if mol and not Chem.MolFromSmiles(mol):
        raise ValueError(f"Molecule smiles {mol} is invalid!")
elif task_type == "reactants":
  if not Chem.MolFromSmiles(input_seq):
    raise ValueError(f"Molecule smiles {input_seq} is invalid!")
else:
  mols = input_seq.split('>>')
  for mol in mols:
    if not mol or not Chem.MolFromSmiles(mol):
       raise ValueError(f"Molecule smiles {mol} is invalid! Make sure you have include two '>'s")

#@markdown ---
#@markdown ### Advanced settings
#@markdown What the beam size we should use in prediction?
beam_size = "10" #@param ["1", "3", "5", "10"]
beam_size = int(beam_size)
#@markdown The number of sequences to return
num_seq = "5" #@param ["1", "3", "5", "10"]
num_seq = int(num_seq)

if num_seq > beam_size:
  raise ValueError("num_seq should be smaller than beam_size!")

input_seq = task2prefix[task_type]+input_seq
model_path = "model/"
print("You are good to go!")

You are good to go!


In [5]:
print(mols)
print(input_seq)

['CN1CCNCC1.OCCCBr.Cc1ccccc1', '', '']
Product:CN1CCNCC1.OCCCBr.Cc1ccccc1>>


In [6]:
#@title Make predictions
from transformers import T5ForConditionalGeneration
from t5chem import SimpleTokenizer
model = T5ForConditionalGeneration.from_pretrained(model_path)
tokenizer = SimpleTokenizer(vocab_file='model/vocab.pt')
inputs = tokenizer.encode(input_seq, return_tensors='pt')
outputs = model.generate(input_ids=inputs, max_length=300, early_stopping=True, num_beams=beam_size, num_return_sequences=num_seq)
print("Model input:", input_seq)
for i, pred in enumerate(outputs):
  print("Prediction", str(i+1)+":", tokenizer.decode(pred, skip_special_tokens=True))

Model input: Product:CN1CCNCC1.OCCCBr.Cc1ccccc1>>
Prediction 1: CC1CCCC2(C)c3ccccc3C(=O)N12
Prediction 2: CC1(C)C(=O)N(C2CCCCC2)c2ccccc21
Prediction 3: CC1CCCC2(C)c3ccccc3C(=O)N2C1
Prediction 4: CC1(C)CCCC(C)(C)N1OCCCC1
Prediction 5: CCOC(=O)C1(C(=O)OCC)C(c2ccccc2)C2ccccc21


In [6]:
inputs

tensor([[85,  5, 13,  9,  5,  5, 13,  5,  5,  9, 27, 10,  5,  5,  5, 28, 30, 27,
          5,  6,  9,  6,  6,  6,  6,  6,  9, 37, 37]])

# Instructions <a name="Instructions"></a>
**Quick start**
1. Select a task type to test
2. Input a sequence to test on
3. (Optionally) Select the number of returned predictions
4. Run all

**Bugs**
- If you encounter any bugs, please report the issue to https://github.com/HelloJocelynLu/t5chem/issues
