In [5]:
#@title Select the task type you want to try out, then run this cell

task_type = "product" #@param ["product","reactants","reagents","regression","classification","pretrain","mixed"] {type:"string"}
has_test = False
if task_type == "mixed":
  print("You are selecting 'mixed'! Which means that the model will be trained on a mixed dataset (product, reactants, reagents), and you will need to choose the task type for testing")
elif task_type == "pretrain":
  print("You are selecting 'pretrain'! T5Chem was pretrained in a self-supervised manner and will not have corresponding test task.")
else:
  has_test = True

#@markdown ---
#@markdown ### Advanced settings
#@markdown Where should the trained model be saved?
model_dir = 'model_test/' #@param {type:"string"}
#@markdown The number of training epochs:
num_epoch = 1 #@param {type:"integer"}

In [8]:
#@title Model Training
!t5chem train --data_dir data/sample/"$task_type"/ --output_dir "$model_dir" --task_type "$task_type" --pretrain models/pretrain/simple/ --num_epoch "$num_epoch"

Namespace(batch_size=32, command=<function train at 0x7fcb6be0f950>, data_dir='data/sample/product/', init_lr=0.0005, log_step=5000, num_classes=None, num_epoch=1, output_dir='model_test/', pretrain='models/pretrain/simple/', random_seed=8570, task_type='product', tokenizer='', vocab='')
***** Running training *****
  Num examples = 10000
  Num Epochs = 1
  Instantaneous batch size per device = 32
  Total train batch size (w. parallel, distributed & accumulation) = 32
  Gradient Accumulation steps = 1
  Total optimization steps = 313
100%|████████████████████████████████████████▊| 312/313 [00:44<00:00,  6.72it/s]

Training completed. Do not forget to share your model on huggingface.co/models =)


{'train_runtime': 44.6098, 'train_samples_per_second': 224.166, 'train_steps_per_second': 7.016, 'train_loss': 0.2960994342645517, 'epoch': 1.0}
100%|█████████████████████████████████████████| 313/313 [00:44<00:00,  7.02it/s]
Namespace(batch_size=32, command=<function train at 0

In [19]:
#@title Select test task

test_type = "product" #@param ["product","reactants","reagents"] {type:"string"}
# prefix = ''
# In most cases, the model can read prefix from a trained model, but when mixed 
# training was used (1 model, multiple tasks!), you will need to specifiy prefix
# to distinguish between different tasks.
task2prefix = {
    'product': "Product:",
    'reactants': "Reactants:",
    'reagents': "Reagents:",
}
if has_test and task_type != test_type:
  print(f"WARNING, you only trained the model for {task_type}, and only the same task would be available")
  test_type = task_type
elif task_type == "pretrain":
  print("Sorry, but there is no test task for pretrain.")
else:
  prefix = task2prefix[test_type]

#@markdown ---
#@markdown ### Advanced settings
#@markdown Number of batch size for prediction (If the program cannot run 
#@markdown , try smaller batch size)
batch_size =  32#@param {type:"integer"}
#@markdown The beam size for decoding:
beam_size =  5#@param {type:"integer"}
#@markdown The number of returned sequences:
num_seq =  1#@param {type:"integer"}

if num_seq > beam_size:
  raise ValueError("num_seq should be <= beam_size!")



In [20]:
#@title Model Testing
# 注意显存，如果不够可能跑不起来
!t5chem predict --data_dir data/sample/"$test_type"/ --model_dir "$model_dir" --batch_size "$batch_size" --prefix "$prefix" --num_beams "$beam_size" --num_preds "$num_seq"

prediction: 100%|███████████████████████████████| 32/32 [03:06<00:00,  5.83s/it]
Top-1: 10.4% || Invalid 8.90%


# Instructions <a name="Instructions"></a>
**Quick start**
1. Select a task type in training
2. Select a test task type in testing (if you used multi-task training)
3. Run all

**Trained model**

1. Your trained model weights would be avilable in the `model_dir`
2. The predictions can be checked under `model_dir`, named `predictions.csv`

**Bugs**
- If you encounter any bugs, please report the issue to https://github.com/HelloJocelynLu/t5chem/issues
