# Hyformer Training

This notebook shows how to train Hyformer. 

In [41]:
# Imports

import os

from hyformer.configs.dataset import DatasetConfig
from hyformer.configs.tokenizer import TokenizerConfig
from hyformer.configs.model import ModelConfig
from hyformer.configs.trainer import TrainerConfig

from hyformer.utils.datasets.auto import AutoDataset
from hyformer.utils.tokenizers.auto import AutoTokenizer
from hyformer.models.auto import AutoModel
from hyformer.trainers.trainer import Trainer

from hyformer.utils.runtime import set_seed

%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [42]:
# Set working directory of the project

REPOSITORY_DIR = '/home/adamizdebski/projects/hyformer'
os.chdir(REPOSITORY_DIR)

In [43]:
# Configs

DATA_DIR = '/home/adamizdebski/projects/hyformer/data/'
OUTPUT_DIR = '/home/adamizdebski/projects/hyformer/results/finetune'

PATH_TO_DATASET_CONFIG   = '/home/adamizdebski/projects/hyformer/configs/datasets/hi/drd2/config.json'
PATH_TO_TOKENIZER_CONFIG = '/home/adamizdebski/projects/hyformer/configs/tokenizers/smiles_separate_task_token/config.json'
PATH_TO_MODEL_CONFIG = '/home/adamizdebski/projects/hyformer/configs/models/hyformer_v2_prediction/config.json'
PATH_TO_TRAINER_CONFIG = '/home/adamizdebski/projects/hyformer/configs/trainers/test/config.json'

In [44]:
set_seed(1337)

In [45]:
# Test Datsaset

dataset_config = DatasetConfig.from_config_file(PATH_TO_DATASET_CONFIG)
tokenizer_config = TokenizerConfig.from_config_file(PATH_TO_TOKENIZER_CONFIG)

train_dataset = AutoDataset.from_config(dataset_config, root=DATA_DIR, split='train')
val_dataset = AutoDataset.from_config(dataset_config, root=DATA_DIR, split='val')
test_dataset = AutoDataset.from_config(dataset_config, root=DATA_DIR, split='test')

tokenizer = AutoTokenizer.from_config(tokenizer_config)

In [46]:
# Init Hyformer

model_config = ModelConfig.from_config_file(PATH_TO_MODEL_CONFIG)
model = AutoModel.from_config(model_config, downstream_task=dataset_config.prediction_task_type, num_prediction_tasks=dataset_config.num_prediction_tasks, hidden_dim=256)
# model.load_pretrained('ckpt.pt')

number of parameters: 6.72M


In [47]:

trainer_config = TrainerConfig.from_config_file(PATH_TO_TRAINER_CONFIG)
trainer = Trainer(
    config=trainer_config,
    model=model,
    train_dataset=train_dataset,
    val_dataset=val_dataset,
    test_dataset=test_dataset,
    tokenizer=tokenizer,
    device='cuda:0',
    seed=1337,
    test_metric=dataset_config.test_metric
    )

using fused AdamW: False


In [48]:
trainer._init_data_loaders()

In [55]:
trainer.test()

Testing model with metric prc_auc...


0.6584794214696377

In [54]:
dataset_config.test_metric

'prc_auc'

In [11]:
import torch 

trainer._init_data_loaders()

batch = next(iter(trainer.train_loader))
batch.to('cuda')
model.to('cuda')
batch['task'] = 'mlm'
outputs = model(**batch, context=torch.randn(trainer_config.batch_size, 512).to('cuda'))

tensor([[ 1.7938e-01,  6.2100e-01,  4.1589e-01, -8.7854e-01, -5.4776e-01,
         -2.6689e-02,  1.8406e-01, -1.3317e-01,  3.2801e-02, -7.4903e-01,
          6.2528e-02,  7.4672e-01, -3.8328e-01, -9.9888e-01,  8.3748e-02,
         -5.1927e-01,  4.2134e-01, -1.6152e-01, -2.1093e-01, -2.2474e-01,
         -1.6463e-01,  4.1958e-01,  6.0405e-01, -1.6488e-01,  6.1693e-01,
          2.1958e-02, -5.8056e-01,  2.6150e-02, -7.0434e-01,  9.4427e-01,
         -6.8019e-01, -3.3226e-01, -3.7340e-01, -3.2278e-01,  4.2227e-01,
         -8.4276e-01, -3.5122e-01,  4.8739e-01,  5.8796e-01, -5.6082e-01,
         -8.7024e-01, -3.6497e-01, -1.2079e-01, -4.1559e-01,  3.4476e-01,
         -1.3307e-01,  3.8978e-01, -2.1380e-01,  7.2153e-01,  5.5799e-01,
          1.3965e-01,  3.2980e-01, -5.0698e-01, -7.1141e-03,  5.0818e-01,
          4.8350e-01, -1.0274e-01, -8.4063e-01,  3.3236e-01, -1.4932e-01,
          4.2687e-01,  3.7131e-01,  5.5743e-02, -6.2024e-01, -4.2108e-01,
         -6.0014e-01,  1.2281e-01, -1.

In [22]:
inputs = tokenizer([train_dataset[idx] for idx in range(2)], task='generation')
inputs = inputs.to('cuda')
model_outputs = model.get_loss(**inputs)

In [28]:
inputs.keys()

dict_keys(['input_ids', 'attention_mask', 'task', 'input_labels', 'properties'])

In [24]:
smiles = 'ClC[C]'
tokenizer(smiles, task='generation')

{'input_ids': tensor([[605, 603, 453,  55,   2, 602, 602, 602, 602, 602, 602, 602, 602, 602,
          602, 602, 602, 602, 602, 602, 602, 602, 602, 602, 602, 602, 602, 602,
          602, 602, 602, 602, 602, 602, 602, 602, 602, 602, 602, 602, 602, 602,
          602, 602, 602, 602, 602, 602, 602, 602, 602, 602, 602, 602, 602, 602,
          602, 602, 602, 602, 602, 602, 602, 602, 602, 602, 602, 602, 602, 602,
          602, 602, 602, 602, 602, 602, 602, 602, 602, 602, 602, 602, 602, 602,
          602, 602, 602, 602, 602, 602, 602, 602, 602, 602, 602, 602, 602, 602,
          602, 602, 602, 602, 602, 602, 602, 602, 602, 602, 602, 602, 602, 602,
          602, 602, 602, 602, 602, 602, 602, 602, 602, 602, 602, 602, 602, 602,
          602, 602]]),
 'attention_mask': tensor([[ True,  True,  True,  True,  True, False, False, False, False, False,
          False, False, False, False, False, False, False, False, False, False,
          False, False, False, False, False, False, False, False, 

In [29]:
tokenizer.tokenizer.cls_token

'[BOS]'

In [37]:
tokenizer.tokenizer.special_tokens_map

{'unk_token': '[UNK]',
 'sep_token': '[EOS]',
 'pad_token': '[PAD]',
 'cls_token': '[BOS]',
 'mask_token': '[MASK]',
 'additional_special_tokens': ['[GEN]', '[PRED]', '[REC]']}