# Jointformer Training

This notebook shows how to train Jointformer. 

In [14]:
# Imports

import os

from jointformer.configs.dataset import DatasetConfig
from jointformer.configs.tokenizer import TokenizerConfig
from jointformer.configs.model import ModelConfig
from jointformer.configs.trainer import TrainerConfig

from jointformer.utils.datasets.auto import AutoDataset
from jointformer.utils.tokenizers.auto import AutoTokenizer
from jointformer.models.auto import AutoModel
from jointformer.trainers.trainer import Trainer

%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [15]:
# Set working directory of the project

REPOSITORY_DIR = '/home/adamizdebski/projects/jointformer'
os.chdir(REPOSITORY_DIR)

In [16]:
# Configs

DATA_DIR = '/home/adamizdebski/projects/jointformer/data/'
OUTPUT_DIR = '/home/adamizdebski/projects/jointformer/results/finetune'

PATH_TO_DATASET_CONFIG   = '/home/adamizdebski/projects/jointformer/configs/datasets/molecule_net/scaffold/lipo/config.json'
PATH_TO_TOKENIZER_CONFIG = '/home/adamizdebski/projects/jointformer/configs/tokenizers/smiles_bpe/config.json'
PATH_TO_MODEL_CONFIG = '/home/adamizdebski/projects/jointformer/configs/models/jointformer_test/config.json'
PATH_TO_TRAINER_CONFIG = '/home/adamizdebski/projects/jointformer/configs/trainers/test/config.json'

In [17]:
# Test Datsaset

dataset_config = DatasetConfig.from_config_file(PATH_TO_DATASET_CONFIG)
tokenizer_config = TokenizerConfig.from_config_file(PATH_TO_TOKENIZER_CONFIG)

train_dataset = AutoDataset.from_config(dataset_config, root=DATA_DIR, split='train')
val_dataset = AutoDataset.from_config(dataset_config, root=DATA_DIR, split='val')

tokenizer = AutoTokenizer.from_config(tokenizer_config)

In [18]:
# Init Jointformer

model_config = ModelConfig.from_config_file(PATH_TO_MODEL_CONFIG)
model = AutoModel.from_config(model_config, downstream_task=dataset_config.task_type, num_tasks=dataset_config.num_tasks, hidden_dim=256)
# model.load_pretrained('ckpt.pt')

In [19]:
model_config

ModelConfig({'model_name': 'JointformerWithPrefix', 'embedding_dim': 32, 'embedding_hidden_dim': 128, 'num_heads': 2, 'num_local_heads': 2, 'head_dim': 16, 'num_layers': 2, 'bias': False, 'attention_dropout': 0.0, 'feed_forward_dropout': 0.0, 'prediction_dropout': None, 'layer_norm_eps': 1e-05, 'vocab_size': 608, 'max_seq_len': 128, 'prediction_task_type': 'regression', 'num_prediction_tasks': 1, 'num_physchem_tasks': 200, 'pretrained_filepath': None, 'predictor_hidden_size': None, 'predictor_dropout': None, 'predictor_num_heads': None, 'prediction_hidden_dim': 256, 'set_separate_task_tokens': None, 'flash_attention': True, 'dropout': None, 'lambda_hparam': None, 'pooler_dropout': None})

In [31]:

trainer_config = TrainerConfig.from_config_file(PATH_TO_TRAINER_CONFIG)
trainer = Trainer(
    config=trainer_config,
    model=model,
    train_dataset=train_dataset,
    val_dataset=val_dataset,
    tokenizer=tokenizer,
    device='cuda:0'
    )

In [32]:
trainer.train()

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


In [21]:
trainer._init_data_loaders()

batch = next(iter(trainer.train_loader))
batch.to('cuda')
model.to('cuda')
batch['task'] = 'physchem'
outputs = model(**batch)

In [22]:
inputs = tokenizer([train_dataset[idx] for idx in range(2)], task='generation')
inputs = inputs.to('cuda')
model_outputs = model.get_loss(**inputs)

In [28]:
inputs.keys()

dict_keys(['input_ids', 'attention_mask', 'task', 'input_labels', 'properties'])

In [24]:
smiles = 'ClC[C]'
tokenizer(smiles, task='generation')

{'input_ids': tensor([[605, 603, 453,  55,   2, 602, 602, 602, 602, 602, 602, 602, 602, 602,
          602, 602, 602, 602, 602, 602, 602, 602, 602, 602, 602, 602, 602, 602,
          602, 602, 602, 602, 602, 602, 602, 602, 602, 602, 602, 602, 602, 602,
          602, 602, 602, 602, 602, 602, 602, 602, 602, 602, 602, 602, 602, 602,
          602, 602, 602, 602, 602, 602, 602, 602, 602, 602, 602, 602, 602, 602,
          602, 602, 602, 602, 602, 602, 602, 602, 602, 602, 602, 602, 602, 602,
          602, 602, 602, 602, 602, 602, 602, 602, 602, 602, 602, 602, 602, 602,
          602, 602, 602, 602, 602, 602, 602, 602, 602, 602, 602, 602, 602, 602,
          602, 602, 602, 602, 602, 602, 602, 602, 602, 602, 602, 602, 602, 602,
          602, 602]]),
 'attention_mask': tensor([[ True,  True,  True,  True,  True, False, False, False, False, False,
          False, False, False, False, False, False, False, False, False, False,
          False, False, False, False, False, False, False, False, 

In [29]:
tokenizer.tokenizer.cls_token

'[BOS]'

In [37]:
tokenizer.tokenizer.special_tokens_map

{'unk_token': '[UNK]',
 'sep_token': '[EOS]',
 'pad_token': '[PAD]',
 'cls_token': '[BOS]',
 'mask_token': '[MASK]',
 'additional_special_tokens': ['[GEN]', '[PRED]', '[REC]']}