# Add separate task token

The goal of this notebook is to add a separate task token, such that cls_embeddings and lm_embeddings are disjoint

In [2]:
# Imports

import os

from jointformer.configs.dataset import DatasetConfig
from jointformer.configs.tokenizer import TokenizerConfig
from jointformer.configs.model import ModelConfig
from jointformer.configs.trainer import TrainerConfig

from jointformer.utils.datasets.auto import AutoDataset
from jointformer.utils.tokenizers.auto import AutoTokenizer
from jointformer.models.auto import AutoModel
from jointformer.trainers.trainer import Trainer

%load_ext autoreload
%autoreload 2

In [6]:
# Configs

REPOSITORY_DIR = '/home/adamizdebski/projects/jointformer'
DATA_DIR = '/home/adamizdebski/files/data'
OUTPUT_DIR = '/home/adamizdebski/files/jointformer/results/chemberta2/moleculenet'

PATH_TO_DATASET_CONFIG   = '/home/adamizdebski/projects/jointformer/configs/datasets/guacamol/qed'
PATH_TO_TOKENIZER_CONFIG = '/home/adamizdebski/projects/jointformer/configs/tokenizers/smiles'
PATH_TO_MODEL_CONFIG = '/home/adamizdebski/projects/jointformer/configs/models/jointformer_test'
PATH_TO_TRAINER_CONFIG = '/home/adamizdebski/projects/jointformer/configs/trainers/test'

In [7]:
os.chdir(REPOSITORY_DIR)

In [8]:
# Test Datsaset

dataset_config = DatasetConfig.from_config_file(PATH_TO_DATASET_CONFIG)
tokenizer_config = TokenizerConfig.from_config_file(PATH_TO_TOKENIZER_CONFIG)

train_dataset = AutoDataset.from_config(dataset_config, data_dir=DATA_DIR, split='train')
val_dataset = AutoDataset.from_config(dataset_config, data_dir=DATA_DIR, split='val')
test_dataset = AutoDataset.from_config(dataset_config, data_dir=DATA_DIR, split='test')

tokenizer = AutoTokenizer.from_config(tokenizer_config)

In [18]:
smiles = train_dataset[0][0]
inputs = tokenizer(smiles, task='generation')

In [19]:
inputs

{'input_ids': tensor([[593, 592,  16,  16,  17,  37,  18,  17,  16,  18,  16,  16, 591,   0,
            0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
            0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
            0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
            0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
            0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
            0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
            0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
            0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
            0,   0]]),
 'attention_mask': tensor([[ True,  True,  True,  True,  True,  True,  True,  True,  True,  True,
           True,  True,  True, False, False, False, False, False, False, False,
          False, False, False, False, False, False, False, False, 

1

In [21]:
smiles_with_prefix = prefix_token + " " + smiles
smiles_with_prefix

'[PREFIX] CC(Br)(C)CC'

In [22]:
inputs = tokenizer(smiles_with_prefix, task='generation')
inputs

{'input_ids': tensor([[593, 592, 595,  16,  16,  17,  37,  18,  17,  16,  18,  16,  16, 591,
            0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
            0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
            0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
            0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
            0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
            0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
            0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
            0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
            0,   0]]),
 'attention_mask': tensor([[ True,  True,  True,  True,  True,  True,  True,  True,  True,  True,
           True,  True,  True,  True, False, False, False, False, False, False,
          False, False, False, False, False, False, False, False, 

In [28]:
prefix_token = '[PREFIX]'
tokenizer.tokenizer.add_special_tokens({'additional_special_tokens': [prefix_token]})
inputs = tokenizer.tokenizer(text=' ', text_pair=smiles)
inputs

{'input_ids': [592, 591, 16, 16, 17, 37, 18, 17, 16, 18, 16, 16, 591], 'token_type_ids': [0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}

In [30]:
inputs = tokenizer.tokenizer(
    text=' ', text_pair=smiles, truncation=True, padding='max_length', max_length=20,
    return_special_tokens_mask=True, return_token_type_ids=False, return_tensors='pt'
    )
inputs

{'input_ids': tensor([[592, 591,  16,  16,  17,  37,  18,  17,  16,  18,  16,  16, 591,   0,
           0,   0,   0,   0,   0,   0]]), 'special_tokens_mask': tensor([[1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0]])}

In [None]:

    # Convert tokens to input IDs
    input_ids = tokenizer.convert_tokens_to_ids(tokens)

    # Add the [CLS] or [BOS] token at the beginning if applicable
    if tokenizer.tokenizer.cls_token_id is not None:
        input_ids.insert(0, tokenizer.cls_token_id)
    elif tokenizer.bos_token_id is not None:
        input_ids.insert(0, tokenizer.bos_token_id)

    return input_ids

In [None]:
# pad input ids with the task specific token



In [None]:

# pad attention mask with True, as we want to attentd the task specific token

# pad labels with -100, as we want to ignore the task specific token in loss calculation

# Test Model forward pass