In [2]:
# Imports

import os

from hyformer.configs.dataset import DatasetConfig
from hyformer.configs.tokenizer import TokenizerConfig
from hyformer.configs.model import ModelConfig
from hyformer.configs.trainer import TrainerConfig

from hyformer.utils.datasets.auto import AutoDataset
from hyformer.utils.tokenizers.auto import AutoTokenizer
from hyformer.models.auto import AutoModel
from hyformer.trainers.trainer import Trainer

from hyformer.utils.runtime import set_seed

%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [2]:
# Set working directory of the project

REPOSITORY_DIR = '/home/aih/adam.izdebski/project/jointformer-interface/jointformer'
os.chdir(REPOSITORY_DIR)

In [3]:
# Set seed for reproducibility

set_seed(1337)


In [23]:
# Configs

DATA_DIR = '/lustre/groups/aih/jointformer/icml25/data'
OUTPUT_DIR = '/lustre/groups/aih/jointformer/icml25/results'

PATH_TO_DATASET_CONFIG   = 'configs/datasets/guacamol/config.json'
PATH_TO_TOKENIZER_CONFIG = 'configs/tokenizers/regex_smiles/config.json'
PATH_TO_MODEL_CONFIG = 'configs/models/hyformer/config.json'
PATH_TO_TRAINER_CONFIG = 'configs/trainers/pretrain/config.json'

In [24]:
# Load configs

dataset_config = DatasetConfig.from_config_file(PATH_TO_DATASET_CONFIG)
tokenizer_config = TokenizerConfig.from_config_file(PATH_TO_TOKENIZER_CONFIG)
model_config = ModelConfig.from_config_file(PATH_TO_MODEL_CONFIG)
trainer_config = TrainerConfig.from_config_file(PATH_TO_TRAINER_CONFIG)


In [25]:
# Load datasets

train_dataset = AutoDataset.from_config(dataset_config, root=DATA_DIR, split='train')
val_dataset = AutoDataset.from_config(dataset_config, root=DATA_DIR, split='val')
test_dataset = AutoDataset.from_config(dataset_config, root=DATA_DIR, split='test')


In [26]:
# Load tokenizer

tokenizer = AutoTokenizer.from_config(tokenizer_config)

ValueError: Tokenizer SMILESRegexTokenizer not available. Available options: 'SMILESTokenizer', 'HFTokenizer'

In [21]:
tokenizer_config

TokenizerConfig(path_to_vocabulary='data/vocabularies/deepchem.txt', tokenizer_type='SMILESTokenizer')

In [42]:
train_dataset[0]

{'data': 'CCC(C)(C)Br', 'target': None}

In [62]:
tokenizer(train_dataset[0]['data'], task='lm')

{'input_ids': tensor([591,  16,  16,  16,  17,  16,  18,  17,  16,  18,  37,  11]),
 'attention_mask': tensor([True, True, True, True, True, True, True, True, True, True, True, True]),
 'special_tokens_mask': tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])}

In [None]:
# Requriments - transformers, tokenizers
# Right now, the Smiles Tokenizer uses an exiesting vocab file from rxnfp that is fairly comprehensive and from the USPTO dataset.
# The vocab may be expanded in the near future

import collections
import os
import re
import pkg_resources
from typing import List
from transformers import BertTokenizer
from logging import getLogger

logger = getLogger(__name__)
"""
SMI_REGEX_PATTERN: str
    SMILES regex pattern for tokenization. Designed by Schwaller et. al.

References

.. [1]  Philippe Schwaller, Teodoro Laino, Théophile Gaudin, Peter Bolgar, Christopher A. Hunter, Costas Bekas, and Alpha A. Lee
        ACS Central Science 2019 5 (9): Molecular Transformer: A Model for Uncertainty-Calibrated Chemical Reaction Prediction
        1572-1583 DOI: 10.1021/acscentsci.9b00576

"""

SMI_REGEX_PATTERN = r"""(\[[^\]]+]|Br?|Cl?|N|O|S|P|F|I|b|c|n|o|s|p|\(|\)|\.|=|
#|-|\+|\\|\/|:|~|@|\?|>>?|\*|\$|\%[0-9]{2}|[0-9])"""

# add vocab_file dict
VOCAB_FILES_NAMES = {"vocab_file": "vocab.txt"}


def get_default_tokenizer():
  default_vocab_path = (pkg_resources.resource_filename("deepchem",
                                                        "feat/tests/vocab.txt"))
  return SmilesTokenizer(default_vocab_path)


class SmilesTokenizer(BertTokenizer):
  """
    Creates the SmilesTokenizer class. The tokenizer heavily inherits from the BertTokenizer
    implementation found in Huggingface's transformers library. It runs a WordPiece tokenization
    algorithm over SMILES strings using the tokenisation SMILES regex developed by Schwaller et. al.

    Please see https://github.com/huggingface/transformers
    and https://github.com/rxn4chemistry/rxnfp for more details.

    Examples
    --------
    >>> from deepchem.feat.smiles_tokenizer import SmilesTokenizer
    >>> current_dir = os.path.dirname(os.path.realpath(__file__))
    >>> vocab_path = os.path.join(current_dir, 'tests/data', 'vocab.txt')
    >>> tokenizer = SmilesTokenizer(vocab_path)
    >>> print(tokenizer.encode("CC(=O)OC1=CC=CC=C1C(=O)O"))
    [12, 16, 16, 17, 22, 19, 18, 19, 16, 20, 22, 16, 16, 22, 16, 16, 22, 16, 20, 16, 17, 22, 19, 18, 19, 13]


    References
    ----------
    .. [1]  Schwaller, Philippe; Probst, Daniel; Vaucher, Alain C.; Nair, Vishnu H; Kreutter, David;
            Laino, Teodoro; et al. (2019): Mapping the Space of Chemical Reactions using Attention-Based Neural
            Networks. ChemRxiv. Preprint. https://doi.org/10.26434/chemrxiv.9897365.v3

    Notes
    ----
    This class requires huggingface's transformers and tokenizers libraries to be installed.
    """
  vocab_files_names = VOCAB_FILES_NAMES

  def __init__(
      self,
      vocab_file: str = '',
      # unk_token="[UNK]",
      # sep_token="[SEP]",
      # pad_token="[PAD]",
      # cls_token="[CLS]",
      # mask_token="[MASK]",
      **kwargs):
    """Constructs a SmilesTokenizer.

        Parameters
        ----------
        vocab_file: str
            Path to a SMILES character per line vocabulary file.
            Default vocab file is found in deepchem/feat/tests/data/vocab.txt
        """

    super().__init__(vocab_file, **kwargs)
    # take into account special tokens in max length
    self.max_len_single_sentence = self.max_len - 2
    self.max_len_sentences_pair = self.max_len - 3

    if not os.path.isfile(vocab_file):
      raise ValueError(
          "Can't find a vocab file at path '{}'.".format(vocab_file))
    self.vocab = load_vocab(vocab_file)
    self.highest_unused_index = max(
        [i for i, v in enumerate(self.vocab.keys()) if v.startswith("[unused")])
    self.ids_to_tokens = collections.OrderedDict(
        [(ids, tok) for tok, ids in self.vocab.items()])
    self.basic_tokenizer = BasicSmilesTokenizer()
    self.init_kwargs["max_len"] = self.max_len

  @property
  def vocab_size(self):
    return len(self.vocab)

  @property
  def vocab_list(self):
    return list(self.vocab.keys())

  def _tokenize(self, text: str):
    """
        Tokenize a string into a list of tokens.

        Parameters
        ----------
        text: str
            Input string sequence to be tokenized.
        """

    split_tokens = [token for token in self.basic_tokenizer.tokenize(text)]
    return split_tokens

  def _convert_token_to_id(self, token):
    """
        Converts a token (str/unicode) in an id using the vocab.

        Parameters
        ----------
        token: str
            String token from a larger sequence to be converted to a numerical id.
        """

    return self.vocab.get(token, self.vocab.get(self.unk_token))

  def _convert_id_to_token(self, index):
    """
        Converts an index (integer) in a token (string/unicode) using the vocab.

        Parameters
        ----------
        index: int
            Integer index to be converted back to a string-based token as part of a larger sequence.
        """

    return self.ids_to_tokens.get(index, self.unk_token)

  def convert_tokens_to_string(self, tokens: List[str]):
    """ Converts a sequence of tokens (string) in a single string.

        Parameters
        ----------
        tokens: List[str]
            List of tokens for a given string sequence.

        Returns
        -------
        out_string: str
            Single string from combined tokens.
        """

    out_string: str = " ".join(tokens).replace(" ##", "").strip()
    return out_string

  def add_special_tokens_ids_single_sequence(self, token_ids: List[int]):
    """
        Adds special tokens to the a sequence for sequence classification tasks.
        A BERT sequence has the following format: [CLS] X [SEP]

        Parameters
        ----------

        token_ids: list[int]
            list of tokenized input ids. Can be obtained using the encode or encode_plus methods.
        """

    return [self.cls_token_id] + token_ids + [self.sep_token_id]




 
  



In [11]:
SMI_REGEX_PATTERN = r"""(\[[^\]]+]|Br?|Cl?|N|O|S|P|F|I|b|c|n|o|s|p|\(|\)|\.|=|#|-|\+|\\|\/|:|~|@|\?|>|\*|\$|\%[0-9]{2}|[0-9])"""



In [21]:

import re
from typing import List


class RegexSmilesTokenizer:
  """ Run basic SMILES tokenization using a regex pattern developed by Schwaller et. al [1]. 
  
  
  References:
  ----------
  [1]  Philippe Schwaller, Teodoro Laino, Théophile Gaudin, Peter Bolgar, Christopher A. Hunter, Costas Bekas, and Alpha A. Lee
        ACS Central Science 2019 5 (9): Molecular Transformer: A Model for Uncertainty-Calibrated Chemical Reaction Prediction
        1572-1583 DOI: 10.1021/acscentsci.9b00576   
  """

  def __init__(self, regex_pattern: str = SMI_REGEX_PATTERN):
    """ Constructs a BasicSMILESTokenizer.
        Parameters
        ----------

        regex: string
         SMILES token regex
    """
    self.regex_pattern = regex_pattern
    self.regex = re.compile(self.regex_pattern)

  def tokenize(self, text):
    """ Basic Tokenization of a SMILES.
    """
    tokens = [token for token in self.regex.findall(text)]
    return tokens
  
  def add_special_tokens_single_sequence(self, tokens: List[str]):
    return [self.cls_token] + tokens + [self.sep_token]




In [22]:
tokenizer = RegexSmilesTokenizer()
tokenizer.tokenize("CC(=O)OC1=CC=CC=C1C(=O)O")

['C',
 'C',
 '(',
 '=',
 'O',
 ')',
 'O',
 'C',
 '1',
 '=',
 'C',
 'C',
 '=',
 'C',
 'C',
 '=',
 'C',
 '1',
 'C',
 '(',
 '=',
 'O',
 ')',
 'O']