# Tokenizer

In [1]:
import sys

sys.path.append('..')

from genomix.utils import SPECIAL_TOKENS

In [2]:
input_sequence = ['CACCCTAAACCCTAACCCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCCTAAACCCT', 'ACCCTCACCCTCACCCTCACCCTCACCCTCACCCTCACCCTCACCCTAACCCTAACCCTAACCC']

from datasets import Dataset, DatasetDict

def gen_demo_data():
    yield {"sequence": "GACCCTAAACCCTAACCCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCCTAAACCCT", "chr": "1"}
    yield {"sequence": "ACCCTCACCCTCACCCTCACCCTCACCCTCACCCTCACCCTCACCCTAACCCTAACCCTAACCC", "chr": "1"}
    yield {"sequence": "ACCCTCACCCTCAGGCTCACCCTCACCCTCACCCTCACCCTCACCCTAACCCTAACCCTAACCC", "chr": "1"}

ds1 = Dataset.from_generator(gen_demo_data)
ds2 = Dataset.from_generator(gen_demo_data)


ds = DatasetDict()
ds["train"] = ds1
ds["validation"] = ds2

ds

Generating train split: 0 examples [00:00, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['sequence', 'chr'],
        num_rows: 3
    })
    validation: Dataset({
        features: ['sequence', 'chr'],
        num_rows: 3
    })
})

In [11]:
import numpy as np
ds["train"][np.array([0,1])]['sequence']

['GACCCTAAACCCTAACCCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCCTAAACCCT',
 'ACCCTCACCCTCACCCTCACCCTCACCCTCACCCTCACCCTCACCCTAACCCTAACCCTAACCC']

## BPE tokenizer

### BioSeqBPETokenizer

slow version of BPE tokenizer

In [3]:
from genomix.tokenizers import BioSeqBPETokenizer

tokenizer_kwargs = {
        "bos_token": SPECIAL_TOKENS.BOS.value,
        "eos_token": SPECIAL_TOKENS.EOS.value,
        "unk_token": SPECIAL_TOKENS.UNK.value,
        "mask_token": SPECIAL_TOKENS.MASK.value,
        "padding_side": "left", # as a prediction next token model, the padding is done on the left
        "add_bos_token": False,
        "add_eos_token": False,
        "add_prefix_space": False, 
        "do_lower_case": False,
        # "model_max_length": 6  # This should be set when do tokenization, not in __init__
    }

# we use the shrinked vocabulary for the tokenizer for test
vocab_dir = '/home/share/huadjyin/home/baiyong01/projects/genomix/tmp'
tokenier = BioSeqBPETokenizer.from_pretrained(
    vocab_dir, 
    local_files_only=True, 
    **tokenizer_kwargs)


<jemalloc>: Unsupported system page size


In [6]:
tokenier.vocab_size
## You can add tokens to the vocabulary by providing a list of strings 
# tokenier.add_tokens(['ADDED'])


59

In [8]:
## Because the following tokenization is set with `padding = "max_length"`
# we have to set `pad_token` explicitly, otherwise the errror will be raised:
# *************************************************************************
# ValueError: Asking to pad but the tokenizer does not have a padding token. 
# Please select a token to use as `pad_token` `(tokenizer.pad_token = tokenizer.eos_token e.g.)` 
# or add a new pad token via `tokenizer.add_special_tokens({'pad_token': '[PAD]'})`. 

tokenier.pad_token = SPECIAL_TOKENS.EOS.value

tokenier.pad_token_id

2

In [9]:
tokenizations = tokenier(input_sequence[0], 
                        #  add_special_tokens=True,
                         max_length=6, # we set the max_length to 6 here, so the output will be truncated
                         truncation = True,
                         padding = "max_length",
                         stride=2,
                         return_overflowing_tokens = True
                         )

In [10]:
tokenizations

{'overflowing_tokens': [23, 5, 21, 13, 13, 21, 37, 21, 37, 21, 37, 21, 37, 21, 37, 21, 13, 13, 21, 23, 5, 8], 'num_truncated_tokens': 20, 'input_ids': [5, 23, 5, 21, 23, 5], 'attention_mask': [1, 1, 1, 1, 1, 1]}

**NOTE**: from the output above, we can see that the output returns ONLY ONE input_ids.

This is because we use the SLOW version of tokenizer, and thus `return_overflowing_tokens` DO NOT affect the tokenizer.

see https://github.com/huggingface/transformers/issues/23001 for details.

In [11]:
# Now we test the add_special_tokens=True
tokenizations = tokenier(input_sequence[0], 
                         add_special_tokens=True,
                         max_length=6, # we set the max_length to 6 here, so the output will be truncated
                         truncation = True,
                         padding = "max_length",
                         stride=2,
                         return_overflowing_tokens = True
                         )
tokenizations

{'overflowing_tokens': [23, 5, 21, 13, 13, 21, 37, 21, 37, 21, 37, 21, 37, 21, 37, 21, 13, 13, 21, 23, 5, 8], 'num_truncated_tokens': 20, 'input_ids': [5, 23, 5, 21, 23, 5], 'attention_mask': [1, 1, 1, 1, 1, 1]}

As we can see, the output is the same as the results above, no matter if we set ` add_special_tokens=True` or not. 

This is becase we iniialized the tokenizer with `"add_bos_token": False` and `"add_eos_token": False`.

**NOTE**
if `"add_bos_token": True` and `"add_eos_token": Ture`, then:
* if ` add_special_tokens=True` (* Default value by `transformers` *), then the output will add `BOS` and `EOS`; 
* if ` add_special_tokens=False`, then the output will NOT add `BOS` and `EOS`; 


```python
tokenizer_kwargs = {
        "bos_token": SPECIAL_TOKENS.BOS.value,
        "eos_token": SPECIAL_TOKENS.EOS.value,
        "unk_token": SPECIAL_TOKENS.UNK.value,
        "mask_token": SPECIAL_TOKENS.MASK.value,
        "padding_side": "left", # as a prediction next token model, the padding is done on the left
        "add_bos_token": False,
        "add_eos_token": False,
        "add_prefix_space": False, 
        "do_lower_case": False,
        # "model_max_length": 6  # This should be set when do tokenization, not in __init__
    }
```

In [13]:
# Now we re-initialize the tokenizer with `add_bos_token=True` and `add_eos_token=True`

tokenizer_kwargs = {
        "bos_token": SPECIAL_TOKENS.BOS.value,
        "eos_token": SPECIAL_TOKENS.EOS.value,
        "unk_token": SPECIAL_TOKENS.UNK.value,
        "mask_token": SPECIAL_TOKENS.MASK.value,
        "padding_side": "left", # as a prediction next token model, the padding is done on the left
        "add_bos_token": True,
        "add_eos_token": True,
        "add_prefix_space": False, 
        "do_lower_case": False,
        # "model_max_length": 6  # This should be set when do tokenization, not in __init__
    }

# we use the shrinked vocabulary for the tokenizer for test
vocab_dir = '/home/share/huadjyin/home/baiyong01/projects/genomix/tmp'
tokenier = BioSeqBPETokenizer.from_pretrained(
    vocab_dir, 
    local_files_only=True, 
    **tokenizer_kwargs)

# add `PAD` token
tokenier.pad_token = SPECIAL_TOKENS.EOS.value

In [14]:
# Now we test the add_special_tokens=False
tokenizations = tokenier(input_sequence[0], 
                         add_special_tokens=False,
                         max_length=6, # we set the max_length to 6 here, so the output will be truncated
                         truncation = True,
                         padding = "max_length",
                         stride=2,
                         return_overflowing_tokens = True
                         )
tokenizations

{'overflowing_tokens': [23, 5, 21, 13, 13, 21, 37, 21, 37, 21, 37, 21, 37, 21, 37, 21, 13, 13, 21, 23, 5, 8], 'num_truncated_tokens': 20, 'input_ids': [5, 23, 5, 21, 23, 5], 'attention_mask': [1, 1, 1, 1, 1, 1]}

In [15]:
# Now we test the add_special_tokens=True
tokenizations = tokenier(input_sequence[0], 
                         add_special_tokens=True,
                         max_length=6, # we set the max_length to 6 here, so the output will be truncated
                         truncation = True,
                         padding = "max_length",
                         stride=2,
                         return_overflowing_tokens = True
                         )
tokenizations

{'overflowing_tokens': [5, 21, 23, 5, 21, 13, 13, 21, 37, 21, 37, 21, 37, 21, 37, 21, 37, 21, 13, 13, 21, 23, 5, 8], 'num_truncated_tokens': 22, 'input_ids': [0, 5, 23, 5, 21, 2], 'attention_mask': [1, 1, 1, 1, 1, 1]}

Now, we have special tokens.

### BioSeqBPETokenizerFast

fast version of BPE tokenizer

In [16]:
from genomix.tokenizers import BioSeqBPETokenizerFast

tokenizer_kwargs = {
        "bos_token": SPECIAL_TOKENS.BOS.value,
        "eos_token": SPECIAL_TOKENS.EOS.value,
        "unk_token": SPECIAL_TOKENS.UNK.value,
        "mask_token": SPECIAL_TOKENS.MASK.value,
        "padding_side": "left", # as a prediction next token model, the padding is done on the left
        "add_bos_token": True,
        "add_eos_token": True,
        "add_prefix_space": False, 
        "do_lower_case": False,
        # "model_max_length": 6  # This should be set when do tokenization, not in __init__
    }

# we use the shrinked vocabulary for the tokenizer for test
vocab_dir = '/home/share/huadjyin/home/baiyong01/projects/genomix/tmp'
tokenier = BioSeqBPETokenizerFast.from_pretrained(
    vocab_dir, 
    local_files_only=True, 
    **tokenizer_kwargs)

In [17]:
# add `PAD` token
tokenier.pad_token = SPECIAL_TOKENS.EOS.value

In [20]:
tokenizations = tokenier(input_sequence[0], 
                         add_special_tokens=True,
                         max_length=6, # we set the max_length to 6 here, so the output will be truncated
                         truncation = True,
                         padding = "max_length",
                         stride=2,
                         return_overflowing_tokens = True
                         )
tokenizations

{'input_ids': [[0, 5, 23, 5, 21, 2], [0, 5, 21, 23, 5, 2], [0, 23, 5, 21, 13, 2], [0, 21, 13, 13, 21, 2], [0, 13, 21, 37, 21, 2], [0, 37, 21, 37, 21, 2], [0, 37, 21, 37, 21, 2], [0, 37, 21, 37, 21, 2], [0, 37, 21, 37, 21, 2], [0, 37, 21, 13, 13, 2], [0, 13, 13, 21, 23, 2], [0, 21, 23, 5, 8, 2]], 'attention_mask': [[1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1]], 'overflow_to_sample_mapping': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]}

In [21]:
tokenizations = tokenier(input_sequence[0], 
                         add_special_tokens=False,
                         max_length=6, # we set the max_length to 6 here, so the output will be truncated
                         truncation = True,
                         padding = "max_length",
                         stride=2,
                         return_overflowing_tokens = True
                         )
tokenizations

{'input_ids': [[5, 23, 5, 21, 23, 5], [23, 5, 21, 13, 13, 21], [13, 21, 37, 21, 37, 21], [37, 21, 37, 21, 37, 21], [37, 21, 37, 21, 13, 13], [13, 13, 21, 23, 5, 8]], 'attention_mask': [[1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1]], 'overflow_to_sample_mapping': [0, 0, 0, 0, 0, 0]}

### Datasets test

**NOTE**

when using pyarrow version 16.1.0, the error will be raised:

`<jemalloc>`: Unsupported system page size

see: https://github.com/apache/arrow/issues/11134

This will lead the error when create datatset:

```python
from datasets import Dataset, DatasetDict

def gen_demo_data():
    yield {"sequence": "GACCCTAAACCCTAACCCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCCTAAACCCT", "char": "1"}
    yield {"sequence": "ACCCTCACCCTCACCCTCACCCTCACCCTCACCCTCACCCTCACCCTAACCCTAACCCTAACCC", "chr": "1"}

ds1 = Dataset.from_generator(gen_demo_data)

```

ArrowMemoryError: malloc of size 256 failed.

OSError: [Errno 39] Directory not empty: '~/.cache/huggingface/datasets/generator/default-1d7aac6694688fc4/0.0.0.incomplete'

*************************************************************************

**SOLVED**: Upgrade the pyarrow to 16.1.0 to version 18.0.0 solve the problem


In [1]:

from datasets import Dataset, DatasetDict

def gen_demo_data():
    yield {"sequence": "GACCCTAAACCCTAACCCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCCTAAACCCT", "chr": "1"}
    yield {"sequence": "ACCCTCACCCTCACCCTCACCCTCACCCTCACCCTCACCCTCACCCTAACCCTAACCCTAACCC", "chr": "1"}

ds1 = Dataset.from_generator(gen_demo_data)
ds2 = Dataset.from_generator(gen_demo_data)

ds = DatasetDict()
ds["train"] = ds1
ds["validation"] = ds2

ds


Generating train split: 0 examples [00:00, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['sequence', 'chr'],
        num_rows: 2
    })
    validation: Dataset({
        features: ['sequence', 'chr'],
        num_rows: 2
    })
})

In [14]:
from genomix.tokenizers import BioSeqBPETokenizerFast

tokenizer_kwargs = {
        "bos_token": SPECIAL_TOKENS.BOS.value,
        "eos_token": SPECIAL_TOKENS.EOS.value,
        "unk_token": SPECIAL_TOKENS.UNK.value,
        "mask_token": SPECIAL_TOKENS.MASK.value,
        "padding_side": "left", # as a prediction next token model, the padding is done on the left
        "add_bos_token": True,
        "add_eos_token": False,
        "add_prefix_space": False, 
        "do_lower_case": False,
        # "model_max_length": 6  # This should be set when do tokenization, not in __init__
    }

# we use the shrinked vocabulary for the tokenizer for test
vocab_dir = '/home/share/huadjyin/home/baiyong01/projects/genomix/tmp'
tokenier = BioSeqBPETokenizerFast.from_pretrained(
    vocab_dir, 
    local_files_only=True, 
    **tokenizer_kwargs)

# add `PAD` token
tokenier.pad_token = SPECIAL_TOKENS.EOS.value


In [15]:
tokenized_ds = tokenier(ds['train']['sequence'][0], 
                        max_length=6, 
                        truncation=True,
                        padding=True,
                        return_overflowing_tokens=True, 
                        stride=2, 
                        add_special_tokens=True)
tokenized_ds

{'input_ids': [[0, 6, 23, 5, 21, 23], [0, 21, 23, 5, 21, 13], [0, 21, 13, 13, 21, 37], [0, 21, 37, 21, 37, 21], [0, 37, 21, 37, 21, 37], [0, 21, 37, 21, 37, 21], [0, 37, 21, 13, 13, 21], [0, 13, 21, 23, 5, 8]], 'attention_mask': [[1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1]], 'overflow_to_sample_mapping': [0, 0, 0, 0, 0, 0, 0, 0]}

## Unigram tokenizer

In [7]:
from genomix.tokenizers import BioSeqUnigramTokenizerFast

tokenizer_kwargs = {
        "bos_token": SPECIAL_TOKENS.BOS.value,
        "eos_token": SPECIAL_TOKENS.EOS.value,
        "unk_token": SPECIAL_TOKENS.UNK.value,
        "mask_token": SPECIAL_TOKENS.MASK.value,
        "padding_side": "left", # as a prediction next token model, the padding is done on the left
        "add_bos_token": True,
        "add_eos_token": True,
        "add_prefix_space": False, 
        "do_lower_case": False,
        # "model_max_length": 6  # This should be set when do tokenization, not in __init__
    }

# we use the shrinked vocabulary for the tokenizer for test
vocab_dir = '/home/share/huadjyin/home/baiyong01/projects/genomix/tmp'
tokenier = BioSeqUnigramTokenizerFast.from_pretrained(
    vocab_dir, 
    local_files_only=True, 
    # **tokenizer_kwargs  # test the default setting
    )

# add `PAD` token
tokenier.pad_token = SPECIAL_TOKENS.EOS.value

In [8]:
tokenier.bos_token, tokenier.eos_token, tokenier.unk_token, tokenier.mask_token

('<BOS>', '<EOS>', '<UNK>', None)

The result above shows that the default setting leads `tokenier.mask_token` to `None` even the `unigram.json` contains `<MASK>` token.

Therefore, we have to set the tokenier parameters.

In [8]:
tokenizer_kwargs = {
        "bos_token": SPECIAL_TOKENS.BOS.value,
        "eos_token": SPECIAL_TOKENS.EOS.value,
        "unk_token": SPECIAL_TOKENS.UNK.value,
        "mask_token": SPECIAL_TOKENS.MASK.value,
        "padding_side": "left", # as a prediction next token model, the padding is done on the left
        "add_bos_token": True,
        "add_eos_token": True,
        "add_prefix_space": False, 
        "do_lower_case": False,
        # "model_max_length": 6  # This should be set when do tokenization, not in __init__
    }

# we use the shrinked vocabulary for the tokenizer for test
vocab_dir = '/home/share/huadjyin/home/baiyong01/projects/genomix/tmp'
tokenier = BioSeqUnigramTokenizerFast.from_pretrained(
    vocab_dir, 
    local_files_only=True, 
    **tokenizer_kwargs  # test the default setting
    )

# add `PAD` token
tokenier.pad_token = SPECIAL_TOKENS.EOS.value

In [9]:
tokenier.bos_token, tokenier.eos_token, tokenier.unk_token, tokenier.mask_token

('<BOS>', '<EOS>', '<UNK>', '<MASK>')

## SPM tokenizer

In [15]:
from genomix.tokenizers import BioSeqSPMTokenizerFast, BioSeqSPMTokenizer

tokenizer_kwargs = {
        "bos_token": SPECIAL_TOKENS.BOS.value,
        "eos_token": SPECIAL_TOKENS.EOS.value,
        "unk_token": SPECIAL_TOKENS.UNK.value,
        "mask_token": SPECIAL_TOKENS.MASK.value,
        "padding_side": "left", # as a prediction next token model, the padding is done on the left
        "add_bos_token": True,
        "add_eos_token": True,
        "add_prefix_space": False, 
        "do_lower_case": False,
        # "model_max_length": 6  # This should be set when do tokenization, not in __init__
    }

# we use the shrinked vocabulary for the tokenizer for test
vocab_dir = '/home/share/huadjyin/home/baiyong01/projects/genomix/tmp'
tokenier = BioSeqSPMTokenizer.from_pretrained(
    vocab_dir, 
    local_files_only=True, 
    **tokenizer_kwargs  # test the default setting
    )

# add `PAD` token
tokenier.pad_token = SPECIAL_TOKENS.EOS.value

vocab_size: <sentencepiece.SentencePieceProcessor; proxy of <Swig Object of type 'sentencepiece::SentencePieceProcessor *' at 0x4004b4967030> >


In [8]:
tokenier.bos_token, tokenier.eos_token, tokenier.unk_token, tokenier.mask_token, tokenier.pad_token

('<BOS>', '<EOS>', '<UNK>', '<MASK>', '<EOS>')

In [10]:
tokenier.bos_token_id, tokenier.eos_token_id, tokenier.unk_token_id, tokenier.mask_token_id, tokenier.pad_token_id

(0, 2, 1, 53, 2)

In [11]:
from genomix.tokenizers import BioSeqSPMTokenizerFast, BioSeqSPMTokenizer

tokenizer_kwargs = {
        "bos_token": SPECIAL_TOKENS.BOS.value,
        "eos_token": SPECIAL_TOKENS.EOS.value,
        "unk_token": SPECIAL_TOKENS.UNK.value,
        "mask_token": SPECIAL_TOKENS.MASK.value,
        "padding_side": "left", # as a prediction next token model, the padding is done on the left
        "add_bos_token": True,
        "add_eos_token": True,
        "add_prefix_space": False, 
        "do_lower_case": False,
        # "model_max_length": 6  # This should be set when do tokenization, not in __init__
    }

# we use the shrinked vocabulary for the tokenizer for test
vocab_dir = '/home/share/huadjyin/home/baiyong01/projects/genomix/tmp'
tokenier = BioSeqSPMTokenizerFast.from_pretrained(
    vocab_dir, 
    local_files_only=True, 
    **tokenizer_kwargs  # test the default setting
    )

# add `PAD` token
tokenier.pad_token = SPECIAL_TOKENS.EOS.value

vocab_size: <sentencepiece.SentencePieceProcessor; proxy of <Swig Object of type 'sentencepiece::SentencePieceProcessor *' at 0x4002a3e1d440> >


In [19]:
tokenizations = tokenier(input_sequence[0], 
                         add_special_tokens=True,
                         max_length=6, # we set the max_length to 6 here, so the output will be truncated
                         truncation = True,
                         padding = "max_length",
                         stride=2,
                         return_overflowing_tokens = True
                         )
tokenizations

{'overflowing_tokens': [17, 3, 38, 17, 16, 16, 14, 3, 16, 14, 3, 16, 14, 3, 16, 14, 3, 16, 14, 3, 38, 17, 16, 38, 17, 3, 38], 'num_truncated_tokens': 25, 'input_ids': [0, 47, 38, 17, 3, 2], 'attention_mask': [1, 1, 1, 1, 1, 1]}

In [20]:
tokenier.decode(tokenizations['input_ids'])

'<BOS> CAC CCT AA AC <EOS>'