# check tokenizers

In [1]:
import sys
sys.path.append("/workspace")

In [3]:
from pathlib import Path
import os
from preprocessing.text_tokenizer import TextTokenizer
from tokenizers import Tokenizer
from tokenizers.models import WordLevel
from tokenizers.pre_tokenizers import CharDelimiterSplit
from tokenizers.processors import BertProcessing
# https://github.com/huggingface/transformers/issues/7234#issuecomment-720092292
from transformers import PreTrainedTokenizerFast
from transformers import RobertaTokenizerFast

In [5]:
dataset_path = Path('/workspace/notebooks/data')

## roberta-base 

In [9]:
# os.chdir(dataset_path)
tokenizer = RobertaTokenizerFast.from_pretrained("roberta-base", max_len=512)
tokenizer.save_pretrained(dataset_path/'roberta-base')

('/workspace/notebooks/data/roberta-base/tokenizer_config.json',
 '/workspace/notebooks/data/roberta-base/special_tokens_map.json',
 '/workspace/notebooks/data/roberta-base/vocab.json',
 '/workspace/notebooks/data/roberta-base/merges.txt',
 '/workspace/notebooks/data/roberta-base/added_tokens.json')

In [None]:
# EsperBERTo

In [None]:
# load text tokenizer

In [10]:
# 2. Create a tokenizer
# load our tokenizer
text_tokenizer = TextTokenizer(dataset_path)
text_tokenizer.load_vocab(dataset_path/'pan_tadeusz'/'vocab.json')

In [None]:
# create tokenizer 1

In [11]:
tokenizer1_path = dataset_path / 'tokenizer1'

In [14]:
# Create transformers compatible tokenizer
tokenizer1 = Tokenizer(WordLevel(text_tokenizer.vocab))
tokenizer1.pre_tokenizer = CharDelimiterSplit(' ')
# tokenizer1.model.unk_token = '<unk>'

tokenizer1_path.mkdir(parents=True, exist_ok=True)
tokenizer1.save(str(tokenizer1_path/"tokenizer1.json"))

In [15]:
tokenizer1

<tokenizers.Tokenizer at 0x55660d3754f0>

In [16]:
# create tokenizer 2

In [None]:
# Re-create as roberta compatible tokenizer

# tokenizer2 = PreTrainedTokenizerFast(tokenizer_file=str(tokenizer1_path/"tokenizer1.json"))
tokenizer2 = RobertaTokenizerFast(vocab_file=str(tokenizer1_path/"tokenizer1.json"), merges_file=None)
tokenizer2._tokenizer.post_processor = BertProcessing(
    ("</s>", tokenizer2._tokenizer.token_to_id("</s>")),
    ("<s>", tokenizer2._tokenizer.token_to_id("<s>")),
)
tokenizer2._tokenizer.enable_truncation(max_length=128)  # 512
tokenizer2.mask_token = "<mask>"
tokenizer2.pad_token = "<pad>"

In [18]:
tokenizer2

PreTrainedTokenizerFast(name_or_path='', vocab_size=6473, model_max_len=1000000000000000019884624838656, is_fast=True, padding_side='right', special_tokens={'pad_token': '<pad>', 'mask_token': '<mask>'})

In [None]:
tokenizer2.save_pretrained('tokenizer2')

In [25]:
# load my-roberta

In [29]:
tokenizer = RobertaTokenizerFast.from_pretrained(dataset_path/"my-roberta", max_len=512)

In [30]:
tokenizer

PreTrainedTokenizerFast(name_or_path='/workspace/notebooks/data/my-roberta', vocab_size=6473, model_max_len=512, is_fast=True, padding_side='right', special_tokens={'bos_token': '<s>', 'eos_token': '</s>', 'unk_token': '<unk>', 'sep_token': '</s>', 'pad_token': '<pad>', 'cls_token': '<s>', 'mask_token': AddedToken("<mask>", rstrip=False, lstrip=True, single_word=False, normalized=False)})

In [31]:
tokenizer.save_pretrained(dataset_path/'my-roberta2')

('/workspace/notebooks/data/my-roberta2/tokenizer_config.json',
 '/workspace/notebooks/data/my-roberta2/special_tokens_map.json',
 '/workspace/notebooks/data/my-roberta2/vocab.json',
 '/workspace/notebooks/data/my-roberta2/merges.txt',
 '/workspace/notebooks/data/my-roberta2/added_tokens.json')