In [2]:
from transformers import AutoTokenizer
import spacy
from nltk import wordpunct_tokenize

In [3]:
text = """Thank you Michele, this helps a lot! Thank you for taking the time 
to explain this to us."""
print(text)

Thank you Michele, this helps a lot! Thank you for taking the time 
to explain this to us.


In [4]:
tokens = wordpunct_tokenize(text)
print(tokens)

['Thank', 'you', 'Michele', ',', 'this', 'helps', 'a', 'lot', '!', 'Thank', 'you', 'for', 'taking', 'the', 'time', 'to', 'explain', 'this', 'to', 'us', '.']


In [14]:
tokenizer1 = AutoTokenizer.from_pretrained("roberta-large", use_fast=True)
tokenizer2 = AutoTokenizer.from_pretrained("roberta-large", use_fast=True,
                                           add_prefix_space=True)
tokenizer1.special_tokens_map

{'bos_token': '<s>',
 'eos_token': '</s>',
 'unk_token': '<unk>',
 'sep_token': '</s>',
 'pad_token': '<pad>',
 'cls_token': '<s>',
 'mask_token': '<mask>'}

In [18]:
for tokenizer in [tokenizer1, tokenizer2]:
    concat_text = " ".join(tokens)
    output = tokenizer(concat_text)
    print(output["input_ids"])
    print(output["attention_mask"])
    print(tokenizer.convert_ids_to_tokens(output["input_ids"]))
    print()

[0, 13987, 47, 21656, 2156, 42, 2607, 10, 319, 27785, 3837, 47, 13, 602, 5, 86, 7, 3922, 42, 7, 201, 479, 2]
[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]
['<s>', 'Thank', 'Ġyou', 'ĠMichele', 'Ġ,', 'Ġthis', 'Ġhelps', 'Ġa', 'Ġlot', 'Ġ!', 'ĠThank', 'Ġyou', 'Ġfor', 'Ġtaking', 'Ġthe', 'Ġtime', 'Ġto', 'Ġexplain', 'Ġthis', 'Ġto', 'Ġus', 'Ġ.', '</s>']

[0, 3837, 47, 21656, 2156, 42, 2607, 10, 319, 27785, 3837, 47, 13, 602, 5, 86, 7, 3922, 42, 7, 201, 479, 2]
[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]
['<s>', 'ĠThank', 'Ġyou', 'ĠMichele', 'Ġ,', 'Ġthis', 'Ġhelps', 'Ġa', 'Ġlot', 'Ġ!', 'ĠThank', 'Ġyou', 'Ġfor', 'Ġtaking', 'Ġthe', 'Ġtime', 'Ġto', 'Ġexplain', 'Ġthis', 'Ġto', 'Ġus', 'Ġ.', '</s>']



In [20]:
for tokenizer in [tokenizer1, tokenizer2]:
    subtokens = []
    subtoken_ids = []
    for token in tokens:
        _subtokens = tokenizer.tokenize(token)
        subtokens.extend(_subtokens)
        subtoken_ids.extend(tokenizer.convert_tokens_to_ids(_subtokens))
    print(subtokens)
    print(subtoken_ids)
    print()

['Thank', 'you', 'M', 'iche', 'le', ',', 'this', 'helps', 'a', 'lot', '!', 'Thank', 'you', 'for', 'taking', 'the', 'time', 'to', 'expl', 'ain', 'this', 'to', 'us', '.']
[13987, 6968, 448, 11529, 459, 6, 9226, 44590, 102, 17508, 328, 13987, 6968, 1990, 16883, 627, 958, 560, 23242, 1851, 9226, 560, 687, 4]

['ĠThank', 'Ġyou', 'ĠMichele', 'Ġ,', 'Ġthis', 'Ġhelps', 'Ġa', 'Ġlot', 'Ġ!', 'ĠThank', 'Ġyou', 'Ġfor', 'Ġtaking', 'Ġthe', 'Ġtime', 'Ġto', 'Ġexplain', 'Ġthis', 'Ġto', 'Ġus', 'Ġ.']
[3837, 47, 21656, 2156, 42, 2607, 10, 319, 27785, 3837, 47, 13, 602, 5, 86, 7, 3922, 42, 7, 201, 479]



In [21]:
tokenizer2.max_len_single_sentence

510

In [22]:
tokenizer2.max_len_sentences_pair

508

In [23]:
tokenizer.max_model_input_sizes

{'roberta-base': 512,
 'roberta-large': 512,
 'roberta-large-mnli': 512,
 'distilroberta-base': 512,
 'roberta-base-openai-detector': 512,
 'roberta-large-openai-detector': 512}