In [16]:
from rich import print as rprint
from rich.traceback import install
install()

<bound method InteractiveShell.excepthook of <ipykernel.zmqshell.ZMQInteractiveShell object at 0x7f1ee6950bb0>>

## Train a basic tokenizer from scratch

ref: https://huggingface.co/docs/tokenizers/python/latest/quicktour.html#build-a-tokenizer-from-scratch

In [156]:
dir(decoders)

['BPEDecoder',
 'ByteLevel',
 'Decoder',
 'Metaspace',
 'WordPiece',
 '__builtins__',
 '__cached__',
 '__doc__',
 '__file__',
 '__loader__',
 '__name__',
 '__package__',
 '__path__',
 '__spec__',
 'decoders']

In [13]:
from tokenizers import Tokenizer
from tokenizers.models import BPE
from tokenizers.pre_tokenizers import Whitespace
from tokenizers.normalizers import NFD
from tokenizers.trainers import BpeTrainer
from tokenizers.processors import TemplateProcessing
from tokenizers.decoders import ByteLevel as ByteLevelDecoder

In [147]:
DATA_DIR = './data/wikitext-103-raw'

In [164]:
# ---- Step 1: Tokenizer ----
# instantiate a tokenizer with BPE model
tokenizer = Tokenizer(BPE(unk_token='[UNK]'))

# ---- Step 1.5: Normalizer ----
tokenizer.normalizer = NFD()

# ---- Step 2 & 3: Trainer and pre-tokenizer ----
# default vocab_size is 30_000, min_frequency is 0. Define special tokens for post-process. These special tokens will be assigned id starting from 0.
trainer = BpeTrainer(special_tokens=["[UNK]", "[CLS]", "[SEP]", "[PAD]", "[MASK]"], vocab_size=50000)
# The tokenizer itself might not know things like space and punctuation, i.e. it just regards the text as a sequence of characters.
tokenizer.pre_tokenizer = Whitespace()

# ---- Step 4: Train ----
# call the `train` method with a list of files.
files = [f"{DATA_DIR}/wiki.{split}.raw" for split in ["test", "train", "valid"]]
%time tokenizer.train(files, trainer)

# ---- Step 5: Post-processor
# add post_processor to add special tokens
tokenizer.post_processor = TemplateProcessing(
    single="[CLS] $A [SEP]",
    pair="[CLS] $A [SEP] $B:1 [SEP]:1",  # `:1` for segment id
    special_tokens=[
        ("[CLS]", tokenizer.token_to_id("[CLS]")),
        ("[SEP]", tokenizer.token_to_id("[SEP]")),
    ],
)

# ---- Step 6: Other configurations and saving ----
tokenizer.decoder = BPEDecoder()  # costum decoder
tokenizer.enable_truncation(max_length=256)
tokenizer.enable_padding(pad_id=3, pad_token="[PAD]")
tokenizer.save(f"{DATA_DIR}/tokenizer-wiki.json")

CPU times: user 18min 13s, sys: 2min 19s, total: 20min 33s
Wall time: 55.2 s


In [227]:
tokenizer = Tokenizer.from_file(f'{DATA_DIR}/tokenizer-wiki.json')

In [165]:
dir(tokenizer)

['__class__',
 '__delattr__',
 '__dict__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__getnewargs__',
 '__getstate__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__le__',
 '__lt__',
 '__ne__',
 '__new__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__setstate__',
 '__sizeof__',
 '__str__',
 '__subclasshook__',
 'add_special_tokens',
 'add_tokens',
 'decode',
 'decode_batch',
 'decoder',
 'enable_padding',
 'enable_truncation',
 'encode',
 'encode_batch',
 'from_buffer',
 'from_file',
 'from_str',
 'get_vocab',
 'get_vocab_size',
 'id_to_token',
 'model',
 'no_padding',
 'no_truncation',
 'normalizer',
 'num_special_tokens_to_add',
 'padding',
 'post_process',
 'post_processor',
 'pre_tokenizer',
 'save',
 'to_str',
 'token_to_id',
 'train',
 'train_from_iterator',
 'truncation']

### Try to use it

encode a single sentence

In [166]:
encoding = tokenizer.encode("Hello, y'all!, How are you 😁 ?")
rprint(encoding)
rprint(encoding.tokens)
rprint(encoding.ids)
rprint(encoding.type_ids)  # Is this the segment id?
rprint(encoding.attention_mask)

In [167]:
"Hello, y'all!, How are you 😁 ?"[encoding.offsets[11][0]: encoding.offsets[11][1]]

'😁'

In [170]:
# Special tokens will be automatically removed.
tokenizer.decode([1, 26696, 16, 93, 11, 4520, 5, 16, 7388, 4535, 5642, 0, 35, 2])  # skip_special_tokens default to True


"Hello,y'all!,Howareyou?"

encode a pair of sentence

In [9]:
encoding = tokenizer.encode("Hello, y'all!", "How are you 😁 ?")
rprint(encoding)
rprint(encoding.tokens)
rprint(encoding.ids)
rprint(encoding.type_ids)
rprint(encoding.attention_mask)

In [233]:
tokenizer.decode([1, 26696, 16, 93, 11, 4520, 5, 2, 7388, 4535, 5642, 0, 35, 2])

"Hello,y'all!Howareyou?"

`encode_batch`

In [10]:
encoding = tokenizer.encode_batch(["Hello, y'all!", "How are you these days 😁 ?"])  # a flat list means a list of single sentence
rprint(encoding)
rprint(encoding[0].tokens)
rprint(encoding[0].ids)
rprint(encoding[0].type_ids)
rprint(encoding[0].attention_mask)

In [25]:
encoding = tokenizer.encode_batch(
    [["Hello, y'all!", "How are you 😁 ?"], ["Hello to you too!", "I'm fine, thank you!"]]
)  # a list of two-element list means pair encoding
rprint(encoding)
rprint(encoding[0].tokens)
rprint(encoding[0].ids)
rprint(encoding[0].type_ids)
rprint(encoding[0].attention_mask)
rprint(encoding[1].ids)

In [27]:
tokenizer.decode_batch([[1, 26696, 16, 93, 11, 4520, 5, 2, 7388, 4535, 5642, 0, 35, 2, 3, 3], [1, 26696, 4453, 5642, 6023, 5, 2, 45, 11, 81, 9908, 16, 23360, 5642, 5, 2]])

["Hello , y ' all ! How are you ?",
 "Hello to you too ! I ' m fine , thank you !"]

In [41]:
dir(tokenizer)

['__class__',
 '__delattr__',
 '__dict__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__getnewargs__',
 '__getstate__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__le__',
 '__lt__',
 '__ne__',
 '__new__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__setstate__',
 '__sizeof__',
 '__str__',
 '__subclasshook__',
 'add_special_tokens',
 'add_tokens',
 'decode',
 'decode_batch',
 'decoder',
 'enable_padding',
 'enable_truncation',
 'encode',
 'encode_batch',
 'from_buffer',
 'from_file',
 'from_str',
 'get_vocab',
 'get_vocab_size',
 'id_to_token',
 'model',
 'no_padding',
 'no_truncation',
 'normalizer',
 'num_special_tokens_to_add',
 'padding',
 'post_process',
 'post_processor',
 'pre_tokenizer',
 'save',
 'to_str',
 'token_to_id',
 'train',
 'train_from_iterator',
 'truncation']

In [43]:
tokenizer.get_vocab_size()

30000

Change `post_processor`

In [37]:
tokenizer.post_processor = TemplateProcessing(
    single="<CLS> $A <SEP>",
    pair="<CLS> $A <SEP> $B:1 <SEP>:1",  # `:1` for segment id
    special_tokens=[
        ("<CLS>", tokenizer.token_to_id("[CLS]")),
        ("<SEP>", tokenizer.token_to_id("[SEP]")),
    ],
)  # the special token here seems to be stand-alone ones.

## T5 tokenizer

In [100]:
from transformers import T5Tokenizer, T5TokenizerFast
import pandas as pd 

In [96]:
MODEL = "t5-base"
tokenizer = T5Tokenizer.from_pretrained(MODEL)

In [102]:
rprint(set(dir(T5Tokenizer.from_pretrained(MODEL))) - set(dir(T5TokenizerFast.from_pretrained(MODEL))))
rprint(set(dir(T5TokenizerFast.from_pretrained(MODEL))) - set(dir(T5Tokenizer.from_pretrained(MODEL))))

In [97]:
[attr for attr in dir(tokenizer) if attr.endswith('_token') or attr.endswith('_tokens')]

['_add_tokens',
 '_additional_special_tokens',
 '_bos_token',
 '_cls_token',
 '_convert_id_to_token',
 '_eos_token',
 '_mask_token',
 '_pad_token',
 '_sep_token',
 '_unk_token',
 'add_special_tokens',
 'add_tokens',
 'additional_special_tokens',
 'all_special_tokens',
 'bos_token',
 'build_inputs_with_special_tokens',
 'cls_token',
 'convert_ids_to_tokens',
 'eos_token',
 'mask_token',
 'pad_token',
 'sanitize_special_tokens',
 'sep_token',
 'unique_no_split_tokens',
 'unk_token']

In [31]:
df = pd.read_csv('~/wusuowei/data/kaggle/news_summary/news_summary_processed.csv')

In [44]:
rprint(hasattr(tokenizer, 'post_processor'))
rprint(hasattr(tokenizer, 'enable_truncation'))
rprint(hasattr(tokenizer, 'enable_padding'))

In [52]:
text = df.summary[0]
rprint(text)

In [50]:
tokenizer.pad_token, tokenizer.pad_token_id

('<pad>', 0)

In [54]:
tokenizer(
    text,
    max_length=100,
    padding="max_length",
    truncation=True,
    return_attention_mask=True,
    add_special_tokens=True,
    return_tensors="pt",
)

{'input_ids': tensor([[   37,  6863,    13,  3545, 30110,   878,   348,    11,  2043,    76,
            65,     3,    52, 17943,    26,   165,   455,    24,   263,    34,
         29701,    21,   887,    12,  6177,     3,  9782, 10193,    12,    70,
          5069,  6976,    30,     8,  5333,    13,  2922,   157,     7,  6111,
           232,  2618,    30,  1660,  4306,    37,  3602,    47,  5241,    12,
         14510,     8,  1357,   441,   997,   716,    13,    19, 17180,     8,
         15646,   227,    34,  1204,  5731,   157,    45,  1652,    11,    47,
             3,     7,    40,   265,  2726,    30,   569,   783,     5,     1,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1

In [56]:
tokenizer.encode(
    text,
    max_length=100,
    padding="max_length",
    truncation=True,
    return_attention_mask=True,
    add_special_tokens=True,
    return_tensors="pt",
)

tensor([[   37,  6863,    13,  3545, 30110,   878,   348,    11,  2043,    76,
            65,     3,    52, 17943,    26,   165,   455,    24,   263,    34,
         29701,    21,   887,    12,  6177,     3,  9782, 10193,    12,    70,
          5069,  6976,    30,     8,  5333,    13,  2922,   157,     7,  6111,
           232,  2618,    30,  1660,  4306,    37,  3602,    47,  5241,    12,
         14510,     8,  1357,   441,   997,   716,    13,    19, 17180,     8,
         15646,   227,    34,  1204,  5731,   157,    45,  1652,    11,    47,
             3,     7,    40,   265,  2726,    30,   569,   783,     5,     1,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0]])

In [66]:
encoding = tokenizer.encode_plus(
    text,
    max_length=100,
    padding="max_length",
    truncation=True,
    return_attention_mask=True,
    add_special_tokens=True,
    return_tensors="pt",
)['input_ids']

In [68]:
tokenizer.decode(encoding[0], skip_special_tokens=True)

'The Administration of Union Territory Daman and Diu has revoked its order that made it compulsory for women to tie rakhis to their male colleagues on the occasion of Rakshabandhan on August 7. The administration was forced to withdraw the decision within 24 hours of issuing the circular after it received flak from employees and was slammed on social media.'

In [70]:
tokenizer.decode(encoding[0][-1])  # input can also be a single token id.

'<pad>'

In [72]:
tokenizer.all_special_ids, tokenizer.all_special_tokens

([1,
  2,
  0,
  32099,
  32098,
  32097,
  32096,
  32095,
  32094,
  32093,
  32092,
  32091,
  32090,
  32089,
  32088,
  32087,
  32086,
  32085,
  32084,
  32083,
  32082,
  32081,
  32080,
  32079,
  32078,
  32077,
  32076,
  32075,
  32074,
  32073,
  32072,
  32071,
  32070,
  32069,
  32068,
  32067,
  32066,
  32065,
  32064,
  32063,
  32062,
  32061,
  32060,
  32059,
  32058,
  32057,
  32056,
  32055,
  32054,
  32053,
  32052,
  32051,
  32050,
  32049,
  32048,
  32047,
  32046,
  32045,
  32044,
  32043,
  32042,
  32041,
  32040,
  32039,
  32038,
  32037,
  32036,
  32035,
  32034,
  32033,
  32032,
  32031,
  32030,
  32029,
  32028,
  32027,
  32026,
  32025,
  32024,
  32023,
  32022,
  32021,
  32020,
  32019,
  32018,
  32017,
  32016,
  32015,
  32014,
  32013,
  32012,
  32011,
  32010,
  32009,
  32008,
  32007,
  32006,
  32005,
  32004,
  32003,
  32002,
  32001,
  32000],
 ['</s>',
  '<unk>',
  '<pad>',
  '<extra_id_0>',
  '<extra_id_1>',
  '<extra_id_2>'

In [74]:
tokenizer.mask_token

Using mask_token, but it is not set yet.


In [78]:
tokenizer.tokenize("Hello, y'all!, How are you these days 😁 ?")

['▁Hello',
 ',',
 '▁',
 'y',
 "'",
 'all',
 '!',
 ',',
 '▁How',
 '▁are',
 '▁you',
 '▁these',
 '▁days',
 '▁',
 '😁',
 '▁',
 '?']

In [81]:
tokenizer.decode(tokenizer.encode("Hello, y'all!, How are you these days 😁 ?"))

"Hello, y'all!, How are you these days <unk>?</s>"

## Another pre-trained tokenizer

In [86]:
from transformers import AutoTokenizer

In [103]:
MODEL = 'Helsinki-NLP/opus-mt-en-ro'
tokenizer = AutoTokenizer.from_pretrained(MODEL)

In [88]:
tokenizer.all_special_ids, tokenizer.all_special_tokens

([0, 1, 59542], ['</s>', '<unk>', '<pad>'])

In [91]:
tokenizer.mask_token, tokenizer.bos_token, tokenizer.cls_token, tokenizer.sep_token

Using mask_token, but it is not set yet.
Using bos_token, but it is not set yet.
Using cls_token, but it is not set yet.
Using sep_token, but it is not set yet.


(None, None, None, None)

In [94]:
[attr for attr in dir(tokenizer) if attr.endswith('_token') or attr.endswith('_tokens')]

['_add_tokens',
 '_additional_special_tokens',
 '_bos_token',
 '_cls_token',
 '_convert_id_to_token',
 '_eos_token',
 '_mask_token',
 '_pad_token',
 '_sep_token',
 '_unk_token',
 'add_special_tokens',
 'add_tokens',
 'additional_special_tokens',
 'all_special_tokens',
 'bos_token',
 'build_inputs_with_special_tokens',
 'cls_token',
 'convert_ids_to_tokens',
 'eos_token',
 'mask_token',
 'pad_token',
 'sanitize_special_tokens',
 'sep_token',
 'unique_no_split_tokens',
 'unk_token']

In [109]:
tokenizer.encode(
    text,
    max_length=100,
    padding="max_length",
    truncation=True,
    return_attention_mask=True,
    add_special_tokens=True,
    return_tensors="pt",
)

tensor([[   68, 16472,    14,   523, 46324,   111,   506,    18,  1249,   180,
           174, 34290,   350,   712,    30,   462,    33, 21588,    37,  1577,
            13, 11094,    15, 15217, 15574,    13,   239,  7640, 15995,    45,
             4, 10871,    14,  1510,  3792, 21121,  1065,  2802,    45,  3364,
          4163,    68,  3319,    65,  7883,    13, 14556,     4,  1505,   808,
          1164,  1351,    14, 14843,     4, 26485,   437,    33,  2548, 39356,
           108,  7600,    18,    65,    94,  8963,  6450,    45,  1060,  3155,
             2,     0, 59542, 59542, 59542, 59542, 59542, 59542, 59542, 59542,
         59542, 59542, 59542, 59542, 59542, 59542, 59542, 59542, 59542, 59542,
         59542, 59542, 59542, 59542, 59542, 59542, 59542, 59542, 59542, 59542]])

## BERT tokenizer from scratch

In [8]:
from tokenizers import Tokenizer
from tokenizers.normalizers import Lowercase, NFD, Sequence, StripAccents
from tokenizers.pre_tokenizers import Whitespace
from tokenizers.models import WordPiece
from tokenizers.trainers import WordPieceTrainer
from tokenizers.processors import TemplateProcessing
from tokenizers.decoders import WordPiece as WordPieceDecoder

In [9]:
tokenizer = Tokenizer(WordPiece(unk_token='[UNK]'))
tokenizer.get_vocab_size()

0

In [10]:
tokenizer = Tokenizer(WordPiece(unk_token='[UNK]'))
tokenizer.normalizer = Sequence([NFD(), Lowercase(), StripAccents()])
tokenizer.pre_tokenizer = Whitespace()
trainer = WordPieceTrainer(
    vocab_size=30522, special_tokens=["[UNK]", "[CLS]", "[SEP]", "[PAD]", "[MASK]"]  # why 30522??
)
files = [f"./data/wikitext-103-raw/wiki.{split}.raw" for split in ["test", "train", "valid"]]
tokenizer.train(files, trainer)
tokenizer.post_processor = TemplateProcessing(
    single="[CLS] $A [SEP]",
    pair="[CLS] $A [SEP] $B:1 [SEP]:1",
    special_tokens=[
        ("[CLS]", tokenizer.token_to_id('[CLS]')),
        ("[SEP]", tokenizer.token_to_id('[SEP]')),
    ],
)

In [11]:
tokenizer.encode("Hello, y'all! How are you 😁 ?")

Encoding(num_tokens=13, attributes=[ids, type_ids, tokens, offsets, attention_mask, special_tokens_mask, overflowing])

In [17]:
# We need a custom decoder.
rprint(tokenizer.decode(tokenizer.encode("Welcome to the 🤗 Tokenizers library.").ids))
rprint(tokenizer.decode(tokenizer.encode("Hello, y'all! How are you 😁 ?").ids))
tokenizer.decoder = WordPieceDecoder()
rprint(tokenizer.decode(tokenizer.encode("Welcome to the 🤗 Tokenizers library.").ids))
rprint(tokenizer.decode(tokenizer.encode("Hello, y'all! How are you 😁 ?").ids))

## Tokenizer for language modeling

In [194]:
from tokenizers.implementations import ByteLevelBPETokenizer
from tokenizers.processors import BertProcessing

In [281]:
tokenizer = ByteLevelBPETokenizer()

In [282]:
tokenizer.normalizer, tokenizer.pre_tokenizer, tokenizer.post_processor, tokenizer.decoder

(None,
 <tokenizers.pre_tokenizers.ByteLevel at 0x7f7d5a7d84b0>,
 <tokenizers.processors.ByteLevel at 0x7f7d696fbcf0>,
 <tokenizers.decoders.ByteLevel at 0x7f7d696fb930>)

In [199]:
files = ['./data/oscar_eo/oscar.eo.txt']
tokenizer.train(files=files, vocab_size=52_000, min_frequency=2, special_tokens=["<|endoftext|>"])

In [255]:
tokenizer.post_processor = BertProcessing(
    ("</s>", tokenizer.token_to_id("</s>")),
    ("<s>", tokenizer.token_to_id("<s>")),
)
tokenizer.enable_truncation(max_length=512)

In [256]:
tokenizer.save_model('./data/oscar_eo')

['./data/oscar_eo/vocab.json', './data/oscar_eo/merges.txt']

In [257]:
tokenizer = ByteLevelBPETokenizer(
    "./data/oscar_eo/vocab.json",
    "./data/oscar_eo/merges.txt",
)

In [258]:
tokenizer.post_processor

<tokenizers.processors.ByteLevel at 0x7f7e18552a20>

In [235]:
tokenizer.encode("Mi estas Julien.").tokens

['Mi', 'Ġestas', 'ĠJuli', 'en', '.']

In [236]:
tokenizer.decode(tokenizer.encode("Mi, estas Julien 😁.").ids)

'Mi, estas Julien 😁.'

In [237]:
from transformers import RobertaTokenizerFast

In [238]:
# Re-create the tokenizer in transformers
tokenizer = RobertaTokenizerFast.from_pretrained("./data/oscar_eo/", max_len=512)

In [244]:
tokenizer.decode(tokenizer.encode("Mi, estas Julien 😁."), skip_special_tokens=True)

'Mi, estas Julien 😁.'

In [259]:
from transformers import LineByLineTextDataset

## GPT2 tokenizer

In [263]:
import transformers
[i for i in dir(transformers) if i.endswith('Tokenizer')]

['AlbertTokenizer',
 'AutoTokenizer',
 'AutoTokenizer',
 'BartTokenizer',
 'BarthezTokenizer',
 'BasicTokenizer',
 'BertGenerationTokenizer',
 'BertJapaneseTokenizer',
 'BertTokenizer',
 'BertweetTokenizer',
 'BigBirdTokenizer',
 'BlenderbotSmallTokenizer',
 'BlenderbotTokenizer',
 'CTRLTokenizer',
 'CamembertTokenizer',
 'CharacterTokenizer',
 'ConvBertTokenizer',
 'DPRContextEncoderTokenizer',
 'DPRQuestionEncoderTokenizer',
 'DPRReaderTokenizer',
 'DebertaTokenizer',
 'DebertaV2Tokenizer',
 'DistilBertTokenizer',
 'ElectraTokenizer',
 'FSMTTokenizer',
 'FlaubertTokenizer',
 'FunnelTokenizer',
 'GPT2Tokenizer',
 'HerbertTokenizer',
 'LEDTokenizer',
 'LayoutLMTokenizer',
 'LongformerTokenizer',
 'LxmertTokenizer',
 'M2M100Tokenizer',
 'MBart50Tokenizer',
 'MBartTokenizer',
 'MPNetTokenizer',
 'MT5Tokenizer',
 'MarianTokenizer',
 'MecabTokenizer',
 'MobileBertTokenizer',
 'OpenAIGPTTokenizer',
 'PegasusTokenizer',
 'PhobertTokenizer',
 'PreTrainedTokenizer',
 'ProphetNetTokenizer',
 'R

In [1]:
from transformers import AutoTokenizer, GPT2Tokenizer, GPT2TokenizerFast

In [2]:
pretrained_tokenizer = GPT2TokenizerFast.from_pretrained('gpt2')

In [7]:
[i for i in dir(pretrained_tokenizer) if i.startswith('all')]

['all_special_ids', 'all_special_tokens', 'all_special_tokens_extended']

In [8]:
pretrained_tokenizer.all_special_ids

[50256]

In [10]:
pretrained_tokenizer(["Hello, y'all!", "How are you these days 😁 ?"])

{'input_ids': [[15496, 11, 331, 6, 439, 0], [2437, 389, 345, 777, 1528, 30325, 223, 5633]], 'attention_mask': [[1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 1]]}

In [273]:
pretrained_tokenizer.decode(pretrained_tokenizer.encode("Mi, estas Julien 😁."))

'Mi, estas Julien 😁.'

In [None]:
datasets

In [284]:
from datasets import load_dataset

In [290]:
DATA_DIR = './data/wikitext-103-raw/'
data_files = dict(
    train=DATA_DIR + 'wiki.train.raw',
    validation=DATA_DIR + 'wiki.valid.raw',
    test=DATA_DIR + 'wiki.test.raw'
)
raw_datasets = load_dataset(path='text', data_files=data_files)

Using custom data configuration default-2db72cd504d2a6a0
Downloading and preparing dataset text/default (download: Unknown size, generated: Unknown size, post-processed: Unknown size, total: Unknown size) to /home/tiankang/.cache/huggingface/datasets/text/default-2db72cd504d2a6a0/0.0.0/e16f44aa1b321ece1f87b07977cc5d70be93d69b20486d6dacd62e12cf25c9a5...


HBox(children=(FloatProgress(value=1.0, bar_style='info', layout=Layout(width='20px'), max=1.0), HTML(value=''…

HBox(children=(FloatProgress(value=1.0, bar_style='info', layout=Layout(width='20px'), max=1.0), HTML(value=''…

HBox(children=(FloatProgress(value=1.0, bar_style='info', layout=Layout(width='20px'), max=1.0), HTML(value=''…

Dataset text downloaded and prepared to /home/tiankang/.cache/huggingface/datasets/text/default-2db72cd504d2a6a0/0.0.0/e16f44aa1b321ece1f87b07977cc5d70be93d69b20486d6dacd62e12cf25c9a5. Subsequent calls will reuse this data.


In [302]:
raw_datasets['train'][123]['text']

' Except for Æsthetic Club meetings , the Tower Building remained largely unoccupied for almost fifty years and suffered significant deterioration . The Æsthetic Club provided much @-@ needed financial support during the period and even paid the electric bill during the Great Depression . The Æsthetic Club is still headquartered in the Tower Building . '

In [322]:
def tokenize_function(examples):
    return pretrained_tokenizer(
        examples['text'],
        max_length=512,
        truncation=True,
    )
block_size = 512
def group_texts(examples):
    # Concatenate all texts.
    concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()}
    total_length = len(concatenated_examples[list(examples.keys())[0]])
    # We drop the small remainder, we could add padding if the model supported it instead of this drop, you can
    # customize this part to your needs.
    total_length = (total_length // block_size) * block_size
    # Split by chunks of max_len.
    result = {
        k: [t[i : i + block_size] for i in range(0, total_length, block_size)]
        for k, t in concatenated_examples.items()
    }
    result["labels"] = result["input_ids"].copy()
    return result

In [313]:
raw_datasets['train'][123]['text']

' Except for Æsthetic Club meetings , the Tower Building remained largely unoccupied for almost fifty years and suffered significant deterioration . The Æsthetic Club provided much @-@ needed financial support during the period and even paid the electric bill during the Great Depression . The Æsthetic Club is still headquartered in the Tower Building . '

In [316]:
tokenize_function(raw_datasets['train'][123])

{'input_ids': [18181, 329, 6184, 228, 301, 6587, 6289, 8292, 837, 262, 8765, 11819, 6150, 5688, 555, 28756, 329, 2048, 15334, 812, 290, 6989, 2383, 38495, 764, 383, 6184, 228, 301, 6587, 6289, 2810, 881, 2488, 12, 31, 2622, 3176, 1104, 1141, 262, 2278, 290, 772, 3432, 262, 5186, 2855, 1141, 262, 3878, 22483, 764, 383, 6184, 228, 301, 6587, 6289, 318, 991, 48583, 287, 262, 8765, 11819, 764, 220], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}

In [310]:
tokenized_datasets = raw_datasets.map(
    tokenize_function,
    batched=True,
    num_proc=8,
    remove_columns=['text'],
)



























In [327]:
lm_datasets = tokenized_datasets.map(
    group_texts,
    batched=True,
    num_proc=8,
)



























In [332]:
lm_datasets['train']

Dataset({
    features: ['attention_mask', 'input_ids', 'labels'],
    num_rows: 228151
})

In [334]:
pretrained_tokenizer.tokenize('Except for Æsthetic Club meetings , the Tower Building remained largely unoccupied for almost fifty years and suffered significant deterioration . The Æsthetic Club provided much @-@ needed financial support during the period and even paid the electric bill during the Great Depression . The Æsthetic Club is still headquartered in the Tower Building .')

['Except',
 'Ġfor',
 'ĠÃ',
 'Ĩ',
 'st',
 'hetic',
 'ĠClub',
 'Ġmeetings',
 'Ġ,',
 'Ġthe',
 'ĠTower',
 'ĠBuilding',
 'Ġremained',
 'Ġlargely',
 'Ġun',
 'occupied',
 'Ġfor',
 'Ġalmost',
 'Ġfifty',
 'Ġyears',
 'Ġand',
 'Ġsuffered',
 'Ġsignificant',
 'Ġdeterioration',
 'Ġ.',
 'ĠThe',
 'ĠÃ',
 'Ĩ',
 'st',
 'hetic',
 'ĠClub',
 'Ġprovided',
 'Ġmuch',
 'Ġ@',
 '-',
 '@',
 'Ġneeded',
 'Ġfinancial',
 'Ġsupport',
 'Ġduring',
 'Ġthe',
 'Ġperiod',
 'Ġand',
 'Ġeven',
 'Ġpaid',
 'Ġthe',
 'Ġelectric',
 'Ġbill',
 'Ġduring',
 'Ġthe',
 'ĠGreat',
 'ĠDepression',
 'Ġ.',
 'ĠThe',
 'ĠÃ',
 'Ĩ',
 'st',
 'hetic',
 'ĠClub',
 'Ġis',
 'Ġstill',
 'Ġheadquartered',
 'Ġin',
 'Ġthe',
 'ĠTower',
 'ĠBuilding',
 'Ġ.']