In [26]:
### Huggingface dataset and tokenizer imports
from datasets import Dataset
from transformers import PreTrainedTokenizerFast

# ### xVal imports
from src.utils import make_tokenizer, preprocess, analyze

from tokenizers import (
    decoders,
    models,
    processors,
    Tokenizer,
    pre_tokenizers,
)

from tqdm import tqdm

In [11]:
data_dir = './dataset'

ds = Dataset.from_text(data_dir+'/multi_train')
ds

Dataset({
    features: ['text'],
    num_rows: 1043272
})

In [12]:
entry = 0 # index for each row in the dataset
ds['text'][entry]

'3 times 7 is 21'

In [13]:
special_tokens=["[END]", "[MASK]", "[PAD]", "[NUM]"]
full_vocab = {}
vocab_words = ['times', 'is']
tokenizer = Tokenizer(models.BPE(vocab=full_vocab, merges=[]))
tokenizer.add_special_tokens(special_tokens)
tokenizer.add_tokens(vocab_words)
tokenizer.pre_tokenizer = pre_tokenizers.ByteLevel(add_prefix_space=False)
tokenizer.post_processor = processors.ByteLevel(trim_offsets=False)
tokenizer.decoder = decoders.ByteLevel()
tokenizer.save('./multi_tokenizer.json')

In [14]:
tokenizer = PreTrainedTokenizerFast(
    tokenizer_file="./multi_tokenizer.json",
    bos_token="[END]", # beginning of sentence
    eos_token="[END]", # end of sentence
    mask_token="[MASK]", # mask token
    pad_token="[PAD]", # pad token
)

In [15]:
print(ds['text'][0])

3 times 7 is 21


In [16]:
x = preprocess.convert_num_string(ds['text'][0], sigfigs=3)
print(x)

+3.00e+0times+7.00e+0is+2.10e+1


In [17]:
tokenized_x = preprocess.tokenize_fnc(ds['text'][0], tokenizer)
print('input_ids:', tokenized_x['input_ids'])
print('numbers:', tokenized_x['numbers'])

input_ids: [3, 4, 3, 5, 3]
numbers: [ 3  1  7  1 21]


In [18]:
print("\nStarting tokenization...")
tokenize_lambda = lambda x: preprocess.tokenize_fnc(x, tokenizer)
tokenized_ds = ds.map(
    tokenize_lambda,
    batched=False,
    num_proc=30,
    remove_columns=["text"],
    load_from_cache_file=False,
)


Starting tokenization...


Map (num_proc=30): 100%|██████████| 1043272/1043272 [01:09<00:00, 15101.47 examples/s]


In [19]:
num_token = tokenizer.encode("[NUM]")[0]
print(f"[NUM] --> token #{num_token}")

[NUM] --> token #3


In [20]:
tokenized_ds

Dataset({
    features: ['input_ids', 'numbers', 'len'],
    num_rows: 1043272
})

In [21]:
x_token_ids = tokenized_ds[entry]['input_ids']
x_token_ids

[3, 4, 3, 5, 3]

In [22]:
x_num = tokenized_ds[entry]['numbers']
x_num

[3, 1, 7, 1, 21]

In [23]:
[tokenizer.decode(x) for x in x_token_ids]

['[NUM]', 'times', '[NUM]', 'is', '[NUM]']

In [24]:
tokenized_ds.save_to_disk(data_dir+'/multi_train_tokenized')

Saving the dataset (1/1 shards): 100%|██████████| 1043272/1043272 [00:00<00:00, 1827863.04 examples/s]


In [25]:
def tokenize_ds(ds, tokenizer):
    tokenize_lambda = lambda x: preprocess.tokenize_fnc(x, tokenizer)
    tokenized_ds = ds.map(
        tokenize_lambda,
        batched=False,
        num_proc=30,
        remove_columns=["text"],
        load_from_cache_file=False,
    )
    return tokenized_ds

In [27]:
num_digit = 5
digits = list(range(1, num_digit + 1))

for split in ['val', 'test']:
    for a_num_digit in digits:
        for b_num_digit in tqdm(digits[:a_num_digit]):
            name = f'multi_{split}_{a_num_digit}_by_{b_num_digit}'
            ds = Dataset.from_text(f'{data_dir}/{name}')
            tokenized_ds = tokenize_ds(ds, tokenizer)
            tokenized_ds.save_to_disk(f'{data_dir}/{name}_tokenized')

Generating train split: 8 examples [00:00, 1524.79 examples/s]
num_proc must be <= 8. Reducing num_proc to 8 for dataset of size 8.
Map (num_proc=8): 100%|██████████| 8/8 [00:00<00:00, 58.84 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 8/8 [00:00<00:00, 1417.17 examples/s]
100%|██████████| 1/1 [00:00<00:00,  2.56it/s]
Generating train split: 81 examples [00:00, 26455.27 examples/s]
Map (num_proc=30): 100%|██████████| 81/81 [00:00<00:00, 270.24 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 81/81 [00:00<00:00, 12978.52 examples/s]
Generating train split: 810 examples [00:00, 310206.92 examples/s]
Map (num_proc=30): 100%|██████████| 810/810 [00:00<00:00, 2300.25 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 810/810 [00:00<00:00, 61019.56 examples/s]
100%|██████████| 2/2 [00:01<00:00,  1.18it/s]
Generating train split: 810 examples [00:00, 236669.19 examples/s]
Map (num_proc=30): 100%|██████████| 810/810 [00:00<00:00, 2300.14 examples/s]
Sav

In [28]:
train_data_path = data_dir+'/multi_train_tokenized'
train_tokenized_ds = Dataset.load_from_disk(train_data_path)
train_tokenized_ds

Dataset({
    features: ['input_ids', 'numbers', 'len'],
    num_rows: 1043272
})