In [1]:
from datasets import load_dataset, DatasetDict, load_from_disk
from transformers import AutoTokenizer

In [2]:
# read next fast speed
dataset = load_from_disk("../processed_datadir/wikitext-103-story-train")

In [3]:
print(dataset.features)
print(len(dataset))

{'text': Value(dtype='string', id=None)}
29525


In [4]:
from transformers import CanineTokenizer,ByT5Tokenizer
tokenizer = ByT5Tokenizer.from_pretrained("../tokenizer_save/byt5-tokenizer")

In [5]:
import torch
example = tokenizer.encode("Hello, y'all! How are you 😁 ?")
string = " 😁 "
if not isinstance(string, bytes):
    string = str.encode(string)
print(torch.tensor([x + 2 for x in string]))
print(example)

tensor([ 34, 242, 161, 154, 131,  34])
[75, 104, 111, 111, 114, 47, 35, 124, 42, 100, 111, 111, 36, 35, 75, 114, 122, 35, 100, 117, 104, 35, 124, 114, 120, 35, 243, 162, 155, 132, 35, 66, 1]


In [6]:
tokenizer.model_max_length = 2048

In [7]:
print(tokenizer.decode(example))

Hello, y'all! How are you 😁?</s>


In [8]:
import multiprocessing

num_proc = multiprocessing.cpu_count() - 4
print(num_proc)
print(f"The max length for the tokenizer is: {tokenizer.model_max_length}")

def group_texts(examples):
    # print(len(examples["text"]))
    tokenized_inputs = tokenizer(examples["text"], truncation=True,   # 在这就不带文本了
        max_length=tokenizer.model_max_length, return_overflowing_tokens=True,return_length=True)
    input_ids_list = []
    # token_type_ids_list = []
    attention_mask_list = []


    
    for length, input_ids in zip(tokenized_inputs["length"], tokenized_inputs["input_ids"]):
        if length == tokenizer.model_max_length:
            input_ids_list.append(input_ids)
        else:
            input_ids_list.append(input_ids + [tokenizer.pad_token_id] * (tokenizer.model_max_length - length))
    # for length, token_type_ids in zip(tokenized_inputs["length"], tokenized_inputs["token_type_ids"]):
    #     if length == tokenizer.model_max_length:
    #         token_type_ids_list.append(token_type_ids)
    #     else:
    #         token_type_ids_list.append(token_type_ids + [0] * (tokenizer.model_max_length - length))
    for length, attention_mask in zip(tokenized_inputs["length"], tokenized_inputs["attention_mask"]):
        if length == tokenizer.model_max_length:
            attention_mask_list.append(attention_mask)
        else:
            attention_mask_list.append(attention_mask + [0] * (tokenizer.model_max_length - length))
    print("success")
    # return {"input_ids": input_ids_list, "token_type_ids": token_type_ids_list, "attention_mask": attention_mask_list}
    return {"input_ids": input_ids_list, "attention_mask": attention_mask_list}

# preprocess dataset
tokenized_datasets = dataset.map(group_texts, batched=True, remove_columns=["text"], num_proc=num_proc)
tokenized_datasets.features

12
The max length for the tokenizer is: 2048


Map (num_proc=12):   0%|          | 0/29525 [00:00<?, ? examples/s]

success
success
success
success
success
success
success
success
success
success
success
success
success
success
success
success
success
success
success
success
success
success
success
success
success
success
success
success
success
success
success
success
success
success
success
success


{'input_ids': Sequence(feature=Value(dtype='int32', id=None), length=-1, id=None),
 'attention_mask': Sequence(feature=Value(dtype='int8', id=None), length=-1, id=None)}

In [None]:
for i in range(0, len(tokenized_datasets)):
    # print(tokenized_datasets[i])
    for k in tokenized_datasets[i]:
        # print(k)
        # print(len(tokenized_datasets[i][k]))
        if len(tokenized_datasets[i][k]) != tokenizer.model_max_length:
            print("error")
            

In [10]:
# Split preprocessed dataset into train, validation, and test sets
splits = tokenized_datasets.train_test_split(test_size=0.03)
preprocessed_splits = DatasetDict({
    "train": splits["train"],
    "validation": splits["test"],
    "test": load_from_disk("../processed_datadir/wikitext-103-story-test/")
})

In [11]:
preprocessed_splits.save_to_disk("../processed_datadir/wikitext-103-story-chartoken-bert-2048/")

Flattening the indices:   0%|          | 0/28639 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/28639 [00:00<?, ? examples/s]

Flattening the indices:   0%|          | 0/886 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/886 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/62 [00:00<?, ? examples/s]

In [25]:
print(len(preprocessed_splits["train"]))

28639
