In [1]:
from datasets import load_dataset, DatasetDict, load_from_disk
from transformers import AutoTokenizer

# Load WikiText-103 dataset
dataset = load_dataset("wikitext", "wikitext-103-raw-v1", split="train")

dataset = load_from_disk("../processed_datadir/wikitext-103-raw-train")

Found cached dataset wikitext (/home/songx_lab/cse12012530/.cache/huggingface/datasets/wikitext/wikitext-103-raw-v1/1.0.0/a241db52902eaf2c6aa732210bead40c090019a499ceb13bcbfa3f8ab646a126)


In [None]:
# read next fast speed
# dataset.save_to_disk("wikitext-103-raw-train")

In [2]:
from transformers import BertTokenizerFast
tokenizer = BertTokenizerFast.from_pretrained("bert-base-uncased")
print(tokenizer.vocab_size)
print("Tokenizer max length: ", tokenizer.model_max_length)  # change your own model max input length
tokenizer.model_max_length = 128
print("Tokenizer max length after change: ", tokenizer.model_max_length) 

# Some tokenizer have a pad_token, which is used to pad a sequence up to max_length. But GPT2 tokenizer doesn't have it

# convenient function to load tokenizer next time
tokenizer.save_pretrained(f"../tokenizer_save/tokenizer-{tokenizer.name_or_path}-{tokenizer.model_max_length}")

30522
Tokenizer max length:  512
Tokenizer max length after change:  128


('../tokenizer_save/tokenizer-bert-base-uncased-128/tokenizer_config.json',
 '../tokenizer_save/tokenizer-bert-base-uncased-128/special_tokens_map.json',
 '../tokenizer_save/tokenizer-bert-base-uncased-128/vocab.txt',
 '../tokenizer_save/tokenizer-bert-base-uncased-128/added_tokens.json',
 '../tokenizer_save/tokenizer-bert-base-uncased-128/tokenizer.json')

In [None]:
print(tokenizer)

BertTokenizerFast(name_or_path='bert-base-uncased', vocab_size=30522, model_max_length=256, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'})


In [None]:
print(tokenizer.mask_token_id)

103


In [None]:
for i in range(10):
    print(dataset[i])

In [8]:
tokenized_inputs = tokenizer(dataset[3]["text"], truncation=True,   # 在这就不带文本了
        max_length=tokenizer.model_max_length, return_overflowing_tokens=True,return_length=True)

In [None]:
print(tokenized_inputs)

In [10]:
import multiprocessing

num_proc = multiprocessing.cpu_count()
print(f"The max length for the tokenizer is: {tokenizer.model_max_length}")

def group_texts(examples):
    tokenized_inputs = tokenizer(examples["text"], truncation=True,   # 在这就不带文本了
        max_length=tokenizer.model_max_length, return_overflowing_tokens=True,return_length=True)
    input_ids_list = []
    token_type_ids_list = []
    attention_mask_list = []
    
    for length, input_ids in zip(tokenized_inputs["length"], tokenized_inputs["input_ids"]):
        if length == tokenizer.model_max_length:
            input_ids_list.append(input_ids)
    for length, token_type_ids in zip(tokenized_inputs["length"], tokenized_inputs["token_type_ids"]):
        if length == tokenizer.model_max_length:
            token_type_ids_list.append(token_type_ids)
    for length, attention_mask in zip(tokenized_inputs["length"], tokenized_inputs["attention_mask"]):
        if length == tokenizer.model_max_length:
            attention_mask_list.append(attention_mask)
        
    return {"input_ids": input_ids_list, "token_type_ids": token_type_ids_list, "attention_mask": attention_mask_list}

# preprocess dataset
tokenized_datasets = dataset.map(group_texts, batched=True, remove_columns=["text"], num_proc=num_proc)
tokenized_datasets.features



The max length for the tokenizer is: 128


Map (num_proc=16):   0%|          | 0/1801350 [00:00<?, ? examples/s]

{'input_ids': Sequence(feature=Value(dtype='int32', id=None), length=-1, id=None),
 'token_type_ids': Sequence(feature=Value(dtype='int8', id=None), length=-1, id=None),
 'attention_mask': Sequence(feature=Value(dtype='int8', id=None), length=-1, id=None)}

In [11]:
print(len(tokenized_datasets))

501627


In [12]:
print(len(dataset))

1801350


In [None]:
for i in range(10):
    print(tokenized_datasets[i])

In [7]:
from itertools import chain

# Main data processing function that will concatenate all texts from our dataset and generate chunks of
# max_seq_length.
def group_texts(examples):
    # Concatenate all texts.
    concatenated_examples = {k: list(chain(*examples[k])) for k in examples.keys()}
    total_length = len(concatenated_examples[list(examples.keys())[0]])
    # We drop the small remainder, we could add padding if the model supported it instead of this drop, you can
    # customize this part to your needs.
    if total_length >= tokenizer.model_max_length:
        total_length = (total_length // tokenizer.model_max_length) * tokenizer.model_max_length
        # residal_length = total_length % tokenizer.model_max_length
    # Split by chunks of max_len.
    result = {
        k: [t[i : i + tokenizer.model_max_length] for i in range(0, total_length, tokenizer.model_max_length)]
        for k, t in concatenated_examples.items()
    }
    # if residal_length > 5:
        
    return result

tokenized_datasets = tokenized_datasets.map(group_texts, batched=True, num_proc=num_proc)
# shuffle dataset
tokenized_datasets = tokenized_datasets.shuffle(seed=34)

print(f"the dataset contains in total {len(tokenized_datasets)*tokenizer.model_max_length} tokens")
# the dataset contains in total 3417216000 tokens


Map (num_proc=16):   0%|          | 0/1801350 [00:00<?, ? examples/s]

the dataset contains in total 117058048 tokens


In [None]:
for i in range(10):
    # print(tokenizer.decode(tokenized_datasets[i]["input_ids"]))
    print(tokenized_datasets[i])
    print("---------------------------------------------------------")

In [3]:
# from datasets import load_dataset, DatasetDict, load_from_disk
# from transformers import AutoTokenizer

# tokenized_datasets = load_from_disk("wikitext-103-raw-train-bert-half-processed")

In [10]:
splits = tokenized_datasets.train_test_split(test_size=0.05, shuffle=True)

In [11]:
preprocessed_splits = DatasetDict({
    "train": splits["train"],
    "validation": splits["test"],
    # "test": load_dataset("wikitext", "wikitext-103-raw-v1", split="test")
})

In [12]:
preprocessed_splits.save_to_disk("../wikitext-103-bert-512-without-test")

Flattening the indices:   0%|          | 0/217197 [00:00<?, ? examples/s]

Saving the dataset (0/2 shards):   0%|          | 0/217197 [00:00<?, ? examples/s]

Flattening the indices:   0%|          | 0/11432 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/11432 [00:00<?, ? examples/s]

### Then we can construct our own wikitext test data
The biggest difference is that train and evaluate always stay with CrossEntry Loss, and won't care the real text effect, but text should have the overflowing tokens for GPT2
 

In [None]:
def preprocess_function(example):
    return_dic = tokenizer.__call__(example["text"], padding="max_length", truncation=True,   # 在这就不带文本了
        max_length=tokenizer.model_max_length, return_overflowing_tokens=True) 

    # if len(return_dic["overflowing_tokens"]) > 0:
    #     return_dic["overflow_text"] = tokenizer.decode(return_dic["overflowing_tokens"])
    # else:
    #     return_dic["overflow_text"] = ""
    # return_dic["prompt"] = tokenizer.decode(return_dic["input_ids"])
    return return_dic
#Apply preprocessing to dataset
preprocessed_splits["test"] = preprocessed_splits["test"].map(preprocess_function,batched=True, remove_columns=["text"],num_proc=4)

Map (num_proc=4):   0%|          | 0/4358 [00:00<?, ? examples/s]

In [None]:

def filter_function(example):  
    return len(example['overflowing_tokens']) > 20 and len(example['overflowing_tokens']) < 300 

preprocessed_splits["test"] = preprocessed_splits["test"].filter(filter_function,num_proc=4)

Filter (num_proc=4):   0%|          | 0/4358 [00:00<?, ? examples/s]

In [None]:
print(len(preprocessed_splits["test"]))

134


In [None]:
def preprocess_function_truncate(example, max_over_length=128):
    # if isinstance(example["overflowing_tokens"], list):
    #     example["overflowing_tokens"] = example["overflowing_tokens"].truncate(max_over_length)
    # else:
    # print("type: ",type(example["overflowing_tokens"]))
    def complete_list(lst, length, value=tokenizer.eos_token_id):
        l = len(lst)
        return lst + [value] * (length - l)
    if len(example["overflowing_tokens"]) > max_over_length:
        example["overflowing_tokens"] = example["overflowing_tokens"][:max_over_length]
    else:
        example["overflowing_tokens"] = complete_list(example["overflowing_tokens"], max_over_length)
    return example

preprocessed_splits["test"] = preprocessed_splits["test"].map(preprocess_function_truncate, batched=False, num_proc=4)

Map (num_proc=4):   0%|          | 0/134 [00:00<?, ? examples/s]

In [None]:
try:
    preprocessed_splits["test"] = preprocessed_splits["test"].remove_columns(["overflowing_tokens_ids"])
except:
    pass
try:
    preprocessed_splits["test"] = preprocessed_splits["test"].remove_columns(["num_truncated_tokens"])
except:
    pass

In [None]:
preprocessed_splits.save_to_disk("../wikitext-103-preprocessed-ws-notext-gpt2-256-wtest")

Flattening the indices:   0%|          | 0/422993 [00:00<?, ? examples/s]

Saving the dataset (0/2 shards):   0%|          | 0/422993 [00:00<?, ? examples/s]

Flattening the indices:   0%|          | 0/47000 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/47000 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/134 [00:00<?, ? examples/s]