In [2]:
from datasets import load_dataset, DatasetDict, load_from_disk
from transformers import AutoTokenizer

# Load WikiText-103 dataset
# dataset = load_dataset("wikitext", "wikitext-103-raw-v1", split="train")

# dataset = load_from_disk("../wikitext-103-raw-train")

In [3]:
preprocessed_splits = load_from_disk("../wikitext-103-bert-512-without-test")

In [4]:
print(preprocessed_splits["train"])

Dataset({
    features: ['input_ids', 'token_type_ids', 'attention_mask'],
    num_rows: 217197
})


In [None]:
# read next fast speed
# dataset.save_to_disk("wikitext-103-raw-train")

In [1]:
from transformers import BertTokenizerFast
tokenizer = BertTokenizerFast.from_pretrained("bert-base-uncased")
print(tokenizer.vocab_size)
print("Tokenizer max length: ", tokenizer.model_max_length)  # change your own model max input length
tokenizer.model_max_length = 512
print("Tokenizer max length after change: ", tokenizer.model_max_length) 

# Some tokenizer have a pad_token, which is used to pad a sequence up to max_length. But GPT2 tokenizer doesn't have it

# convenient function to load tokenizer next time
tokenizer.save_pretrained(f"../tokenizer_save/tokenizer-{tokenizer.name_or_path}-{tokenizer.model_max_length}")

30522
Tokenizer max length:  512
Tokenizer max length after change:  512


('../tokenizer_save/tokenizer-bert-base-uncased-512/tokenizer_config.json',
 '../tokenizer_save/tokenizer-bert-base-uncased-512/special_tokens_map.json',
 '../tokenizer_save/tokenizer-bert-base-uncased-512/vocab.txt',
 '../tokenizer_save/tokenizer-bert-base-uncased-512/added_tokens.json',
 '../tokenizer_save/tokenizer-bert-base-uncased-512/tokenizer.json')

In [None]:
print(tokenizer)

BertTokenizerFast(name_or_path='bert-base-uncased', vocab_size=30522, model_max_length=256, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'})


In [None]:
print(tokenizer.mask_token_id)

103


In [3]:
for i in range(10):
    print(dataset[i])

{'text': ''}
{'text': ' = Valkyria Chronicles III = \n'}
{'text': ''}
{'text': ' Senjō no Valkyria 3 : Unrecorded Chronicles ( Japanese : 戦場のヴァルキュリア3 , lit . Valkyria of the Battlefield 3 ) , commonly referred to as Valkyria Chronicles III outside Japan , is a tactical role @-@ playing video game developed by Sega and Media.Vision for the PlayStation Portable . Released in January 2011 in Japan , it is the third game in the Valkyria series . Employing the same fusion of tactical and real @-@ time gameplay as its predecessors , the story runs parallel to the first game and follows the " Nameless " , a penal military unit serving the nation of Gallia during the Second Europan War who perform secret black operations and are pitted against the Imperial unit " Calamaty Raven " . \n'}
{'text': " The game began development in 2010 , carrying over a large portion of the work done on Valkyria Chronicles II . While it retained the standard features of the series , it also underwent multiple adju

In [4]:
import multiprocessing

num_proc = multiprocessing.cpu_count()
print(f"The max length for the tokenizer is: {tokenizer.model_max_length}")

def group_texts(examples):
    tokenized_inputs = tokenizer(
       examples["text"], 
    )
    return tokenized_inputs

# preprocess dataset
tokenized_datasets = dataset.map(group_texts, batched=True, remove_columns=["text"], num_proc=num_proc)
tokenized_datasets.features



The max length for the tokenizer is: 512


Map (num_proc=16):   0%|          | 0/1801350 [00:00<?, ? examples/s]

Token indices sequence length is longer than the specified maximum sequence length for this model (519 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (606 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (667 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (516 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (627 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for thi

{'input_ids': Sequence(feature=Value(dtype='int32', id=None), length=-1, id=None),
 'token_type_ids': Sequence(feature=Value(dtype='int8', id=None), length=-1, id=None),
 'attention_mask': Sequence(feature=Value(dtype='int8', id=None), length=-1, id=None)}

In [6]:
for i in range(10):
    print(tokenized_datasets[i])

{'input_ids': [101, 102], 'token_type_ids': [0, 0], 'attention_mask': [1, 1]}
{'input_ids': [101, 1027, 11748, 4801, 4360, 11906, 3523, 1027, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1]}
{'input_ids': [101, 102], 'token_type_ids': [0, 0], 'attention_mask': [1, 1]}
{'input_ids': [101, 12411, 5558, 2053, 11748, 4801, 4360, 1017, 1024, 4895, 2890, 27108, 5732, 11906, 1006, 2887, 1024, 1856, 1806, 1671, 30222, 30218, 30259, 30227, 30255, 30258, 30219, 2509, 1010, 5507, 1012, 11748, 4801, 4360, 1997, 1996, 11686, 1017, 1007, 1010, 4141, 3615, 2000, 2004, 11748, 4801, 4360, 11906, 3523, 2648, 2900, 1010, 2003, 1037, 8608, 2535, 1030, 1011, 1030, 2652, 2678, 2208, 2764, 2011, 16562, 1998, 2865, 1012, 4432, 2005, 1996, 9160, 12109, 1012, 2207, 1999, 2254, 2249, 1999, 2900, 1010, 2009, 2003, 1996, 2353, 2208, 1999, 1996, 11748, 4801, 4360, 2186, 1012, 15440, 1996, 2168, 10077, 1997, 8608, 1998, 2613, 1030, 1011, 1030, 2051, 11247, 2004, 204

In [7]:
from itertools import chain

# Main data processing function that will concatenate all texts from our dataset and generate chunks of
# max_seq_length.
def group_texts(examples):
    # Concatenate all texts.
    concatenated_examples = {k: list(chain(*examples[k])) for k in examples.keys()}
    total_length = len(concatenated_examples[list(examples.keys())[0]])
    # We drop the small remainder, we could add padding if the model supported it instead of this drop, you can
    # customize this part to your needs.
    if total_length >= tokenizer.model_max_length:
        total_length = (total_length // tokenizer.model_max_length) * tokenizer.model_max_length
        # residal_length = total_length % tokenizer.model_max_length
    # Split by chunks of max_len.
    result = {
        k: [t[i : i + tokenizer.model_max_length] for i in range(0, total_length, tokenizer.model_max_length)]
        for k, t in concatenated_examples.items()
    }
    # if residal_length > 5:
        
    return result

tokenized_datasets = tokenized_datasets.map(group_texts, batched=True, num_proc=num_proc)
# shuffle dataset
tokenized_datasets = tokenized_datasets.shuffle(seed=34)

print(f"the dataset contains in total {len(tokenized_datasets)*tokenizer.model_max_length} tokens")
# the dataset contains in total 3417216000 tokens


Map (num_proc=16):   0%|          | 0/1801350 [00:00<?, ? examples/s]

the dataset contains in total 117058048 tokens


In [8]:
for i in range(10):
    # print(tokenizer.decode(tokenized_datasets[i]["input_ids"]))
    print(tokenized_datasets[i])
    print("---------------------------------------------------------")

{'input_ids': [102, 101, 2096, 8026, 14887, 1005, 1055, 2303, 2001, 2579, 2011, 1057, 1012, 1055, 1012, 2749, 1010, 1996, 4230, 1997, 1996, 2176, 2500, 2730, 1999, 1996, 8118, 2020, 2187, 2369, 2012, 1996, 7328, 1998, 2101, 2579, 2046, 9889, 9968, 1012, 102, 101, 102, 101, 1027, 1027, 1027, 1027, 10236, 1030, 1011, 1030, 2039, 1027, 1027, 1027, 1027, 102, 101, 102, 101, 1996, 8118, 2001, 3832, 2000, 2202, 2871, 2781, 1012, 2035, 2409, 1010, 1996, 2051, 2090, 1996, 2136, 1005, 1055, 4443, 1999, 1998, 6164, 2013, 1996, 7328, 2001, 4229, 2781, 1012, 2429, 2000, 1996, 3378, 2811, 1010, 1996, 6101, 2001, 2949, 1999, 1996, 2034, 2321, 2781, 1012, 102, 101, 2051, 1999, 1996, 7328, 2001, 2985, 4288, 12534, 1010, 1000, 3048, 5362, 2083, 1996, 7328, 1010, 2282, 2000, 2282, 1010, 2723, 2000, 2723, 1000, 12329, 1996, 2308, 1998, 2336, 1010, 8430, 1000, 4255, 2358, 11823, 2229, 1998, 19820, 5555, 6155, 1000, 2164, 1037, 6270, 2341, 1010, 1998, 6575, 1996, 7328, 2005, 2592, 1012, 1057, 1012, 1055, 1

In [3]:
# from datasets import load_dataset, DatasetDict, load_from_disk
# from transformers import AutoTokenizer

# tokenized_datasets = load_from_disk("wikitext-103-raw-train-bert-half-processed")

In [10]:
splits = tokenized_datasets.train_test_split(test_size=0.05, shuffle=True)

In [11]:
preprocessed_splits = DatasetDict({
    "train": splits["train"],
    "validation": splits["test"],
    # "test": load_dataset("wikitext", "wikitext-103-raw-v1", split="test")
})

In [12]:
preprocessed_splits.save_to_disk("../wikitext-103-bert-512-without-test")

Flattening the indices:   0%|          | 0/217197 [00:00<?, ? examples/s]

Saving the dataset (0/2 shards):   0%|          | 0/217197 [00:00<?, ? examples/s]

Flattening the indices:   0%|          | 0/11432 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/11432 [00:00<?, ? examples/s]

### Then we can construct our own wikitext test data
The biggest difference is that train and evaluate always stay with CrossEntry Loss, and won't care the real text effect, but text should have the overflowing tokens for GPT2
 

In [None]:
def preprocess_function(example):
    return_dic = tokenizer.__call__(example["text"], padding="max_length", truncation=True,   # 在这就不带文本了
        max_length=tokenizer.model_max_length, return_overflowing_tokens=True) 

    # if len(return_dic["overflowing_tokens"]) > 0:
    #     return_dic["overflow_text"] = tokenizer.decode(return_dic["overflowing_tokens"])
    # else:
    #     return_dic["overflow_text"] = ""
    # return_dic["prompt"] = tokenizer.decode(return_dic["input_ids"])
    return return_dic
#Apply preprocessing to dataset
preprocessed_splits["test"] = preprocessed_splits["test"].map(preprocess_function,batched=True, remove_columns=["text"],num_proc=4)

Map (num_proc=4):   0%|          | 0/4358 [00:00<?, ? examples/s]

In [None]:

def filter_function(example):  
    return len(example['overflowing_tokens']) > 20 and len(example['overflowing_tokens']) < 300 

preprocessed_splits["test"] = preprocessed_splits["test"].filter(filter_function,num_proc=4)

Filter (num_proc=4):   0%|          | 0/4358 [00:00<?, ? examples/s]

In [None]:
print(len(preprocessed_splits["test"]))

134


In [None]:
def preprocess_function_truncate(example, max_over_length=128):
    # if isinstance(example["overflowing_tokens"], list):
    #     example["overflowing_tokens"] = example["overflowing_tokens"].truncate(max_over_length)
    # else:
    # print("type: ",type(example["overflowing_tokens"]))
    def complete_list(lst, length, value=tokenizer.eos_token_id):
        l = len(lst)
        return lst + [value] * (length - l)
    if len(example["overflowing_tokens"]) > max_over_length:
        example["overflowing_tokens"] = example["overflowing_tokens"][:max_over_length]
    else:
        example["overflowing_tokens"] = complete_list(example["overflowing_tokens"], max_over_length)
    return example

preprocessed_splits["test"] = preprocessed_splits["test"].map(preprocess_function_truncate, batched=False, num_proc=4)

Map (num_proc=4):   0%|          | 0/134 [00:00<?, ? examples/s]

In [None]:
try:
    preprocessed_splits["test"] = preprocessed_splits["test"].remove_columns(["overflowing_tokens_ids"])
except:
    pass
try:
    preprocessed_splits["test"] = preprocessed_splits["test"].remove_columns(["num_truncated_tokens"])
except:
    pass

In [None]:
preprocessed_splits.save_to_disk("../wikitext-103-preprocessed-ws-notext-gpt2-256-wtest")

Flattening the indices:   0%|          | 0/422993 [00:00<?, ? examples/s]

Saving the dataset (0/2 shards):   0%|          | 0/422993 [00:00<?, ? examples/s]

Flattening the indices:   0%|          | 0/47000 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/47000 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/134 [00:00<?, ? examples/s]