In [1]:
from datasets import load_dataset, DatasetDict, load_from_disk
from transformers import AutoTokenizer

# Load WikiText-103 dataset
# dataset = load_dataset("wikitext", "wikitext-103-raw-v1", split="train")

dataset = load_from_disk("../wikitext-103-raw-train")

In [None]:
# read next fast speed
# dataset.save_to_disk("wikitext-103-raw-train")

In [2]:
from transformers import GPT2Tokenizer
tokenizer = GPT2Tokenizer.from_pretrained("gpt2") # change your own model
print(tokenizer.vocab_size)
print("Tokenizer max length: ", tokenizer.model_max_length)  # change your own model max input length
tokenizer.model_max_length = 256
print("Tokenizer max length after change: ", tokenizer.model_max_length) 

# Some tokenizer have a pad_token, which is used to pad a sequence up to max_length. But GPT2 tokenizer doesn't have it
if tokenizer.pad_token is None:
    # tokenizer.set_special_tokens({'pad_token': '[PAD]'})
    tokenizer.pad_token = tokenizer.eos_token

# convenient function to load tokenizer next time
tokenizer.save_pretrained(f"./tokenizer_save/tokenizer-{tokenizer.name_or_path}-{tokenizer.model_max_length}")

Using pad_token, but it is not set yet.


50257
Tokenizer max length:  1024
Tokenizer max length after change:  256


('./tokenizer_save/tokenizer-gpt2-256/tokenizer_config.json',
 './tokenizer_save/tokenizer-gpt2-256/special_tokens_map.json',
 './tokenizer_save/tokenizer-gpt2-256/vocab.json',
 './tokenizer_save/tokenizer-gpt2-256/merges.txt',
 './tokenizer_save/tokenizer-gpt2-256/added_tokens.json')

In [3]:
# make your own filter function
def filter_function(example):  
    return len(example['text'].split()) >= 100

dataset_without_short = dataset.filter(filter_function,num_proc=4)

Filter:   0%|          | 0/1801350 [00:00<?, ? examples/s]

In [4]:
# Define function to tokenize and encode text
def preprocess_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True, max_length=tokenizer.model_max_length)
    

# Apply preprocessing to dataset
preprocessed_dataset_without_short = dataset_without_short.map(preprocess_function, batched=True, num_proc=4)

# Split preprocessed dataset into train, validation, and test sets
splits = preprocessed_dataset_without_short.train_test_split(test_size=0.1, shuffle=True)
preprocessed_splits = DatasetDict({
    "train": splits["train"],
    "validation": splits["test"],
    "test": load_dataset("wikitext", "wikitext-103-raw-v1", split="test")
})

Map (num_proc=4):   0%|          | 0/469993 [00:00<?, ? examples/s]

Found cached dataset wikitext (/home/songx_lab/cse12012530/.cache/huggingface/datasets/wikitext/wikitext-103-raw-v1/1.0.0/a241db52902eaf2c6aa732210bead40c090019a499ceb13bcbfa3f8ab646a126)


### Then we can construct our own wikitext test data
The biggest difference is that train and evaluate always stay with CrossEntry Loss, and won't care the real text effect, but text should have the overflowing tokens for GPT2
 

In [5]:
def preprocess_function(example):
    return_dic = tokenizer.__call__(example["text"], padding="max_length", truncation=True,   # 在这就不带文本了
        max_length=tokenizer.model_max_length, return_overflowing_tokens=True) 

    # if len(return_dic["overflowing_tokens"]) > 0:
    #     return_dic["overflow_text"] = tokenizer.decode(return_dic["overflowing_tokens"])
    # else:
    #     return_dic["overflow_text"] = ""
    # return_dic["prompt"] = tokenizer.decode(return_dic["input_ids"])
    return return_dic
#Apply preprocessing to dataset
preprocessed_splits["test"] = preprocessed_splits["test"].map(preprocess_function,batched=True, remove_columns=["text"],num_proc=4)

Map (num_proc=4):   0%|          | 0/4358 [00:00<?, ? examples/s]

In [6]:

def filter_function(example):  
    return len(example['overflowing_tokens']) > 20 and len(example['overflowing_tokens']) < 300 

preprocessed_splits["test"] = preprocessed_splits["test"].filter(filter_function,num_proc=4)

Filter (num_proc=4):   0%|          | 0/4358 [00:00<?, ? examples/s]

In [7]:
print(len(preprocessed_splits["test"]))

134


In [8]:
def preprocess_function_truncate(example, max_over_length=128):
    # if isinstance(example["overflowing_tokens"], list):
    #     example["overflowing_tokens"] = example["overflowing_tokens"].truncate(max_over_length)
    # else:
    # print("type: ",type(example["overflowing_tokens"]))
    def complete_list(lst, length, value=tokenizer.eos_token_id):
        l = len(lst)
        return lst + [value] * (length - l)
    if len(example["overflowing_tokens"]) > max_over_length:
        example["overflowing_tokens"] = example["overflowing_tokens"][:max_over_length]
    else:
        example["overflowing_tokens"] = complete_list(example["overflowing_tokens"], max_over_length)
    return example

preprocessed_splits["test"] = preprocessed_splits["test"].map(preprocess_function_truncate, batched=False, num_proc=4)

Map (num_proc=4):   0%|          | 0/134 [00:00<?, ? examples/s]

In [9]:
try:
    preprocessed_splits["test"] = preprocessed_splits["test"].remove_columns(["overflowing_tokens_ids"])
except:
    pass
try:
    preprocessed_splits["test"] = preprocessed_splits["test"].remove_columns(["num_truncated_tokens"])
except:
    pass

In [10]:
preprocessed_splits.save_to_disk("../wikitext-103-preprocessed-ws-notext-gpt2-256-wtest")

Flattening the indices:   0%|          | 0/422993 [00:00<?, ? examples/s]

Saving the dataset (0/2 shards):   0%|          | 0/422993 [00:00<?, ? examples/s]

Flattening the indices:   0%|          | 0/47000 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/47000 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/134 [00:00<?, ? examples/s]