# 预训练数据

以 GPT2 进行讲解：

1. 预训练数据来源和清洗
2. 预训练数据示例
3. 预训练数据封装

## 1. 预训练数据来源

Common Crawl 是常见的网络爬虫语料数据，OpenAI 爬取高质量数据，形成数据集 **WebText**

1. Reddit 少于 3 个 Karma 的帖子
2. 启发式的连接感兴趣、有教育意义和仅仅只是娱乐的特子
3. 45 million links
4. WebText 子集 Dragnet, Newspaper
5. 截止2017, 收集 8 million documents 再过滤或去重 得到 40GB 数据
6. 去除 Wiki 数据（可能wiki将作为 evaluate 数据集测评 PPL）

数据集目录：`https://github.com/openai/gpt-2-output-dataset`

演示数据集目录：`https://openaipublic.blob.core.windows.net/gpt-2/output-dataset/v1/small-117M.test.jsonl`

In [1]:
! mkdir output
! wget -O ./output/data.jsonl https://openaipublic.blob.core.windows.net/gpt-2/output-dataset/v1/small-117M.test.jsonl ./output/data.jsonl

mkdir: output: File exists
--2025-10-11 17:29:02--  https://openaipublic.blob.core.windows.net/gpt-2/output-dataset/v1/small-117M.test.jsonl
Resolving openaipublic.blob.core.windows.net (openaipublic.blob.core.windows.net)... 20.60.244.1
connected. to openaipublic.blob.core.windows.net (openaipublic.blob.core.windows.net)|20.60.244.1|:443... 
HTTP request sent, awaiting response... 200 OK
Length: 15582673 (15M) [application/octet-stream]
Saving to: ‘./output/data.jsonl’


2025-10-11 17:34:27 (47.0 KB/s) - ‘./output/data.jsonl’ saved [15582673/15582673]

Prepended http:// to './output/data.jsonl'
--2025-10-11 17:34:27--  http://./output/data.jsonl
Resolving . (.)... failed: nodename nor servname provided, or not known.
wget: unable to resolve host address ‘.’
FINISHED --2025-10-11 17:34:27--
Total wall clock time: 5m 26s
Downloaded: 1 files, 15M in 5m 24s (47.0 KB/s)


In [2]:
import json

def load_jsonl(file_path):
    data = []
    with open(file_path, 'r', encoding='utf-8') as f:
        for line in f:
            if line.strip():
                data.append(json.loads(line)['text'])
    return data

# 使用示例
data = load_jsonl('./output/data.jsonl')

In [3]:
print(len(data))
print(data[0])

5000
The Technology Report empowers or enlightens. This column does not necessarily reflect the opinion of the company that issued the report.


# 数据拼接

In [4]:
data_total = '\n'.join(data)
print(len(data_total))

15075691


## tokenizer

In [5]:
class SimplestTokenizer:
    def __init__(self, text):
        tokens = list(text)
        self.vocab = {}
        self.vocab_reverse = {}
        idx = 0
        for i in tokens:
            if i not in self.vocab:
                self.vocab[i] = idx
                self.vocab_reverse[idx] = i
                idx += 1
    def encode(self, text, return_pt = False):
        tokens = list(text)
        token_ids = [ self.vocab[token] for token in tokens]
        if return_pt:
            token_ids = torch.tensor(token_ids, dtype = torch.long).unsqueeze(0)
        return token_ids
    def decode(self, ids):
        token_list = [self.vocab_reverse[idx] for idx in ids]
        text = ''.join(token_list)
        return text
        

print(len(tokenizer.vocab))
tokenizer = SimplestTokenizer(data_total)
print(tokenizer.encode('hello world'))
my_data = tokenizer.encode(data_total)

NameError: name 'tokenizer' is not defined

## 数据集封装

In [None]:
data_block_size = 512
data_block = [  my_data[i: i+data_block_size ]  for i in range(0, len(my_data), data_block_size)]

print(len(data_block))
print(len(data_block[-1])) # 最后一个数据舍去或填充成 data_block_size 
print(data_block[0])

In [None]:
import torch
from torch.utils.data import Dataset, DataLoader

class PretrainedLanguageModelDataset(Dataset):
    def __init__(self, data, max_len = 512, pad_token_id = 0):
        self.data = data
        self.max_len = 512
        self.pad_token_id = 0
        # self.tokenizer = tokenizer

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        item = self.data[idx]
        if len(item) < self.max_len:
            item = item + (self.max_len-len(item)) * [self.pad_token_id]
        return torch.tensor(item, dtype = torch.long)
        
dataset = PretrainedLanguageModelDataset(data_block)
dataloader = DataLoader(dataset, 
                        batch_size=2, 
                        shuffle=False, 
                        collate_fn = None) # dataset getitem 根据 index 取数据，批量数据建议用 collate_fn 进行封装

for i, batch in enumerate(dataloader):
    print(batch)
    break

## 批量数据加载

In [None]:
from typing import Dict
class PaddingCollateFunction:
    def __init__(self, pad_token_id: int):
        self.pad_token_id = pad_token_id

    def __call__(self, batch) -> Dict:
        batch = paddding_collate_fn(batch, self.pad_token_id)
        return batch


def paddding_collate_fn(batch_data, pad_token_id=None):

    input_lens = []
    label_lens = []
    bs = len(batch_data)
    for data in batch_data:
        input_lens.append(data.shape[0])
    max_input_len = torch.max(torch.tensor(input_lens, dtype=torch.long))
    input_ids = torch.ones(
        bs, max_input_len, dtype=torch.long) * pad_token_id
    attention_masks = torch.zeros(bs, max_input_len, dtype=torch.long)
    labels = torch.ones(bs, max_input_len, dtype=torch.long) * pad_token_id

    

    for i in range(bs):
        input_ids[i, :input_lens[i]] = batch_data[i][:input_lens[i]]
        attention_masks[i, :input_lens[i]] = 1

    labels[:, 0:max_input_len-1] = input_ids[:, 1:max_input_len]


    return {
        'input_ids': input_ids,
        'attention_masks': attention_masks,
        'labels': labels,
    }


collate_fn = PaddingCollateFunction(pad_token_id=0)
dataloader = DataLoader(dataset, 
                    batch_size=2, 
                    shuffle=False, 
                    collate_fn = collate_fn) 

for i, batch in enumerate(dataloader):
    print(batch)

    # training

    # loss

    # evaluation
    
    break  