In [1]:
# 复制自：https://github.com/timinar/BabyLlama

In [2]:
# 下载两份数据：https://osf.io/5mk3x, https://osf.io/m48ed
# 将两份数据解压到当前目录下的data文件夹中
# data目录结构如下：
# data/
#   |--train_10M/
#   |--dev/

# Basic data cleaning and tokenization

## Cleaning

Some simple, regex-based cleaning is performed on train and dev datasets, e.g. to remove HTML tags from Wikipedia articles, non-verbal cues from subtitles, or even to correct I’s that were incorrectly recognized as l’s in OCR’ed uppercase text.

In [3]:
from pathlib import Path
from mrclean import *
import os

In [4]:
DATA_ROOT = Path("./data")
SEQ_LENGTH = 128 # this is a legacy parameter, it does not affect cleaning
DATA_SPLITS = ['train_10M', 'dev']

CLEANUP_FUNCTIONS = {
    'aochildes': cleanup_aochildes,
    'bnc_spoken': cleanup_bnc_spoken,
    'cbt': cleanup_cbt,
    'childes': cleanup_children_stories,
    'gutenberg': cleanup_gutenberg,
    'open_subtitles': cleanup_open_subtitles,
    'qed': cleanup_qed,
    'simple_wiki': cleanup_simple_wikipedia,
    'switchboard': cleanup_switchboard,
    'wikipedia': cleanup_wikipedia,
}


In [5]:
for split in DATA_SPLITS:
    INPUT_DIR = DATA_ROOT /  split
    OUTPUT_DIR = DATA_ROOT / f'{split}_clean'
    
    OUTPUT_DIR.mkdir(exist_ok=True)

    train_files = [f for f in INPUT_DIR.iterdir() if f.is_file() and f.suffix in ['.train', '.dev']]
    
    for file in train_files:
        text = file.read_text(encoding='utf-8')
        cleaned_text = CLEANUP_FUNCTIONS[file.stem](text, SEQ_LENGTH)
        (OUTPUT_DIR / file.name).write_text(cleaned_text, encoding='utf-8')
        print(f"🧹 Cleaned '{file.name}' (size {len(text)} -> {len(cleaned_text)}) in {split}")


🧹 Cleaned 'childes.train' (size 15482927 -> 15482927) in train_10M
🧹 Cleaned 'simple_wiki.train' (size 8411630 -> 8387062) in train_10M
🧹 Cleaned 'bnc_spoken.train' (size 4883879 -> 4851676) in train_10M
🧹 Cleaned 'gutenberg.train' (size 13910986 -> 13910986) in train_10M
🧹 Cleaned 'switchboard.train' (size 719322 -> 719322) in train_10M
🧹 Cleaned 'open_subtitles.train' (size 10806305 -> 10804026) in train_10M
🧹 Cleaned 'switchboard.dev' (size 724013 -> 724013) in dev
🧹 Cleaned 'simple_wiki.dev' (size 8149513 -> 8128239) in dev
🧹 Cleaned 'gutenberg.dev' (size 15490473 -> 15490473) in dev
🧹 Cleaned 'bnc_spoken.dev' (size 6538139 -> 6503778) in dev
🧹 Cleaned 'open_subtitles.dev' (size 11016133 -> 11014854) in dev
🧹 Cleaned 'childes.dev' (size 14638378 -> 14638378) in dev


## Training a tokenizer

In [6]:
from pathlib import Path
from tokenizers import (Tokenizer, decoders, models, pre_tokenizers,
                        processors, trainers)
from tokenizers.normalizers import NFKC

In [7]:
# We train the tokenizer on the train data only
data_dir = Path("./data/train_10M_clean/")

paths = [str(f) for f in data_dir.glob("*") if f.is_file() and not f.name.endswith(".DS_Store") and f.suffix in [".train"]]

# paths
print(len(paths))
assert len(paths) > 0, 'No data files found'

6


In [8]:
tokenizer = Tokenizer(models.BPE())

tokenizer.pre_tokenizer = pre_tokenizers.ByteLevel(add_prefix_space=True)
tokenizer.decoder = decoders.ByteLevel()
tokenizer.post_processor = processors.ByteLevel(trim_offsets=True)
tokenizer.normalizer = NFKC()

In [9]:
trainer = trainers.BpeTrainer(vocab_size=16000, min_frequency=2, special_tokens=["<pad>", "<s>", "</s>"])
tokenizer.train(paths, trainer)






In [10]:
tokenizer_path = "./models/gpt-clean-16000.json"
os.makedirs("models", exist_ok=True)
tokenizer.save(str(tokenizer_path), pretty=True)

## Testing the tokenizer

In [11]:

tokenizer = Tokenizer.from_file(str(tokenizer_path))


# text = 'Shiro Okada (岡田志郎, "Okada Shirō", June 9, 1949; Hirakata, Osaka {age 71} - ) is a Japanese guitarist who participate in the Group Sound band, the Ox. His nickname was Shiro (シロー) and his real name is Shiro Okamoto (岡田史郎).'
text = "The quick brown fox jumps over the lazy dog."

encoded = tokenizer.encode(text)
print(f"Encoded String: {encoded.tokens}")

print(f"Encoded IDs: {encoded.ids}")

decoded = tokenizer.decode(encoded.ids)
print(f"Decoded String: {decoded}")


Encoded String: ['ĠThe', 'Ġquick', 'Ġbrown', 'Ġfox', 'Ġjumps', 'Ġover', 'Ġthe', 'Ġlazy', 'Ġdog', '.']
Encoded IDs: [300, 1782, 3264, 5710, 15959, 539, 188, 11551, 1467, 16]
Decoded String:  The quick brown fox jumps over the lazy dog.
