In [None]:
import gzip

with gzip.open("enwiki-latest-all-titles-in-ns0.gz", "rt", encoding="utf-8") as f:
    text = f.read()

with open("wiki_data.txt", "w", encoding="utf-8") as f:
    f.write(text)

BadGzipFile: Not a gzipped file (b'BZ')

In [5]:
import re

def clean_text(text):
    # 去掉 Wiki 标记，如 [1], [2], ...
    text = re.sub(r"\[\d+\]", "", text)
    # 去掉特殊字符
    text = re.sub(r"[^\w\s]", "", text)
    # 统一小写
    text = text.lower()
    return text

with open("wiki_data.txt", "r", encoding="utf-8") as f:
    raw_text = f.read()

cleaned_text = clean_text(raw_text)

with open("wiki_cleaned.txt", "w", encoding="utf-8") as f:
    f.write(cleaned_text)

print("清理后的数据大小:", len(cleaned_text) / (1024 * 1024), "MB")


清理后的数据大小: 368.9857807159424 MB


In [None]:
import sentencepiece as spm

spm.SentencePieceTrainer.train(
    input="wiki_cleaned.txt", 
    model_prefix="wiki_bpe",
    vocab_size=50000,  # 词表大小
    character_coverage=1.0,  # 适用于纯英文
    model_type="bpe"  # 训练 BPE 模型
)

# 加载训练好的 BPE Tokenizer
sp = spm.SentencePieceProcessor(model_file="wiki_bpe.model")

# 测试编码 & 解码
encoded = sp.encode("Artificial intelligence is evolving rapidly.", out_type=str)
print("编码:", encoded)
decoded = sp.decode(encoded)
print("解码:", decoded)

In [None]:
sp = spm.SentencePieceProcessor(model_file="wiki_bpe.model")

# 测试分词
sentence = "Machine learning is amazing!"
tokens = sp.encode(sentence, out_type=str)
print("Tokenized:", tokens)

# 反向转换
original_text = sp.decode(tokens)
print("Decoded:", original_text)
