In [15]:
from datasets import load_dataset

# Load Wikipedia dataset
wiki_dataset = load_dataset(
    "wikimedia/wikipedia", 
    "20231101.en",
    streaming=True
)


Resolving data files:   0%|          | 0/41 [00:00<?, ?it/s]

In [16]:
first_example = next(iter(wiki_dataset["train"]))
print(f"Keys: {first_example.keys()}")
print(first_example)

Keys: dict_keys(['id', 'url', 'title', 'text'])
{'id': '12', 'url': 'https://en.wikipedia.org/wiki/Anarchism', 'title': 'Anarchism', 'text': 'Anarchism is a political philosophy and movement that is skeptical of all justifications for authority and seeks to abolish the institutions it claims maintain unnecessary coercion and hierarchy, typically including nation-states, and capitalism. Anarchism advocates for the replacement of the state with stateless societies and voluntary free associations. As a historically left-wing movement, this reading of anarchism is placed on the farthest left of the political spectrum, usually described as the libertarian wing of the socialist movement (libertarian socialism).\n\nHumans have lived in societies without formal hierarchies long before the establishment of states, realms, or empires. With the rise of organised hierarchical bodies, scepticism toward authority also rose. Although traces of anarchist ideas are found all throughout history, modern 

In [17]:
from tqdm.auto import tqdm
import re

text_data = []
file_count = 0
processed_samples = 0
for sample in tqdm(wiki_dataset['train']):
    paragraphs = [p.strip() for p in sample['text'].split('\n') if p.strip()]
    for paragraph in paragraphs:
        paragraph = paragraph.lower()

        if paragraph in ['see also', 'references', 'notes', 'explanatory notes',
                         'citations', 'general and cited sources', 'primary sources',
                         'secondary sources', 'tertiary sources', 'further reading',
                         'external links', 'bibliography']:
            break
        
        # 标准化空白字符
        paragraph = ' '.join(paragraph.split())
        
        # 处理URL和邮箱
        paragraph = re.sub(r'https?://\S+|www\.\S+', '[URL]', paragraph)
        paragraph = re.sub(r'\S+@\S+', '[EMAIL]', paragraph)

        if len(paragraph.split(' ')) < 20:
            continue
        
        #print(paragraph)
        text_data.append(paragraph)
        if len(text_data) >= 6_000:
            # once we git the 6K mark, save to file
            with open(f'wiki_files/text_{file_count}.txt', 'w', encoding='utf-8') as fp:
                fp.write('\n'.join(text_data))
            text_data = []
            file_count += 1
    processed_samples += 1
    if file_count >= 1000:    # 1000 files is 880k sample, like 1/8 of total wiki en
        print(f"processed samples: {processed_samples}")
        break


0it [00:00, ?it/s]

processed samples: 889552


In [20]:
# Dump data for book corpus
# Load Wikipedia dataset
book_dataset = load_dataset(
    "lucadiliello/bookcorpusopen",
    streaming=True
)

README.md:   0%|          | 0.00/400 [00:00<?, ?B/s]

In [22]:
first_example = next(iter(book_dataset["train"]))
print(f"Keys: {first_example.keys()}")
print(first_example)

Keys: dict_keys(['text', 'title'])


In [24]:
len(first_example['text'])

1136214

In [32]:
from tqdm.auto import tqdm
import re

text_data = []
file_count = 0
processed_samples = 0
for sample in tqdm(book_dataset['train']):
    for paragraph in sample['text'].split('\n'):
        paragraph = paragraph.strip().lower()
        if len(paragraph) == 0:
            continue
        if paragraph[-1] not in ['.', '!', '?', '"', "'", ')', ']', '}']:
            continue
        for wrong, correct in {
            ' .': '.',      # 空格 + 句号
            ' ,': ',',      # 空格 + 逗号
            ' !': '!',      # 空格 + 感叹号
            ' ?': '?',      # 空格 + 问号
            ' ;': ';',      # 空格 + 分号
            ' :': ':',      # 空格 + 冒号
        }.items():
            paragraph = paragraph.replace(wrong, correct)
    
        # 处理URL和邮箱
        paragraph = re.sub(r'https?://\S+|www\.\S+', '[URL]', paragraph)
        paragraph = re.sub(r'\S+@\S+', '[EMAIL]', paragraph)

        if len(paragraph.split(' ')) < 10:
            continue
        
        #print(paragraph)
        text_data.append(paragraph)
        if len(text_data) >= 6_000:
            # once we git the 6K mark, save to file
            with open(f'book_files/text_{file_count}.txt', 'w', encoding='utf-8') as fp:
                fp.write('\n'.join(text_data))
            text_data = []
            file_count += 1
    processed_samples += 1
    if file_count >= 1000:        # 1000 files uses 4.4k books, it is 1/4 of 17.9k books
        print(f"processed samples: {processed_samples}")
        break

0it [00:00, ?it/s]

processed samples: 4424


In [None]:
### End of text processing, begin model related

In [1]:
# Model definition
import torch
from transformers import BertConfig, BertForMaskedLM

device = torch.device('cuda')

# 加载预训练模型的配置（不加载权重）
config = BertConfig.from_pretrained("bert-base-uncased")
#print(config)

# fp32 by default
model = BertForMaskedLM(config)
model.to(device);

# If continue from previous epoch
#model.load_state_dict(torch.load('bert-trained/bert_ckpt_5.pth'))

In [2]:
# Load tokenizer
# from transformers import BertTokenizer
# tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
print(tokenizer)

BertTokenizerFast(name_or_path='bert-base-uncased', vocab_size=30522, model_max_length=512, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'}, clean_up_tokenization_spaces=False, added_tokens_decoder={
	0: AddedToken("[PAD]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	100: AddedToken("[UNK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	101: AddedToken("[CLS]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	102: AddedToken("[SEP]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	103: AddedToken("[MASK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
}
)


In [3]:
import random
import torch
from torch.optim import AdamW
from tqdm.auto import tqdm

# Used in data loading
class CustomDataset(torch.utils.data.Dataset):
    def __init__(self, encodings):
        # store encodings internally
        self.encodings = encodings

    def __len__(self):
        # return the number of samples
        return self.encodings['input_ids'].shape[0]

    def __getitem__(self, i):
        # return dictionary of input_ids, attention_mask, and labels for index i
        return {key: tensor[i] for key, tensor in self.encodings.items()}


# activate training mode
model.train()
optim = AdamW(model.parameters(), lr=5e-5)
scaler = torch.cuda.amp.GradScaler()

epochs = 2
steps = 0
for epoch in range(epochs):
    # Iterate data files
    for file_id in range(0, 100):   #1000):
        lines = []
        file_path = f"book_files/text_{file_id}.txt"
        with open(file_path, 'r', encoding='utf-8') as f:
            lines.extend(f.read().strip().split('\n'))
        file_path = f"wiki_files/text_{file_id}.txt"
        with open(file_path, 'r', encoding='utf-8') as f:
            lines.extend(f.read().strip().split('\n'))
        file_path = f"c4_files/text_{file_id}.txt"
        with open(file_path, 'r', encoding='utf-8') as f:
            lines.extend(f.read().strip().split('\n'))
        random.shuffle(lines)
    
        # tokenize and Mask raw inputs
        tokenized_lines = tokenizer(lines, max_length=256, padding='max_length', truncation=True, return_tensors='pt')
        input_ids = tokenized_lines['input_ids']
        attention_mask = tokenized_lines['attention_mask']
    
        # mask random x% where token is not special tokens, id >= 999
        rand = torch.rand(input_ids.shape)
        mask_arr = (rand < 0.15) * (input_ids >= 999)

        # 80% mask, 10% unchange, 10% random other token
        mask_prob = torch.rand(input_ids.shape)
        masked_input_ids = input_ids.clone()
        masked_input_ids[(mask_arr) & (mask_prob < 0.8)] = tokenizer.mask_token_id
        # 10% random other token
        random_tokens = torch.randint(
            low=999, high=tokenizer.vocab_size, size=input_ids.shape
        )
        masked_input_ids[(mask_arr) & (mask_prob >= 0.8) & (mask_prob < 0.9)] = \
            random_tokens[(mask_arr) & (mask_prob >= 0.8) & (mask_prob < 0.9)]
        #print(masked_input_ids)
        
        labels = input_ids.clone()
        labels[~mask_arr] = -100              # non padding, not choose as mask
        labels[attention_mask == 0] = -100    # padding
        #print(labels)
        
        # Define data loader
        encodings = {'input_ids': masked_input_ids, 'attention_mask': attention_mask, 'labels': labels} 
        custom_ds = CustomDataset(encodings) 
        loader = torch.utils.data.DataLoader(custom_ds, batch_size=16, shuffle=True) 
        
        # Finished create data loader
        # setup loop with TQDM and dataloader
        loop = tqdm(loader, leave=True)
        for batch in loop:
            # initialize calculated gradients (from prev step)
            optim.zero_grad()
            # pull all tensor batches required for training
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)
            # process
            # mixed precision automatically
            with torch.autocast(device_type="cuda", dtype=torch.float16):
                outputs = model(input_ids, attention_mask=attention_mask,
                                labels=labels)
            # extract loss
            loss = outputs.loss

            # Original method
            #loss.backward()
            #optim.step()
            # Use scaler
            scaler.scale(loss).backward()
            scaler.step(optim)
            scaler.update()

            # print relevant info to progress bar
            loop.set_description(f'Epoch {epoch}, File {file_id}')
            loop.set_postfix(loss=loss.item())
        if file_id % 10 == 0:
            torch.save({
                'model': model.state_dict(),
                'optimizer': optim.state_dict(),
            }, f"bert-trained/checkpoint-epoch{epoch}-file{file_id}.pt")

            
    
        

  scaler = torch.cuda.amp.GradScaler()


  0%|          | 0/1125 [00:00<?, ?it/s]

  0%|          | 0/1125 [00:00<?, ?it/s]

  0%|          | 0/1125 [00:00<?, ?it/s]

  0%|          | 0/1125 [00:00<?, ?it/s]

  0%|          | 0/1125 [00:00<?, ?it/s]

  0%|          | 0/1125 [00:00<?, ?it/s]

  0%|          | 0/1125 [00:00<?, ?it/s]

  0%|          | 0/1125 [00:00<?, ?it/s]

  0%|          | 0/1125 [00:00<?, ?it/s]

  0%|          | 0/1125 [00:00<?, ?it/s]

  0%|          | 0/1125 [00:00<?, ?it/s]

  0%|          | 0/1125 [00:00<?, ?it/s]

  0%|          | 0/1125 [00:00<?, ?it/s]

  0%|          | 0/1125 [00:00<?, ?it/s]

  0%|          | 0/1125 [00:00<?, ?it/s]

  0%|          | 0/1125 [00:00<?, ?it/s]

  0%|          | 0/1125 [00:00<?, ?it/s]

  0%|          | 0/1125 [00:00<?, ?it/s]

  0%|          | 0/1125 [00:00<?, ?it/s]

  0%|          | 0/1125 [00:00<?, ?it/s]

  0%|          | 0/1125 [00:00<?, ?it/s]

  0%|          | 0/1125 [00:00<?, ?it/s]

  0%|          | 0/1125 [00:00<?, ?it/s]

  0%|          | 0/1125 [00:00<?, ?it/s]

  0%|          | 0/1125 [00:00<?, ?it/s]

  0%|          | 0/1125 [00:00<?, ?it/s]

  0%|          | 0/1125 [00:00<?, ?it/s]

  0%|          | 0/1125 [00:00<?, ?it/s]

  0%|          | 0/1125 [00:00<?, ?it/s]

  0%|          | 0/1125 [00:00<?, ?it/s]

  0%|          | 0/1125 [00:00<?, ?it/s]

  0%|          | 0/1125 [00:00<?, ?it/s]

  0%|          | 0/1125 [00:00<?, ?it/s]

  0%|          | 0/1125 [00:00<?, ?it/s]

  0%|          | 0/1125 [00:00<?, ?it/s]

  0%|          | 0/1125 [00:00<?, ?it/s]

  0%|          | 0/1125 [00:00<?, ?it/s]

  0%|          | 0/1125 [00:00<?, ?it/s]

  0%|          | 0/1125 [00:00<?, ?it/s]

  0%|          | 0/1125 [00:00<?, ?it/s]

  0%|          | 0/1125 [00:00<?, ?it/s]

  0%|          | 0/1125 [00:00<?, ?it/s]

  0%|          | 0/1125 [00:00<?, ?it/s]

  0%|          | 0/1125 [00:00<?, ?it/s]

  0%|          | 0/1125 [00:00<?, ?it/s]

  0%|          | 0/1125 [00:00<?, ?it/s]

  0%|          | 0/1125 [00:00<?, ?it/s]

  0%|          | 0/1125 [00:00<?, ?it/s]

  0%|          | 0/1125 [00:00<?, ?it/s]

  0%|          | 0/1125 [00:00<?, ?it/s]

  0%|          | 0/1125 [00:00<?, ?it/s]

  0%|          | 0/1125 [00:00<?, ?it/s]

  0%|          | 0/1125 [00:00<?, ?it/s]

  0%|          | 0/1125 [00:00<?, ?it/s]

  0%|          | 0/1125 [00:00<?, ?it/s]

  0%|          | 0/1125 [00:00<?, ?it/s]

  0%|          | 0/1125 [00:00<?, ?it/s]

  0%|          | 0/1125 [00:00<?, ?it/s]

  0%|          | 0/1125 [00:00<?, ?it/s]

  0%|          | 0/1125 [00:00<?, ?it/s]

  0%|          | 0/1125 [00:00<?, ?it/s]

  0%|          | 0/1125 [00:00<?, ?it/s]

  0%|          | 0/1125 [00:00<?, ?it/s]

  0%|          | 0/1125 [00:00<?, ?it/s]

  0%|          | 0/1125 [00:00<?, ?it/s]

  0%|          | 0/1125 [00:00<?, ?it/s]

  0%|          | 0/1125 [00:00<?, ?it/s]

  0%|          | 0/1125 [00:00<?, ?it/s]

  0%|          | 0/1125 [00:00<?, ?it/s]

  0%|          | 0/1125 [00:00<?, ?it/s]

  0%|          | 0/1125 [00:00<?, ?it/s]

  0%|          | 0/1125 [00:00<?, ?it/s]

  0%|          | 0/1125 [00:00<?, ?it/s]

  0%|          | 0/1125 [00:00<?, ?it/s]

KeyboardInterrupt: 

  0%|          | 0/1125 [00:00<?, ?it/s]

  0%|          | 0/1125 [00:00<?, ?it/s]

  0%|          | 0/1125 [00:00<?, ?it/s]

In [None]:
print(TARGET)
torch.save(model.state_dict(), f"bert-manual-trained/bert_ckpt_{TARGET}.pth")

In [42]:
# Evaluate with fill mask 
from transformers import pipeline
import torch

# 创建fill-mask pipeline
fill_mask = pipeline(
    'fill-mask',
    model=model,  # 你的训练好的模型
    tokenizer=tokenizer,  # 对应的tokenizer
    device=0 if torch.cuda.is_available() else -1,  # GPU/CPU
    top_k=5  # 显示前5个预测结果
)

# 测试句子（包含[MASK] token）
test_sentences = [
    "In the [MASK], I went to the school.",
    "I love to eat [MASK] for breakfast.",
    "The weather today is very [MASK].",
    "[MASK] is a good man.",
    "She is a [MASK] doctor."
]

# 逐个预测
for sentence in test_sentences:
    print(f"\n输入: {sentence}")
    results = fill_mask(sentence)
    for i, result in enumerate(results):
        print(f"  {i+1}. {result['token_str']:15} (score: {result['score']:.4f})")

Device set to use cuda:0



输入: In the [MASK], I went to the school.
  1. ,               (score: 0.0480)
  2. i               (score: 0.0424)
  3. the             (score: 0.0403)
  4. .               (score: 0.0371)
  5. "               (score: 0.0288)

输入: I love to eat [MASK] for breakfast.
  1. ,               (score: 0.0452)
  2. i               (score: 0.0442)
  3. the             (score: 0.0442)
  4. .               (score: 0.0399)
  5. "               (score: 0.0269)

输入: The weather today is very [MASK].
  1. the             (score: 0.0414)
  2. ,               (score: 0.0409)
  3. i               (score: 0.0406)
  4. .               (score: 0.0373)
  5. "               (score: 0.0310)

输入: [MASK] is a good man.
  1. "               (score: 0.0488)
  2. i               (score: 0.0411)
  3. the             (score: 0.0386)
  4. ,               (score: 0.0376)
  5. .               (score: 0.0350)

输入: She is a [MASK] doctor.
  1. ,               (score: 0.0497)
  2. i               (score: 0.0404)
  3. the

In [None]:
# appendix
# New way of training, using DataCollectorForLanguageModeling
# This works, but the high level libs, not good for understanding the details
from transformers import (
    TrainingArguments,
    Trainer,
    DataCollatorForLanguageModeling
)
from datasets import Dataset

mini_dataset = Dataset.from_dict({"text": lines})
tokenized_datasets = mini_dataset.map(
    lambda x : tokenizer(
        x["text"], max_length=512, padding="max_length", truncation=True), #, return_tensors='pt'),
    batched=True,
    remove_columns=["text"])
# Split dataset
train_test_split = tokenized_datasets.train_test_split(test_size=0.1)
train_dataset = train_test_split["train"]
eval_dataset = train_test_split["test"]

# Data collator for MLM
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=True,
    mlm_probability=0.15,
    mask_replace_prob=0.8,
    random_replace_prob=0.1,
    pad_to_multiple_of=8,
    return_tensors='pt',
)
# Training arguments
training_args = TrainingArguments(
    output_dir="./bert-trained",
    overwrite_output_dir=True,
    num_train_epochs=1,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=100,
    #evaluation_strategy="steps",
    eval_steps=500,
    save_steps=1000,
    save_total_limit=2,
    fp16=torch.cuda.is_available(),  # Use mixed precision if GPU available
)

# Initialize trainer
trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
)

# start training
trainer.train()