In [1]:
from transformers import BertModel, BertTokenizer

In [2]:
checkpoint = "bert-base-chinese"

bert_model = BertModel.from_pretrained(checkpoint)
tokenizer = BertTokenizer.from_pretrained(checkpoint)

In [3]:
sentence = "我是大帅哥"

In [17]:
t = tokenizer(sentence, return_tensors="pt")
t

{'input_ids': tensor([[ 101, 2769, 3221, 1920, 2358, 1520,  102]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1]])}

In [21]:
res = bert_model(t["input_ids"])

In [1]:
from typing import *

import torch
import tqdm
from config import *
from torch.utils.data import Dataset


class CSCDataset(Dataset):
    def __init__(
        self,
        path: Union[str, List[str]],  # path 数据集路径
    ):
        # assert len(data) == len(label)
        self.path = path
        self.x = []
        self.y = []

        self.data_processor()

    def data_processor(self):
        if isinstance(self.path, list):
            self.handle_sighan()
        elif isinstance(self.path, str):
            pass

    def handle_sighan(self):
        xpath, ypath = self.path
        num_lines = 0
        with open(xpath, "r", encoding="utf-8") as f:
            for line in tqdm.tqdm(f, desc="preprocessing sighan dataset"):
                line = line.strip()
                num_lines += 1
                self.x.append(line)

        with open(ypath, "r", encoding="utf-8") as f:
            for line in tqdm.tqdm(
                f, desc="preprocessing sighan dataset", total=num_lines
            ):
                line = line.strip()
                self.y.append(line)

    def __len__(self):
        return len(self.x)

    def __getitem__(self, index):
        return self.x[index], self.y[index]

In [2]:
train_dataset = CSCDataset(path=[SIGHAN_train_dir_corr, SIGHAN_train_dir_err])

preprocessing sighan dataset: 700it [00:00, 21563.44it/s]
preprocessing sighan dataset: 100%|██████████████████████████████████████████████| 700/700 [00:00<00:00, 955733.33it/s]


In [3]:
from torch.utils.data import DataLoader

train_data_loader = DataLoader(
    train_dataset, batch_size=batch_size, num_workers=0, shuffle=True
)

In [4]:
bert_model.config.hidden_size

NameError: name 'bert_model' is not defined

In [5]:
import numpy as np
import torch
from datasets import load_dataset
from torch.utils.data import DataLoader
from transformers import (
    AdamW,
    BertForSequenceClassification,
    BertTokenizer,
    get_scheduler,
)

In [6]:
tokenizer = BertTokenizer.from_pretrained("bert-base-chinese")
model = BertForSequenceClassification.from_pretrained(
    "bert-base-chinese", num_labels=tokenizer.vocab_size
)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-chinese and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [7]:
from torch.utils.data import DataLoader

train_dataset = CSCDataset(path=[SIGHAN_train_dir_err, SIGHAN_train_dir_corr])
train_data_loader = DataLoader(train_dataset, batch_size=4, num_workers=0, shuffle=True)

preprocessing sighan dataset: 700it [00:00, 1358636.19it/s]
preprocessing sighan dataset: 100%|██████████████████████████████████████████████| 700/700 [00:00<00:00, 672934.40it/s]


In [8]:
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
model.to(device)

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(21128, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e

In [9]:
optimizer = AdamW(model.parameters(), lr=5e-5)



In [10]:
# 学习率调度器
num_epochs = 3
num_training_steps = num_epochs * len(train_data_loader)
lr_scheduler = get_scheduler(
    name="linear",
    optimizer=optimizer,
    num_warmup_steps=0,
    num_training_steps=num_training_steps,
)

In [None]:
# 训练循环
model.train()
for epoch in range(num_epochs):
    for x, y in train_data_loader:
        input_ids, token_type_ids, attention_mask = tokenizer(
            x,
            return_tensors="pt",
            truncation=True,
            padding="max_length",
            max_length=128,
        ).values()
        labels = tokenizer(
            y,
            return_tensors="pt",
            truncation=True,
            padding="max_length",
            max_length=128,
        )["input_ids"]
        outputs = model(
            input_ids=input_ids.to(device).squeeze(),
            attention_mask=attention_mask.to(device).squeeze(),
            labels=labels.to(device).squeeze(),
        )
        loss = outputs.loss
        loss.backward()

        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()

    # 验证步骤可以在这里添加
    print(f"Epoch {epoch+1}/{num_epochs} completed")

print("Training complete.")

In [None]:
import torch
from datasets import Dataset
from torch.utils.data import DataLoader
from transformers import AdamW, BertForMaskedLM, BertTokenizer, get_scheduler


# 创建模拟的CSC数据集
# def create_mock_csc_dataset():
#     return {
#         "train": [
#             {"original_text": "这是一个测试句子", "corrected_text": "这是一个测试句子"},
#             {"original_text": "这是个测试句子", "corrected_text": "这是一个测试句子"},
#             {"original_text": "这是一只猫", "corrected_text": "这是一只猫"},
#             {"original_text": "我爱北京天安们", "corrected_text": "我爱北京天安门"},
#         ],
#         "validation": [
#             {
#                 "original_text": "这是一个简单的例子",
#                 "corrected_text": "这是一个简单的例子",
#             },
#             {"original_text": "他去了北景公园", "corrected_text": "他去了北海公园"},
#         ],
#     }
def create_mock_csc_dataset():
    ret = {"train": [], "validation": []}
    for x, y in train_data_loader:
        for i, v in enumerate(x):
            ret["train"].append({"original_text": v, "corrected_text": y[i]})
            ret["validation"].append({"original_text": v, "corrected_text": y[i]})
    return ret


dataset = create_mock_csc_dataset()

# 初始化分词器
tokenizer = BertTokenizer.from_pretrained("bert-base-chinese")


# 定义tokenize函数
def tokenize_function(examples):
    inputs = tokenizer(
        examples["original_text"],
        truncation=True,
        padding="max_length",
        max_length=128,
        return_tensors="pt",
    )
    labels = tokenizer(
        examples["corrected_text"],
        truncation=True,
        padding="max_length",
        max_length=128,
        return_tensors="pt",
    )["input_ids"]
    return {
        "input_ids": inputs["input_ids"].squeeze(),
        "attention_mask": inputs["attention_mask"].squeeze(),
        "labels": labels.squeeze(),
    }


# 将数据集转换为Hugging Face的Dataset对象
train_dataset = Dataset.from_list(dataset["train"])
val_dataset = Dataset.from_list(dataset["validation"])

# 对数据集进行tokenize处理
train_tokenized_datasets = train_dataset.map(tokenize_function, batched=True)
val_tokenized_datasets = val_dataset.map(tokenize_function, batched=True)

# 设置格式为pytorch tensor
train_tokenized_datasets.set_format(
    type="torch", columns=["input_ids", "attention_mask", "labels"]
)
val_tokenized_datasets.set_format(
    type="torch", columns=["input_ids", "attention_mask", "labels"]
)

# 创建DataLoader
train_dataloader = DataLoader(train_tokenized_datasets, shuffle=True, batch_size=4)
eval_dataloader = DataLoader(val_tokenized_datasets, batch_size=4)

# 加载预训练的BERT模型
model = BertForMaskedLM.from_pretrained("bert-base-chinese")

# 检查设备类型
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
model.to(device)

# 优化器设置
optimizer = AdamW(model.parameters(), lr=5e-5)

# 学习率调度器
num_epochs = 3
num_training_steps = num_epochs * len(train_dataloader)
lr_scheduler = get_scheduler(
    name="linear",
    optimizer=optimizer,
    num_warmup_steps=0,
    num_training_steps=num_training_steps,
)

# 训练循环
model.train()
for epoch in range(num_epochs):
    for batch in train_dataloader:
        batch = {k: v.to(device) for k, v in batch.items()}
        outputs = model(**batch)
        loss = outputs.loss
        loss.backward()

        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()

    # 验证步骤可以在这里添加
    print(f"Epoch {epoch+1}/{num_epochs} completed")

print("Training complete.")


# 示例预测
def predict(text):
    inputs = tokenizer(
        text, return_tensors="pt", truncation=True, padding="max_length", max_length=128
    ).to(device)
    with torch.no_grad():
        logits = model(**inputs).logits
    predicted_token_id = logits.argmax(dim=-1)[0]
    corrected_text = tokenizer.decode(predicted_token_id, skip_special_tokens=True)
    return corrected_text


# 测试预测
test_texts = ["我爱北京天安们", "这是一只猫"]
for text in test_texts:
    print(f"Original: {text}")
    print(f"Corrected: {predict(text)}\n")

Map:   0%|          | 0/700 [00:00<?, ? examples/s]

Map:   0%|          | 0/700 [00:00<?, ? examples/s]

BertForMaskedLM has generative capabilities, as `prepare_inputs_for_generation` is explicitly overwritten. However, it doesn't directly inherit from `GenerationMixin`. From 👉v4.50👈 onwards, `PreTrainedModel` will NOT inherit from `GenerationMixin`, and this model will lose the ability to call `generate` and other related functions.
  - If you are the owner of the model architecture code, please modify your model class such that it inherits from `GenerationMixin` (after `PreTrainedModel`, otherwise you'll get an exception).
  - If you are not the owner of the model architecture class, please contact the model code owner to update it.
Some weights of the model checkpoint at bert-base-chinese were not used when initializing BertForMaskedLM: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another archite

Epoch 1/3 completed


In [None]:
predict("我愛北京天们")

In [None]:
t = _

In [None]:
"".join(t.split())

In [None]:
def get_error_pos(x, y):
    pos = []
    for i, v in enumerate(x):
        if v != y[i]:
            print(v, y[i])
            pos.append(i)
    return pos

In [None]:
def score(x, y, pos):
    err, correct, tot = 0, 0, 0
    er_pos = ""
    for i, v in enumerate(x):
        if i in pos:
            tot += 1
            if v == y[i]:
                correct += 1
        elif v != y[i]:
            err += 1
            er_pos += y[i]
    return err, correct, tot, er_pos

In [13]:
te, tc, tt

NameError: name 'te' is not defined

In [None]:
te, tc, tt = 0, 0, 0

for x, y in train_data_loader:
    for i, v in enumerate(x):
        cy = y[i]
        cur_pos = get_error_pos(v, cy)
        px = predict(v)
        npx = "".join(px.split())
        if len(npx) == len(cy):
            e, c, t, er_pos = score(npx, cy, cur_pos)
            te += e
            tc += c
            tt += t
            if e != 0:
                print("ERROR", er_pos)
            if c != t:
                print("not true")
        else:
            print("notice")

        print(i, v, npx, cy, cur_pos)