In [14]:
from transformers import AutoModelForTokenClassification, AutoTokenizer
import torch
from datasets import load_dataset
import transformers
# 加载预训练的BERT模型和分词器
model = AutoModelForTokenClassification.from_pretrained("google-bert/bert-base-chinese")
tokenizer = AutoTokenizer.from_pretrained("google-bert/bert-base-chinese")


ds = load_dataset('doushabao4766/msra_ner_k_V3')

print(ds)
print(ds['train'][0])
print(transformers.__version__)

Some weights of BertForTokenClassification were not initialized from the model checkpoint at google-bert/bert-base-chinese and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


DatasetDict({
    train: Dataset({
        features: ['id', 'tokens', 'ner_tags', 'knowledge'],
        num_rows: 45001
    })
    test: Dataset({
        features: ['id', 'tokens', 'ner_tags', 'knowledge'],
        num_rows: 3443
    })
})
{'id': '0', 'tokens': ['当', '希', '望', '工', '程', '救', '助', '的', '百', '万', '儿', '童', '成', '长', '起', '来', '，', '科', '教', '兴', '国', '蔚', '然', '成', '风', '时', '，', '今', '天', '有', '收', '藏', '价', '值', '的', '书', '你', '没', '买', '，', '明', '日', '就', '叫', '你', '悔', '不', '当', '初', '！'], 'ner_tags': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'knowledge': ''}
4.51.3


In [3]:
!pip install -q evaluate
!pip install -q  seqeval

from transformers import DataCollatorForTokenClassification, TrainingArguments, Trainer
import numpy as np
from datasets import ClassLabel
import evaluate
from transformers import AutoModelForTokenClassification, AutoTokenizer
import torch
from datasets import load_dataset


ds = load_dataset('doushabao4766/msra_ner_k_V3')
label_list = ds["train"].features["ner_tags"].feature.names  # ['O', 'B-PER', 'I-PER', 'B-ORG', 'I-ORG', 'B-LOC', 'I-LOC']

label2id = {label: i for i, label in enumerate(label_list)}
id2label = {i: label for i, label in enumerate(label_list)}

# 加载预训练的BERT模型和分词器
# model = AutoModelForTokenClassification.from_pretrained("google-bert/bert-base-chinese")
model = AutoModelForTokenClassification.from_pretrained(
    "google-bert/bert-base-chinese",
    num_labels=7,      # 设置为 7
    id2label=id2label,
    label2id=label2id
)
tokenizer = AutoTokenizer.from_pretrained("google-bert/bert-base-chinese")



def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(examples["tokens"], truncation=True, is_split_into_words=True)

    all_labels = []
    for i, word_ids in enumerate(tokenized_inputs.word_ids(batch_index=i) for i in range(len(examples["tokens"]))):
        labels = []
        previous_word_idx = None
        for word_idx in word_ids:
            if word_idx is None:
                labels.append(-100)
            elif word_idx != previous_word_idx:
                labels.append(examples["ner_tags"][i][word_idx])
            else:
                labels.append(examples["ner_tags"][i][word_idx])
            previous_word_idx = word_idx
        all_labels.append(labels)

    tokenized_inputs["labels"] = all_labels
    return tokenized_inputs

# 应用预处理
tokenized_datasets = ds.map(tokenize_and_align_labels, batched=True)

# 训练参数
training_args = TrainingArguments(
    output_dir="./ner_model",
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=50,
    report_to="none"  # 不使用wandb等
)

# 数据收集器
data_collator = DataCollatorForTokenClassification(tokenizer)

# 评估指标
metric = evaluate.load("seqeval")  # 加载序列标注指标

def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)
    true_predictions = [
        [label_list[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    true_labels = [
        [label_list[l] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    results = metric.compute(predictions=true_predictions, references=true_labels)
    return {
        "precision": results["overall_precision"],
        "recall": results["overall_recall"],
        "f1": results["overall_f1"],
        "accuracy": results["overall_accuracy"],
    }
# Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

# 训练
trainer.train()

# 保存模型
trainer.save_model("bert-msra-ner")

# 推理示例
def predict(example_text):
    tokens = list(example_text)
    inputs = tokenizer(tokens, return_tensors="pt", is_split_into_words=True)
    outputs = model(**inputs)
    predictions = torch.argmax(outputs.logits, dim=2)
    predicted_labels = [id2label[p.item()] for p in predictions[0]]
    return list(zip(tokens, predicted_labels))

# 示例
test_text = "李雷在北京上学"
print(predict(test_text))

README.md:   0%|          | 0.00/697 [00:00<?, ?B/s]

(…)-00000-of-00001-42717a92413393f9.parquet:   0%|          | 0.00/13.9M [00:00<?, ?B/s]

(…)-00000-of-00001-8899cab5fdab45bc.parquet:   0%|          | 0.00/946k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/45001 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/3443 [00:00<?, ? examples/s]

config.json:   0%|          | 0.00/624 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/412M [00:00<?, ?B/s]

Some weights of BertForTokenClassification were not initialized from the model checkpoint at google-bert/bert-base-chinese and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


tokenizer_config.json:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/110k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/269k [00:00<?, ?B/s]

Map:   0%|          | 0/45001 [00:00<?, ? examples/s]

Map:   0%|          | 0/3443 [00:00<?, ? examples/s]

Downloading builder script:   0%|          | 0.00/6.34k [00:00<?, ?B/s]

  trainer = Trainer(


Step,Training Loss
50,0.3923
100,0.0681
150,0.0565
200,0.0462
250,0.0417
300,0.0399
350,0.0414
400,0.0394
450,0.0356
500,0.0321




RuntimeError: Expected all tensors to be on the same device, but found at least two devices, cuda:0 and cpu! (when checking argument for argument index in method wrapper_CUDA__index_select)

In [12]:
def ner_inference(text: str):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)

    words = list(text)

    inputs = tokenizer(
        words,
        is_split_into_words=True,
        return_tensors="pt",
        truncation=True
    )
    word_ids = inputs.word_ids()  # 先拿 word_ids
    inputs = {k: v.to(device) for k, v in inputs.items()}  # 转设备

    with torch.no_grad():
        outputs = model(**inputs)
    pred_ids = outputs.logits.argmax(dim=-1)[0].tolist()

    entities = []
    cur_entity = []
    cur_type = None

    for idx, word_idx in enumerate(word_ids):
        if word_idx is None:
            continue

        label = id2label[pred_ids[idx]]
        prefix, ent_type = (label.split("-", 1) + [None])[:2]

        if prefix == "B":
            if cur_entity:
                entities.append({"entity": cur_type, "content": "".join(cur_entity)})
            cur_entity = [words[word_idx]]
            cur_type = ent_type

        elif prefix == "I" and cur_type == ent_type:
            cur_entity.append(words[word_idx])

        else:
            if cur_entity:
                entities.append({"entity": cur_type, "content": "".join(cur_entity)})
                cur_entity = []
                cur_type = None

    if cur_entity:
        entities.append({"entity": cur_type, "content": "".join(cur_entity)})

    return entities

# —— 测试一下 —— 
text = "双方确定了今后发展中美关系的指导方针。"
print(ner_inference(text))

[{'entity': 'LOC', 'content': '中'}, {'entity': 'LOC', 'content': '美'}]
