## step1 导入相关包

In [1]:
# pip install datasets
# pip install --upgrade datasets

In [2]:
import evaluate
from datasets import load_dataset

from transformers import AutoTokenizer, AutoModelForTokenClassification,TrainingArguments, Trainer,DataCollatorForTokenClassification

## Step2 加载数据集

In [3]:

ner_datasets = load_dataset("lansinuote/peoples-daily-ner", cache_dir="./data", trust_remote_code=True)
ner_datasets

DatasetDict({
    train: Dataset({
        features: ['id', 'tokens', 'ner_tags'],
        num_rows: 20865
    })
    validation: Dataset({
        features: ['id', 'tokens', 'ner_tags'],
        num_rows: 2319
    })
    test: Dataset({
        features: ['id', 'tokens', 'ner_tags'],
        num_rows: 4637
    })
})

In [4]:
ner_datasets['train'][0]

{'id': '0',
 'tokens': ['海',
  '钓',
  '比',
  '赛',
  '地',
  '点',
  '在',
  '厦',
  '门',
  '与',
  '金',
  '门',
  '之',
  '间',
  '的',
  '海',
  '域',
  '。'],
 'ner_tags': [0, 0, 0, 0, 0, 0, 0, 5, 6, 0, 5, 6, 0, 0, 0, 0, 0, 0]}

In [5]:
ner_datasets['train'].features

{'id': Value(dtype='string', id=None),
 'tokens': Sequence(feature=Value(dtype='string', id=None), length=-1, id=None),
 'ner_tags': Sequence(feature=ClassLabel(names=['O', 'B-PER', 'I-PER', 'B-ORG', 'I-ORG', 'B-LOC', 'I-LOC'], id=None), length=-1, id=None)}

In [6]:
ner_datasets['train'].features['ner_tags'].feature.names

['O', 'B-PER', 'I-PER', 'B-ORG', 'I-ORG', 'B-LOC', 'I-LOC']

## Step3数据集预处理

In [7]:
tokenizer=AutoTokenizer.from_pretrained('hfl/chinese-macbert-base')

In [8]:
ner_datasets['train'][0]

{'id': '0',
 'tokens': ['海',
  '钓',
  '比',
  '赛',
  '地',
  '点',
  '在',
  '厦',
  '门',
  '与',
  '金',
  '门',
  '之',
  '间',
  '的',
  '海',
  '域',
  '。'],
 'ner_tags': [0, 0, 0, 0, 0, 0, 0, 5, 6, 0, 5, 6, 0, 0, 0, 0, 0, 0]}

In [9]:

tokenizer(ner_datasets['train'][0]["tokens"])

{'input_ids': [[101, 3862, 102], [101, 7157, 102], [101, 3683, 102], [101, 6612, 102], [101, 1765, 102], [101, 4157, 102], [101, 1762, 102], [101, 1336, 102], [101, 7305, 102], [101, 680, 102], [101, 7032, 102], [101, 7305, 102], [101, 722, 102], [101, 7313, 102], [101, 4638, 102], [101, 3862, 102], [101, 1818, 102], [101, 511, 102]], 'token_type_ids': [[0, 0, 0], [0, 0, 0], [0, 0, 0], [0, 0, 0], [0, 0, 0], [0, 0, 0], [0, 0, 0], [0, 0, 0], [0, 0, 0], [0, 0, 0], [0, 0, 0], [0, 0, 0], [0, 0, 0], [0, 0, 0], [0, 0, 0], [0, 0, 0], [0, 0, 0], [0, 0, 0]], 'attention_mask': [[1, 1, 1], [1, 1, 1], [1, 1, 1], [1, 1, 1], [1, 1, 1], [1, 1, 1], [1, 1, 1], [1, 1, 1], [1, 1, 1], [1, 1, 1], [1, 1, 1], [1, 1, 1], [1, 1, 1], [1, 1, 1], [1, 1, 1], [1, 1, 1], [1, 1, 1], [1, 1, 1]]}

In [10]:
tokenizer(ner_datasets['train'][0]["tokens"],is_split_into_words=True) # 将字分成词，实际上就是将字的token放在一起，可以根据上一个模块一起理解

{'input_ids': [101, 3862, 7157, 3683, 6612, 1765, 4157, 1762, 1336, 7305, 680, 7032, 7305, 722, 7313, 4638, 3862, 1818, 511, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}

In [11]:
res=tokenizer('intersecting word')
res

{'input_ids': [101, 10673, 8755, 10862, 8221, 8681, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1]}

In [12]:
res.word_ids()##查看是从哪个词里面分出来的

[None, 0, 0, 0, 0, 1, None]

In [13]:
ner_datasets['train'][0]["ner_tags"]

[0, 0, 0, 0, 0, 0, 0, 5, 6, 0, 5, 6, 0, 0, 0, 0, 0, 0]

In [14]:
label_list=ner_datasets['train'].features['ner_tags'].feature.names
label_list

['O', 'B-PER', 'I-PER', 'B-ORG', 'I-ORG', 'B-LOC', 'I-LOC']

In [15]:
def process_function(examples):
    tokenized_examples = tokenizer(examples["tokens"],max_length=128,truncation=True,is_split_into_words=True)
    # print(tokenized_examples)
    labels=[]
    # 遍历数据将label放在返回数据中
    for i,label in enumerate(examples["ner_tags"]):
        # print(i,label)
        word_ids = tokenized_examples.word_ids(batch_index=i)
        label_ids = []
        for word_idx in word_ids:
            # 如果word_idx为None，则说明该token是padding，label_ids中添加-100，
            # -100在交叉熵计算时候会默认跳过计算的值 ！！！！！！
            if word_idx is None:
               label_ids.append(-100)
            else:
               label_ids.append(label[word_idx])
        labels.append(label_ids)
    tokenized_examples['labels']=labels
    return tokenized_examples

In [16]:
tokenizer_datasets=ner_datasets.map(process_function,batched=True)
tokenizer_datasets

DatasetDict({
    train: Dataset({
        features: ['id', 'tokens', 'ner_tags', 'input_ids', 'token_type_ids', 'attention_mask', 'labels'],
        num_rows: 20865
    })
    validation: Dataset({
        features: ['id', 'tokens', 'ner_tags', 'input_ids', 'token_type_ids', 'attention_mask', 'labels'],
        num_rows: 2319
    })
    test: Dataset({
        features: ['id', 'tokens', 'ner_tags', 'input_ids', 'token_type_ids', 'attention_mask', 'labels'],
        num_rows: 4637
    })
})

In [17]:
print(tokenizer_datasets['train'][0])

{'id': '0', 'tokens': ['海', '钓', '比', '赛', '地', '点', '在', '厦', '门', '与', '金', '门', '之', '间', '的', '海', '域', '。'], 'ner_tags': [0, 0, 0, 0, 0, 0, 0, 5, 6, 0, 5, 6, 0, 0, 0, 0, 0, 0], 'input_ids': [101, 3862, 7157, 3683, 6612, 1765, 4157, 1762, 1336, 7305, 680, 7032, 7305, 722, 7313, 4638, 3862, 1818, 511, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], 'labels': [-100, 0, 0, 0, 0, 0, 0, 0, 5, 6, 0, 5, 6, 0, 0, 0, 0, 0, 0, -100]}


## Step4 创建模型

In [18]:
model=AutoModelForTokenClassification.from_pretrained('hfl/chinese-macbert-base',num_labels=len(label_list))
model

Some weights of BertForTokenClassification were not initialized from the model checkpoint at hfl/chinese-macbert-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


BertForTokenClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(21128, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12

## Step5 创建评估模型

In [19]:
seqeval=evaluate.load("seqeval")
seqeval

EvaluationModule(name: "seqeval", module_type: "metric", features: {'predictions': Sequence(feature=Value(dtype='string', id='label'), length=-1, id='sequence'), 'references': Sequence(feature=Value(dtype='string', id='label'), length=-1, id='sequence')}, usage: """
Produces labelling scores along with its sufficient statistics
from a source against one or more references.

Args:
    predictions: List of List of predicted labels (Estimated targets as returned by a tagger)
    references: List of List of reference labels (Ground truth (correct) target values)
    suffix: True if the IOB prefix is after type, False otherwise. default: False
    scheme: Specify target tagging scheme. Should be one of ["IOB1", "IOB2", "IOE1", "IOE2", "IOBES", "BILOU"].
        default: None
    mode: Whether to count correct entity labels with incorrect I/B tags as true positives or not.
        If you want to only count exact matches, pass mode="strict". default: None.
    sample_weight: Array-like of sha

### 创建评估模型

In [20]:
import numpy as np
def eval_metrics(preds):
    predictioins ,labels=preds
    predictioins = np.argmax(predictioins, axis=-1)
    # 预测结果都是类似token的数字，需要对其进行转换
    # 真实预测值
    true_predictions = [
        [label_list[p] for (p, l) in zip(prediction, label) if l != -100] 
        for prediction, label in zip(predictioins, labels)
    ]
    # 真实标签
    true_labels = [
        [label_list[l] for (p, l) in zip(prediction, label) if l != -100] 
        for prediction, label in zip(predictioins, labels)
    ]
    result=seqeval.compute(predictions=true_predictions, references=true_labels, mode="strict",scheme="IOB2")
    return {
        "f1":result["overall_f1"]
    }

## Step6 设置训练参数

In [21]:
args=TrainingArguments(
    output_dir='./models_for_ner',     # 输出目录
    per_device_eval_batch_size=128,      # 每块GPU上的评估批次大小
    per_device_train_batch_size=64,      # 每块GPU上的训练批次大小
    eval_strategy='epoch',             # 评估策略，这里是每个epoch后进行一次评估
    save_strategy='epoch',             # 保存策略，这里是每个epoch后进行一次保存
    num_train_epochs=3,                # 训练的epoch数
    metric_for_best_model='f1',       # 最好的模型指标
    load_best_model_at_end=True,       # 在结束训练后加载最佳模型
    logging_steps=50,                   # 每50步记录一次日志  
    
)

## Step7 创建训练器

In [22]:
trainer = Trainer(
    args=args,    # 训练器的参数
    model=model,  # 模型
    train_dataset=tokenizer_datasets['train'],  # 训练数据集
    eval_dataset=tokenizer_datasets['validation'],  # 评估数据集
    compute_metrics=eval_metrics,  # 评估指标的计算函数
    data_collator=DataCollatorForTokenClassification(tokenizer),  # 数据整理器，用于将数据整理为模型可以接受的格式
)

## Step8 开始训练

In [23]:
trainer.train()#RuntimeError: CUDA error: device-side assert triggered 数据标签创建错误

  0%|          | 0/981 [00:00<?, ?it/s]

{'loss': 0.232, 'grad_norm': 0.735589861869812, 'learning_rate': 4.745158002038736e-05, 'epoch': 0.15}
{'loss': 0.0428, 'grad_norm': 0.5820013284683228, 'learning_rate': 4.490316004077472e-05, 'epoch': 0.31}
{'loss': 0.0385, 'grad_norm': 0.4263501763343811, 'learning_rate': 4.235474006116208e-05, 'epoch': 0.46}
{'loss': 0.0335, 'grad_norm': 0.5451405644416809, 'learning_rate': 3.980632008154944e-05, 'epoch': 0.61}
{'loss': 0.0324, 'grad_norm': 0.5822671055793762, 'learning_rate': 3.72579001019368e-05, 'epoch': 0.76}
{'loss': 0.0276, 'grad_norm': 0.9135494828224182, 'learning_rate': 3.4709480122324164e-05, 'epoch': 0.92}


  0%|          | 0/19 [00:00<?, ?it/s]

{'eval_loss': 0.02073419652879238, 'eval_f1': 0.9477581242287124, 'eval_runtime': 10.3099, 'eval_samples_per_second': 224.93, 'eval_steps_per_second': 1.843, 'epoch': 1.0}
{'loss': 0.0272, 'grad_norm': 0.9369726777076721, 'learning_rate': 3.2161060142711516e-05, 'epoch': 1.07}
{'loss': 0.0154, 'grad_norm': 0.3175949156284332, 'learning_rate': 2.9612640163098882e-05, 'epoch': 1.22}
{'loss': 0.0152, 'grad_norm': 0.43263593316078186, 'learning_rate': 2.7064220183486238e-05, 'epoch': 1.38}
{'loss': 0.0161, 'grad_norm': 0.42302441596984863, 'learning_rate': 2.45158002038736e-05, 'epoch': 1.53}
{'loss': 0.0132, 'grad_norm': 0.11086752265691757, 'learning_rate': 2.196738022426096e-05, 'epoch': 1.68}
{'loss': 0.0136, 'grad_norm': 0.2637183964252472, 'learning_rate': 1.9418960244648318e-05, 'epoch': 1.83}
{'loss': 0.0132, 'grad_norm': 0.4318681061267853, 'learning_rate': 1.6870540265035677e-05, 'epoch': 1.99}


  0%|          | 0/19 [00:00<?, ?it/s]

{'eval_loss': 0.01698417402803898, 'eval_f1': 0.9527063969382177, 'eval_runtime': 10.1749, 'eval_samples_per_second': 227.913, 'eval_steps_per_second': 1.867, 'epoch': 2.0}
{'loss': 0.0084, 'grad_norm': 0.3358180522918701, 'learning_rate': 1.4322120285423038e-05, 'epoch': 2.14}
{'loss': 0.0064, 'grad_norm': 0.515368640422821, 'learning_rate': 1.1773700305810397e-05, 'epoch': 2.29}
{'loss': 0.0073, 'grad_norm': 0.15612316131591797, 'learning_rate': 9.225280326197758e-06, 'epoch': 2.45}
{'loss': 0.0067, 'grad_norm': 0.32551348209381104, 'learning_rate': 6.676860346585118e-06, 'epoch': 2.6}
{'loss': 0.0063, 'grad_norm': 0.18511822819709778, 'learning_rate': 4.128440366972477e-06, 'epoch': 2.75}
{'loss': 0.0066, 'grad_norm': 0.41867774724960327, 'learning_rate': 1.580020387359837e-06, 'epoch': 2.91}


  0%|          | 0/19 [00:00<?, ?it/s]

{'eval_loss': 0.01698402129113674, 'eval_f1': 0.9549500205395044, 'eval_runtime': 9.8836, 'eval_samples_per_second': 234.631, 'eval_steps_per_second': 1.922, 'epoch': 3.0}
{'train_runtime': 729.382, 'train_samples_per_second': 85.819, 'train_steps_per_second': 1.345, 'train_loss': 0.028849077334097768, 'epoch': 3.0}


TrainOutput(global_step=981, training_loss=0.028849077334097768, metrics={'train_runtime': 729.382, 'train_samples_per_second': 85.819, 'train_steps_per_second': 1.345, 'total_flos': 3940951205762142.0, 'train_loss': 0.028849077334097768, 'epoch': 3.0})

In [24]:
trainer.evaluate(eval_dataset=tokenizer_datasets['test'])

  0%|          | 0/37 [00:00<?, ?it/s]

FailedPreconditionError: ./models_for_ner\runs\Jan20_11-51-05_周立庆 is not a directory

## Step9 模型预测

In [26]:
from transformers import pipeline
ner = pipeline("token-classification", model=model, tokenizer=tokenizer)

In [27]:
ner('小明在北京上班')

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


[{'entity': 'LABEL_1',
  'score': 0.9391015,
  'index': 1,
  'word': '小',
  'start': 0,
  'end': 1},
 {'entity': 'LABEL_2',
  'score': 0.91279423,
  'index': 2,
  'word': '明',
  'start': 1,
  'end': 2},
 {'entity': 'LABEL_0',
  'score': 0.99953055,
  'index': 3,
  'word': '在',
  'start': 2,
  'end': 3},
 {'entity': 'LABEL_5',
  'score': 0.9989299,
  'index': 4,
  'word': '北',
  'start': 3,
  'end': 4},
 {'entity': 'LABEL_6',
  'score': 0.9989794,
  'index': 5,
  'word': '京',
  'start': 4,
  'end': 5},
 {'entity': 'LABEL_0',
  'score': 0.99963856,
  'index': 6,
  'word': '上',
  'start': 5,
  'end': 6},
 {'entity': 'LABEL_0',
  'score': 0.9995515,
  'index': 7,
  'word': '班',
  'start': 6,
  'end': 7}]

In [30]:
model.config    # 当前显示的不算是正确结果，他将每个字都分开了，需要修改配置，才能得到正确结果

BertConfig {
  "_name_or_path": "hfl/chinese-macbert-base",
  "architectures": [
    "BertForTokenClassification"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "directionality": "bidi",
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "id2label": {
    "0": "LABEL_0",
    "1": "LABEL_1",
    "2": "LABEL_2",
    "3": "LABEL_3",
    "4": "LABEL_4",
    "5": "LABEL_5",
    "6": "LABEL_6"
  },
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "label2id": {
    "LABEL_0": 0,
    "LABEL_1": 1,
    "LABEL_2": 2,
    "LABEL_3": 3,
    "LABEL_4": 4,
    "LABEL_5": 5,
    "LABEL_6": 6
  },
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "pooler_fc_size": 768,
  "pooler_num_attention_heads": 12,
  "pooler_num_fc_layers": 3,
  "pooler_size_per_head": 128,
  "pooler_type": "first_to

In [31]:
# 


[{'entity': 'LABEL_1',
  'score': 0.9391015,
  'index': 1,
  'word': '小',
  'start': 0,
  'end': 1},
 {'entity': 'LABEL_2',
  'score': 0.91279423,
  'index': 2,
  'word': '明',
  'start': 1,
  'end': 2},
 {'entity': 'LABEL_0',
  'score': 0.99953055,
  'index': 3,
  'word': '在',
  'start': 2,
  'end': 3},
 {'entity': 'LABEL_5',
  'score': 0.9989299,
  'index': 4,
  'word': '北',
  'start': 3,
  'end': 4},
 {'entity': 'LABEL_6',
  'score': 0.9989794,
  'index': 5,
  'word': '京',
  'start': 4,
  'end': 5},
 {'entity': 'LABEL_0',
  'score': 0.99963856,
  'index': 6,
  'word': '上',
  'start': 5,
  'end': 6},
 {'entity': 'LABEL_0',
  'score': 0.9995515,
  'index': 7,
  'word': '班',
  'start': 6,
  'end': 7}]

In [34]:
model.config.id2label  ={idx:label  for idx ,label in enumerate(label_list)}
model.config


BertConfig {
  "_name_or_path": "hfl/chinese-macbert-base",
  "architectures": [
    "BertForTokenClassification"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "directionality": "bidi",
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "id2label": {
    "0": "O",
    "1": "B-PER",
    "2": "I-PER",
    "3": "B-ORG",
    "4": "I-ORG",
    "5": "B-LOC",
    "6": "I-LOC"
  },
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "label2id": {
    "LABEL_0": 0,
    "LABEL_1": 1,
    "LABEL_2": 2,
    "LABEL_3": 3,
    "LABEL_4": 4,
    "LABEL_5": 5,
    "LABEL_6": 6
  },
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "pooler_fc_size": 768,
  "pooler_num_attention_heads": 12,
  "pooler_num_fc_layers": 3,
  "pooler_size_per_head": 128,
  "pooler_type": "first_token_transform",
  

In [33]:
ner = pipeline("token-classification", model=model, tokenizer=tokenizer)# h还是一个子一个字的
ner('小明在北京上班')

[{'entity': 'B-PER',
  'score': 0.9391015,
  'index': 1,
  'word': '小',
  'start': 0,
  'end': 1},
 {'entity': 'I-PER',
  'score': 0.91279423,
  'index': 2,
  'word': '明',
  'start': 1,
  'end': 2},
 {'entity': 'B-LOC',
  'score': 0.9989299,
  'index': 4,
  'word': '北',
  'start': 3,
  'end': 4},
 {'entity': 'I-LOC',
  'score': 0.9989794,
  'index': 5,
  'word': '京',
  'start': 4,
  'end': 5}]

In [35]:
ner = pipeline("token-classification", model=model, tokenizer=tokenizer,aggregation_strategy="simple")# aggregation_strategy聚合策略，具体看官网
ner('小明在北京上班')

[{'entity_group': 'PER',
  'score': 0.9259479,
  'word': '小 明',
  'start': 0,
  'end': 2},
 {'entity_group': 'LOC',
  'score': 0.99895465,
  'word': '北 京',
  'start': 3,
  'end': 5}]