### 1.导入依赖包

In [41]:
# 导入相关依赖包
from transformers import AutoModelForTokenClassification, AutoTokenizer,DataCollatorForTokenClassification, AutoModel
from transformers import TrainingArguments, Trainer
import torch
import evaluate  # pip install evaluate
import seqeval   # pip install seqeval
from datasets import load_dataset
import numpy as np

### 2.初始化预训练模型

In [10]:
# 初始化模型使用 模型名称为：google-bert/bert-base-chinese 最终预测类别为21个对应最后输出层神经元个数
model = AutoModelForTokenClassification.from_pretrained("google-bert/bert-base-chinese", num_labels=21)

Some weights of BertForTokenClassification were not initialized from the model checkpoint at google-bert/bert-base-chinese and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [11]:
# 查看模型结构
model

BertForTokenClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(21128, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12

In [12]:
# 使用AutoModel 加载预训练模型对对比 AutoModelForTokenClassification 加在模型的区别, 使用AutoModel 下载预训练模型只能加载最原始的bert模型结构，在模型最后的输出后面没有
# 用于做分类的线性层可供使用
model1 = AutoModel.from_pretrained("google-bert/bert-base-chinese", num_labels=21)
model1

BertModel(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(21128, 768, padding_idx=0)
    (position_embeddings): Embedding(512, 768)
    (token_type_embeddings): Embedding(2, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0-11): 12 x BertLayer(
        (attention): BertAttention(
          (self): BertSdpaSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inplace=False

### 3.初始化分词器


In [13]:
# 使用AutoTokenizer 加载分词器
tokenizer = AutoTokenizer.from_pretrained("google-bert/bert-base-chinese")

tokenizer_config.json:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/110k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/269k [00:00<?, ?B/s]

In [14]:
# 查看分词器结构
tokenizer

BertTokenizerFast(name_or_path='google-bert/bert-base-chinese', vocab_size=21128, model_max_length=512, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'}, clean_up_tokenization_spaces=False, added_tokens_decoder={
	0: AddedToken("[PAD]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	100: AddedToken("[UNK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	101: AddedToken("[CLS]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	102: AddedToken("[SEP]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	103: AddedToken("[MASK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
}
)

In [15]:
# 分词器测试
input_data = tokenizer(["今天是个好天气", "直接指向我们要加载的模型名或存盘⽬录"], return_tensors = 'pt', padding=True, truncation=True, max_length=512)
input_data

{'input_ids': tensor([[ 101,  791, 1921, 3221,  702, 1962, 1921, 3698,  102,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0],
        [ 101, 4684, 2970, 2900, 1403, 2769,  812, 6206, 1217, 6770, 4638, 3563,
         1798, 1399, 2772, 2100, 4669,  100, 2497,  102]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])}

In [16]:
# 模型测试
tokenizer1 = AutoTokenizer.from_pretrained("google-bert/bert-base-chinese")
message= "命名实体识别"
label = torch.tensor([0,1,0,2,5,4])

model_input = tokenizer1([message], return_tensors='pt')
result = model(**model_input)
print(model_input)

print(result.loss)
print(result.logits)

{'input_ids': tensor([[ 101, 1462, 1399, 2141,  860, 6399, 1166,  102]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1]])}
None
tensor([[[-2.4484e-01,  1.7518e-02, -3.2333e-01,  4.1861e-01,  5.3770e-01,
           6.0092e-02,  1.2795e-01,  8.0413e-03,  3.6765e-01, -1.8829e-01,
          -4.0653e-01,  3.6950e-01,  5.0703e-01,  4.7109e-01, -2.4483e-02,
           7.9691e-01,  2.0414e-01,  1.7849e-01,  7.4815e-01, -2.1226e-01,
           6.9242e-01],
         [ 3.5343e-01,  4.4996e-01, -3.3618e-01,  5.8534e-01,  5.7982e-01,
          -1.0071e-01,  4.0454e-01,  5.7865e-02,  2.7064e-01,  1.9377e-01,
           1.8542e-01, -1.0717e-02,  1.5382e-01,  2.1916e-01,  4.4539e-01,
           8.2503e-01, -1.5035e-01,  5.3618e-01,  3.0218e-01, -1.0195e-01,
           5.8157e-01],
         [-6.2226e-03,  1.5144e-01, -6.5257e-01,  1.2281e+00,  3.3460e-01,
           2.2465e-01,  7.8376e-01,  5.7116e-01,  6.1610e-01,  2.5007e-01,
          -1.67

### 加载数据集

In [17]:
#加载dataset
ds = load_dataset('nlhappy/CLUE-NER')
ds

README.md:   0%|          | 0.00/21.0 [00:00<?, ?B/s]

dataset_infos.json:   0%|          | 0.00/970 [00:00<?, ?B/s]

(…)-00000-of-00001-a33d0e4276aef9b4.parquet:   0%|          | 0.00/1.30M [00:00<?, ?B/s]

(…)-00000-of-00001-07f476b71c5edde6.parquet:   0%|          | 0.00/178k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/10748 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/1343 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['text', 'ents'],
        num_rows: 10748
    })
    validation: Dataset({
        features: ['text', 'ents'],
        num_rows: 1343
    })
})

In [18]:
# 查看数据结构
ds['train'][0]

{'text': '浙商银行企业信贷部叶老桂博士则从另一个角度对五道门槛进行了解读。叶老桂认为，对目前国内商业银行而言，',
 'ents': [{'indices': [9, 10, 11],
   'is_continuous': True,
   'label': 'name',
   'text': '叶老桂'},
  {'indices': [0, 1, 2, 3],
   'is_continuous': True,
   'label': 'company',
   'text': '浙商银行'}]}

In [19]:
ds['validation'][0]

{'text': '彭小军认为，国内银行现在走的是台湾的发卡模式，先通过跑马圈地再在圈的地里面选择客户，',
 'ents': [{'indices': [15, 16],
   'is_continuous': True,
   'label': 'address',
   'text': '台湾'},
  {'indices': [0, 1, 2],
   'is_continuous': True,
   'label': 'name',
   'text': '彭小军'}]}

In [20]:
# 通过代码获取数据中标签的种类
# 对于训练数据集
train_data = ds['train']
labels_set = set()
for item in train_data:
    # 遍历训练数据集
    info_list = item['ents']
    for i in range(len(info_list)):
       # 取出标注数据
       labels_set.add(info_list[i]["label"])
print(labels_set)

{'movie', 'address', 'scene', 'position', 'book', 'name', 'game', 'company', 'organization', 'government'}


In [21]:
# 通过代码获取数据中标签的种类
# 对于验证数据集
train_data = ds['validation']
labels_set = set()
for item in train_data:
    # 遍历训练数据集
    info_list = item['ents']
    for i in range(len(info_list)):
       # 取出标注数据
       labels_set.add(info_list[i]["label"])
print(labels_set)

{'movie', 'scene', 'address', 'position', 'book', 'name', 'game', 'company', 'organization', 'government'}


### 类别映射词典准备

In [23]:
# 组装类别词典列表（使用序列标注，三位标注法）(最后tags 中标签的数量即为模型最终输出线性层输出的数量，用于最终的分类)
entitys = ['O'] + list(labels_set)
print(entitys)

tags = []
# 添加标签前缀
for item in entitys:
    if item != 'O':
        # 其余标签都添加 B- 标识命名实体开始标签
        tags.append("B-" + item.upper())
        # 添加 I- 标识命名实体本身
        tags.append("I-" + item.upper())
    else:
        # 非命名实体的用“O” 标识
        tags.append(item)
print(tags)

# 给实体类别添加索引值并组装为字典
entity_index = {entity:i for i, entity in enumerate(entitys)}
print(entity_index)

['O', 'movie', 'scene', 'address', 'position', 'book', 'name', 'game', 'company', 'organization', 'government']
['O', 'B-MOVIE', 'I-MOVIE', 'B-SCENE', 'I-SCENE', 'B-ADDRESS', 'I-ADDRESS', 'B-POSITION', 'I-POSITION', 'B-BOOK', 'I-BOOK', 'B-NAME', 'I-NAME', 'B-GAME', 'I-GAME', 'B-COMPANY', 'I-COMPANY', 'B-ORGANIZATION', 'I-ORGANIZATION', 'B-GOVERNMENT', 'I-GOVERNMENT']
{'O': 0, 'movie': 1, 'scene': 2, 'address': 3, 'position': 4, 'book': 5, 'name': 6, 'game': 7, 'company': 8, 'organization': 9, 'government': 10}


In [24]:
# 定义数据集数据-标签映射函数，处理数据集数据
def entity_tags_precess_map(item):
    # 先取出数据中文本
    text = item['text']
    # 根据文本长度初始化整个文本标记的的初始值
    # 均初始化为非命名实体标记字符
    text_tags = len(text) * [0]
    # 遍历文本标记信息
    for tag_info in item['ents']:
        # 获取命名实体类别标签
        label = tag_info['label']
        # 获取命名实体在实体类别字典中的索引
        label_index = entity_index[label]
        # 获取当前数据中当前命名实体在 tags list 中的标签
        # 开始位置
        start = label_index * 2 - 1
        # 实体本身
        inside = label_index * 2
        # 命名实体开始位置
        tag_text_index_list = tag_info['indices']
        text_tags[tag_text_index_list[0]] = start
        # 获取命名实体剩余的部分
        for tag_index in tag_text_index_list[1:]:
            # 替换 text_tags 中命名实体对应的标签值
            text_tags[tag_index] = inside
    # 此映射函数需要再原来的dataset 字典中添加一个文本序列标注的list
    return {'ent_tag':text_tags}

# 使用自定义的回调函数处理数据集
ds1 = ds.map(entity_tags_precess_map)

Map:   0%|          | 0/10748 [00:00<?, ? examples/s]

Map:   0%|          | 0/1343 [00:00<?, ? examples/s]

In [25]:
# 查看处理过的数据集发现字典中多了一个 “ent_tag” 的属性
ds1

DatasetDict({
    train: Dataset({
        features: ['text', 'ents', 'ent_tag'],
        num_rows: 10748
    })
    validation: Dataset({
        features: ['text', 'ents', 'ent_tag'],
        num_rows: 1343
    })
})

In [26]:
# 继续查询数据详情
ds1['train'][0]['ent_tag']

[15,
 16,
 16,
 16,
 0,
 0,
 0,
 0,
 0,
 11,
 12,
 12,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0]

#### 中文bert分词在日期时间和英文转换token过程中，出现合并。影响ner标注准确性。

In [27]:
token_index = tokenizer.encode('2000年2月hahah', add_special_tokens=False)
print(token_index)
tokens = tokenizer.decode(token_index)
print(tokens)

[8202, 2399, 123, 3299, 11643, 8778, 8199]
2000 年 2 月 hahah


In [28]:
input_data = tokenizer(['2000年2月add'], add_special_tokens=False, truncation=True)
print(input_data)

input_data.word_ids(0) # 返回批次中指定token对应原始文本的索引映射

{'input_ids': [[8202, 2399, 123, 3299, 10253]], 'token_type_ids': [[0, 0, 0, 0, 0]], 'attention_mask': [[1, 1, 1, 1, 1]]}


[0, 1, 2, 3, 4]

In [29]:
# 原始文本转换模型需要token_idx,生成和token_idx对齐label
def data_input_proc(item):
    # 输入文本转换模型输入token索引
    input_data = tokenizer(item['text'], truncation=True, add_special_tokens=False, max_length=512)
    adjust_labels = []  # 所有修正后label索引列表
    # 上一步骤生成ner_tag中索引和token对齐
    for k in range(len(input_data['input_ids'])):
        # 每条记录token对应word_ids
        word_ids = input_data.word_ids(k)
        # 批次ner_tag长度和token长度对齐
        tags = item['ent_tag'][k]
        
        adjusted_label_ids = []
        i, prev_wid = -1,-1
        for wid in word_ids:
            if (wid != prev_wid):   #  word_ids [1,1,1,2,3,4,5] -> [0,1,2,3,4,5,6]
                i += 1 # token对应检索位置+1
                prev_wid = wid
            adjusted_label_ids.append(tags[i])
        adjust_labels.append(adjusted_label_ids)                
    # 修正后label添加到input_data
    input_data['labels'] = adjust_labels
    return input_data
    

ds2 = ds1.map(data_input_proc, batched=True)  # batched 每次传入自定义方法样本数量多个

Map:   0%|          | 0/10748 [00:00<?, ? examples/s]

Map:   0%|          | 0/1343 [00:00<?, ? examples/s]

In [30]:
ds2['train'][0]['labels']

[15,
 16,
 16,
 16,
 0,
 0,
 0,
 0,
 0,
 11,
 12,
 12,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0]

In [31]:
ds2['train'][0]['ent_tag']

[15,
 16,
 16,
 16,
 0,
 0,
 0,
 0,
 0,
 11,
 12,
 12,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0]

In [32]:
# 记录转换为pytorch
ds2.set_format('torch', columns=['input_ids', 'token_type_ids', 'attention_mask', 'labels'])

In [33]:
 for item in ds2['train']:
     print(item)
     break

{'input_ids': tensor([3851, 1555, 7213, 6121,  821,  689,  928, 6587, 6956, 1383, 5439, 3424,
        1300, 1894, 1156,  794, 1369,  671,  702, 6235, 2428, 2190,  758, 6887,
        7305, 3546, 6822, 6121,  749, 6237, 6438,  511, 1383, 5439, 3424, 6371,
         711, 8024, 2190, 4680, 1184, 1744, 1079, 1555,  689, 7213, 6121, 5445,
        6241, 8024]), 'token_type_ids': tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0]), 'attention_mask': tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1]), 'labels': tensor([15, 16, 16, 16,  0,  0,  0,  0,  0, 11, 12, 12,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0])}


### 模型训练

#### TrainingArguments

In [34]:
args = TrainingArguments(
    output_dir="ner_train",  # 模型训练工作目录（tensorboard，临时模型存盘文件，日志）
    num_train_epochs = 3,    # 训练 epoch
    save_safetensors=False,  # 设置False保存文件可以通过torch.load加载
    per_device_train_batch_size=32,  # 训练批次
    per_device_eval_batch_size=32,
    report_to='tensorboard',  # 训练输出记录
    eval_strategy="epoch",
)

#### Model

In [35]:
print(tags)

['O', 'B-MOVIE', 'I-MOVIE', 'B-SCENE', 'I-SCENE', 'B-ADDRESS', 'I-ADDRESS', 'B-POSITION', 'I-POSITION', 'B-BOOK', 'I-BOOK', 'B-NAME', 'I-NAME', 'B-GAME', 'I-GAME', 'B-COMPANY', 'I-COMPANY', 'B-ORGANIZATION', 'I-ORGANIZATION', 'B-GOVERNMENT', 'I-GOVERNMENT']


In [36]:
id2lbl = {i:tag for i, tag in enumerate(tags)}
print(id2lbl)
lbl2id = {tag:i for i, tag in enumerate(tags)}
print(lbl2id)

model = AutoModelForTokenClassification.from_pretrained('google-bert/bert-base-chinese', 
                                                        num_labels=21,
                                                        id2label=id2lbl,
                                                        label2id=lbl2id)
model

Some weights of BertForTokenClassification were not initialized from the model checkpoint at google-bert/bert-base-chinese and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


{0: 'O', 1: 'B-MOVIE', 2: 'I-MOVIE', 3: 'B-SCENE', 4: 'I-SCENE', 5: 'B-ADDRESS', 6: 'I-ADDRESS', 7: 'B-POSITION', 8: 'I-POSITION', 9: 'B-BOOK', 10: 'I-BOOK', 11: 'B-NAME', 12: 'I-NAME', 13: 'B-GAME', 14: 'I-GAME', 15: 'B-COMPANY', 16: 'I-COMPANY', 17: 'B-ORGANIZATION', 18: 'I-ORGANIZATION', 19: 'B-GOVERNMENT', 20: 'I-GOVERNMENT'}
{'O': 0, 'B-MOVIE': 1, 'I-MOVIE': 2, 'B-SCENE': 3, 'I-SCENE': 4, 'B-ADDRESS': 5, 'I-ADDRESS': 6, 'B-POSITION': 7, 'I-POSITION': 8, 'B-BOOK': 9, 'I-BOOK': 10, 'B-NAME': 11, 'I-NAME': 12, 'B-GAME': 13, 'I-GAME': 14, 'B-COMPANY': 15, 'I-COMPANY': 16, 'B-ORGANIZATION': 17, 'I-ORGANIZATION': 18, 'B-GOVERNMENT': 19, 'I-GOVERNMENT': 20}


BertForTokenClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(21128, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12

#### Trainer

In [43]:
# metric 方法
def compute_metric(result):
    # result 是一个tuple (predicts, labels)
    
    # 获取评估对象
    seqeval = evaluate.load('seqeval')
    predicts,labels = result
    predicts = np.argmax(predicts, axis=2)
    
    # 准备评估数据
    predicts = [[tags[p] for p,l in zip(ps,ls) if l != -100]
                 for ps,ls in zip(predicts,labels)]
    labels = [[tags[l] for p,l in zip(ps,ls) if l != -100]
                 for ps,ls in zip(predicts,labels)]
    results = seqeval.compute(predictions=predicts, references=labels)

    return results

In [44]:
data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer, padding=True)

trainer = Trainer(
    model,
    args,
    train_dataset=ds2['train'],
    eval_dataset=ds2['validation'],
    data_collator=data_collator,
    compute_metrics=compute_metric
)

In [45]:
# 开始训练
trainer.train()

Epoch,Training Loss,Validation Loss,Address,Book,Company,Game,Government,Movie,Name,Organization,Position,Scene,Overall Precision,Overall Recall,Overall F1,Overall Accuracy
1,No log,0.336759,"{'precision': 0.4251968503937008, 'recall': 0.5806451612903226, 'f1': 0.4909090909090909, 'number': 372}","{'precision': 0.5957446808510638, 'recall': 0.5419354838709678, 'f1': 0.5675675675675675, 'number': 155}","{'precision': 0.585, 'recall': 0.582089552238806, 'f1': 0.5835411471321695, 'number': 402}","{'precision': 0.37800687285223367, 'recall': 0.40441176470588236, 'f1': 0.3907637655417407, 'number': 272}","{'precision': 0.5972696245733788, 'recall': 0.7142857142857143, 'f1': 0.650557620817844, 'number': 245}","{'precision': 0.6832298136645962, 'recall': 0.6707317073170732, 'f1': 0.676923076923077, 'number': 164}","{'precision': 0.7707865168539326, 'recall': 0.7236286919831224, 'f1': 0.7464635473340587, 'number': 474}","{'precision': 0.5844504021447721, 'recall': 0.5989010989010989, 'f1': 0.5915875169606511, 'number': 364}","{'precision': 0.6846153846153846, 'recall': 0.6312056737588653, 'f1': 0.6568265682656826, 'number': 423}","{'precision': 0.6091954022988506, 'recall': 0.4840182648401826, 'f1': 0.5394402035623409, 'number': 219}",0.586587,0.602913,0.594638,0.900386
2,No log,0.307914,"{'precision': 0.4859335038363171, 'recall': 0.510752688172043, 'f1': 0.4980340760157274, 'number': 372}","{'precision': 0.5786163522012578, 'recall': 0.5935483870967742, 'f1': 0.5859872611464967, 'number': 155}","{'precision': 0.6177884615384616, 'recall': 0.6393034825870647, 'f1': 0.6283618581907091, 'number': 402}","{'precision': 0.41114982578397213, 'recall': 0.4338235294117647, 'f1': 0.42218246869409665, 'number': 272}","{'precision': 0.6485507246376812, 'recall': 0.7306122448979592, 'f1': 0.6871401151631479, 'number': 245}","{'precision': 0.708029197080292, 'recall': 0.5914634146341463, 'f1': 0.6445182724252492, 'number': 164}","{'precision': 0.7291666666666666, 'recall': 0.7383966244725738, 'f1': 0.7337526205450734, 'number': 474}","{'precision': 0.5894206549118388, 'recall': 0.6428571428571429, 'f1': 0.6149802890932983, 'number': 364}","{'precision': 0.6908212560386473, 'recall': 0.6761229314420804, 'f1': 0.6833930704898447, 'number': 423}","{'precision': 0.5605381165919282, 'recall': 0.5707762557077626, 'f1': 0.5656108597285068, 'number': 219}",0.606289,0.623948,0.614992,0.908035
3,0.189500,0.320696,"{'precision': 0.47858942065491183, 'recall': 0.510752688172043, 'f1': 0.494148244473342, 'number': 372}","{'precision': 0.5515151515151515, 'recall': 0.5870967741935483, 'f1': 0.5687499999999999, 'number': 155}","{'precision': 0.5995260663507109, 'recall': 0.6293532338308457, 'f1': 0.6140776699029127, 'number': 402}","{'precision': 0.45, 'recall': 0.4632352941176471, 'f1': 0.45652173913043476, 'number': 272}","{'precision': 0.6321428571428571, 'recall': 0.7224489795918367, 'f1': 0.6742857142857143, 'number': 245}","{'precision': 0.6838709677419355, 'recall': 0.6463414634146342, 'f1': 0.664576802507837, 'number': 164}","{'precision': 0.7151639344262295, 'recall': 0.7362869198312236, 'f1': 0.7255717255717256, 'number': 474}","{'precision': 0.5788177339901478, 'recall': 0.6456043956043956, 'f1': 0.6103896103896104, 'number': 364}","{'precision': 0.6893424036281179, 'recall': 0.7186761229314421, 'f1': 0.7037037037037037, 'number': 423}","{'precision': 0.5502183406113537, 'recall': 0.5753424657534246, 'f1': 0.5625, 'number': 219}",0.599755,0.633333,0.616087,0.909766


Trainer is attempting to log a value of "{'precision': 0.4251968503937008, 'recall': 0.5806451612903226, 'f1': 0.4909090909090909, 'number': 372}" of type <class 'dict'> for key "eval/ADDRESS" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorrect so we dropped this attribute.
Trainer is attempting to log a value of "{'precision': 0.5957446808510638, 'recall': 0.5419354838709678, 'f1': 0.5675675675675675, 'number': 155}" of type <class 'dict'> for key "eval/BOOK" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorrect so we dropped this attribute.
Trainer is attempting to log a value of "{'precision': 0.585, 'recall': 0.582089552238806, 'f1': 0.5835411471321695, 'number': 402}" of type <class 'dict'> for key "eval/COMPANY" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorrect so we dropped this attribute.
Trainer is attempting to log a value of "{'precision': 0.37800687285223367, 'recall': 0.40441176470588236, 'f1'

TrainOutput(global_step=504, training_loss=0.1891273157227607, metrics={'train_runtime': 211.1088, 'train_samples_per_second': 152.736, 'train_steps_per_second': 2.387, 'total_flos': 820469833815600.0, 'train_loss': 0.1891273157227607, 'epoch': 3.0})

#### 模型推理

In [46]:
result = trainer.predict(ds2['validation'])

In [47]:
result

PredictionOutput(predictions=array([[[ 1.16269815e+00,  3.01405728e-01, -3.46002847e-01, ...,
         -2.03423882e+00, -8.17227066e-01, -1.54515016e+00],
        [ 1.93018639e+00, -1.13778794e+00,  5.87029994e-01, ...,
         -3.47192168e-01, -1.24619222e+00, -4.27877307e-01],
        [ 2.18126941e+00, -1.36515689e+00,  5.32690763e-01, ...,
         -4.56831336e-01, -1.27404177e+00, -4.77858573e-01],
        ...,
        [ 1.14714851e+01, -1.34443128e+00, -8.45166862e-01, ...,
         -9.28908408e-01, -1.24595869e+00, -1.49832118e+00],
        [ 1.15317984e+01, -1.41182327e+00, -9.25910592e-01, ...,
         -8.41526985e-01, -1.17196071e+00, -1.39698303e+00],
        [ 1.13740597e+01, -1.31516242e+00, -1.07594752e+00, ...,
         -9.01089251e-01, -1.13462996e+00, -1.51564324e+00]],

       [[ 3.79712415e+00, -7.27360606e-01, -1.17418075e+00, ...,
         -1.40646279e+00, -1.42037654e+00, -2.13151813e+00],
        [ 4.21313572e+00, -2.13888931e+00, -2.38957942e-01, ...,
         

In [49]:
print(ds1['validation'][13]['text'])
print(ds2['validation'][13]['labels'])
print(result.label_ids[13])

价格高昂的大钻和翡翠消费为何如此火？通灵珠宝总裁沈东军认为，这与原料稀缺有直接关系。“
tensor([ 0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        15, 16, 16, 16,  7,  8, 11, 12, 12,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0])
[   0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0   15   16   16   16    7    8   11   12   12    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0 -100 -100 -100 -100 -100 -100 -100]


In [50]:
[tags[p] for p,l in zip(result.label_ids[13],ds2['validation'][13]['labels'])]

['O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'B-COMPANY',
 'I-COMPANY',
 'I-COMPANY',
 'I-COMPANY',
 'B-POSITION',
 'I-POSITION',
 'B-NAME',
 'I-NAME',
 'I-NAME',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O']

In [None]:
# trainer1 = Trainer(
#     model,
#     args,
#     train_dataset=ds_orignal['train'],
#     eval_dataset=ds_orignal['test'],
#     data_collator=data_collator,
#     compute_metrics=compute_metric
# )