In [59]:
from transformers import AutoModelForTokenClassification,AutoTokenizer,TrainingArguments,Trainer,DataCollatorForTokenClassification
import torch
from datasets import load_dataset
import evaluate  # pip install evaluate
import seqeval   # pip install seqeval
from datasets import Dataset
import numpy as np

In [60]:
model = AutoModelForTokenClassification.from_pretrained('google-bert/bert-base-chinese',num_labels=7)

Some weights of BertForTokenClassification were not initialized from the model checkpoint at google-bert/bert-base-chinese and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [61]:
tokenizer = AutoTokenizer.from_pretrained('google-bert/bert-base-chinese')

In [62]:
# message = "命名实体识别"
# label = torch.tensor([0,1,0,2,5,4])

# model_input = tokenizer([message],return_tensors='pt')
# print(model_input)
# result = model(**model_input)
# print(result)

In [63]:
# token_index = tokenizer.encode('2000年2月add', add_special_tokens=False)
# print(token_index)
# tokens = tokenizer.decode(token_index)
# print(tokens)

# input_data = tokenizer(['2000年2月add','testing and',], add_special_tokens=False, truncation=True)
# print(input_data)
# input_data.word_ids(1)

In [64]:
# 加载dataset
ds = load_dataset('nlhappy/CLUE-NER')
ds

DatasetDict({
    train: Dataset({
        features: ['text', 'ents'],
        num_rows: 10748
    })
    validation: Dataset({
        features: ['text', 'ents'],
        num_rows: 1343
    })
})

In [65]:
for data in ds['train']:
    print(data)
    break

{'text': '浙商银行企业信贷部叶老桂博士则从另一个角度对五道门槛进行了解读。叶老桂认为，对目前国内商业银行而言，', 'ents': [{'indices': [9, 10, 11], 'is_continuous': True, 'label': 'name', 'text': '叶老桂'}, {'indices': [0, 1, 2, 3], 'is_continuous': True, 'label': 'company', 'text': '浙商银行'}]}


In [66]:
classication_set = set()
for data in ds['train']:
    ents = data['ents']
    for ent in ents:
        label = ent['label']
        classication_set.add(label)
print(classication_set)

{'company', 'organization', 'government', 'position', 'movie', 'game', 'address', 'book', 'name', 'scene'}


In [67]:
other_label = ['0']
label_list = other_label + list(classication_set)
label_list

label_indx = {label:idx for idx,label in enumerate(label_list)}
print(label_indx)

{'0': 0, 'company': 1, 'organization': 2, 'government': 3, 'position': 4, 'movie': 5, 'game': 6, 'address': 7, 'book': 8, 'name': 9, 'scene': 10}


In [68]:
tags = other_label
for label in classication_set:
    tags.append('B-' + label.upper())
    tags.append('I-' + label.upper())
print(tags)

['0', 'B-COMPANY', 'I-COMPANY', 'B-ORGANIZATION', 'I-ORGANIZATION', 'B-GOVERNMENT', 'I-GOVERNMENT', 'B-POSITION', 'I-POSITION', 'B-MOVIE', 'I-MOVIE', 'B-GAME', 'I-GAME', 'B-ADDRESS', 'I-ADDRESS', 'B-BOOK', 'I-BOOK', 'B-NAME', 'I-NAME', 'B-SCENE', 'I-SCENE']


In [69]:
def ent_indices_convert(item):
    ents = item['ents']
    convert_indices = [0] * len(item['text'])
    for ent in ents:
        indices = ent['indices']
        label = ent['label']
        convert_indices[indices[0]] = label_indx[label] * 2 - 1
        for org_idx in indices[1:]:
            convert_indices[org_idx] = label_indx[label] * 2
    return {'converted_indices':convert_indices}
    

first_proc_ds = ds.map(ent_indices_convert)
print(first_proc_ds)

DatasetDict({
    train: Dataset({
        features: ['text', 'ents', 'converted_indices'],
        num_rows: 10748
    })
    validation: Dataset({
        features: ['text', 'ents', 'converted_indices'],
        num_rows: 1343
    })
})


In [70]:
# 训练集
for row in first_proc_ds['train']:
    print(row['text'])
    print(row['converted_indices'])
    
    break

浙商银行企业信贷部叶老桂博士则从另一个角度对五道门槛进行了解读。叶老桂认为，对目前国内商业银行而言，
[1, 2, 2, 2, 0, 0, 0, 0, 0, 17, 18, 18, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]


In [71]:
def data_input_proc(item):
    input_data = tokenizer(item['text'],truncation=True,add_special_tokens=False,max_length=512)
    adjust_labels = []
    for sample_index in range(len(input_data['input_ids'])):
        s_input_idx =input_data.word_ids(sample_index)
        # 拿到BI转化的序列
        convert_indices = item['converted_indices'][sample_index]
        sample_list = []
        i,pre_idx = -1,-1
        for idx in s_input_idx:
            # [0, 0, 1] 上一索引与当前索引不一致
            if idx != pre_idx:
                i += 1
                pre_idx = idx
            sample_list.append(convert_indices[i])
        adjust_labels.append(sample_list)
    input_data['labels'] = adjust_labels
    return input_data


enc_label_map = first_proc_ds.map(data_input_proc,batched=True)

Map:   0%|          | 0/10748 [00:00<?, ? examples/s]

In [72]:
# 记录转换为pytorch
enc_label_map.set_format('torch', columns=['input_ids', 'token_type_ids', 'attention_mask', 'labels'])
for item in enc_label_map['train']:
    print(item)
    break

{'input_ids': tensor([3851, 1555, 7213, 6121,  821,  689,  928, 6587, 6956, 1383, 5439, 3424,
        1300, 1894, 1156,  794, 1369,  671,  702, 6235, 2428, 2190,  758, 6887,
        7305, 3546, 6822, 6121,  749, 6237, 6438,  511, 1383, 5439, 3424, 6371,
         711, 8024, 2190, 4680, 1184, 1744, 1079, 1555,  689, 7213, 6121, 5445,
        6241, 8024]), 'token_type_ids': tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0]), 'attention_mask': tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1]), 'labels': tensor([ 1,  2,  2,  2,  0,  0,  0,  0,  0, 17, 18, 18,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0])}


In [73]:
args = TrainingArguments(
    output_dir="ner_train",
    num_train_epochs = 3,
    save_safetensors = False,
    per_device_train_batch_size = 32,
    per_device_eval_batch_size=32,
    report_to='tensorboard',  # 训练输出记录
    eval_strategy="epoch",
)

In [74]:
id2lbl = {i:tag for i, tag in enumerate(tags)}
lbl2id = {tag:i for i, tag in enumerate(tags)}

ner_model = AutoModelForTokenClassification.from_pretrained('google-bert/bert-base-chinese',
                                                           num_labels=21,
                                                           id2label=id2lbl,
                                                           label2id=lbl2id)
ner_model

Some weights of BertForTokenClassification were not initialized from the model checkpoint at google-bert/bert-base-chinese and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


BertForTokenClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(21128, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12

In [75]:
def compute_metric(result):
    # 这是 Trainer 的约定，result 一定是一个 (predictions, labels) 的元组，所以可以直接解包。
    
    # 获取评估对象
    seqeval = evaluate.load('seqeval')
    predicts,labels = result
    predicts = np.argmax(prdicts,axis=-1)

    predicts = [[tags[p] for p,l in zip(ps,ls) if l != 100]
        for ps,ls in zip(predicts,labels)]
    labels = [[tags[l] for p,l in zip(ps,ls) if l != 100]
        for ps,ls in zip(predicts,labels)]
    results = seqeval.compute(predictions=predicts, references=labels)
    return results
    

In [76]:
data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer,padding=True)

trainer = Trainer(
    model=ner_model,
    args=args,
    train_dataset=enc_label_map['train'],
    eval_dataset=enc_label_map['validation'],
    data_collator=data_collator
)

In [77]:
trainer.train()



Epoch,Training Loss,Validation Loss
1,No log,0.383259
2,No log,0.326066
3,0.400900,0.32143




TrainOutput(global_step=504, training_loss=0.3997058844755566, metrics={'train_runtime': 209.2042, 'train_samples_per_second': 154.127, 'train_steps_per_second': 2.409, 'total_flos': 820469833815600.0, 'train_loss': 0.3997058844755566, 'epoch': 3.0})

In [97]:
result = trainer.predict(enc_label_map['validation'])
result

PredictionOutput(predictions=array([[[ 1.13471544e+00,  6.38880134e-01, -1.69897747e+00, ...,
          1.86063623e+00,  3.46176744e-01, -1.66656399e+00],
        [ 1.95383072e+00, -9.62811172e-01,  4.07076478e-01, ...,
          8.42767239e+00, -1.56653976e+00,  1.61329672e-01],
        [ 2.09611177e+00, -1.07191491e+00,  4.76606131e-01, ...,
          8.38572407e+00, -1.67243266e+00,  2.24312007e-01],
        ...,
        [ 1.00073423e+01, -3.70441943e-01, -1.45898744e-01, ...,
          1.79699197e-01, -1.46042418e+00, -1.08878112e+00],
        [ 1.01190090e+01, -3.18227559e-01, -2.52026111e-01, ...,
         -9.04206336e-02, -1.32528567e+00, -1.02481127e+00],
        [ 9.38657188e+00,  1.00098252e+00, -2.83375293e-01, ...,
         -8.11910868e-01, -1.22550249e+00, -2.01015973e+00]],

       [[ 3.95751214e+00,  1.52460325e+00, -1.45915794e+00, ...,
         -6.17582262e-01,  7.94928849e-01, -1.75260198e+00],
        [ 3.96355271e+00, -1.77755272e+00,  7.45102644e-01, ...,
         

In [98]:
sample_index =0
print(first_proc_ds['validation'][sample_index]['text'])
print(enc_label_map['validation'][sample_index]['labels'])
print(result.label_ids[sample_index])

彭小军认为，国内银行现在走的是台湾的发卡模式，先通过跑马圈地再在圈的地里面选择客户，
tensor([17, 18, 18,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0, 13, 14,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0])
[  17   18   18    0    0    0    0    0    0    0    0    0    0    0
    0   13   14    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
 -100 -100 -100 -100 -100 -100 -100 -100]


In [80]:
[(p,l) for p,l in zip(result.label_ids[10],enc_label_map['validation'][10]['labels'])]

[(0, tensor(0)),
 (0, tensor(0)),
 (5, tensor(5)),
 (6, tensor(6)),
 (6, tensor(6)),
 (6, tensor(6)),
 (6, tensor(6)),
 (6, tensor(6)),
 (0, tensor(0)),
 (0, tensor(0)),
 (0, tensor(0)),
 (0, tensor(0)),
 (0, tensor(0)),
 (0, tensor(0)),
 (0, tensor(0)),
 (0, tensor(0)),
 (0, tensor(0)),
 (0, tensor(0)),
 (0, tensor(0)),
 (0, tensor(0)),
 (0, tensor(0)),
 (0, tensor(0)),
 (0, tensor(0)),
 (0, tensor(0)),
 (0, tensor(0)),
 (0, tensor(0)),
 (0, tensor(0)),
 (0, tensor(0)),
 (0, tensor(0)),
 (0, tensor(0)),
 (0, tensor(0)),
 (0, tensor(0)),
 (0, tensor(0)),
 (0, tensor(0)),
 (0, tensor(0)),
 (0, tensor(0)),
 (0, tensor(0)),
 (7, tensor(7)),
 (8, tensor(8)),
 (0, tensor(0)),
 (0, tensor(0)),
 (0, tensor(0)),
 (0, tensor(0)),
 (0, tensor(0)),
 (0, tensor(0)),
 (0, tensor(0))]

In [119]:
# 示例：原始文本数据
raw_data = {"text": ["腾讯和阿里一起研发AI"]}
test_dataset = Dataset.from_dict(raw_data)

# 使用训练时的 tokenizer 处理数据
tokenized_test = test_dataset.map(
    lambda x: tokenizer(x["text"], truncation=True, padding=True),
    batched=True
)

# 对数据集批量预测
result = trainer.predict(tokenized_test)
predicted_classes = np.argmax(result.predictions, axis=-1)

predicted_labels = [[tags[idx] for idx in sample] for sample in predicted_classes]
entitys = []
for i, (text, labels) in enumerate(zip(raw_data["text"], predicted_labels)):
    print(labels)
    for token,label in zip(list(text),labels):
        pass

    

Map:   0%|          | 0/1 [00:00<?, ? examples/s]

['0', 'B-COMPANY', 'I-COMPANY', '0', 'B-COMPANY', 'I-COMPANY', '0', '0', '0', '0', '0', '0']
