<a href="https://colab.research.google.com/github/yuyu990116/transformers_tutorials/blob/main/P2_NER.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from google.colab import drive
drive.mount("/content/drive")
import os
os.chdir("/content/drive/MyDrive/nlp")

Mounted at /content/drive


In [None]:
!pip install datasets
!pip install -U accelerate
!pip install -U transformers
from datasets import Dataset,load_dataset
from transformers import AutoModelForTokenClassification,AutoTokenizer,Trainer,TrainingArguments,pipeline,DataCollatorForTokenClassification
datasets=load_dataset("peoples_daily_ner",cache_dir='./')
model=AutoModelForTokenClassification.from_pretrained("hfl/chinese-macbert-base")
tokenizer = AutoTokenizer.from_pretrained("hfl/chinese-macbert-base")

In [3]:
datasets

DatasetDict({
    train: Dataset({
        features: ['id', 'tokens', 'ner_tags'],
        num_rows: 20865
    })
    validation: Dataset({
        features: ['id', 'tokens', 'ner_tags'],
        num_rows: 2319
    })
    test: Dataset({
        features: ['id', 'tokens', 'ner_tags'],
        num_rows: 4637
    })
})

In [4]:
datasets["train"][0]

{'id': '0',
 'tokens': ['海',
  '钓',
  '比',
  '赛',
  '地',
  '点',
  '在',
  '厦',
  '门',
  '与',
  '金',
  '门',
  '之',
  '间',
  '的',
  '海',
  '域',
  '。'],
 'ner_tags': [0, 0, 0, 0, 0, 0, 0, 5, 6, 0, 5, 6, 0, 0, 0, 0, 0, 0]}

In [6]:
datasets["train"].features

{'id': Value(dtype='string', id=None),
 'tokens': Sequence(feature=Value(dtype='string', id=None), length=-1, id=None),
 'ner_tags': Sequence(feature=ClassLabel(names=['O', 'B-PER', 'I-PER', 'B-ORG', 'I-ORG', 'B-LOC', 'I-LOC'], id=None), length=-1, id=None)}

In [11]:
labels = datasets["train"].features["ner_tags"].feature.names
labels
#"B-ORG":组织或公司(organization) "I-ORG":组织或公司  PER：person  LOC:location
#IOB2
#另一个比较常用的是IOBES

['O', 'B-PER', 'I-PER', 'B-ORG', 'I-ORG', 'B-LOC', 'I-LOC']

In [12]:
tokenizer(datasets["train"][0]["tokens"]) #把一句话拆成了一个字一个字进行了tokenizer

{'input_ids': [[101, 3862, 102], [101, 7157, 102], [101, 3683, 102], [101, 6612, 102], [101, 1765, 102], [101, 4157, 102], [101, 1762, 102], [101, 1336, 102], [101, 7305, 102], [101, 680, 102], [101, 7032, 102], [101, 7305, 102], [101, 722, 102], [101, 7313, 102], [101, 4638, 102], [101, 3862, 102], [101, 1818, 102], [101, 511, 102]], 'token_type_ids': [[0, 0, 0], [0, 0, 0], [0, 0, 0], [0, 0, 0], [0, 0, 0], [0, 0, 0], [0, 0, 0], [0, 0, 0], [0, 0, 0], [0, 0, 0], [0, 0, 0], [0, 0, 0], [0, 0, 0], [0, 0, 0], [0, 0, 0], [0, 0, 0], [0, 0, 0], [0, 0, 0]], 'attention_mask': [[1, 1, 1], [1, 1, 1], [1, 1, 1], [1, 1, 1], [1, 1, 1], [1, 1, 1], [1, 1, 1], [1, 1, 1], [1, 1, 1], [1, 1, 1], [1, 1, 1], [1, 1, 1], [1, 1, 1], [1, 1, 1], [1, 1, 1], [1, 1, 1], [1, 1, 1], [1, 1, 1]]}

In [13]:
tokenizer(datasets["train"][0]["tokens"], is_split_into_words=True) #对于已经做好tokenize的数据，要指定is_split_into_words=True，才能达到一句话的tokens在同一个列表的效果

{'input_ids': [101, 3862, 7157, 3683, 6612, 1765, 4157, 1762, 1336, 7305, 680, 7032, 7305, 722, 7313, 4638, 3862, 1818, 511, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}

In [15]:
a = tokenizer(datasets["train"][0]["tokens"], is_split_into_words=True)
a.word_ids()
#'tokens': ['海',
  # '钓',
  # '比',
  # '赛',
  # '地',
  # '点',
  # '在',
  # '厦',
  # '门',
  # '与',
  # '金',
  # '门',
  # '之',
  # '间',
  # '的',
  # '海',
  # '域',
  # '。'],

[None, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, None]

In [17]:
def data_process(examples):
  tokenized_examples=tokenizer(examples["tokens"],max_length=32,truncation=True,is_split_into_words=True)
  labels=[]
  for i,tags in enumerate(examples["ner_tags"]):
    label = []
    word_ids=tokenized_examples.word_ids(batch_index=i)
    for word_id in word_ids:
      if word_id is None:
        label.append(-100)
      else:
        label.append(tags[word_id])
    labels.append(label)
  tokenized_examples["labels"]=labels
  return tokenized_examples
tokenized_datasets=datasets.map(data_process,batched=True)
tokenized_datasets

Map:   0%|          | 0/20865 [00:00<?, ? examples/s]

Map:   0%|          | 0/2319 [00:00<?, ? examples/s]

Map:   0%|          | 0/4637 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['id', 'tokens', 'ner_tags', 'input_ids', 'token_type_ids', 'attention_mask', 'labels'],
        num_rows: 20865
    })
    validation: Dataset({
        features: ['id', 'tokens', 'ner_tags', 'input_ids', 'token_type_ids', 'attention_mask', 'labels'],
        num_rows: 2319
    })
    test: Dataset({
        features: ['id', 'tokens', 'ner_tags', 'input_ids', 'token_type_ids', 'attention_mask', 'labels'],
        num_rows: 4637
    })
})

In [None]:
tokenized_datasets["train"][0]

In [19]:
model = AutoModelForTokenClassification.from_pretrained("hfl/chinese-macbert-base", num_labels=len(labels))#num_labels默认是2

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
Some weights of BertForTokenClassification were not initialized from the model checkpoint at hfl/chinese-macbert-base and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [20]:
model.config.num_labels

7

In [None]:
!pip install seqeval

In [None]:
!pip install evaluate
import evaluate
metrics = evaluate.load("seqeval")
metrics

In [33]:
labels

['O', 'B-PER', 'I-PER', 'B-ORG', 'I-ORG', 'B-LOC', 'I-LOC']

In [39]:
import numpy as np
def eval_process(predictions):
  pred_ids,true_label_ids = predictions
  pred_ids = np.argmax(pred_ids,axis=-1)
  #由于seqeval要的数据类型是 [['O', 'O', 'B-MISC', 'I-MISC', 'I-MISC', 'I-MISC', 'O'], ['B-PER', 'I-PER', 'O']]，而模型出来的是根据labels的id，所以要进行转化
  batch_preds=[[labels[pid] for pid,lid in zip(pred_id,true_label_id)if lid != -100]
      for pred_id,true_label_id in zip(pred_ids,true_label_ids)
  ]
  batch_labels=[[labels[lid] for pid,lid in zip(pred_id,true_label_id)if lid != -100]
      for pred_id,true_label_id in zip(pred_ids,true_label_ids)
  ]
  result = metrics.compute(predictions=batch_preds, references=batch_labels, mode="strict", scheme="IOB2")

  return {
      "f1": result["overall_f1"]
  }


In [40]:
args = TrainingArguments(
    output_dir="./model_for_NER",
    per_device_train_batch_size=16,
    per_device_eval_batch_size=32,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    metric_for_best_model="f1",
    load_best_model_at_end=True,
    logging_steps=50,
    num_train_epochs=1
)

In [41]:
trainer = Trainer(
    model=model,
    args=args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    compute_metrics=eval_process,
    data_collator=DataCollatorForTokenClassification(tokenizer=tokenizer)
)

In [42]:
trainer.train()

Epoch,Training Loss,Validation Loss,F1
1,0.0279,0.031217,0.94459


TrainOutput(global_step=1305, training_loss=0.007622719371969672, metrics={'train_runtime': 178.0963, 'train_samples_per_second': 117.156, 'train_steps_per_second': 7.327, 'total_flos': 340757091157362.0, 'train_loss': 0.007622719371969672, 'epoch': 1.0})

In [43]:
trainer.evaluate(eval_dataset=tokenized_datasets["test"])

{'eval_loss': 0.04056788608431816,
 'eval_f1': 0.931415313225058,
 'eval_runtime': 10.5012,
 'eval_samples_per_second': 441.57,
 'eval_steps_per_second': 13.808,
 'epoch': 1.0}

In [44]:
model.config

BertConfig {
  "_name_or_path": "hfl/chinese-macbert-base",
  "architectures": [
    "BertForTokenClassification"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "directionality": "bidi",
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "id2label": {
    "0": "LABEL_0",
    "1": "LABEL_1",
    "2": "LABEL_2",
    "3": "LABEL_3",
    "4": "LABEL_4",
    "5": "LABEL_5",
    "6": "LABEL_6"
  },
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "label2id": {
    "LABEL_0": 0,
    "LABEL_1": 1,
    "LABEL_2": 2,
    "LABEL_3": 3,
    "LABEL_4": 4,
    "LABEL_5": 5,
    "LABEL_6": 6
  },
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "pooler_fc_size": 768,
  "pooler_num_attention_heads": 12,
  "pooler_num_fc_layers": 3,
  "pooler_size_per_head": 128,
  "pooler_type": "first_to

In [46]:
model.config.id2label = {idx: label for idx, label in enumerate(labels)}
model.config.id2label

{0: 'O',
 1: 'B-PER',
 2: 'I-PER',
 3: 'B-ORG',
 4: 'I-ORG',
 5: 'B-LOC',
 6: 'I-LOC'}

In [47]:
pipe = pipeline("token-classification", model=model, tokenizer=tokenizer, device=0, aggregation_strategy="simple")

In [48]:
res = pipe("安民医生在北京的北京协和医院上班")
res

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


[{'entity_group': 'PER',
  'score': 0.99979347,
  'word': '安 民',
  'start': 0,
  'end': 2},
 {'entity_group': 'LOC',
  'score': 0.9994552,
  'word': '北 京',
  'start': 5,
  'end': 7},
 {'entity_group': 'ORG',
  'score': 0.9990966,
  'word': '北 京 协 和 医 院',
  'start': 8,
  'end': 14}]

In [49]:
ner_result = {}
x = "安民医生在北京的北京协和医院上班"
for r in res:
    if r["entity_group"] not in ner_result:
        ner_result[r["entity_group"]] = []
    ner_result[r["entity_group"]].append(x[r["start"]: r["end"]])

ner_result

{'PER': ['安民'], 'LOC': ['北京'], 'ORG': ['北京协和医院']}