In [None]:
from datasets import load_dataset

raw_datasets = load_dataset("conll2003")

In [None]:
raw_datasets
#ner:命名实体识别 pos：词性标注 chunk：分块

In [None]:
raw_datasets["train"][0]["tokens"]

In [None]:
raw_datasets["train"][0]["ner_tags"]

In [None]:
ner_feature = raw_datasets["train"].features["ner_tags"]
ner_feature

In [None]:
label_names = ner_feature.feature.names
label_names

In [None]:
words = raw_datasets["train"][0]["tokens"]
labels = raw_datasets["train"][0]["ner_tags"]
line1 = ""
line2 = ""
for word,label in zip(words,labels):
     full_label = label_names[label]
     max_length = max(len(word),len(full_label))
     #对齐打印
     line1+=word + " "* (max_length - len(word) + 1)
     line2 += full_label + " " * (max_length - len(full_label) + 1)
print(line1)
print(line2)

In [None]:
words = raw_datasets["train"][4]["tokens"]
labels = raw_datasets["train"][4]["ner_tags"]
line1 = ""
line2 = ""
for word, label in zip(words, labels):
     full_label = label_names[label]
     max_length = max(len(word),len(full_label))
     line1 += word + " "*(max_length - len(word) + 1)
     line2 += full_label + " " * (max_length - len(full_label) + 1)
print(line1)
print(line2)


In [None]:
from transformers import AutoTokenizer

model_checkpoint = "bert-base-cased"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
inputs = tokenizer(raw_datasets["train"][0]["tokens"], is_split_into_words = True)
inputs.tokens()

In [None]:
#lamb 被切分为两个token，这导致输入和标签之间的不匹配
inputs.word_ids()

In [None]:
def align_labels_with_tokens(labels,word_ids):
     new_labels = []
     current_word = None
     for word_id in word_ids:
          if word_id != current_word:
               #新单词的开始
               current_word = word_id
               label = -100 if word_id is None else labels[word_id]
               new_labels.append(label)
          elif word_id is None:
               #特殊Token
               new_labels.append(-100)
          else:
               #与前一个tokens类型相同的单词
               label = labels[word_id]
               #如果标签是B-xxx改为I-xxx
               if label %2 == 1:
                    label += 1
               new_labels.append(label)
     return new_labels


In [None]:
labels = raw_datasets["train"][0]["ner_tags"]
word_ids = inputs.word_ids()
print(labels)
print(align_labels_with_tokens(labels,word_ids))

In [None]:
def tokenize_and_align_labels(examples):
     tokenized_inputs = tokenizer(
          examples["tokens"],truncation = True,is_split_into_words = True
     )
     all_labels = examples["ner_tags"]
     new_labels = []
     for i, labels in enumerate(all_labels):
          word_ids = tokenized_inputs.word_ids(i)
          new_labels.append(align_labels_with_tokens(labels,word_ids))
     tokenized_inputs["labels"] = new_labels
     #Hugging Face 的模型在训练时，默认会找一个叫 "labels" 的列来计算 Loss。
     # 如果不加这一行，模型就不知道正确答案是什么了。
     return tokenized_inputs

In [None]:
tokenized_datasets = raw_datasets.map(
     tokenize_and_align_labels,
     batched=True,
     #map处理的是一个列表
     remove_columns=raw_datasets["train"].column_names,
)

In [None]:
from transformers import DataCollatorForTokenClassification

data_collator = DataCollatorForTokenClassification(tokenizer= tokenizer)
batch = data_collator([tokenized_datasets["train"][i] for i in range(2)])
batch["labels"]

In [None]:
for i in range(2):
     print(tokenized_datasets["train"][i]["labels"])

In [None]:
import evaluate
metric = evaluate.load("seqeval")

In [None]:
#它需要字符串形式的标签列表而不是整数
labels = raw_datasets["train"][0]["ner_tags"]
labels = [label_names[i] for i in labels]
labels

In [None]:
predictions = labels.copy()
predictions[2] = "O"
metric.compute(predictions=[predictions], references=[labels])

In [None]:
import numpy as np

def compute_metrics(eval_preds):
     logits, labels = eval_preds
     predictions = np.argmax(logits, axis = -1)

     #删除忽略的索引(特殊 tokens )并转换为标签
     true_labels = [[label_names[l] for l in label if l != -100] for label in labels]
     true_predictions = [
        [label_names[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
     ]
     all_metrics = metric.compute(predictions=true_predictions, references=true_labels)
     return {
        "precision": all_metrics["overall_precision"],
        "recall": all_metrics["overall_recall"],
        "f1": all_metrics["overall_f1"],
        "accuracy": all_metrics["overall_accuracy"],
     }

In [None]:
id2label = {str(i): label for i, label in enumerate(label_names)}
id2label

In [None]:
label2id = {v:k for k, v in id2label.items()}
label2id

In [None]:
from transformers import AutoModelForTokenClassification

#定义模型
model = AutoModelForTokenClassification.from_pretrained(
     model_checkpoint,
     id2label = id2label,
     label2id = label2id
)

In [None]:
model.config.num_labels

In [None]:
#微调模型

from huggingface_hub import notebook_login

notebook_login()

In [None]:
from transformers import TrainingArguments

args = TrainingArguments(
     "bert-finetuned-ner",
     eval_strategy="epoch",
     save_strategy= "epoch",
     learning_rate=2e-5,
     num_train_epochs=3,
     weight_decay=0.01,
     push_to_hub=True,
     load_best_model_at_end=True,
     save_total_limit=1,
)

In [None]:
from transformers import Trainer

trainer = Trainer(
     model = model,
     args = args,
     train_dataset= tokenized_datasets["train"],
     eval_dataset=tokenized_datasets["validation"],
     data_collator= data_collator,
     compute_metrics=compute_metrics,
     tokenizer = tokenizer,
)
trainer.train()

In [None]:
from torch.utils.data import DataLoader

train_dataloader = DataLoader(
     tokenized_datasets["train"],
     shuffle= True,
     collate_fn=data_collator,
     batch_size=8
)
eval_dataloader = DataLoader(
     tokenized_datasets["validation"],
     collate_fn=data_collator,
     batch_size=8
)

In [None]:
model = AutoModelForTokenClassification.from_pretrained(
     model_checkpoint,
     id2label = id2label,
     label2id = label2id,
)

In [None]:
from torch.optim import AdamW

optimizer = AdamW(model.parameters(),lr = 2e-5)


In [None]:
from accelerate import Accelerator

accelerator = Accelerator()
model,optimizer,train_dataloader,eval_dataloader = accelerator.prepare(
     model,optimizer,train_dataloader,eval_dataloader
)

In [None]:
from transformers import get_scheduler

num_train_epochs = 3
num_update_steps_per_epoch = len(train_dataloader)
num_training_steps = num_train_epochs * num_update_steps_per_epoch

lr_scheduler = get_scheduler(
     "linear",
     optimizer=optimizer,
     num_warmup_steps=0,
     num_training_steps=num_training_steps
)

In [None]:
def postprocess(predictions, labels):
     predictions = predictions.detach().cpu().clone().numpy()
     labels = labels.detach().cpu().clone().numpy()

     true_labels = [[label_names[l] for l in label if l != -100] for label in labels]
     true_predictions = [
          [label_names[p] for (p,l) in zip(prediction,label) if l != -100]
          for prediction, label in zip(predictions,labels)
     ]
     return true_labels,true_predictions

In [None]:
from tqdm.auto import tqdm
import torch
progress_bar = tqdm(range(num_training_steps))

for epoch in range(num_train_epochs):
     #训练
     model.train()
     for batch in train_dataloader:
          outputs = model(**batch)
          loss = outputs.loss
          accelerator.backward(loss)

          optimizer.step()
          lr_scheduler.step()
          optimizer.zero_grad()
          progress_bar.update(1)
     #评估
     model.eval()
     for batch in eval_dataloader:
          with torch.no_grad():
               outputs = model(**batch)
          predictions = outputs.logits.argmax(dim = -1)
          labels = batch["labels"]

          predictions = accelerator.pad_across_processes(predictions,dim =1,pad_index=-100)
          labels = accelerator.pad_across_processes(labels,dim=1,pad_index=-100)

          predictions_gathered = accelerator.gather(predictions)
          labels_gathered = accelerator.gather(labels)

          true_predictions, true_labels = postprocess(predictions_gathered,labels_gathered)
          metric.add_batch(predictions=true_predictions, references=true_labels)

     result = metric.compute()
     print(
          f"epoch {epoch}:",
          {
               key:result[f"overall_{key}"]
               for key in ["precision","recall","f1","accuracy"]
          },
     )

     accelerator.wait_for_everyone()


In [None]:
from transformers import pipeline

model_checkpoint = "yuhuihhh/bert-finetuned-ner"
token_classifier = pipeline(
     "token-classification",model=model_checkpoint,aggregation_strategy="simple"
)
token_classifier("My name is Sylvain and I work at Hugging Face in Brooklyn.")