In [None]:
import evaluate
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForTokenClassification, TrainingArguments, Trainer, DataCollatorForTokenClassification

  from .autonotebook import tqdm as notebook_tqdm


In [6]:
# 加载数据集
ner_datasets = load_dataset(
    "csv",
    data_files={
        "train": "./ResumeDataset/train.csv",
        "valid": "./ResumeDataset/valid.csv",
        "test": "./ResumeDataset/test.csv"
    }
)
ner_datasets = ner_datasets.filter(lambda x: x["text"] is not None)
ner_datasets

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 3821
    })
    valid: Dataset({
        features: ['text', 'label'],
        num_rows: 463
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 477
    })
})

In [7]:
ner_datasets["train"][0]

{'text': '高 勇 ： 男 ， 中 国 国 籍 ， 无 境 外 居 留 权 ，',
 'label': 'B-NAME,E-NAME,O,O,O,B-CONT,I-CONT,I-CONT,E-CONT,O,O,O,O,O,O,O,O'}

In [6]:
ner_datasets["train"][0] # input_ids与label都应该是等长的列表 但text tokenizer出现input_ids会变为列表 为 label要手动转为列表
# print(type(ner_datasets["train"][0]["label"]))
# ner_datasets["train"]["label"]
# print(type(ner_datasets["train"]["label"]))
# print(ner_datasets["train"][0]["label"])
from collections import Counter
# ner_datasets["train"]["label"] 是一个 list
labels = ner_datasets["train"]["label"]
# 将 list 中的每个 string 拆分成单个标签
all_labels = [label for string in labels for label in string.split(',')]
# 使用 Counter 统计每个标签的出现次数
label_counts = Counter(all_labels)
# 输出结果
for label, count in label_counts.items():
    print(f"{label}: {count}")

B-NAME: 861
E-NAME: 861
O: 45085
B-CONT: 260
I-CONT: 499
E-CONT: 260
B-RACE: 112
E-RACE: 112
B-TITLE: 6308
I-TITLE: 14835
E-TITLE: 6308
B-EDU: 858
I-EDU: 1536
E-EDU: 858
B-ORG: 4610
I-ORG: 33808
E-ORG: 4610
I-NAME: 740
B-PRO: 287
I-PRO: 666
E-PRO: 287
S-RACE: 3
S-NAME: 91
B-LOC: 47
I-LOC: 143
E-LOC: 47
I-RACE: 6
S-ORG: 1


In [7]:
tokenizer = AutoTokenizer.from_pretrained("hfl") 

In [6]:
tokenizer(ner_datasets["train"][0]["text"])   # 对于已经分好词的数据，要指定is_split_into_words参数为True 这里未分好词 // 但是对汉字tokenized使得句子长度加2，input_ids与lable不一样长怎么办

{'input_ids': [101, 2408, 691, 4798, 2168, 2832, 6598, 5052, 4415, 3300, 7361, 1062, 1385, 2809, 6121, 5869, 752, 8039, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}

In [8]:
label_list = [
    'O',          # 0: 其他 (45903)
    'B-ORG',      # 1: 组织的开始 (4632)
    'I-ORG',      # 2: 组织的内部 (33837)
    'E-ORG',      # 3: 组织的结束 (4632)
    'B-TITLE',    # 4: 职称的开始 (6307)
    'I-TITLE',    # 5: 职称的内部 (14758)
    'E-TITLE',    # 6: 职称的结束 (6307)
    'B-PRO',      # 7: 产品的开始 (284)
    'I-PRO',      # 8: 产品的内部 (648)
    'E-PRO',      # 9: 产品的结束 (284)
    'B-EDU',      # 10: 教育的开始 (873)
    'I-EDU',      # 11: 教育的内部 (1561)
    'E-EDU',      # 12: 教育的结束 (873)
    'B-NAME',     # 13: 人名的开始 (872)
    'I-NAME',     # 14: 人名的内部 (744)
    'E-NAME',     # 15: 人名的结束 (872)
    'B-RACE',     # 16: 种族的开始 (114)
    'I-RACE',     # 17: 种族的内部 (6)
    'E-RACE',     # 18: 种族的结束 (114)
    'B-LOC',      # 19: 地点的开始 (46)
    'I-LOC',      # 20: 地点的内部 (141)
    'E-LOC',      # 21: 地点的结束 (46)
    'B-CONT',     # 22: 内容的开始 (278)
    'I-CONT',     # 23: 内容的内部 (536)
    'E-CONT',     # 24: 内容的结束 (278)
    'S-NAME',     # 25: 单一人名 (88)
    'S-RACE',     # 26: 单一种族 (3)
    'S-ORG'       # 27: 单一组织 (1)
]

In [11]:
label_to_index = {label: index for index, label in enumerate(label_list)} #看下面的注释
label_to_index["I-PRO"]
label_to_index

{'O': 0,
 'B-ORG': 1,
 'I-ORG': 2,
 'E-ORG': 3,
 'B-TITLE': 4,
 'I-TITLE': 5,
 'E-TITLE': 6,
 'B-PRO': 7,
 'I-PRO': 8,
 'E-PRO': 9,
 'B-EDU': 10,
 'I-EDU': 11,
 'E-EDU': 12,
 'B-NAME': 13,
 'I-NAME': 14,
 'E-NAME': 15,
 'B-RACE': 16,
 'I-RACE': 17,
 'E-RACE': 18,
 'B-LOC': 19,
 'I-LOC': 20,
 'E-LOC': 21,
 'B-CONT': 22,
 'I-CONT': 23,
 'E-CONT': 24,
 'S-NAME': 25,
 'S-RACE': 26,
 'S-ORG': 27}

In [12]:
#前者 延长ner_tags（labels） 我还要把每句的label（因为不是数字）靠label_list转为ner_tags 发现映射一层就好了 因为labels也是由ner_tags映射来的 两次映射而不用转后映射了
#examples是一个batch的字典 examples["label"]是一个batch句子的lable为[batch条[一句的lable(seqlen)]]   labeler是batch中i句的label   examples["ner_tags"]与for label是数字数组 
# 我的数据集中对等的是examples["label"]与labeler是字符数组"O,O,O,O,O,O,O,O,O,O,O,B-ORG,I-ORG,I-ORG,E-ORG,B-TITLE,E-TITLE,O"用字典映射就好了
def process_function(examples): 
    tokenized_exmaples = tokenizer(examples["text"], max_length=128, truncation=True)
    labels = []
    for i, labeler in enumerate(examples["label"]):
        if isinstance(labeler, str):
            labeler = labeler.split(',')
        word_ids = tokenized_exmaples.word_ids(batch_index=i)
        label_ids = []
        for word_id in word_ids:
            if word_id is None:
                label_ids.append(-100) #cls与sep赋为-100 不会计算交叉熵
            else:
                label_ids.append(label_to_index[labeler[word_id]]) #修改：labeler[word_id]为"I-PRO"这样的 用label_to_index映射
        labels.append(label_ids)
    tokenized_exmaples["labels"] = labels
    return tokenized_exmaples

In [13]:
tokenized_datasets = ner_datasets.map(process_function, batched=True,remove_columns=['text', 'label'])
tokenized_datasets
# ner_datasets.column_names

Map: 100%|██████████| 3821/3821 [00:00<00:00, 11370.49 examples/s]
Map: 100%|██████████| 463/463 [00:00<00:00, 12216.91 examples/s]
Map: 100%|██████████| 477/477 [00:00<00:00, 11122.94 examples/s]


DatasetDict({
    train: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask', 'labels'],
        num_rows: 3821
    })
    valid: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask', 'labels'],
        num_rows: 463
    })
    test: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask', 'labels'],
        num_rows: 477
    })
})

In [11]:
# tokenized_datasets["train"][0] 
tokenized_datasets["train"][0]["labels"]
# tokenized_datasets["train"][0]["label"]

[-100, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 4, 5, 5, 6, 0, -100]

In [None]:
# 对于所有的非二分类任务，切记要指定num_labels，否则就会device错误
model = AutoModelForTokenClassification.from_pretrained("hfl", num_labels=len(label_list))
# 对于所有的非二分类任务，切记要指定num_labels，否则就会device错误

Some weights of the model checkpoint at hfl were not used when initializing BertForTokenClassification: ['cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias', 'cls.predictions.decoder.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForTokenClassification were not initialized from the model checkpo

In [13]:
model.config.num_labels

28

In [None]:
seqeval = evaluate.load("seqeval_metric.py")
seqeval #不看accuracy 太多0虚高 特别关注Returns:


EvaluationModule(name: "seqeval", module_type: "metric", features: {'predictions': Sequence(feature=Value(dtype='string', id='label'), length=-1, id='sequence'), 'references': Sequence(feature=Value(dtype='string', id='label'), length=-1, id='sequence')}, usage: """
Produces labelling scores along with its sufficient statistics
from a source against one or more references.

Args:
    predictions: List of List of predicted labels (Estimated targets as returned by a tagger)
    references: List of List of reference labels (Ground truth (correct) target values)
    suffix: True if the IOB prefix is after type, False otherwise. default: False
    scheme: Specify target tagging scheme. Should be one of ["IOB1", "IOB2", "IOE1", "IOE2", "IOBES", "BILOU"].
        default: None
    mode: Whether to count correct entity labels with incorrect I/B tags as true positives or not.
        If you want to only count exact matches, pass mode="strict". default: None.
    sample_weight: Array-like of sha

In [None]:
import numpy as np

def eval_metric(pred):
    predictions, labels = pred
    predictions = np.argmax(predictions, axis=-1)

    # 将id转换为原始的字符串类型的标签  操作 for item in iterable if condition
    true_predictions = [
        [label_list[p] for p, l in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels) 
    ]#拿到预测的标签

    true_labels = [
        [label_list[l] for p, l in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels) 
    ]#拿到真实的标签

    result = seqeval.compute(predictions=true_predictions, references=true_labels, mode="strict", scheme="IOBES")

    return {
        "f1": result["overall_f1"]
    }

In [None]:
args = TrainingArguments(
    output_dir="models_for_ner",
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    evaluation_strategy="epoch",     # 评估策略
    save_strategy="epoch",
    metric_for_best_model="f1",
    load_best_model_at_end=True,
    logging_steps=50,
    num_train_epochs=3
)

NameError: name 'TrainingArguments' is not defined

In [19]:
trainer = Trainer(
    model=model,
    args=args,
    tokenizer=tokenizer,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["valid"],
    compute_metrics=eval_metric,
    data_collator=DataCollatorForTokenClassification(tokenizer=tokenizer)
)

In [20]:
trainer.train()

***** Running training *****
  Num examples = 3821


  Num Epochs = 3
  Instantaneous batch size per device = 32
  Total train batch size (w. parallel, distributed & accumulation) = 32
  Gradient Accumulation steps = 1
  Total optimization steps = 360
 14%|█▍        | 50/360 [02:16<07:31,  1.46s/it]

{'loss': 0.6061, 'learning_rate': 4.305555555555556e-05, 'epoch': 0.42}


 28%|██▊       | 100/360 [04:36<06:27,  1.49s/it]

{'loss': 0.1286, 'learning_rate': 3.611111111111111e-05, 'epoch': 0.83}


 33%|███▎      | 120/360 [05:27<05:14,  1.31s/it]***** Running Evaluation *****
  Num examples = 463
  Batch size = 32
                                                 
 33%|███▎      | 120/360 [05:38<05:14,  1.31s/it]Saving model checkpoint to models_for_ner\checkpoint-120
Configuration saved in models_for_ner\checkpoint-120\config.json


{'eval_loss': 0.07801701873540878, 'eval_f1': 0.9562860047441547, 'eval_runtime': 10.5668, 'eval_samples_per_second': 43.817, 'eval_steps_per_second': 1.42, 'epoch': 1.0}


Model weights saved in models_for_ner\checkpoint-120\pytorch_model.bin
tokenizer config file saved in models_for_ner\checkpoint-120\tokenizer_config.json
Special tokens file saved in models_for_ner\checkpoint-120\special_tokens_map.json
 42%|████▏     | 150/360 [07:04<05:18,  1.52s/it]

{'loss': 0.0866, 'learning_rate': 2.916666666666667e-05, 'epoch': 1.25}


 56%|█████▌    | 200/360 [09:14<07:30,  2.82s/it]

{'loss': 0.0671, 'learning_rate': 2.2222222222222223e-05, 'epoch': 1.67}


 67%|██████▋   | 240/360 [10:49<02:13,  1.11s/it]***** Running Evaluation *****
  Num examples = 463
  Batch size = 32
                                                 
 67%|██████▋   | 240/360 [10:59<02:13,  1.11s/it]Saving model checkpoint to models_for_ner\checkpoint-240
Configuration saved in models_for_ner\checkpoint-240\config.json


{'eval_loss': 0.08226735889911652, 'eval_f1': 0.956081081081081, 'eval_runtime': 9.953, 'eval_samples_per_second': 46.519, 'eval_steps_per_second': 1.507, 'epoch': 2.0}


Model weights saved in models_for_ner\checkpoint-240\pytorch_model.bin
tokenizer config file saved in models_for_ner\checkpoint-240\tokenizer_config.json
Special tokens file saved in models_for_ner\checkpoint-240\special_tokens_map.json
 69%|██████▉   | 250/360 [11:38<08:08,  4.44s/it]

{'loss': 0.0554, 'learning_rate': 1.527777777777778e-05, 'epoch': 2.08}


 83%|████████▎ | 300/360 [13:18<03:15,  3.26s/it]

{'loss': 0.039, 'learning_rate': 8.333333333333334e-06, 'epoch': 2.5}


 97%|█████████▋| 350/360 [15:32<00:39,  3.95s/it]

{'loss': 0.0328, 'learning_rate': 1.388888888888889e-06, 'epoch': 2.92}


100%|██████████| 360/360 [15:47<00:00,  1.16s/it]***** Running Evaluation *****
  Num examples = 463
  Batch size = 32
                                                 
100%|██████████| 360/360 [15:57<00:00,  1.16s/it]Saving model checkpoint to models_for_ner\checkpoint-360
Configuration saved in models_for_ner\checkpoint-360\config.json


{'eval_loss': 0.09164676070213318, 'eval_f1': 0.9603760913364674, 'eval_runtime': 9.7181, 'eval_samples_per_second': 47.643, 'eval_steps_per_second': 1.544, 'epoch': 3.0}


Model weights saved in models_for_ner\checkpoint-360\pytorch_model.bin
tokenizer config file saved in models_for_ner\checkpoint-360\tokenizer_config.json
Special tokens file saved in models_for_ner\checkpoint-360\special_tokens_map.json


Training completed. Do not forget to share your model on huggingface.co/models =)


Loading best model from models_for_ner\checkpoint-360 (score: 0.9603760913364674).
100%|██████████| 360/360 [16:00<00:00,  2.67s/it]

{'train_runtime': 960.3143, 'train_samples_per_second': 11.937, 'train_steps_per_second': 0.375, 'train_loss': 0.14189292482203908, 'epoch': 3.0}





TrainOutput(global_step=360, training_loss=0.14189292482203908, metrics={'train_runtime': 960.3143, 'train_samples_per_second': 11.937, 'train_steps_per_second': 0.375, 'train_loss': 0.14189292482203908, 'epoch': 3.0})

In [22]:
trainer.evaluate(eval_dataset=tokenized_datasets["test"])#上面的f1是在valid上做的 

***** Running Evaluation *****
  Num examples = 477
  Batch size = 32


KeyboardInterrupt: 

In [23]:
from transformers import pipeline
# 使用pipeline进行推理，要指定id2label
model.config.id2label = {idx: label for idx, label in enumerate(label_list)}
model.config

BertConfig {
  "_name_or_path": "hfl",
  "architectures": [
    "BertForTokenClassification"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "directionality": "bidi",
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "id2label": {
    "0": "O",
    "1": "B-ORG",
    "2": "I-ORG",
    "3": "E-ORG",
    "4": "B-TITLE",
    "5": "I-TITLE",
    "6": "E-TITLE",
    "7": "B-PRO",
    "8": "I-PRO",
    "9": "E-PRO",
    "10": "B-EDU",
    "11": "I-EDU",
    "12": "E-EDU",
    "13": "B-NAME",
    "14": "I-NAME",
    "15": "E-NAME",
    "16": "B-RACE",
    "17": "I-RACE",
    "18": "E-RACE",
    "19": "B-LOC",
    "20": "I-LOC",
    "21": "E-LOC",
    "22": "B-CONT",
    "23": "I-CONT",
    "24": "E-CONT",
    "25": "S-NAME",
    "26": "S-RACE",
    "27": "S-ORG"
  },
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "label2id": {
    "LABEL_0": 0,
    "LABEL_1": 1,
    "LABEL_10": 10,
    "LABE

In [24]:
ner_pipe = pipeline("token-classification", model=model, tokenizer=tokenizer, device=0, aggregation_strategy="simple")

In [25]:
res = ner_pipe("李 超 ， 男 ， 成 都 电 力 职 工 大 学 发 电 厂 及 电 力 系 统 专 业 毕 业 ， 高 级 工 程 师 ")
res

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


[{'entity_group': 'NAME',
  'score': 0.994986,
  'word': '李',
  'start': 0,
  'end': 1},
 {'entity_group': 'NAME',
  'score': 0.9948091,
  'word': '超',
  'start': 2,
  'end': 3},
 {'entity_group': 'ORG',
  'score': 0.998124,
  'word': '成 都 电 力 职 工 大 学 发 电',
  'start': 10,
  'end': 29},
 {'entity_group': 'ORG',
  'score': 0.9693472,
  'word': '厂',
  'start': 30,
  'end': 31},
 {'entity_group': 'PRO',
  'score': 0.44359675,
  'word': '及',
  'start': 32,
  'end': 33},
 {'entity_group': 'PRO',
  'score': 0.97792614,
  'word': '电 力 系 统 专',
  'start': 34,
  'end': 43},
 {'entity_group': 'PRO',
  'score': 0.97626007,
  'word': '业',
  'start': 44,
  'end': 45},
 {'entity_group': 'TITLE',
  'score': 0.9984763,
  'word': '高 级 工 程',
  'start': 52,
  'end': 59},
 {'entity_group': 'TITLE',
  'score': 0.98319346,
  'word': '师',
  'start': 60,
  'end': 61}]

In [26]:
ner_result = {}
x = "李 超 ， 男 ， 成 都 电 力 职 工 大 学 发 电 厂 及 电 力 系 统 专 业 毕 业 ， 高 级 工 程 师 "
for r in res:
    if r["entity_group"] not in ner_result:
        ner_result[r["entity_group"]] = []
    ner_result[r["entity_group"]].append(x[r["start"]: r["end"]])

ner_result

{'NAME': ['李', '超'],
 'ORG': ['成 都 电 力 职 工 大 学 发 电', '厂'],
 'PRO': ['及', '电 力 系 统 专', '业'],
 'TITLE': ['高 级 工 程', '师']}