In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

1⃣️【第十一周作业】

1. 参考课堂案例，使用指定的数据集，编写代码实现ner模型训练和推流。
   https://huggingface.co/datasets/doushabao4766/msra_ner_k_V3
2. 完成预测结果的实体抽取。
   输入：“双方确定了今后发展中美关系的指导方针。”
   输出：[{"entity":"ORG","content":"中"},{"entity":"ORG","content":"美"}]
3. 整理Dataset、Trainer、TrainingArgument、DataCollator、Evaluate 知识点，总结文档`

In [None]:
!pip install seqeval evaluate

In [None]:
from transformers import AutoModelForTokenClassification, AutoTokenizer,DataCollatorForTokenClassification
from transformers import TrainingArguments, Trainer
import torch
import evaluate  # pip install evaluate
import seqeval   # pip install seqeval
from datasets import load_dataset

In [None]:
#  从预训练模型google-bert/bert-base-chinese中加载AutoModelForTokenClassification模型，并设置标签数量为7
#  AutoModelForTokenClassification是transformers库中用于命名实体识别（NER）的预训练模型，它基于BERT模型进行微调，可以识别文本中的命名实体。
#  AutoTokenizer是transformers库中用于文本分词的类，它可以将文本转换为模型可以理解的token索引。
#  AutoModelForTokenClassification.from_pretrained('google-bert/bert-base-chinese', num_labels=7)
#  从预训练模型google-bert/bert-base-chinese中加载AutoModelForTokenClassification模型，并设置标签数量为7。
model = AutoModelForTokenClassification.from_pretrained('google-bert/bert-base-chinese', num_labels=7)

In [None]:
#  从预训练模型google-bert/bert-base-chinese中加载tokenizer，用于将文本转换为模型可以理解的token索引。

tokenizer = AutoTokenizer.from_pretrained('google-bert/bert-base-chinese')

In [None]:
model

In [None]:
# 模型测试
message= "命名实体识别"

label = torch.tensor([0,1,0,2,5,4,3,6])

# 使用tokenizer对输入文本进行编码，并返回PyTorch张量
model_input = tokenizer([message], return_tensors='pt')

print(model_input)

result = model(**model_input, labels=label)

print(f'result.loss={result.loss}')
print(f'result.logits={result.logits}')
print(f'result.logits.shape={result.logits.shape}')


In [None]:
# huggingface数据及加载
ds = load_dataset("doushabao4766/msra_ner_k_V3")
# ds = load_dataset('nlhappy/CLUE-NER')
ds

In [None]:
# 查看数据集
train_data = ds['train']

# for row in train_data:
#     print(row)
#     print(len(row['tokens']))
#     print(len(row['ner_tags']))
#     break

for row in train_data:
    print(row)
    print((row['tokens']))
    print(len(row['ner_tags']))
    break

In [None]:
#  
# 数据集预处理确定label与数值类型之间的映射（map）
entities = ['O'] + list({'movie', 'name', 'game', 'address', 'position', \
           'company', 'scene', 'book', 'organization', 'government'})

tags = ['O']
 
for entity in entities[1:]:
    tags.append('B-'+ entity.upper())
    tags.append('I-'+ entity.upper())

entity_index = {entity:i for i, entity in enumerate(entities)}
print(entity_index)
print(tags, len(tags))

In [None]:
# 原始文本转换模型需要token_idx,生成和token_idx对齐label
def data_input_proc(item):
    # 输入文本转换模型输入token索引
    all_texts = [''.join(tokens) for tokens in item['tokens']]  # 每个 token 列表拼接成字符串
    input_data = tokenizer(all_texts, truncation=True, add_special_tokens=False, max_length=512)
    adjust_labels = []  # 所有修正后label索引列表
    # 上一步骤生成ner_tag中索引和token对齐
    for k in range(len(input_data['input_ids'])):
        # 每条记录token对应word_ids
        word_ids = input_data.word_ids(k)
        # 批次ner_tag长度和token长度对齐
        tags = item['ner_tags'][k]
        
        adjusted_label_ids = []
        i, prev_wid = -1,-1
        for wid in word_ids:
            if (wid != prev_wid):   #  word_ids [1,1,1,2,3,4,5] -> [0,1,2,3,4,5,6]
                i += 1 # token对应检索位置+1
                prev_wid = wid
            adjusted_label_ids.append(tags[i])
        adjust_labels.append(adjusted_label_ids)                
    # 修正后label添加到input_data
    input_data['labels'] = adjust_labels
    return input_data
    
# 正确使用 map：启用 batched 并设置 batch_size
ds_map = ds.map(data_input_proc, batched=True, batch_size=1000)

In [None]:
ds_map

In [None]:
# 记录转换为pytorch
ds_map.set_format('torch', columns=['input_ids', 'token_type_ids', 'attention_mask', 'labels'])

In [None]:
args = TrainingArguments(
    output_dir="ner_train",  # 模型训练工作目录（tensorboard，临时模型存盘文件，日志）
    num_train_epochs = 3,    # 训练 epoch
    save_safetensors=False,  # 设置False保存文件可以通过torch.load加载
    per_device_train_batch_size=32,  # 训练批次
    per_device_eval_batch_size=32,
    report_to='tensorboard',  # 训练输出记录
    eval_strategy="epoch",
)

In [None]:
id2lbl = {i:tag for i, tag in enumerate(tags)}
lbl2id = {tag:i for i, tag in enumerate(tags)}

model = AutoModelForTokenClassification.from_pretrained('google-bert/bert-base-chinese', 
                                                        num_labels=21,
                                                        id2label=id2lbl,
                                                        label2id=lbl2id)
model

In [None]:
# metric 方法
def compute_metric(result):
    # result 是一个tuple (predicts, labels)
    
    # 获取评估对象
    seqeval = evaluate.load('seqeval')
    predicts,labels = result
    predicts = np.argmax(prdicts, axis=2)
    
    # 准备评估数据
    predicts = [[tags[p] for p,l in zip(ps,ls) if l != -100]
                 for ps,ls in zip(predicts,labels)]
    labels = [[tags[l] for p,l in zip(ps,ls) if l != -100]
                 for ps,ls in zip(predicts,labels)]
    results = seqeval.compute(predictions=predicts, references=labels)

    return results


In [None]:
data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer, padding=True)

trainer = Trainer(
    model,
    args,
    train_dataset=ds_map['train'],
    eval_dataset=ds_map['test'],
    data_collator=data_collator,
    compute_metrics=compute_metric
)

In [None]:
trainer.train()

In [None]:
result = trainer.predict(ds_map['validation'])

In [None]:
print(ds1['test'][10]['tokens'])
print(ds2['test'][10]['labels'])
print(result.label_ids[10])

In [None]:
 [tags[p] for p,l in zip(result.label_ids[10],ds_map['test'][10]['labels'])]

In [None]:
[tags[l] for p,l in zip(result.label_ids[10],ds2['test'][10]['labels'])]