In [13]:
import torch
print(torch.cuda.is_available())

True


In [35]:
# Load model directly
from transformers import AutoTokenizer, AutoModelForCausalLM

tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen2.5-0.5B-Instruct")
model = AutoModelForCausalLM.from_pretrained("Qwen/Qwen2.5-0.5B-Instruct")

### 加载模型和分词器

In [36]:
from transformers import AutoModelForCausalLM, AutoTokenizer

model_name = "Qwen/Qwen2.5-0.5B-Instruct"

model = AutoModelForCausalLM.from_pretrained(
    model_name,
    # torch_dtype="auto",
    # device_map="auto"
).to("cuda")
tokenizer = AutoTokenizer.from_pretrained(model_name)

### 读取ner.txt，切分成句子

In [None]:
# 读取文件内容
# 根据空行拆分段落
words = []
sentences = []
true_labels = []
with open("ner.txt", "r", encoding="utf-8") as f:  # 打开指定的文件
    word = []
    sentence = ""
    true_label = []
    while line := f.readline():  # 按行读取文件
        line = line.strip()  # 去掉行首行尾的空白字符
        if line:  # 如果当前行非空
            single_word, label = line.split()  # 将行中的单词和标签拆分
            word.append(single_word)
            sentence += single_word
            true_label.append(label)
        elif sentence:  # 如果遇到空行，且句子不为空
            words.append(word)
            sentences.append(sentence)
            true_labels.extend(true_label)
            word = []
            sentence = ""
            true_label = []
    if sentence:    # 处理最后一句
        words.append(word)
        sentences.append(sentence)
        true_labels.extend(true_label)
            
article = ''.join(sentences)
print(f"article length: {len(article)}")
print(f"sentences: {len(sentences)}")
print(f"true label length: {len(true_labels)}")

article length: 13173
sentences: 218
true label length: 13173


### prompt

In [80]:
# 定义命名实体识别的提示模板
def create_prompt(text_chunk):
    return f"""
            你是一个专业的古文命名实体识别专家。请仔细阅读一下古文内容，在保证理解意思的基础上，帮我做命名实体识别任务，按照如下的提取要求来执行任务：
            
            请按照以下格式输出结果：
            LOC:(此处为若干个提取出的人名，之间用'、'隔开，若没有则是空的)
            LOC:(此处为若干个提取出的地点名，之间用'、'隔开，若没有则是空的)
            OFI:(此处为若干个提取出的地点名，之间用'、'隔开，若没有则是空的)
            BOOK:(此处为若干个提取出的地点名，之间用'、'隔开，若没有则是空的)

            一个示例如下：
            
            输入文本:
            后高宗知而深歎美之。仪凤四年薨，辍朝三日，使百官以次赴宅哭之，赠开府仪同三司、并州大都督，谥曰恭。宣帝即位，授上柱国。运之为宫正也，数进谏于帝。帝不纳，反疏忌之。时运又与王轨、宇文孝伯等皆为武帝亲待。阿剌怗木儿袭职，授虎符，緫管高丽人户。

            输出:
            PER:高宗、恭、宣帝、运、王轨、宇文孝伯、武帝、阿剌怗木儿
            LOC:并州、高丽
            OFI:开府仪同三司、并州大都督、上柱国、宫正
            BOOK:
            
            现在把以下这段话作为输入，请给出输出结果：
            
            {text_chunk}
 
            """ 

### 输入每句话，生成prompt，运行得到每句话的预测标签
把运行成功的每句话保存到日志中，若出错下次运行可从日志中恢复

In [None]:
import os

# 日志文件名
journal_file = "journal.txt"

# 已完成的标签列表
pred_labels = []
count = 0

# 检查文件是否存在，如果不存在则创建空文件
if not os.path.exists(journal_file):
    open(journal_file, "w", encoding="utf-8").close()

# 读取已完成的标签
if os.path.exists(journal_file):
    with open(journal_file, "r", encoding="utf-8") as file:
        for line in file:
            line_list = line.strip().split(" ")  # 去掉换行符后按空格分隔
            count += 1
            pred_labels.extend(line_list)
        
print(f"已从日志读取 {count} 句话的标签.")

# for i, sentence in enumerate(sentences):
for i in range(count, len(sentences)):
    print(f"正在处理第 {i+1} 段...")

    # 构建每段的 prompt
    sentence = sentences[i]
    prompt = create_prompt(sentence)
    
    print("prompt OK")
    
    # 准备输入数据
    messages = [
        # {"role": "system", "content": "You are Qwen, created by Alibaba Cloud. You are a helpful assistant."},
        {"role": "user", "content": prompt}
    ]
    text = tokenizer.apply_chat_template(
        messages,
        tokenize=False,
        add_generation_prompt=True
    )
    model_inputs = tokenizer([text], return_tensors="pt").to(model.device)
    
    print("start gen")
    
    # 生成输出
    generated_ids = model.generate(
        **model_inputs,
        max_new_tokens=512
    )
    # 处理生成的结果
    generated_ids = [
        output_ids[len(input_ids):] for input_ids, output_ids in zip(model_inputs.input_ids, generated_ids)
    ]

    response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
    
    print("response OK")
    
    # 解析并分类实体
    entities = {"PER": [], "LOC": [], "OFI": [], "BOOK": []}

    for line in response.split("\n"):
        if line.startswith("PER:"):
            entities["PER"] = line.replace("PER:", "").strip().split("、")
        elif line.startswith("LOC:"):
            entities["LOC"] = line.replace("LOC:", "").strip().split("、")
        elif line.startswith("OFI:"):
            entities["OFI"] = line.replace("OFI:", "").strip().split("、")
        elif line.startswith("BOOK:"):
            entities["BOOK"] = line.replace("BOOK:", "").strip().split("、")
            
    print("entities OK")
            
    pred_label = ['O'] * len(sentence)
    
    # 定义标签类型
    label_types = ["PER", "LOC", "OFI", "BOOK"]

    # 遍历每种实体类型
    for label_type in label_types:
        for entity in entities[label_type]:
            if not entity:
                continue
            start = sentence.find(entity)
            if start != -1:
                end = start + len(entity)
                if 0 <= start < len(pred_label) and 0 < end <= len(pred_label):
                    if len(entity) == 1:  # 单字实体
                        pred_label[start] = f"S-{label_type}"
                    else:  # 多字实体
                        pred_label[start] = f"B-{label_type}"  # 开始
                        pred_label[end - 1] = f"E-{label_type}"  # 结束
                        if end - start > 2:
                            pred_label[start + 1:end - 1] = [f"I-{label_type}"] * (len(entity) - 2)  # 中间
                    
    print(f"sentence length: {len(sentence)}, label length: {len(pred_label)}")
    assert len(sentence) == len(pred_label), "LABEL_LENGTH_ERROR"
    
    pred_labels.extend(pred_label)
    
    print("label OK") 
    
    # 实时写入日志
    with open(journal_file, "a", encoding="utf-8") as file: # 'a' 模式表示追加写入
        file.write(" ".join(pred_label) + "\n")  # 将 list 转为空格分隔的字符串并换行

    print(f"第 {i+1} 段处理完成，结果已保存到日志.")

已从日志读取 218 句话的标签.


### 计算f1分数

In [89]:
from sklearn.metrics import f1_score

f1 = f1_score(true_labels, pred_labels, average='macro')
print(f"f1_marco: {f1}")

f1_marco: 0.33181592684168515
