In [10]:
import torch
print(torch.cuda.is_available())

True


### 加载模型和分词器

In [11]:
from transformers import AutoModelForCausalLM, AutoTokenizer

model_name = "Qwen/Qwen2.5-0.5B-Instruct"

model = AutoModelForCausalLM.from_pretrained(
    model_name,
    # torch_dtype="auto",
    # device_map="auto"
).to("cuda")
tokenizer = AutoTokenizer.from_pretrained(model_name)

  from .autonotebook import tqdm as notebook_tqdm


## 必做1：命名实体识别任务

### 读取ner.txt，切分成句子

In [12]:
# 读取文件内容
# 根据空行拆分段落
words = []
sentences = []
true_labels = []
with open("ner.txt", "r", encoding="utf-8") as f:  # 打开指定的文件
    word = []
    sentence = ""
    true_label = []
    while line := f.readline():  # 按行读取文件
        line = line.strip()  # 去掉行首行尾的空白字符
        if line:  # 如果当前行非空
            single_word, label = line.split()  # 将行中的单词和标签拆分
            word.append(single_word)
            sentence += single_word
            true_label.append(label)
        elif sentence:  # 如果遇到空行，且句子不为空
            words.append(word)
            sentences.append(sentence)
            true_labels.extend(true_label)
            word = []
            sentence = ""
            true_label = []
    if sentence:    # 处理最后一句
        words.append(word)
        sentences.append(sentence)
        true_labels.extend(true_label)
            
print(f"sentences: {len(sentences)}")
print(f"true label length: {len(true_labels)}")

sentences: 218
true label length: 13173


### prompt

In [None]:
# 定义命名实体识别的提示模板
def create_ner_prompt(text_chunk):
    return f"""你是一个专业的古文命名实体识别专家。请仔细阅读一下古文内容，在保证理解意思的基础上，帮我做命名实体识别任务，按照如下的提取要求来执行任务：

请按照以下格式输出结果：
LOC:(此处为若干个提取出的人名，之间用'、'隔开，若没有则是空的)
LOC:(此处为若干个提取出的地点名，之间用'、'隔开，若没有则是空的)
OFI:(此处为若干个提取出的地点名，之间用'、'隔开，若没有则是空的)
BOOK:(此处为若干个提取出的地点名，之间用'、'隔开，若没有则是空的)

一个示例如下：

输入文本:
后高宗知而深歎美之。仪凤四年薨，辍朝三日，使百官以次赴宅哭之，赠开府仪同三司、并州大都督，谥曰恭。宣帝即位，授上柱国。运之为宫正也，数进谏于帝。帝不纳，反疏忌之。时运又与王轨、宇文孝伯等皆为武帝亲待。阿剌怗木儿袭职，授虎符，緫管高丽人户。

输出:
PER:高宗、恭、宣帝、运、王轨、宇文孝伯、武帝、阿剌怗木儿
LOC:并州、高丽
OFI:开府仪同三司、并州大都督、上柱国、宫正
BOOK:

现在把以下这段话作为输入，请给出输出结果：

{text_chunk}

""" 

### 输入每句话，生成prompt，运行得到每句话的预测标签
把运行成功的每句话保存到日志中，若出错下次运行可从日志中恢复

In [14]:
import os

# 日志文件名
ner_journal_file = "ner_journal.txt"

# 已完成的标签列表
pred_labels = []
ner_count = 0

# 检查文件是否存在，如果不存在则创建空文件
if not os.path.exists(ner_journal_file):
    open(ner_journal_file, "w", encoding="utf-8").close()

# 读取已完成的标签
if os.path.exists(ner_journal_file):
    with open(ner_journal_file, "r", encoding="utf-8") as file:
        for line in file:
            line_list = line.strip().split(" ")  # 去掉换行符后按空格分隔
            ner_count += 1
            pred_labels.extend(line_list)
        
print(f"已从日志读取 {ner_count} 句话的标签.")

for i in range(ner_count, len(sentences)):
    print(f"正在处理第 {i+1} 段...")

    # 构建每段的 prompt
    sentence = sentences[i]
    prompt = create_ner_prompt(sentence)
    
    print("prompt OK")
    
    # 准备输入数据
    messages = [
        # {"role": "system", "content": "You are Qwen, created by Alibaba Cloud. You are a helpful assistant."},
        {"role": "user", "content": prompt}
    ]
    text = tokenizer.apply_chat_template(
        messages,
        tokenize=False,
        add_generation_prompt=True
    )
    model_inputs = tokenizer([text], return_tensors="pt").to(model.device)
    
    print("start gen")
    
    # 生成输出
    generated_ids = model.generate(
        **model_inputs,
        max_new_tokens=512
    )
    # 处理生成的结果
    generated_ids = [
        output_ids[len(input_ids):] for input_ids, output_ids in zip(model_inputs.input_ids, generated_ids)
    ]

    response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
    
    print("response OK")
    
    # 解析并分类实体
    entities = {"PER": [], "LOC": [], "OFI": [], "BOOK": []}

    for line in response.split("\n"):
        if line.startswith("PER:"):
            entities["PER"] = line.replace("PER:", "").strip().split("、")
        elif line.startswith("LOC:"):
            entities["LOC"] = line.replace("LOC:", "").strip().split("、")
        elif line.startswith("OFI:"):
            entities["OFI"] = line.replace("OFI:", "").strip().split("、")
        elif line.startswith("BOOK:"):
            entities["BOOK"] = line.replace("BOOK:", "").strip().split("、")
            
    print("entities OK")
            
    pred_label = ['O'] * len(sentence)
    
    # 定义标签类型
    label_types = ["PER", "LOC", "OFI", "BOOK"]

    # 遍历每种实体类型
    for label_type in label_types:
        for entity in entities[label_type]:
            if not entity:
                continue
            start = sentence.find(entity)
            if start != -1:
                end = start + len(entity)
                if 0 <= start < len(pred_label) and 0 < end <= len(pred_label):
                    if len(entity) == 1:  # 单字实体
                        pred_label[start] = f"S-{label_type}"
                    else:  # 多字实体
                        pred_label[start] = f"B-{label_type}"  # 开始
                        pred_label[end - 1] = f"E-{label_type}"  # 结束
                        if end - start > 2:
                            pred_label[start + 1:end - 1] = [f"I-{label_type}"] * (len(entity) - 2)  # 中间
                    
    print(f"sentence length: {len(sentence)}, label length: {len(pred_label)}")
    assert len(sentence) == len(pred_label), "LABEL_LENGTH_ERROR"
    
    pred_labels.extend(pred_label)
    
    print("label OK") 
    
    # 实时写入日志
    with open(ner_journal_file, "a", encoding="utf-8") as file: # 'a' 模式表示追加写入
        file.write(" ".join(pred_label) + "\n")  # 将 list 转为空格分隔的字符串并换行

    print(f"第 {i+1} 段处理完成，结果已保存到日志.")

已从日志读取 218 句话的标签.


### 计算f1分数

In [15]:
from sklearn.metrics import f1_score

f1 = f1_score(true_labels, pred_labels, average='macro')
print(f"f1_marco: {f1}")

f1_marco: 0.33181592684168515


## 必做2：摘要任务

In [44]:
import json

articles = []
sample_summarys = []
with open("summary.jsonl", "r", encoding="utf-8") as f:  # 打开指定的文件
    for line in f:
        # 解析 JSON 数据
        data = json.loads(line)
        article = data["article"]
        summary = data["summary"]
        temp = ""
        for sentence in article:
            temp += "".join(sentence.split())
        articles.append(temp)
        sample_summarys.append(summary)
           
print(f"articles length: {len(articles)}")
print(f"sample summarys length: {len(sample_summarys)}")

articles length: 2000
sample summarys length: 2000


In [51]:
def create_summary_prompt(text_chunk):
    return f"""你是一个专业的新闻文本摘要专家。请仔细阅读输入的新闻内容，在保证理解意思的基础上，帮我做摘要任务，按照如下的提取要求来执行任务：

输入为一行文本，你在完成对输入文本的阅读以及理解后，请输出一行文字，作为该文本内容的摘要，注意，输出仅有一行，为普通文本，不要有markdown语法

现在把以下这段新闻文本作为输入，请给出输出结果：

{text_chunk}

""" 

In [52]:
import os
import jieba
# from rouge_score import rouge_scorer
from rouge_chinese import Rouge

# 日志文件名
summary_journal_file = "summary_journal.jsonl"

# 已完成的标签列表
pred_summarys = []
summary_count = 0

# 分数记录
rouge_1s = []
rouge_2s = []

# 初始化 ROUGE 计算器
# scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2'], use_stemmer=True)
rouge = Rouge()

# 检查文件是否存在，如果不存在则创建空文件
if not os.path.exists(summary_journal_file):
    open(summary_journal_file, "w", encoding="utf-8").close()

# 读取已完成的标签
if os.path.exists(summary_journal_file):
    with open(summary_journal_file, "r", encoding="utf-8") as file:
        for line in file:
            # 解析 JSON 行
            log_entry = json.loads(line)
            # 获取每一条记录中的内容
            pred_summarys.append(log_entry["pred_summary"])
            rouge_1s.append(log_entry["rouge_1"])
            rouge_2s.append(log_entry["rouge_2"])
            summary_count += 1
        
print(f"已从日志读取 {summary_count} 个新闻的摘要.")

for i in range(summary_count, len(articles)):
    print(f"正在处理第 {i+1} 段...")
    
    # 构建每段的 prompt
    article = articles[i]
    prompt = create_summary_prompt(article)
    
    print("prompt OK")
    
    messages = [
        # {"role": "system", "content": "You are Qwen, created by Alibaba Cloud. You are a helpful assistant."},
        {"role": "user", "content": prompt}
    ]
    text = tokenizer.apply_chat_template(
        messages,
        tokenize=False,
        add_generation_prompt=True
    )
    model_inputs = tokenizer([text], return_tensors="pt").to(model.device)

    print("start gen")
    
    generated_ids = model.generate(
        **model_inputs,
        max_new_tokens=512
    )
    generated_ids = [
        output_ids[len(input_ids):] for input_ids, output_ids in zip(model_inputs.input_ids, generated_ids)
    ]

    response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]

    print("response OK")
    
    response = response.replace("\n", "")
    pred_summary = " ".join(jieba.cut(response))
    pred_summarys.append(pred_summary)
    
    print("summary OK")
    
    # 计算 ROUGE 分数
    scores = rouge.get_scores(pred_summary, sample_summarys[i])
    rouge_1 = scores[0]['rouge-1']['r']
    rouge_2 = scores[0]['rouge-2']['r']
    # rouge_scores = scorer.score(sample_summarys[i], pred_summary)
    # rouge_1 = rouge_scores['rouge1'].recall
    # rouge_2 = rouge_scores['rouge2'].recall
    rouge_1s.append(rouge_1)
    rouge_2s.append(rouge_2)

    if (i + 1) % 100 == 0:
        print(f"average: ROUGE-1: {sum(rouge_1s) / (i + 1)}, ROUGE-2: {sum(rouge_2s) / (i + 1)}")
    
    # 实时写入日志
    with open(summary_journal_file, "a", encoding="utf-8") as file: # 'a' 模式表示追加写入
        log_entry = {
            "pred_summary": pred_summary,
            "rouge_1": rouge_1,
            "rouge_2": rouge_2
        }
        json.dump(log_entry, file,ensure_ascii=False)
        file.write("\n")

    print(f"第 {i+1} 段处理完成，结果已保存到日志.")

已从日志读取 0 个新闻的摘要.
正在处理第 1 段...
prompt OK
start gen
response OK
summary OK
第 1 段处理完成，结果已保存到日志.
正在处理第 2 段...
prompt OK
start gen
response OK
summary OK
第 2 段处理完成，结果已保存到日志.
正在处理第 3 段...
prompt OK
start gen
response OK
summary OK
第 3 段处理完成，结果已保存到日志.
正在处理第 4 段...
prompt OK
start gen
response OK
summary OK
第 4 段处理完成，结果已保存到日志.
正在处理第 5 段...
prompt OK
start gen
response OK
summary OK
第 5 段处理完成，结果已保存到日志.
正在处理第 6 段...
prompt OK
start gen
response OK
summary OK
第 6 段处理完成，结果已保存到日志.
正在处理第 7 段...
prompt OK
start gen
response OK
summary OK
第 7 段处理完成，结果已保存到日志.
正在处理第 8 段...
prompt OK
start gen
response OK
summary OK
第 8 段处理完成，结果已保存到日志.
正在处理第 9 段...
prompt OK
start gen
response OK
summary OK
第 9 段处理完成，结果已保存到日志.
正在处理第 10 段...
prompt OK
start gen
response OK
summary OK
第 10 段处理完成，结果已保存到日志.
正在处理第 11 段...
prompt OK
start gen
response OK
summary OK
第 11 段处理完成，结果已保存到日志.
正在处理第 12 段...
prompt OK
start gen
response OK
summary OK
第 12 段处理完成，结果已保存到日志.
正在处理第 13 段...
prompt OK
start gen
response OK
summary OK
第 13 段处理

### 计算ROUGE-1、ROUGE-2平均分数

In [53]:
print(f"average: ROUGE-1: {sum(rouge_1s) / len(articles)}, ROUGE-2: {sum(rouge_2s) / len(articles)}")

average: ROUGE-1: 0.2998226583545211, ROUGE-2: 0.09410141564539408
