In [2]:
from openai import OpenAI
from tqdm import tqdm
import json
import numpy as np
import random
from collections import Counter
import nltk
from nltk.translate.bleu_score import sentence_bleu
from rouge_chinese import Rouge
import jieba
import json
from bert_score import score
from transformers import BertTokenizer, BertModel
from transformers import AutoTokenizer
from GeneralRequestProcessor import GeneralRequestProcessor

In [3]:
# 载入部署好的模型
client = OpenAI(
    api_key="EMPTY",
    base_url="http://localhost:8000/v1",
)

tokenizer = AutoTokenizer.from_pretrained('/gemini/data-2/DeepSeek-R1-Distill-llama-8B/')

## 读取数据

In [4]:
# 读取和保存数据的函数
def read_json_file(file_path):
    try:
        with open(file_path, 'r', encoding='utf-8') as file:
            data = json.load(file)
        return data
    except FileNotFoundError:
        print(f"错误：找不到文件 {file_path}。请检查文件路径是否正确。")
        return None
    except json.JSONDecodeError:
        print(f"错误：无法解码文件 {file_path} 中的JSON数据。请确保这是一个有效的JSON文件。")
        return None
    
def save_to_json_file(data, file_path):
    try:
        with open(file_path, 'w', encoding='utf-8') as file:
            json.dump(data, file, ensure_ascii=False, indent=4)
        print(f"数据成功保存到 {file_path}")
    except Exception as e:
        print(f"保存文件时发生错误: {e}")   

In [5]:
# 使用函数读取JSON文件
file_path = '/gemini/code/Abstract_extract/ProcessedData/rdfybk/train_data_abstract_mulIns_small.json'  # 将这里的文件路径替换为你自己的JSON文件路径
train_test_data = read_json_file(file_path)

## 创建模型进行测试

In [14]:
head_template = [
    "请你根据给出的学术全文本内容，生成关于该文本的摘要。",
    "请根据提供的学术文本内容，生成一个该文本的摘要。",
    "请根据给定的学术全文，撰写一个简要总结。",
    "请根据所提供的学术文章内容，生成一个概述。",
    "你需要阅读以下学术文本，并根据其内容编写摘要",
    "根据给出的学术材料，请你整理出一份摘要。",
    "请将下面的学术文本内容总结成一个简短的摘要。",
    "请从提供的学术文本中提炼出摘要内容。",
    "请根据以下学术文本写一段简短的总结。",
    "请基于下列学术全文，提炼并生成相关摘要。"
]
head_template_COT = """
下面，我将会给你一份学术文本内容，你需要分析给出的内容，编写关于该学术文本的摘要。除去学术文本，我还会给你其他学术文本的摘要作为参考案例
## 编写摘要要求
- 首先你需要分析我给你的学术文本摘要案例，案例总共8个，你需要分析每一份摘要案例的行文特点和语言特征，作为你生成摘要的支撑。
- 然后你需要分析我所给你的学术文本主要讲述的核心内容，涉及的实验对象等学术信息，并分析你认为的每一个段落的关键语句。
- 最后，你需要根据你的分析，结合你从我所提供的摘要的行文特点和语言特征，编写一份我所给你的学术文本内容的摘要。
## 输出格式
- 你输出数据的格式应严格为：<所给摘要特征总结>xxxxx\n<学术文本内容分析>xxxxx\n<最终摘要>xxxxx
"""

head_template_COT2 = """
下面，我将会给你一份学术文本内容，你需要分析给出的内容，编写关于该学术文本的摘要。除去学术文本，我还会给你其他学术文本的摘要作为参考案例
## 编写摘要要求
- 首先你需要分析我给你的学术文本摘要案例，案例总共8个，你需要分析每一份摘要案例的行文特点和语言特征，作为你生成摘要的支撑。
- 然后你需要分析我所给你的学术文本主要讲述的核心内容，涉及的实验对象等学术信息，并分析你认为的每一个段落的关键语句。
- 最后，你需要根据你的分析，结合你从我所提供的摘要的行文特点和语言特征，编写一份我所给你的学术文本内容的摘要。
## 输出格式
- 你的最终输出只需要给出你的摘要即可，不用给出其他内容！
"""

head_template_COT3 = """
下面，我会先给你一份人文社科学术文本，你需要总结该文本摘要。此外，我还会给你8个摘要案例作为编写学术文本的参考，编写中需要注意的点如下：
## 所给摘要特征总结要求
- 你需要分析我给你的学术文本摘要案例，案例总共8个，你需要分析每一份摘要案例的行文特点和语言特征，作为你生成摘要的支撑。
- 你要根据你对案例摘要分析的内容，在<所给摘要特征总结>模块给出人文社科领域学术文本摘要的行文特点和语言特征，为之后修改摘要提供依据。
## 学术文本内容分析要求
- 人文社科领域摘要的重点在于背景描述和本文论点的结合。
- 你需要识别出本文的研究对象，并在<学术文本内容分析>模块中以【研究对象：xxxx】的格式给出。
- 你需要识别出本文的研究背景，并在<学术文本内容分析>模块中以【研究背景：xxxx】的格式给出。
- 你需要以文章段落为单位进行分析,总结每一个段落的核心论述内容，并在<学术文本内容分析>模块中以【论述x：xxxx】的格式给出。
- 你需要分析不同论述之间是并列、递进亦或对比等关系，并在<学术文本内容分析>模块中按照论述关系三元组【论述m，关系，论述n】的格式给出。
- 你要根据总结的论述关系三元组，分析出本文论述的核心论点，并在<学术文本内容分析>模块中以【核心论点：xxxx】的格式给出
## 初步摘要的编写
- 你要围绕<学术文本内容分析>模块中给出的“研究对象”、“研究背景”以及“核心论点”部分内容编写初步摘要，并在<初步摘要>模块给出。
## 摘要审改的编写
- 根据<初步摘要>模块的初步摘要，结合<所给摘要特征总结>模块中人文社科领域学术文本的行文特点和语言特征，以及你得到的8个案例，对初步摘要进行修改。
- 你需要比对初步摘要同我给你的案例摘要在语言表达方式上的区别，并在<摘要审改>模块中以【区别x:xxxxx】形式给出。
- 你需要根据你所罗列的区别信息，在<摘要审改>模块中给出摘要修改的思路。
## 最终摘要的编写
- 根据在<摘要审改>模块中得到的思路，对<初步摘要>模块中的摘要内容进行修改，力求修改后的语言和结构特征同真实摘要一致。
- 注意最终摘要中的组织语句主语、谓语宾语等结构的语言表达习惯要同真实摘要相一致。在<最终摘要>模块中给出你所转写的最终摘要。
## 输出格式
- 输出数据的格式严格为：<所给摘要特征总结>xxxxx</所给摘要特征总结>\n\n<学术文本内容分析>xxxxx</学术文本内容分析>\n\n<初步摘要>xxxxx</初步摘要>\n\n<摘要审改>xxxxx</摘要审改>\n\n<最终摘要>xxxxx
"""

In [15]:
## 获取8-shot
def make_shot(origin_data=None):
    shot_8 = ""
    for index, single_data in enumerate(origin_data[4492:4500]):
        shot_ = f"【**案例{index}**：\n{single_data['conversations'][1]['value']}\n】"
        shot_8 += shot_
    return shot_8

# 读取原始数据并处理成需要的格式
def make_messages(origin_data=None):
    """
    将原始数据集转换为大模型微调所需数据格式的新数据集
    """
    messages = []
    Real_outputs = []
    shot_8 = make_shot(origin_data)
    for index, single_data in enumerate(origin_data):
        random_number = random.randint(0, 9)
        message = [
            {"role": "system", "content": ""},
#             {"role": "user", "content": single_data['conversations'][0]['value']}
            {"role": "user", "content": "{}\n-{},以下是供你参考的人文社科领域学术文本摘要案例\n{}".format(head_template_COT3, single_data['conversations'][0]['value'], shot_8)}
        ]
        messages.append([message,index])
        Real_outputs.append(single_data['conversations'][1]['value'])
            
    return messages,Real_outputs

def ChatCompletions(message):
    response = client.chat.completions.create(
        model="MyModel",
        messages=message,
        stream=False,
        temperature=0.01,
        max_tokens=5000,
    )
    return response.choices[0].message.content

In [16]:
# 获取输入数据
messages,Real_outputs = make_messages(train_test_data)

In [17]:
print(messages[4517])
print('-----------------------------------')
print(Real_outputs[4517])

[[{'role': 'system', 'content': ''}, {'role': 'user', 'content': "\n下面，我会先给你一份人文社科学术文本，你需要总结该文本摘要。此外，我还会给你8个摘要案例作为编写学术文本的参考，编写中需要注意的点如下：\n## 所给摘要特征总结要求\n- 你需要分析我给你的学术文本摘要案例，案例总共8个，你需要分析每一份摘要案例的行文特点和语言特征，作为你生成摘要的支撑。\n- 你要根据你对案例摘要分析的内容，在<所给摘要特征总结>模块给出人文社科领域学术文本摘要的行文特点和语言特征，为之后修改摘要提供依据。\n## 学术文本内容分析要求\n- 人文社科领域摘要的重点在于背景描述和本文论点的结合。\n- 你需要识别出本文的研究对象，并在<学术文本内容分析>模块中以【研究对象：xxxx】的格式给出。\n- 你需要识别出本文的研究背景，并在<学术文本内容分析>模块中以【研究背景：xxxx】的格式给出。\n- 你需要以文章段落为单位进行分析,总结每一个段落的核心论述内容，并在<学术文本内容分析>模块中以【论述x：xxxx】的格式给出。\n- 你需要分析不同论述之间是并列、递进亦或对比等关系，并在<学术文本内容分析>模块中按照论述关系三元组【论述m，关系，论述n】的格式给出。\n- 你要根据总结的论述关系三元组，分析出本文论述的核心论点，并在<学术文本内容分析>模块中以【核心论点：xxxx】的格式给出\n## 初步摘要的编写\n- 你要围绕<学术文本内容分析>模块中给出的“研究对象”、“研究背景”以及“核心论点”部分内容编写初步摘要，并在<初步摘要>模块给出。\n## 摘要审改的编写\n- 根据<初步摘要>模块的初步摘要，结合<所给摘要特征总结>模块中人文社科领域学术文本的行文特点和语言特征，以及你得到的8个案例，对初步摘要进行修改。\n- 你需要比对初步摘要同我给你的案例摘要在语言表达方式上的区别，并在<摘要审改>模块中以【区别x:xxxxx】形式给出。\n- 你需要根据你所罗列的区别信息，在<摘要审改>模块中给出摘要修改的思路。\n## 最终摘要的编写\n- 根据在<摘要审改>模块中得到的思路，对<初步摘要>模块中的摘要内容进行修改，力求修改后的语言和结构特征同真实摘要一致。\n- 注意最终摘要中的组织语句主语、谓语

## 模型推理测试

In [11]:
# 单线程遗留代码
# Outputs = []
# for message in messages[4500:]:
#     response = client.chat.completions.create(
#         model="Qwen2.5-7B-Instruct-Abs-lyl",
#         #model="Qwen2.5-7B-Instruct",
#         messages=message,
#         stream=False,
#         temperature=0.01,
#         max_tokens=5000,
#     )
#     Outputs.append(response)
    
# Model_outputs = [response.choices[0].message.content for response in Outputs]
# save_to_json_file(Model_outputs, '/gemini/code/Abstract_extract/ProcessedData/rdfybk/deepseek_outputs_8B_cot.json')

In [18]:
processor = GeneralRequestProcessor(max_concurrent_requests=100)
processor.apply(handle_function=ChatCompletions, data_list=messages[4500:])
processor.process_all_requests()
results = processor.get_results()

KeyboardInterrupt: 

2025-04-03 14:53:49,536 - Retrying request to /chat/completions in 0.450954 seconds
2025-04-03 14:53:49,565 - Retrying request to /chat/completions in 0.430377 seconds
2025-04-03 14:53:49,626 - Retrying request to /chat/completions in 0.490744 seconds
2025-04-03 14:53:49,652 - Retrying request to /chat/completions in 0.383928 seconds
2025-04-03 14:53:49,667 - Retrying request to /chat/completions in 0.379928 seconds
2025-04-03 14:53:49,675 - Retrying request to /chat/completions in 0.405705 seconds
2025-04-03 14:53:49,677 - Retrying request to /chat/completions in 0.495244 seconds
2025-04-03 14:53:49,680 - Retrying request to /chat/completions in 0.393019 seconds
2025-04-03 14:53:49,684 - Retrying request to /chat/completions in 0.401559 seconds
2025-04-03 14:53:49,692 - Retrying request to /chat/completions in 0.383256 seconds
2025-04-03 14:53:49,739 - Retrying request to /chat/completions in 0.376275 seconds
2025-04-03 14:53:49,751 - Retrying request to /chat/completions in 0.397918 

2025-04-03 14:53:50,556 - Retrying request to /chat/completions in 0.487036 seconds
2025-04-03 14:53:50,558 - Retrying request to /chat/completions in 0.933692 seconds
2025-04-03 14:53:50,568 - Retrying request to /chat/completions in 0.376779 seconds
2025-04-03 14:53:50,570 - Retrying request to /chat/completions in 0.969382 seconds
2025-04-03 14:53:50,595 - Retrying request to /chat/completions in 0.841973 seconds
2025-04-03 14:53:50,597 - Retrying request to /chat/completions in 0.387228 seconds
2025-04-03 14:53:50,612 - Retrying request to /chat/completions in 0.391819 seconds
2025-04-03 14:53:50,623 - Retrying request to /chat/completions in 0.840093 seconds
2025-04-03 14:53:50,636 - Retrying request to /chat/completions in 0.414001 seconds
2025-04-03 14:53:50,642 - Retrying request to /chat/completions in 0.418551 seconds
2025-04-03 14:53:50,646 - Retrying request to /chat/completions in 0.445453 seconds
2025-04-03 14:53:50,646 - Retrying request to /chat/completions in 0.479404 

Connection error.
Connection error.
Connection error.
Connection error.
Connection error.
Connection error.


2025-04-03 14:53:50,986 - Retrying request to /chat/completions in 0.394676 seconds
2025-04-03 14:53:50,992 - Retrying request to /chat/completions in 0.445697 seconds
2025-04-03 14:53:50,999 - Processed 7/500 requests. 99 currently processing. 493 remaining. Estimated time left: 42367.32 seconds.
2025-04-03 14:53:51,000 - Retrying request to /chat/completions in 0.437262 seconds
2025-04-03 14:53:51,000 - Retrying request to /chat/completions in 0.411989 seconds
2025-04-03 14:53:51,003 - Processed 8/500 requests. 98 currently processing. 492 remaining. Estimated time left: 36996.43 seconds.
2025-04-03 14:53:51,004 - Retrying request to /chat/completions in 0.459872 seconds
2025-04-03 14:53:51,004 - Retrying request to /chat/completions in 0.902506 seconds
2025-04-03 14:53:51,005 - Retrying request to /chat/completions in 0.882970 seconds
2025-04-03 14:53:51,007 - Retrying request to /chat/completions in 0.878038 seconds
2025-04-03 14:53:51,009 - Processed 9/500 requests. 97 currently p

Connection error.
Connection error.
Connection error.
Connection error.
Connection error.
Connection error.
Connection error.
Connection error.
Connection error.
Connection error.
Connection error.
Connection error.
Connection error.
Connection error.


2025-04-03 14:53:51,105 - Retrying request to /chat/completions in 0.852244 seconds
2025-04-03 14:53:51,107 - Retrying request to /chat/completions in 0.769739 seconds
2025-04-03 14:53:51,107 - Retrying request to /chat/completions in 0.430150 seconds
2025-04-03 14:53:51,108 - Retrying request to /chat/completions in 0.441818 seconds
2025-04-03 14:53:51,108 - Retrying request to /chat/completions in 0.807042 seconds
2025-04-03 14:53:51,114 - Processed 15/500 requests. 94 currently processing. 485 remaining. Estimated time left: 19454.30 seconds.
2025-04-03 14:53:51,116 - Retrying request to /chat/completions in 0.456826 seconds
2025-04-03 14:53:51,117 - Processed 16/500 requests. 93 currently processing. 484 remaining. Estimated time left: 18200.89 seconds.
2025-04-03 14:53:51,117 - Processed 17/500 requests. 92 currently processing. 483 remaining. Estimated time left: 17094.87 seconds.
2025-04-03 14:53:51,139 - Retrying request to /chat/completions in 0.468900 seconds
2025-04-03 14:53

Connection error.
Connection error.
Connection error.
Connection error.
Connection error.
Connection error.
Connection error.
Connection error.
Connection error.
Connection error.
Connection error.
Connection error.
Connection error.
Connection error.


2025-04-03 14:53:51,239 - Retrying request to /chat/completions in 0.998256 seconds
2025-04-03 14:53:51,239 - Processed 23/500 requests. 91 currently processing. 477 remaining. Estimated time left: 12480.91 seconds.
2025-04-03 14:53:51,241 - Retrying request to /chat/completions in 0.412781 seconds
2025-04-03 14:53:51,241 - Retrying request to /chat/completions in 0.448373 seconds
2025-04-03 14:53:51,242 - Processed 24/500 requests. 90 currently processing. 476 remaining. Estimated time left: 11935.84 seconds.
2025-04-03 14:53:51,244 - Retrying request to /chat/completions in 0.926213 seconds
2025-04-03 14:53:51,246 - Retrying request to /chat/completions in 0.982785 seconds
2025-04-03 14:53:51,246 - Retrying request to /chat/completions in 0.870318 seconds
2025-04-03 14:53:51,273 - Processed 25/500 requests. 90 currently processing. 475 remaining. Estimated time left: 11434.92 seconds.
2025-04-03 14:53:51,274 - Retrying request to /chat/completions in 0.394819 seconds
2025-04-03 14:53

Connection error.Connection error.

Connection error.
Connection error.
Connection error.
Connection error.
Connection error.
Connection error.
Connection error.
Connection error.
Connection error.


2025-04-03 14:53:51,374 - Processed 33/500 requests. 86 currently processing. 467 remaining. Estimated time left: 8518.35 seconds.
2025-04-03 14:53:51,375 - Retrying request to /chat/completions in 0.498779 seconds
2025-04-03 14:53:51,375 - Retrying request to /chat/completions in 0.887991 seconds
2025-04-03 14:53:51,410 - Retrying request to /chat/completions in 0.494199 seconds
2025-04-03 14:53:51,410 - Processed 34/500 requests. 87 currently processing. 466 remaining. Estimated time left: 8250.60 seconds.
2025-04-03 14:53:51,411 - Retrying request to /chat/completions in 0.447696 seconds
2025-04-03 14:53:51,411 - Retrying request to /chat/completions in 0.887232 seconds
2025-04-03 14:53:51,435 - Retrying request to /chat/completions in 0.985344 seconds
2025-04-03 14:53:51,436 - Retrying request to /chat/completions in 0.433611 seconds
2025-04-03 14:53:51,436 - Retrying request to /chat/completions in 0.957536 seconds
2025-04-03 14:53:51,438 - Retrying request to /chat/completions in

In [None]:
model_output = [""] * len(results)
for result in results:
    model_output[int(result['input'][-1])-4500] = result['output']
Model_outputs = model_output
save_to_json_file(Model_outputs, '/gemini/code/Abstract_extract/ProcessedData/rdfybk/Model_outputs_8B_cot_plus.json')