In [None]:
import re
import json
from loguru import logger
from tqdm import tqdm
import openai

client = openai.Client(base_url="http://127.0.0.1:30000/v1", api_key="EMPTY")

In [None]:
def get_prompt(problem, question, options):
    options = '\n'.join(f"{'ABCDEFG'[i]}. {o}" for i, o in enumerate(options))

    prompt = f"""你是一个逻辑推理专家，擅长解决逻辑推理问题。以下是一个逻辑推理的题目，形式为单项选择题。所有的问题都是（close-world assumption）闭世界假设，即未观测事实都为假。题目如下：
### 题目:
{problem}
### 问题:
{question}
{options}
"""
    return prompt + "请通过逐步推理来解答问题，答案只有1个，并把最终答案放置于\\boxed{}中"


In [None]:
def process_datas(datas):
    future_data = {}
    lens = 0
    for data in tqdm(datas, desc="Submitting tasks", total=len(datas)):
        problem = data['problem']
        messages = []
        for id, question in enumerate(data['questions']):
            if 'answer' in question and question['answer'] in 'ABCDEFG':
                continue
                
            prompt = get_prompt(problem,
                                question['question'],
                                question['options'],
                                )
            messages.append(prompt)
            lens += 1
        
        if len(messages) == 0:
            continue
        response = client.completions.create(
                            model="default",
                            prompt=messages,
                            temperature=0,
                            max_tokens=512
                        )
        
        for choice in response.choices:
            future_data[choice.text] = (data, choice.index)
   
    for future in tqdm(future_data, total=lens, desc="Processing tasks"):
        data = future_data[future][0]
        problem_id = future_data[future][1]
        try:
            answer_pattern = re.compile(r"boxed\{([A-Z])\}", re.S)
            answer = answer_pattern.findall(future)
            if len(answer) != 1 or answer[0] not in 'ABCDEFG':
                print(data["id"])
                continue
                
            data['questions'][problem_id]['answer'] = answer[0]
        except Exception as e:
            logger.error(f"Failed to process text: {data}. Error: {e}")

    return datas

In [None]:
def all_has_answer(test_data):
    for item in test_data:
        for id, question in enumerate(item['questions']):
            if 'answer' not in question or question['answer'] not in 'ABCDEFG':
                return False
    return True

In [None]:
def main(ifn):
    problems = []
    # 按行读取数据
    with open(ifn) as reader:
        for line in reader:
            sample = json.loads(line)
            problems.append(sample)

    index = 0
    while not all_has_answer(problems) and index < 5:
        problems = process_datas(problems)
        index += 1
    print("All tasks finished!")
    return problems

In [None]:
if __name__ == '__main__':
    return_list = main('round1_test_data.jsonl')

In [None]:
sorted_data = sorted(return_list, key=lambda x: int(str(x['id'])[-3:]))
with open('upload-pipeline.jsonl', 'w') as writer:
    for sample in sorted_data:
        writer.write(json.dumps(sample, ensure_ascii=False))
        writer.write('\n')