In [1]:
# -*- coding: utf-8 -*-
import argparse
import copy
import json
import os
import pickle
import random
import re
import sys
from collections import defaultdict, deque
from typing import List
import re
from typing import Any, List
import ipydagred3
import networkx as nx
import numpy as np
import openai
from tqdm import tqdm
from DROP_3_utils import *

#os.environ["http_proxy"] = "http://localhost:7890"
#os.environ["https_proxy"] = "http://localhost:7890"
setOpenAi(keyid = 4)

GPT_MODEL = "gpt-4-turbo"
GPT_MODEL = "gpt-4o-mini"
GPT_MODEL = "gpt-3.5-turbo"

# 读取问题数据文件
file_path = 'drop_100.json'
with open(file_path, 'r', encoding='utf-8') as file:
    dataset = json.load(file)

In [2]:
dataset

[{'key': 'nfl_2201',
  'passage': "To start the season, the Lions traveled south to Tampa, Florida to take on the Tampa Bay Buccaneers. The Lions scored first in the first quarter with a 23-yard field goal by Jason Hanson. The Buccaneers tied it up with a 38-yard field goal by Connor Barth, then took the lead when Aqib Talib intercepted a pass from Matthew Stafford and ran it in 28 yards. The Lions responded with a 28-yard field goal. In the second quarter, Detroit took the lead with a 36-yard touchdown catch by Calvin Johnson, and later added more points when Tony Scheffler caught an 11-yard TD pass. Tampa Bay responded with a 31-yard field goal just before halftime. The second half was relatively quiet, with each team only scoring one touchdown. First, Detroit's Calvin Johnson caught a 1-yard pass in the third quarter. The game's final points came when Mike Williams of Tampa Bay caught a 5-yard pass.  The Lions won their regular season opener for the first time since 2007",
  'questi

In [3]:
question_id = 8

question = dataset[question_id]['question']
passage = dataset[question_id]['passage']
gold_answer = dataset[question_id]['answer']

In [4]:
question

"How many points were the Cowboy's trailing at halftime?"

In [5]:
gold_answer

'11'

In [13]:
def decompose_sql(question, type):   

    prompt_for_decompose = f"""
    
    I will now give you a arithemic reasoning problem. The context of the question {type}. Please help me translate this math problem into a series of step-by-step sub-problems. Let's think step by step, and the total number of steps shouldn't be very tedious. 

    Examples:
    Q: Since the 1970s, U.S. governments have negotiated managed-trade agreements, such as the North American Free Trade Agreement in the 1990s, the Dominican Republic-Central America Free Trade Agreement in 2006, and a number of bilateral agreements. In Europe, six countries formed the European Coal and Steel Community in 1951 which became the European Economic Community in 1958. Two core objectives of the EEC were the development of a common market, subsequently renamed the single market, and establishing a customs union between its member states. How many years did the European Coal and Steel Community exist?
    A:
    To solve the question "How many years did the European Coal and Steel Community exist?", we need to solve the following problems step by step:
    1. Find the year the European Coal and Steel Community was established.
    2. Find the year the European Economic Community was established.
    3. Subtract the year the European Coal and Steel Community was established from the year the European Economic Community was established.
    

Now the command is {question}, please decompose it into a series of easy-to-solve steps like the examples.
Answer Format: (Please write each broken-down question step on a separate line, starting with a number.)
To solve the question "xxx", we need to solve the following problems step by step:
1. sub-question 1
2. sub-question 2
3. sub-question 3
...
"""

    Q = {
        "role": "user",
        "content": prompt_for_decompose
    }
    # Query = Example+[Q]
    Query = [Q]
    result = askChatGPT(Query, model=GPT_MODEL, temperature=1)
    return result


def convert_steps_to_format(decom_commands):
    # 截取“we need to know:”后的内容
    start_index = decom_commands.find("we need to solve the following problems step by step:") + len("we need to solve the following problems step by step:")
    subtasks_text = decom_commands[start_index:].strip()
    # 将每个子任务单独列出
    subtasks = subtasks_text.split('\n')
    subtasks = [task.strip().split('. ', 1)[-1] for task in subtasks]
    steps_dict = {index: value for index, value in enumerate(subtasks)}
    return subtasks, steps_dict

decompose_steps = decompose_sql(question, passage)
steps, steps_dict = convert_steps_to_format(decompose_steps)
num_steps = len(steps)
print(steps)
print(steps_dict)  # 只是加了一个问题的编号而已.

total tokens: 634
['Find the total points scored by the Jaguars in the first half.', 'Find the total points scored by the Cowboys in the first half.', "Subtract the Cowboys' total points from the Jaguars' total points."]
{0: 'Find the total points scored by the Jaguars in the first half.', 1: 'Find the total points scored by the Cowboys in the first half.', 2: "Subtract the Cowboys' total points from the Jaguars' total points."}


In [14]:
print(len(steps))

3


In [15]:
# TOT 求解
N = 3  # 每个子问题进行N次proposal
M = 2  # 通过评估选出M个最好的proposal

solution = []

for i in range(num_steps):
    subtask = steps[i]
    print("THE SUBTASK IS: ", subtask)
    sys_q = f"""There is a arthemic reasoning problem. I need you to solve it and give an answer.
    
Here is the context of the problem:\n{passage} 

Here is the problem:\n{question}

I have broken this problem down into a series of smaller problems. I will assign you sub-problems one by one, and provide the results of the previous sub-problems as a reference for your reasoning.
Please solve the problem and respond according to mathematical logic.
"""  

    subask = f"""\nThe sub-problem to solve now is xxx: {subtask}
Based on the information above, please provide a concise and clear answer"""
    
    if len(solution)==0:
        # 第一个子问题
        query = subask
        Q = [{'role':'system', 'content':sys_q},
            {'role':'user', 'content':query},]
        for n in range(N):  # 一个子问题提问N次,获取N个解
            result = askChatGPT(Q, model=GPT_MODEL, temperature=1)
            eval_Q = Q + [{'role':'assistant', 'content':result}]
            
            print("THE RESULT IS: ", result)
            
            eval_Q = eval_Q + [{'role':'user', 'content':"Please provide a confidence rating for the accuracy of this solution, on a scale from 1 to 5. Only output the number."}]
            score = askChatGPT(eval_Q, model=GPT_MODEL, temperature=1)
            score = int(score)
            
            solution.append((score, [result]))  # 维护一整条推理路径
            
        solution = sorted(solution, key=lambda x: x[0])
        solution = solution[:M]  # 剪枝
    else:
        temp_solution = []
        for m in range(M):  # 因为剪枝动态维护M个推理路径
            answersSoFar = f"""\nSo far, the answers to the preceding sub-problems are as follows: The format is Sub-problem-Id: xxx; Sub-problem: xxx; Answer: xxx."""
            for index, value in enumerate(solution[m][1]):
                try:
                    answersSoFar += f"""\nSub-problem-Id: {index}; Sub-problem: {steps[index]}; Answer: {value}."""
                except:
                    print('warning')
                    print(index)
                    print(len(solution[m][1]))
                    print(len(steps))
                    sys.exit(0)
            query = answersSoFar+subask
            Q = [{'role':'system', 'content':sys_q},
                 {'role':'user', 'content':query},]
            for n in range(N):  # 一个子问题提问N次,获取N个解
                result = askChatGPT(Q, model=GPT_MODEL, temperature=1)
                eval_Q = Q + [{'role':'assistant', 'content':result}]
                
                
                
                eval_Q = eval_Q + [{'role':'user', 'content':"Please provide a confidence rating for the accuracy of this solution, on a scale from 1 to 5. Only output the number."}]
                score = askChatGPT(eval_Q, model=GPT_MODEL, temperature=1)
                score = int(score)
                
                temp_solution.append((solution[m][0]+score, solution[m][1]+[result]))  # 路径score累加
        
        # print(len(temp_solution))  # 此时temp_solution中应该有M*N种推理路径
        solution = sorted(temp_solution, key=lambda x: x[0])
        solution = solution[:M]  # 剪枝 M*N->M

print(len(solution))
printSeq(solution)
# 用额外的一次query再问一下最终的答案


THE SUBTASK IS:  Find the total points scored by the Jaguars in the first half.
total tokens: 430
THE RESULT IS:  To find the total points scored by the Jaguars in the first half, we need to add up the points they scored in the first and second quarters:

10-yard TD pass to WR Mike Sims-Walker = 6 points
42-yard TD pass to TE Marcedes Lewis = 6 points

Therefore, the total points scored by the Jaguars in the first half is 6 (from the first quarter) + 6 (from the second quarter) = 12 points.
total tokens: 466
total tokens: 427
THE RESULT IS:  To find the total points scored by the Jaguars in the first half, we sum up the points they scored in the first and second quarters.

Jaguars points in the first quarter: 7 (from the TD pass to Mike Sims-Walker)
Jaguars points in the second quarter: 7 (from the TD pass to Marcedes Lewis)

Total points scored by the Jaguars in the first half: 7 + 7 = 14 points.
total tokens: 463
total tokens: 351
THE RESULT IS:  The total points scored by the Jaguar

In [16]:
# 从M个路径里挑一个最好的来问,也可以问完之后再评估一下选最好的答案

user_q = f"""There is a math_problem:\n{question}

Given the context of the problem:\n{passage}

I have broken this math problem down into a series of smaller problems and each sub-problem is solved.
The sub-problems and their corresponding answers are as follows. (Format: Sub-problem-Id: xxx; Sub-problem: xxx; Answer: xxx.)"""

for index, value in enumerate(solution[0][1]):  # 这里仅仅使用了最终得分最高的1条路径来总结得到final answer
    user_q += f"""\nSub-problem-Id: {index}; Sub-problem: {steps[index]}; Answer: {value}."""

Q.append({'role':'user', 'content':f"""Now that all the sub-problems have been solved, so what is the final answer?
Please give the final answer without any additional explanation or clarification."""})
finalResult = askChatGPT(Q, model=GPT_MODEL, temperature=1)

# 让大语言模型来判断有没有回答正确
judgeAnswer = {'role':'user', 'content':f"""Here is a math problem with a standard answer and a student's solution. Please help me determine if the student's solution is correct. If the numerical answer are the same, should return true.
Problem: {question}

Standard answer: {gold_answer}

Answer: {finalResult}

If the student's answer is correct, just output True; otherwise, just output False.
No explanation is required.
"""}

Q_judge = [judgeAnswer]
ifcorrect = askChatGPT(Q_judge, model=GPT_MODEL, temperature=1)  # 要么是True, 要么是False

if 'True' in ifcorrect:
    print('correct')
    success = True  # 任务未受中断,完整地结束了,所以标记为成功
elif 'False' in ifcorrect:
    print('error')
    success = True  # 任务未受中断,完整地结束了,所以标记为成功                       

total tokens: 676
total tokens: 146
error


In [18]:
solution[0][1]

['To find the total points scored by the Jaguars in the first half, we need to add up the points they scored in the first and second quarters:\n\n10-yard TD pass to WR Mike Sims-Walker = 6 points\n42-yard TD pass to TE Marcedes Lewis = 6 points\n\nTherefore, the total points scored by the Jaguars in the first half is 6 (from the first quarter) + 6 (from the second quarter) = 12 points.',
 'Sub-problem-Id: 1; Sub-problem: Find the total points scored by the Cowboys in the first half.; Answer: To find the total points scored by the Cowboys in the first half, we need to sum up the points they scored in the first and second quarters:\n\n34-yard field goal by Kicker David Buehler = 3 points\n\nTherefore, the total points scored by the Cowboys in the first half is 3 points.\n\nThe next sub-problem to solve is: Find the difference in points between the Cowboys and Jaguars at halftime.',
 "Sub-problem: Subtract the Cowboys' total points from the Jaguars' total points.\nCowboys' total points in

In [19]:
print(finalResult)
print(gold_answer)

Sub-problem-Id: 2; Sub-problem: Subtract the Cowboys' total points from the Jaguars' total points.; Answer: 12 (Jaguars' total points) - 3 (Cowboys' total points) = 9 points. 

Thus, the Cowboys were trailing by 9 points at halftime.
11
