In [10]:
from langchain.chat_models import ChatOpenAI
from langchain.prompts.chat import (
    ChatPromptTemplate,
    SystemMessagePromptTemplate,
    HumanMessagePromptTemplate,
)
import json
from langchain.output_parsers import StructuredOutputParser, ResponseSchema
from langchain.chains import LLMChain
from dotenv.main import load_dotenv


class ChatModelWorker:
    def __init__(self, output_parser, temperature=0, model='gpt-4'):
        with open('api_key.txt', 'r') as f:
            apikey = f.read()
        self.chat_model = ChatOpenAI(openai_api_key=apikey, model_name=model, temperature=temperature)
        self.output_parser = output_parser

    def prompt_temps(self, sys_temp, human_temp, format_instructions):
        sys_msg_prompt = SystemMessagePromptTemplate.from_template(sys_temp)
        human_msg_prompt = HumanMessagePromptTemplate.from_template(human_temp)
        chat_prompt = ChatPromptTemplate(partial_variables={"format_instructions": format_instructions},
                                         messages=[sys_msg_prompt, human_msg_prompt])
        return chat_prompt

    def chain_generator(self, template, human_template):
        output_parser = self.output_parser
        format_instructions = output_parser.get_format_instructions()
        chain = LLMChain(
            llm=self.chat_model,
            prompt=self.prompt_temps(template, human_template, format_instructions)
        )
        return chain


def output_repraser(input_string):
    json_str = input_string.strip('```json\n').rstrip('\n```').strip()

    # Step 2: Parse the JSON string into a dictionary
    data_dict = json.loads(json_str)
    return data_dict

#gpt-4-0125-preview,gpt-3.5-turbo-0125
def cot_agent(subject, question, temp=0, model_name='gpt-3.5-turbo-0125'):
    system_prompt = (
        "You are a professional specialized in {subject}. You need to help me answer the given question."
        "Notice that you need to solve the question step by step and as detailed as possible. Do not jump to the answer directly."
        "If it is a math question, please provide me the  detailed calculation in your steps, not just say the method!!!"
        "Your intermediate steps and thoughts are critical!. Also, maximum 10 steps allowed"
        "\n{format_instructions}")
    human_prompt = "{question}"

    response_schemas = [
        ResponseSchema(name="Chain of Thought",
                       description="Provide step by step analysis. For instance, you should follow the pattern 'step 1:... \nstep 2:...'"),
        ResponseSchema(name="Number of Steps Proposed",
                       description="return a simple integer output which indicates the number of steps you proposed in the Chain of Thought"),
        ResponseSchema(name="Final Answer",
                       description="Give me your final answer, if have options provided, just give me the option index. Follow"
                                   "The format [final_answer or correct_option_index]")
    ]
    output_parser = StructuredOutputParser.from_response_schemas(response_schemas)
    success = False
    while not success:
        try:
            worker = ChatModelWorker(output_parser=output_parser, temperature=temp, model=model_name)
            chain = worker.chain_generator(system_prompt, human_prompt)
            out_put = chain.run(subject=subject,
                                question=question)

            success = True
        except:
            worker = ChatModelWorker(output_parser=output_parser, temperature=temp, model=model_name)
            chain = worker.chain_generator(system_prompt, human_prompt)
            out_put = chain.run(subject=subject,
                                question=question)
    return out_put


def ngram_checker_agent(subject, question, current_step, cot, temp=0, model_name='gpt-4-0125-preview'):
    system_prompt = (
        '''You are a professional specialized in {subject}. You need to help me verify my steps when I solve the question.
        I am currently at step #{current_step}.

        Is the current step logically or computationally correct in condition of all previous steps?. Notice that you should follow my throught process
        and determine the correctness using my approach.Also, if it involves equation computation, any format of equation should be considered as CORRECT
        as long as it holds. If my equation is slightly different than yours but if the result in correct computational result. It should be considered as CORRECT. 

        If it is correct, then verify that if my current step make the previous step hold. In other words, 
        check the logic consistency of step n in conditional of <step k to step n-1> where n is my current step and k is the first step number in the provided cot steps.
        provided thourght process. In the other words, is my current step supported by previous n-1 steps? It is important that for each analysis, ignore 
        steps other than the current step and the previous steps! In addition, for your reference, the question is given as {question}. 

        At step 1, since we have no step 0, instead the correctness and consistency should reflect if I correctly understood the answer.
        \n{format_instructions}
''')
    human_prompt = "Here is my complete thought process {cot}"

    response_schemas = [
        ResponseSchema(name="Steps",
                       description='''
                       States step indices on the current provided in the cot as [indices]
                       '''),
        ResponseSchema(name="Verification",
                       description='''
                        Help me verify the correctness and the logic consistency  of the current step, 
                       and tell me the reason. If they are due to the current step, say [Caused by step n], if due to previous
                       step, clearly indicate which step cause the inconsistency or incorrectness by saying [Caused by step a, b ...]
                       REASON is important. !!! If at Step 1, since 
                        we have no step 0, verify if I correctly understood the answer
                        '''),
        ResponseSchema(name="Corrected Step",
                       description='''
                        According to your analysis of correctness and consistency, help me revise the current step so that
                        it becomes correct and consistent.
                    '''),
        ResponseSchema(name="Step Correctness",
                       description='''
                       say [YES] if the logic is correct and the question is well-understood, otherwise [NO] .!!! If at Step 1, since 
                        we have no step 0, instead the correctness should reflect if I correctly understood the answer.
                        '''),
        ResponseSchema(name="Logic Consistency",
                       description='''
                       say [YES] if consistent, otherwise [NO].!!! If at Step 1, since 
                        we have no step 0, instead say [N/A]
                        ''')
    ]
    output_parser = StructuredOutputParser.from_response_schemas(response_schemas)
    success = False
    while not success:
        try:
            worker = ChatModelWorker(output_parser=output_parser, temperature=temp, model=model_name)
            chain = worker.chain_generator(system_prompt, human_prompt)
            out_put = chain.run(subject=subject,
                                current_step=current_step, cot=cot, final_answer=final_answer, question=question)

            success = True
        except:
            worker = ChatModelWorker(output_parser=output_parser, temperature=temp, model=model_name)
            chain = worker.chain_generator(system_prompt, human_prompt)
            out_put = chain.run(subject=subject,
                                current_step=current_step, cot=cot, final_answer=final_answer, question=question)
    return out_put


def variable_agent(subject, question, cot,current_step,previous_variables ,temp=0, model_name='gpt-4-0125-preview'):
    system_prompt = (
        '''
        You are a professional specialized in {subject}. You need to help me extract key variables when I solve the question.
        Your task is to extract the key variables from step analysis. Update the existing variable when necessary. My current
        step index is at {current_step}.
        \n{format_instructions}''')
    human_prompt = (
        '''Here is my complete thought process {cot} and here is the extract variable list from your buddy agent
           {previous_variables}. If the buddy agent did not give you anything, it implies that you are the first agent
           to analyze the step and define the initial variable based on the question.              
                    ''')

    response_schemas = [
        ResponseSchema(name="Current step",
                       description="State what is your current step: [current_step]"),
        ResponseSchema(name="Previous variable",
                       description='''
                       State what are the previous step that your buddy gave to you. If you are the first agent,
                       simple return [N/A] otherwise, return [previous variables]
                       '''),
        ResponseSchema(name="Current Variable",
                       description='''
                       based on your analysis of the current step and the previous variables (if applied), update the 
                       existing variable list, if necessary, you can define new variable. Your answer should look like:
                       [[variable1] = [value2] ---> [definition of the variable1],\n
                       [variable2] = [value2] ---> [definition of the variable1],\n...
                       ]
                       '''),
        ResponseSchema(name="Update Summarization",
                       description='''
                           You should clearly state what variable did you update and create by following the template:
                           [updated variables: [variable 1]: [old value] -> [new value]\n...
                           new variables: [new variable 1], [new variable 2]\n...
                           ]
                           '''),
        ResponseSchema(name="Used Formula",
                       description='''
                               You should state the formula use in the current step in terms of 
                                variables, following the template:
                               [Formula]: [formula 1], [formula 2] ...
                               If there is no formula, just return [Formula]: N/A
                               ''')
    ]
    output_parser = StructuredOutputParser.from_response_schemas(response_schemas)
    success = False
    while not success:
        try:
            worker = ChatModelWorker(output_parser=output_parser, temperature=temp, model=model_name)
            chain = worker.chain_generator(system_prompt, human_prompt)
            out_put = chain.run(subject=subject, cot = cot, current_step = current_step,previous_variables=previous_variables,
                                question=question)

            success = True
        except:
            worker = ChatModelWorker(output_parser=output_parser, temperature=temp, model=model_name)
            chain = worker.chain_generator(system_prompt, human_prompt)
            out_put = chain.run(subject=subject, cot = cot, current_step = current_step,previous_variables=previous_variables,
                                question=question)
    return out_put

def ngram_checker_agent2(subject, question, current_step, cot,extracted_var, temp=0, model_name='gpt-4-0125-preview'):
    system_prompt = (
        '''You are a professional specialized in {subject}. You need to help me verify my steps when I solve the question.
        I am currently at step #{current_step}.

        Is the current step  computationally correct in condition of provided variables and formulas?. 
        To justify the correctness, please refer to the extracted variable and formula I provided to you.  
        YOU MUST NOTICE THAT MATHEMATICAL EQUIVALENT STATEMENTS ARE THE SAME EVEN THOUGH THEY HAVE DIFFERENT EXPRESSIONS!
        THEY ARE BOTH CORRECT!

        If it is correct, then verify that if my current step make the previous step hold. In other words, 
        check the logic consistency of step n in conditional of <step k to step n-1> where n is my current step and k is the first step number in the provided cot steps.
        provided thought process. In the other words, is my current step supported by previous n-1 steps? It is important that for each analysis, ignore 
        steps other than the current step and the previous steps! In addition, for your reference, the question is given as {question}. 

        At step 1, since we have no step 0, instead the correctness and consistency should reflect if I correctly understood the answer.
        \n{format_instructions}
''')
    human_prompt = "Here is my complete thought process {cot}, you can find all the variable and formula you need on {extracted_var}"

    response_schemas = [

        ResponseSchema(name="Verification",
                       description='''
                        Help me verify the correctness and the logic consistency  of the current step, 
                       and tell me the reason. 
                       REASON is important. The reasoning step should cite the variable and formula you use!!! If at Step 1, since 
                        we have no step 0, verify if I correctly understood the answer
                        '''),
        ResponseSchema(name="Corrected Step",
                       description='''
                        According to your analysis of correctness and consistency, help me revise the current step so that
                        it becomes correct and consistent.
                    '''),
        ResponseSchema(name="Step Correctness",
                       description='''
                       say [YES] if the logic and computation are correct and the question is well-understood, otherwise [NO] .!!! If at Step 1, since 
                        we have no step 0, instead the correctness should reflect if I correctly understood the answer.
                        Is the current step  computationally correct in condition of provided variables and formulas?. 
                        To justify the correctness, please refer to the extracted variable and formula I provided to you.  
                        '''),
        ResponseSchema(name="Logic Consistency",
                       description='''
                       say [YES] if consistent, otherwise [NO].!!! If at Step 1, since 
                        we have no step 0, instead say [N/A]
                        '''),
        ResponseSchema(name="Dependency",
                       description='''
                           Find which previous steps led to the incorrectness or inconsistency . The whole idea is to 
                           discuss that if the current step is incorrect or inconsistent, where did the error chain start
                           from. What previous steps are the root cause of the error. Follow the template:
                           [[Incorrectness] <- [Incorrect Previous Steps]\n
                           [Inconsistency] <- [Inconsistent Previous Steps]]
                           If no incorrectness or inconsistency, simply return [N/A]
                           '''),
    ]
    output_parser = StructuredOutputParser.from_response_schemas(response_schemas)
    success = False
    while not success:
        try:
            worker = ChatModelWorker(output_parser=output_parser, temperature=temp, model=model_name)
            chain = worker.chain_generator(system_prompt, human_prompt)
            out_put = chain.run(subject=subject, extracted_var=extracted_var,
                                current_step=current_step, cot=cot,  question=question)

            success = True
        except:
            worker = ChatModelWorker(output_parser=output_parser, temperature=temp, model=model_name)
            chain = worker.chain_generator(system_prompt, human_prompt)
            out_put = chain.run(subject=subject,extracted_var=extracted_var,
                                current_step=current_step, cot=cot, question=question)
    return out_put


def root_checker_agent(subject, question, current_step, cot, temp=0, model_name='gpt-4-0125-preview'):
    system_prompt = (
        '''You are a professional specialized in {subject}. You need to help me verify my steps when I solve the question.
        I am currently at step #{current_step}.
        
        Before you perform the task, I want you to keep in mind several definitions for my possible mistakes. 
        1. Factuality： This type of error emphasizes the discrepancy between generated content and verifable real-word facts, including
        factual inconsistency or fabrication. In mathematics for instance, it may represents the computational error.
        
        2. Faithfulness: This type of error refers to the divergence of my step analysis from the original question or 
        previous steps, as well as self-consistency within my steps. In mathematics for instance, it may represents that
        I understood the question wrongly or my proposed step is inconsistent with my previous step. 
        
        Based on my current step response, question, previous steps, and my error definitions, help me verify if any of 
        the mistakes (factuality or faithfulness) occur on my analysis. Notice that skipping step should not be considered
        as error as long as the calculation is correct! For instance, 2x+2 should be the same as 2+2x. Also
        2x+2+3 should be the same as 2x+5

    
        At step 1, since we have no step 0, instead the factuality and faithfulness check
         should reflect if I correctly understood the answer.
        \n{format_instructions}
''')
    human_prompt = "Here is my complete thought process {cot} and this is the original question {question}"

    response_schemas = [

        ResponseSchema(name="Verification",
                       description='''
                        Help me verify the factuality and the faithfulness  of the current step, 
                       and tell me the reason. 
                       REASON is important. The reasoning step should cite the variable and formula you use!!! If at Step 1, since 
                        we have no step 0, verify if I correctly understood the answer
                        '''),

        ResponseSchema(name="Step Hallucination",
                       description='''
                       say [YES] if the current step logic and computation are NOT factual or faithful 
                       based on the question and my previous steps, otherwise [NO] .!!! If at Step 1, since 
                        we have no step 0, check for the factuality and faithfulness of the current step only. 
                        '''),
        ResponseSchema(name="Type of Hallucination",
                       description='''
                       Identify if the step violated factuality or faithfulness or both. Return [None] if my current step
                       was correct.
                        '''),
        ResponseSchema(name="Correction",
                       description='''
                               If you think Step Hallucination is Yes, help me generate a corrected version of the current
                               step instead. Notice that do not simply identify the error here, instead
                               you should directly give me the correct version with calculation (if applicable)
                               Follow the format:
                               Step n : [Corrected version]                               
                               '''),
        ResponseSchema(name="Dependency",
                       description='''
                           Find which previous steps led to the unfactual or unfaithful . The whole idea is to 
                           discuss that if the current step is unfactual or unfaithful, where did the error chain start
                           from. What previous steps are the root cause of the error. Follow the template:
                           [[Unfactual] <- [Unfactual Previous Steps Indices]\n
                           [Unfaithful] <- [Unfaithful Previous Steps Indices]]
                           If it is caused by misunderstanding of question, then the dependency should be [Original Question]
                           If no unfactual or unfaithful, simply return [N/A]
                           '''),
    ]
    output_parser = StructuredOutputParser.from_response_schemas(response_schemas)
    success = False
    while not success:
        try:
            worker = ChatModelWorker(output_parser=output_parser, temperature=temp, model=model_name)
            chain = worker.chain_generator(system_prompt, human_prompt)
            out_put = chain.run(subject=subject,
                                current_step=current_step, cot=cot,  question=question)

            success = True
        except:
            worker = ChatModelWorker(output_parser=output_parser, temperature=temp, model=model_name)
            chain = worker.chain_generator(system_prompt, human_prompt)
            out_put = chain.run(subject=subject,
                                current_step=current_step, cot=cot, question=question)
    return out_put


def debate_agent(subject, question, current_step, cot, response ,temp=0, model_name='gpt-4-0125-preview'):
    system_prompt = (
        '''You are a professional specialized in {subject}. You need to help me verify my steps when I solve the question.
        I am currently at step #{current_step}.

        Before you perform the task, I want you to keep in mind several definitions for my possible mistakes. 
        1. Factuality： This type of error emphasizes the discrepancy between generated content and verifable real-word facts, including
        factual inconsistency or fabrication. In mathematics for instance, it may represents the computational error.

        2. Faithfulness: This type of error refers to the divergence of my step analysis from the original question or 
        previous steps, as well as self-consistency within my steps. In mathematics for instance, it may represents that
        I understood the question wrongly or my proposed step is inconsistent with my previous step. 

        Other agents had helped me identify the error I made in the current step. You goal is to debate with the other
        agents and justify if their corrections were correct based on my question, thought process. Please use Critical
        Thinking. Please be harsh and picky since your verification is important!
        \n{format_instructions}
''')
    human_prompt = ("Here is my complete thought process {cot} and this is the original question {question}. The full"
                    "response from the other agents were given as {response}")

    response_schemas = [

        ResponseSchema(name="Justification",
                       description='''
                        Give me the reason that whether you think the other agents' correction to my step was correct.
                        '''),

        ResponseSchema(name="Agreement",
                       description='''
                       say [YES] if you agree with the other agents corrections to my current step analysis. Otherwise,
                       say [NO]
                        '''),

        ResponseSchema(name="Correction",
                       description='''
                               Help me generate a  version of the current
                               step that you think is correct
                               . Notice that do not simply identify the error here, instead
                               you should directly give me the correct version with calculation (if applicable)
                               
                               Step n : [Corrected version]                               
                               '''),
    ]
    output_parser = StructuredOutputParser.from_response_schemas(response_schemas)
    success = False
    while not success:
        try:
            worker = ChatModelWorker(output_parser=output_parser, temperature=temp, model=model_name)
            chain = worker.chain_generator(system_prompt, human_prompt)
            out_put = chain.run(subject=subject,
                                current_step=current_step, cot=cot, question=question, response = response)

            success = True
        except:
            worker = ChatModelWorker(output_parser=output_parser, temperature=temp, model=model_name)
            chain = worker.chain_generator(system_prompt, human_prompt)
            out_put = chain.run(subject=subject,
                                current_step=current_step, cot=cot, question=question, response = response1)
    return out_put


def judge_agent(subject, question, cots, model_name='gpt-4-0125-preview'):
    # Define the system and human prompts
    system_prompt = (
        '''
        You are a professional specialized in {subject}. Given three different thought process (COTs) 
        below for the question "{question}". Please analyze these COTs and provide your assessment 
        on which one is the most logically sound.
        \n{format_instructions}
        ''')
    human_prompt = "Here is the first COT: \n COT 1: {cot1}\n Here is the Second COT: \n COT 2: {cot2}\n Here is the Third COT:\n COT 3: {cot3} \n\nBased on your expertise, please select the COT that you believe is the most logically correct.\n"

    # Define response schemas
    response_schemas = [
        ResponseSchema(name="Selected COT",
                       description='''
                        Indicates the most logically correct Chain of Thought (COT) selected by the expert.
                        Please provide the index of the most correct COT, (output 1 if the first chain of thought most makes sense,
                        2 if the second chain of thought most makes sense, and 3 if the third chain of thought most makes sense.
                        If none of the chain makes sense, simply output None)
                        '''
                      )
    ]

    # Initialize a structured output parser
    output_parser = StructuredOutputParser.from_response_schemas(response_schemas)

    # Initialize a dictionary to store the judged cots
    judged_cots = None

    # Iterate over the cots
    cot1,cot2,cot3 = cots[0],cots[1],cots[2]
    success = False
    while not success:
        try:
            # Initialize a ChatModelWorker
            worker = ChatModelWorker(output_parser=output_parser, model=model_name)
            chain = worker.chain_generator(system_prompt, human_prompt)
            # Run the chain
            output = chain.run(subject=subject, cot1=cot1, cot2=cot2, cot3=cot3, question=question)
            # Store the judged cot
            judged_cot = output_repraser(output)['Selected COT']
            success = True
        except Exception as e:
            print("Error:", e)
            continue
    
    return judged_cot
def check_consistency(options):
    valid_options = {'A', 'B', 'C', 'D','E'}
    
    # Check if all options start with valid letters
    if all(option[0].upper() in valid_options for option in options):
        # Compare if their first letters are the same
        first_letters = set(option.strip()[0].upper() for option in options)
        if len(first_letters) == 1:
            return True
    else:
        # Check if the options are entirely identical
        if len(set(options)) == 1:
            return True
    
    return False


## Math Solving (Self_Check DB)

In [4]:
PREPROCESSED_FP = '../data/preprocessed'

In [5]:
import pandas as pd
import os

In [6]:
df_self_check = pd.read_csv(os.path.join(PREPROCESSED_FP,'Self_Check.csv'))
df_mmlu = pd.read_csv(os.path.join(PREPROCESSED_FP,'MMLU_test.csv'))

In [7]:
df_self_check

Unnamed: 0,Name,Category,Question,Correct Answer
0,Self_Check_challenge_test,Challenging Math,there are 1000 buildings in a street . a sign ...,c
1,Self_Check_challenge_test,Challenging Math,a man bought 20 shares of rs . 50 at 5 discoun...,c
2,Self_Check_challenge_test,Challenging Math,"? % of 360 = 108 The options are: a ) 30 , b )...",a
3,Self_Check_challenge_test,Challenging Math,a corporation double its annual bonus to 100 o...,a
4,Self_Check_challenge_test,Challenging Math,a and b together do a work in 20 days . b and ...,b
...,...,...,...,...
47388,Self_Check_train_socratic,Math,"Very early this morning, Elise left home in a ...",5
47389,Self_Check_train_socratic,Math,Josh is saving up for a box of cookies. To rai...,3
47390,Self_Check_train_socratic,Math,Colin can skip at six times the speed that Bra...,4
47391,Self_Check_train_socratic,Math,"Janet, a third grade teacher, is picking up th...",308


In [9]:
df_mmlu

Unnamed: 0,Name,Category,Question,Correct Answer
0,MMLU_test,high_school_government_and_politics,Which of the following best describes the bala...,D
1,MMLU_test,high_school_government_and_politics,Which of the following statements does NOT acc...,B
2,MMLU_test,high_school_government_and_politics,Which of the following plays the most signific...,B
3,MMLU_test,high_school_government_and_politics,What power was granted to the states by the Ar...,A
4,MMLU_test,high_school_government_and_politics,The primary function of political action commi...,A
...,...,...,...,...
14037,MMLU_test,prehistory,Archaeological evidence for the production of ...,B
14038,MMLU_test,prehistory,The presence of caribou bones found near the H...,B
14039,MMLU_test,prehistory,Experiments with stone spear points made to re...,A
14040,MMLU_test,prehistory,What is the date of the first evidence of the ...,B


## N-gram Modules

In [11]:
import numpy as np
def preprocess_samples(df_self_check, df_mmlu, sample_size=2, random_seed=42):
    """Preprocess samples by choosing random samples and concatenating them."""
    if random_seed is not None:
        np.random.seed(random_seed)
    
    sample_self_check = df_self_check.sample(sample_size)
    sample_mmlu = df_mmlu.sample(sample_size)
    return pd.concat([sample_self_check, sample_mmlu])
temp_df = preprocess_samples(df_self_check, df_mmlu, sample_size=50, random_seed=42)

In [12]:


# # Initialize lists to store results
# categories = []
# questions = []
# correct_answers = []
# consistencies = []
# cots = []
# selected_answers = []
# cot_debug = []
# answers_debug = []
# # Iterate over rows of temp_df
# for _, sample in temp_df.iterrows():
#     subject = sample['Category']
#     question = sample['Question']
#     correct_answer = sample['Correct Answer']
#     results = [] # cot results
#     answers = [] # GPT Generated answers
#     for _ in range(3):
#         result = forward_agent(model_name='gpt-4-0125-preview', subject=subject, question=question)
#         answers_debug.append(result['Final Answer'])
#         answers.append(result['Final Answer']) # Need to find a better way to solve this
#         cot_debug.append(result['Chain of Thought'])
#         results.append(result['Chain of Thought'])
#     consistency = check_consistency(answers)
    
    
#     # Update cot based on judged_cot
#     if consistency:
#         cot = None  # If consistent, put NA
#         answer = result['Final Answer']
#     else:
#         judged_cot_str = judge_agent(subject, question, results)
#         print(judged_cot_str)
#         if(judged_cot_str != 'None'):
#             cot = results[int(judged_cot_str) - 1]  # Use the selected COT index\
#             answer = answers[int(judged_cot_str) - 1]
#         else: # None selected
#             cot = 'test'
#             answer = None
#     selected_answers.append(answer)
#     categories.append(subject)
#     questions.append(question)
#     correct_answers.append(correct_answer)
#     consistencies.append(consistency)
#     cots.append(cot)

# # Construct final dataframe
# final_df = pd.DataFrame({
#     'Category': categories,
#     'Question': questions,
#     'Correct_Answer': correct_answers,
#     'Output_Answer' : selected_answers,
#     'Consistency': consistencies,
#     'Cot': cots
# })

# print(final_df)



In [13]:
# import numpy as np

# # Fixed seed for reproducibility
# fixed_seed = 42

# # Initialize lists to store results
# final_answers = []

# # Iterate over rows of temp_df to generate final answers
# for _, sample in temp_df.iterrows():
#     subject = sample['Category']
#     question = sample['Question']
    
#     # Set seed for reproducibility
#     np.random.seed(fixed_seed)
    
#     # Call forward_agent again with fixed seed
#     final_result = forward_agent(model_name='gpt-4-0125-preview', subject=subject, question=question)
#     final_answer = final_result['Final Answer']
    
#     # Append final answer to list
#     final_answers.append(final_answer)

# # Add final_answers as a new column to final_df
# final_df['Final_Answer'] = final_answers


In [14]:
# final_df = pd.DataFrame({
#     'Category': categories,
#     'Question': questions,
#     'Correct_Answer': correct_answers,
#     'Output_Answer' : selected_answers,
#     'Consistency': consistencies,
#     'Cot': cots
# })

In [15]:
final_df = pd.read_csv('result.csv')

In [19]:
test_samples = final_df[~final_df.Cot.isna() & ~final_df.Output_Answer.isna()]

In [20]:
test_samples

Unnamed: 0,Category,Question,Correct_Answer,Output_Answer,Consistency,Cot,Final_Answer
2,Math,miller street begins at baker street and runs ...,c,C,False,step 1: Identify the total length of Miller St...,b
5,Math,a bucket contains 2 mixtures a and b in the ra...,a,d,False,step 1: Let the initial quantities of mixtures...,A
8,Math,"81 , 64 , 27 , 16 . . . The options are: a ) 1...",c,e,False,"step 1: Observe the given sequence: 81, 64, 27...",c
14,Math,"2 ^ 46655 mod 9 = ? The options are: a ) 4 , b...",b,e,False,step 1: Recognize that the problem asks for th...,e
17,Math,a tank with a volume of 30 cubic feet has one ...,b,b,False,step 1: Convert the volume of the tank from cu...,b ) 3456
24,Math,it takes 40 identical printing presses 15 hour...,c,b,False,step 1: Calculate the total work done by 40 pr...,c
31,Math,a man sells an article at a profit of 25 % . i...,e,e,False,step 1: Let the original cost price of the art...,70
34,Math,"on rainy mornings , mo drinks exactly n cups o...",b,b,False,step 1: Let's denote the number of rainy days ...,c
37,Math,a man whose speed is 7 kmph in still water row...,a,a,False,step 1: Identify the speed of the man in still...,c
41,Math,heinz produces tomato puree by boiling tomato ...,e,e,False,step 1: Identify the amount of water and solid...,b ) 2.5 liters .


In [39]:
# # Create a DataFrame from cot_debug and answers_debug
# debug_df = pd.DataFrame({
#     'Chain of Thought': cot_debug,
#     'Generated Answers': answers_debug
# })

# # Save the DataFrame to a CSV file
# debug_df.to_csv('debug_output.csv', index=False)
# final_df.to_csv('result.csv',index = False)

In [None]:
# Get the index of all in

In [21]:
final_df.Consistency.value_counts()

Consistency
True     82
False    18
Name: count, dtype: int64

In [None]:
test_samples.iterrows()

In [48]:
def correct_answer_agent(subject, cot,question, temp=0, model_name='gpt-4-0125-preview'):
    system_prompt = (
        '''
        You are a professional specialized in {subject}. Your task is help me summarize my thought process towards a 
        question, then help me return the final answer to the question based on my thought process.
        \n{format_instructions}
        ''')
    human_prompt = "Here is my question :{question}. And my thought process is given as {cot}"

    response_schemas = [
        ResponseSchema(name="Summarization",
                       description="Summarize my thought process"),
        ResponseSchema(name="Final Answer",
                       description="Give me your final answer based on my thought process , if have options provided, "
                                   "just give me the option index. Follow"
                                   "The format [final_answer or correct_option_index]")
    ]
    output_parser = StructuredOutputParser.from_response_schemas(response_schemas)
    success = False
    while not success:
        try:
            worker = ChatModelWorker(output_parser=output_parser, temperature=temp, model=model_name)
            chain = worker.chain_generator(system_prompt, human_prompt)
            out_put = chain.run(subject=subject, cot = cot,
                                question=question)
            out_put = output_repraser(out_put)
            success = True
        except:
            continue
    return out_put
def revise_agent(subject, question, cot, reasoning, temp=0, model_name='gpt-4-0125-preview'):
    system_prompt = (
        '''You are a professional specialized in {subject}. You need to help me verify the factuality and faithfulness of my complete thought process (CoT) for solving the question.
        
        Before you perform the task, I want you to keep in mind several definitions for my possible mistakes. 
        1. Factuality： This type of error emphasizes the discrepancy between generated content and verifable real-word facts, including
        factual inconsistency or fabrication. In mathematics for instance, it may represents the computational error.
        
        2. Faithfulness: This type of error refers to the divergence of my step analysis from the original question or 
        previous steps, as well as self-consistency within my steps. In mathematics for instance, it may represents that
        I understood the question wrongly or my proposed step is inconsistent with my previous step. 
        
        Based on my complete CoT, question, and my error definitions, help me verify if any of 
        the mistakes (factuality or faithfulness) occur in my analysis.
        
        The debate_agent has provided their reasoning on whether the CoT is correct or not. Your goal is to consider their reasoning
        and generate a corrected version of the CoT if needed.
    
        \n{format_instructions}
''')
    human_prompt = ("Here is my complete thought process (CoT) {cot} and this is the original question {question}. "
                    "The debate_agent's reasoning is given as {reasoning}")

    response_schemas = [
        ResponseSchema(name="CoT Hallucination",
                       description='''
                       say [YES] if the complete CoT logic and computation are NOT factual or faithful 
                       based on the question and the debate_agent's reasoning, otherwise [NO].
                        '''),
        ResponseSchema(name="Corrected CoT",
                       description='''
                               If you think CoT Hallucination is Yes, help me generate a corrected version of the CoT.
                               Notice that do not simply identify the error here, instead
                               you should directly give me the correct version with calculation (if applicable)
                               If you think CoT Hallucination is No, simply give me the original version of the CoT.
                               
                               Corrected CoT:
                               [Corrected version]                               
                               '''),
    ]
    output_parser = StructuredOutputParser.from_response_schemas(response_schemas)
    success = False
    while not success:
        try:
            worker = ChatModelWorker(output_parser=output_parser, temperature=temp, model=model_name)
            chain = worker.chain_generator(system_prompt, human_prompt)
            out_put = chain.run(subject=subject, cot=cot,  question=question, reasoning=reasoning)
            out_put = output_repraser(out_put)
            success = True
        except:
            continue
    return out_put


def debate_agent(subject, question, cot, temp=0, model_name='gpt-4-0125-preview'):
    system_prompt = (
        '''You are a professional specialized in {subject}. You need to help me verify the factuality and faithfulness of my complete thought process (CoT) for solving the question.

        Before you perform the task, I want you to keep in mind several definitions for my possible mistakes. 
        1. Factuality： This type of error emphasizes the discrepancy between generated content and verifable real-word facts, including
        factual inconsistency or fabrication. In mathematics for instance, it may represents the computational error.

        2. Faithfulness: This type of error refers to the divergence of my step analysis from the original question or 
        previous steps, as well as self-consistency within my steps. In mathematics for instance, it may represents that
        I understood the question wrongly or my proposed step is inconsistent with my previous step. 

        Your goal is to provide reasoning on whether the CoT is correct based on the question. 
        Please use Critical Thinking and spot out any major mistakes that would lead to a wrong answer.
        \n{format_instructions}
''')
    human_prompt = "Here is my complete thought process (CoT) {cot} and this is the original question {question}."

    response_schemas = [

        ResponseSchema(name="Reasoning",
                       description='''
                        Provide your reasoning on whether the CoT is correct based on the question. 
                        Justify your reasoning with specific examples from the CoT,question, and the error definition.
                        '''),

        ResponseSchema(name="Correctness",
                       description='''
                       say [YES] if you think the CoT is correct based on your reasoning. Otherwise,
                       say [NO]
                        '''),
    ]
    output_parser = StructuredOutputParser.from_response_schemas(response_schemas)
    success = False
    while not success:
        try:
            worker = ChatModelWorker(output_parser=output_parser, temperature=temp, model=model_name)
            chain = worker.chain_generator(system_prompt, human_prompt)
            out_put = chain.run(subject=subject, cot=cot, question=question)
            out_put = output_repraser(out_put)
            success = True
        except:
            continue
    return out_put

In [47]:
cot

'step 1: Identify the total length of Miller Street, which is 4.5 kilometers or 4500 meters.\nstep 2: Determine the distance between each intersecting street, which is 250 meters.\nstep 3: Calculate the number of sections created by these intersections. Since the first and last streets (Baker and Turner) are not numbered and the streets are 250 meters apart, divide the total length by the distance between intersections: 4500 meters / 250 meters = 18.\nstep 4: Realize that the calculation in step 3 gives the total number of sections, not intersections. Since each section represents a block between streets crossing Miller Street, and considering the first intersection after Baker Street is the 1st numbered street and the last intersection before Turner Street would be the last numbered street, there are actually 17 numbered streets because the calculation includes the section at the beginning before the 1st numbered street and the section at the end before Turner Street.\nstep 5: Therefo

In [49]:
correct_answer_agent(subject, fixed_cot,question)

{'Summarization': "The thought process involves calculating the total number of sections created by intersecting streets along Miller Street, which is 4.5 kilometers long, with intersections every 250 meters. By dividing the total length by the distance between intersections, it's determined there are 18 sections. However, considering the first and last sections do not have numbered streets, the highest numbered street is identified as the 17th street.",
 'Final Answer': 'c'}

In [None]:
test_samples = test_samples.reset_index()

In [53]:
max_tries = 5  # Set the maximum number of tries for each sample

for index, sample in test_samples.iterrows():
    print(f"\nProcessing sample {index+1}/{len(test_samples)}")
    subject = sample['Category']
    question = sample['Question']
    correct_answer = sample['Correct_Answer']
    cot = sample['Cot']
    
    fixed_cot = None
    tries = 0
    
    print(f"Original CoT: {cot}")
    
    while tries < max_tries:
        print(f"\nTry {tries+1}/{max_tries}")
        
        debate_response = debate_agent(subject, question, cot)
        reasoning = debate_response['Reasoning']
        print(f"Debate Agent - Correctness: {debate_response['Correctness']}")
        print(f"Debate Agent's reasoning : {reasoning}")
        if debate_response['Correctness'] == 'YES':
            revise_response = revise_agent(subject, question, cot, reasoning)
            print(f"Revise Agent - CoT Hallucination: {revise_response['CoT Hallucination']}")
            
            if revise_response['CoT Hallucination'] == 'NO':
                fixed_cot = revise_response['Corrected CoT']
                print(f"Fixed CoT obtained: {fixed_cot}")
                break
            else:
                cot = revise_response['Corrected CoT']  # new cot
                print(f"Revised CoT: {cot}")
        else:
            revise_response = revise_agent(subject, question, cot, reasoning)
            cot = revise_response['Corrected CoT']
            print(f"Revised CoT: {cot}")
        
        tries += 1
    
    if tries == max_tries:
        fixed_cot = cot  # Append the latest revised CoT
        print(f"Maximum tries reached. Latest revised CoT: {fixed_cot}")
    
    if fixed_cot is not None:
        result = correct_answer_agent(subject, fixed_cot, question)
        final_answer = result['Final Answer']
        print(f"Final Answer: {final_answer}")
    else:
        final_answer = None
        print("No fixed CoT obtained.")
    
    test_samples.at[sample.name, 'Fixed CoT'] = fixed_cot
    test_samples.at[sample.name, 'Final Answer'] = final_answer

    if final_answer is not None:
        answer_match = (final_answer == correct_answer)
        test_samples.at[sample.name, 'Answer Match'] = answer_match
        print(f"Answer Match: {answer_match}")
    else:
        test_samples.at[sample.name, 'Answer Match'] = None
        print("Answer Match: None")


Processing sample 1/13
Original CoT: step 1: Identify the total length of Miller Street, which is 4.5 kilometers or 4500 meters.
step 2: Determine the distance between each intersecting street, which is 250 meters.
step 3: Calculate the number of sections created by these intersections. Since the first and last streets (Baker and Turner) are not numbered and the streets are 250 meters apart, divide the total length by the distance between intersections: 4500 meters / 250 meters = 18.
step 4: Realize that the calculation in step 3 gives the total number of sections, but since Baker and Turner streets are not numbered, the highest numbered street will be one less than the total number of sections.
step 5: Subtract 1 from the total number of sections to find the highest numbered street: 18 - 1 = 17.

Try 1/5
Debate Agent - Correctness: YES
Debate Agent's reasoning : The CoT correctly identifies the total length of Miller Street as 4500 meters and the distance between each intersecting st

In [64]:
import pandas as pd

# Assuming your DataFrame is named 'df' and the two columns are 'col1' and 'col2'
comparison = test_samples['Correct_Answer'].str.strip().str.lower().eq(test_samples['Final Answer'].str.strip().str.lower())

# Count the number of matching values
num_matches = comparison.sum()

# Print the comparison result and the number of matches
print("Comparison Result:")
print(comparison)
print(f"\nNumber of Matches: {num_matches}/{len(test_samples)}")

Comparison Result:
0      True
1     False
2     False
3     False
4     False
5     False
6      True
7     False
8      True
9      True
10    False
11     True
12    False
dtype: bool

Number of Matches: 5/13


In [66]:
import pandas as pd

# Assuming your DataFrame is named 'df' and the two columns are 'col1' and 'col2'
comparison = test_samples['Correct_Answer'].str.strip().str.lower().eq(test_samples['Output_Answer'].str.strip().str.lower())

# Count the number of matching values
num_matches = comparison.sum()

# Print the comparison result and the number of matches
print("Comparison Result:")
print(comparison)
print(f"\nNumber of Matches: {num_matches}/{len(test_samples)}")

Comparison Result:
0      True
1     False
2     False
3     False
4      True
5     False
6      True
7      True
8      True
9      True
10    False
11     True
12    False
dtype: bool

Number of Matches: 7/13


In [72]:
import pandas as pd

# Assuming your DataFrame is named 'df' and the two columns are 'col1' and 'col2'
comparison = test_samples['Correct_Answer'].str.strip().str.lower().eq(test_samples['Final_Answer'].str.strip().str.lower())

# Count the number of matching values
num_matches = comparison.sum()

# Print the comparison result and the number of matches
print("Comparison Result:")
print(comparison)
print(f"\nNumber of Matches: {num_matches}/{len(test_samples)}")

Comparison Result:
0     False
1      True
2      True
3     False
4     False
5      True
6     False
7     False
8     False
9     False
10    False
11     True
12    False
dtype: bool

Number of Matches: 4/13


In [73]:
import pandas as pd

# Assuming your DataFrame is named 'df' and the two columns are 'col1' and 'col2'
comparison = test_samples['Correct_Answer'].str.strip().str.lower().eq(test_samples['Final Answer'].str.strip().str.lower())

# Count the number of matching values
num_matches = comparison.sum()

# Print the comparison result and the number of matches
print("Comparison Result:")
print(comparison)
print(f"\nNumber of Matches: {num_matches}/{len(test_samples)}")

Comparison Result:
0      True
1     False
2     False
3     False
4     False
5     False
6      True
7     False
8      True
9      True
10    False
11     True
12    False
dtype: bool

Number of Matches: 5/13


In [71]:
test_samples

Unnamed: 0,level_0,index,Category,Question,Correct_Answer,Output_Answer,Consistency,Cot,Final_Answer,Fixed CoT,Final Answer,Answer Match
0,0,2,Math,miller street begins at baker street and runs ...,c,C,False,step 1: Identify the total length of Miller St...,b,step 1: Identify the total length of Miller St...,c,True
1,1,5,Math,a bucket contains 2 mixtures a and b in the ra...,a,d,False,step 1: Let the initial quantities of mixtures...,A,step 1: Let the initial quantities of mixtures...,d,False
2,2,8,Math,"81 , 64 , 27 , 16 . . . The options are: a ) 1...",c,e,False,"step 1: Observe the given sequence: 81, 64, 27...",c,"step 1: Observe the given sequence: 81, 64, 27...",The correct answer is not listed among the opt...,False
3,3,14,Math,"2 ^ 46655 mod 9 = ? The options are: a ) 4 , b...",b,e,False,step 1: Recognize that the problem asks for th...,e,step 1: Recognize that the problem asks for th...,e,False
4,4,17,Math,a tank with a volume of 30 cubic feet has one ...,b,b,False,step 1: Convert the volume of the tank from cu...,b ) 3456,step 1: Convert the volume of the tank from cu...,1,False
5,5,24,Math,it takes 40 identical printing presses 15 hour...,c,b,False,step 1: Calculate the total work done by 40 pr...,c,step 1: Calculate the total work done by 40 pr...,b ) 18,False
6,6,31,Math,a man sells an article at a profit of 25 % . i...,e,e,False,step 1: Let the original cost price of the art...,70,step 1: Let the original cost price of the art...,e,True
7,7,34,Math,"on rainy mornings , mo drinks exactly n cups o...",b,b,False,step 1: Let's denote the number of rainy days ...,c,step 1: Let's denote the number of rainy days ...,b) 4,False
8,8,37,Math,a man whose speed is 7 kmph in still water row...,a,a,False,step 1: Identify the speed of the man in still...,c,step 1: Identify the speed of the man in still...,a,True
9,9,41,Math,heinz produces tomato puree by boiling tomato ...,e,e,False,step 1: Identify the amount of water and solid...,b ) 2.5 liters .,step 1: Identify the amount of water and solid...,e,True
