In [22]:
from langchain_openai import ChatOpenAI
from langchain.prompts.chat import (
    ChatPromptTemplate,
    SystemMessagePromptTemplate,
    HumanMessagePromptTemplate,
)
import json
from langchain.output_parsers import StructuredOutputParser, ResponseSchema
from langchain.chains import LLMChain
from dotenv.main import load_dotenv



class ChatModelWorker:
    def __init__(self, output_parser, temperature=0, model='gpt-4'):
        with open('api_key.txt', 'r') as f:
            apikey = f.read()
        self.chat_model = ChatOpenAI(openai_api_key=apikey, model_name=model, temperature=temperature,model_kwargs = {"seed":666})
        self.output_parser = output_parser

    def prompt_temps(self, sys_temp, human_temp, format_instructions):
        sys_msg_prompt = SystemMessagePromptTemplate.from_template(sys_temp)
        human_msg_prompt = HumanMessagePromptTemplate.from_template(human_temp)
        chat_prompt = ChatPromptTemplate(partial_variables={"format_instructions": format_instructions},
                                         messages=[sys_msg_prompt, human_msg_prompt])
        return chat_prompt

    def chain_generator(self, template, human_template):
        output_parser = self.output_parser
        format_instructions = output_parser.get_format_instructions()
        chain = LLMChain(
            llm=self.chat_model,
            prompt=self.prompt_temps(template, human_template, format_instructions)
        )
        return chain


def output_repraser(input_string):
    json_str = input_string.strip('```json\n').rstrip('\n```').strip()

    # Create a custom JSONDecoder to handle invalid escape sequences
    class CustomJSONDecoder(json.JSONDecoder):
        def decode(self, s):
            result = super().decode(s)
            return result

    # Use the custom JSONDecoder to parse the JSON string
    data_dict = json.loads(json_str, cls=CustomJSONDecoder)
    return data_dict


def root_checker_agent(subject, question, current_step, cot, temp=0, model_name='gpt-4-0125-preview'):
    system_prompt = (
        '''You are a professional specialized in {subject}. You need to help me verify my steps when I solve the question.
        I am currently at step #{current_step}.

        Before you perform the task, I want you to keep in mind several definitions for my possible mistakes. 
        1. Factuality： This type of error emphasizes the discrepancy between generated content and verifable real-word facts, including
        factual inconsistency or fabrication. In mathematics for instance, it may represents the computational error.

        2. Faithfulness: This type of error refers to the divergence of my step analysis from the original question or 
        previous steps, as well as self-consistency within my steps. In mathematics for instance, it may represents that
        I understood the question wrongly or my proposed step is inconsistent with my previous step. 

        Based on my current step response, question, previous steps, and my error definitions, help me verify if any of 
        the mistakes (factuality or faithfulness) occur on my analysis. Notice that skipping step should not be considered
        as error as long as the calculation is correct! For instance, 2x+2 should be the same as 2+2x. Also
        2x+2+3 should be the same as 2x+5


        At step 1, since we have no step 0, instead the factuality and faithfulness check
         should reflect if I correctly understood the answer.
         
         Do not detect any minor hallucinations! In other words, only targeting the mistakes that contain calculation error
         or apparent logical flaw or contradict real-world facts! If the provided step acknowledge mistake, you need to 
         capture it and correct it.
         
         If you see any step ended up with *<verified>* it means it have been checked without any mistake, so just consider
         it as correct!!!  
        \n{format_instructions}
''')
    human_prompt = "Here is my complete thought process {cot} and this is the original question {question}"

    response_schemas = [

        ResponseSchema(name="Verification",
                       description='''
                        Help me verify the factuality and the faithfulness  of the current step, 
                       and tell me the reason. 
                       REASON is important. The reasoning step should cite the variable and formula you use!!! If at Step 1, since 
                        we have no step 0, verify if I correctly understood the answer
                        If you see any step ended up with *<verified>* it means it have been checked without any mistake, so just consider
         it as correct!!!  
                        '''),

        ResponseSchema(name="Step Hallucination",
                       description='''
                       say [YES] if the current step logic and computation are NOT factual or faithful 
                       based on the question and my previous steps, otherwise [NO] .!!! If at Step 1, since 
                        we have no step 0, check for the factuality and faithfulness of the current step only. 
                        If you see any step ended up with *<verified>* it means it have been checked without any mistake, so just consider
         it as correct!!!  
                        '''),
        ResponseSchema(name="Type of Hallucination",
                       description='''
                       Identify if the step violated factuality or faithfulness or both. Return [None] if my current step
                       was correct.
                       If you see any step ended up with *<verified>* it means it have been checked without any mistake, so just consider
         it as correct!!!  
                        '''),
        ResponseSchema(name="Correction",
                       description='''
                               If you think Step Hallucination is Yes, help me generate a corrected version of the current
                               step instead. Notice that do not simply identify the error here, instead
                               you should directly give me the correct version with calculation (if applicable)
                               Follow the format:
                               'Step n : [Corrected version]'                               
                               '''),
        ResponseSchema(name="Dependency",
                       description='''
                           Find which previous steps led to the unfactual or unfaithful . The whole idea is to 
                           discuss that if the current step is unfactual or unfaithful, where did the error chain start
                           from. What previous steps are the root cause of the error. Follow the template:
                           [[Unfactual] <- [Unfactual Previous Steps Indices]\n
                           [Unfaithful] <- [Unfaithful Previous Steps Indices]]
                           If it is caused by misunderstanding of question, then the dependency should be [Original Question]
                           If no unfactual or unfaithful, simply return [N/A]
                           '''),
    ]
    output_parser = StructuredOutputParser.from_response_schemas(response_schemas)
    success = False
    while not success:
        try:
            worker = ChatModelWorker(output_parser=output_parser, temperature=temp, model=model_name)
            chain = worker.chain_generator(system_prompt, human_prompt)
            out_put = chain.run(subject=subject,
                                current_step=current_step, cot=cot, question=question)

            success = True
        except:
            worker = ChatModelWorker(output_parser=output_parser, temperature=temp, model=model_name)
            chain = worker.chain_generator(system_prompt, human_prompt)
            out_put = chain.run(subject=subject,
                                current_step=current_step, cot=cot, question=question)
    return out_put


def debate_agent(subject, question, current_step, cot, response, temp=0, model_name='gpt-4-0125-preview'):
    system_prompt = (
        '''You are a professional specialized in {subject}. You need to help me verify my steps when I solve the question.
        I am currently at step #{current_step}.

        Before you perform the task, I want you to keep in mind several definitions for my possible mistakes. 
        1. Factuality： This type of error emphasizes the discrepancy between generated content and verifable real-word facts, including
        factual inconsistency or fabrication. In mathematics for instance, it may represents the computational error.

        2. Faithfulness: This type of error refers to the divergence of my step analysis from the original question or 
        previous steps, as well as self-consistency within my steps. In mathematics for instance, it may represents that
        I understood the question wrongly or my proposed step is inconsistent with my previous step. 

        Other agents had helped me identify the error I made in the current step. You goal is to debate with the other
        agents and justify if their corrections were correct based on my question, thought process. Please use Critical
        Thinking.
        \n{format_instructions}
''')
    human_prompt = ("Here is my complete thought process {cot} and this is the original question {question}. The full"
                    "response from the other agents were given as {response}")

    response_schemas = [

        ResponseSchema(name="Justification",
                       description='''
                        Give me the your response to the other agent and justify
                         that whether you think the other agents' correction to my step was correct.
                        '''),

        ResponseSchema(name="Agreement",
                       description='''
                       say [YES] if you agree with the other agents corrections to my current step analysis. Otherwise,
                       say [NO]
                        '''),

        ResponseSchema(name="Correction",
                       description='''
                               Help me generate a  version of the current
                               step that you think is correct
                               . Notice that do not simply identify the error here, instead
                               you should directly give me the correct version with calculation (if applicable)

                               'Step n : [Corrected version] *<verified>*'                                 
                               '''),
    ]
    output_parser = StructuredOutputParser.from_response_schemas(response_schemas)
    success = False
    while not success:
        try:
            worker = ChatModelWorker(output_parser=output_parser, temperature=temp, model=model_name)
            chain = worker.chain_generator(system_prompt, human_prompt)
            out_put = chain.run(subject=subject,
                                current_step=current_step, cot=cot, question=question, response=response)

            success = True
        except:
            worker = ChatModelWorker(output_parser=output_parser, temperature=temp, model=model_name)
            chain = worker.chain_generator(system_prompt, human_prompt)
            out_put = chain.run(subject=subject,
                                current_step=current_step, cot=cot, question=question, response=response)
    return out_put


def correct_answer_agent_partial_cot(subject, cot,question, temp=0, model_name='gpt-4-0125-preview'):
    system_prompt = (
        '''
        You are a professional specialized in {subject}. Your task is help me answer the question based on my initial 
        thoughts. I will provide you several steps of my attempt. Your task is to CONTINUE my thought process and then 
        answer my question step by step. Also, maximum 12 steps allowed and you can assume my initial thoughts had been
        checked since could be trusted. Remember, your response should based on my initial thoughts!
        \n{format_instructions}
        ''')
    human_prompt = "Here is my question :{question}. And my intial thought process is given as {cot}"

    response_schemas = [
        ResponseSchema(name="Complete Thought Process",
                       description="Continue my thought process in order to answer the question,"
                                   "You must include my initial thought process as well and leave them as what they are"
                                   "Return the complete chain of thought by following the format:"
                                   "Step n: [step process]."),
        ResponseSchema(name="Final Answer",
                       description="Give me your final answer based on my thought process , if have options provided, "
                                   "just give me the option index. Follow"
                                   "The format [final_answer or correct_option_index]")
    ]
    output_parser = StructuredOutputParser.from_response_schemas(response_schemas)
    success = False
    while not success:
        try:
            worker = ChatModelWorker(output_parser=output_parser, temperature=temp, model=model_name)
            chain = worker.chain_generator(system_prompt, human_prompt)
            out_put = chain.run(subject=subject, cot = cot,
                                question=question)

            success = True
        except:
            worker = ChatModelWorker(output_parser=output_parser, temperature=temp, model=model_name)
            chain = worker.chain_generator(system_prompt, human_prompt)
            out_put = chain.run(subject=subject, cot = cot,
                                question=question)
    return out_put


def debate_whole_agent(subject, question, cot, temp=0, model_name='gpt-4-0125-preview'):
    system_prompt = (
        '''You are a professional specialized in {subject}. You need to help me verify the other agents' thought process 
        when they solve the question.

        Before you perform the task, I want you to keep in mind several definitions for my possible mistakes. 
        1. Factuality： This type of error emphasizes the discrepancy between generated content and verifable real-word facts, including
        factual inconsistency or fabrication. In mathematics for instance, it may represents the computational error.

        2. Faithfulness: This type of error refers to the divergence of my step analysis from the original question or 
        previous steps, as well as self-consistency within my steps. In mathematics for instance, it may represents that
        I understood the question wrongly or my proposed step is inconsistent with my previous step. 

        Therefore, I need you to critically debate with the other agents. Your goal is to check their thought process,
        identify which step made mistakes and what type of hallucination were those. Then , generate your own version
        based on your justification and their thought process. Finally, generate a log to state your updates.
        \n{format_instructions}
''')
    human_prompt = ("Here is their complete thought process {cot} and this is the original question {question}. ")

    response_schemas = [

        ResponseSchema(name="Justification",
                       description='''
                        Give me the your response to the other agent and justify
                         that whether you think the other agents' thought process to solve the question was correct.
                        '''),

        ResponseSchema(name="Step Verification",
                       description='''
                       State which step had problem and what type by following the format:
                       [step n]: [Mistake type (factuality or faithfulness)]
                        '''),

        ResponseSchema(name="Corrected COT",
                       description='''
                               Help me generate a new version of thought process to solve the question. You should follow
                               the format: 
                               [Step n]: [Step process]                             
                               '''),
        ResponseSchema(name="Updates",
                       description='''
                                   Give me a log history regarding your updates by stating what you changed and what are
                                   the new steps you proposed by following the format:
                                   'Updated Steps': [step indices],
                                   'New Steps': [step indices]                            
                                   '''),
        ResponseSchema(name="Final Answer",
                       description="Give me your final answer based your revised thought process , if have options provided, "
                                   "just give me the option index. Follow"
                                   "The format [final_answer or correct_option_index]")
    ]
    output_parser = StructuredOutputParser.from_response_schemas(response_schemas)
    success = False
    while not success:
        try:
            worker = ChatModelWorker(output_parser=output_parser, temperature=temp, model=model_name)
            chain = worker.chain_generator(system_prompt, human_prompt)
            out_put = chain.run(subject=subject,
                                 cot=cot, question=question)

            success = True
        except:
            worker = ChatModelWorker(output_parser=output_parser, temperature=temp, model=model_name)
            chain = worker.chain_generator(system_prompt, human_prompt)
            out_put = chain.run(subject=subject,
                                cot=cot, question=question)
    return out_put


In [23]:
import os
import pandas as pd
import re
from collections import Counter
from llm_agents import *

PREPROCESSED_FP = '../data/preprocessed'


def load_df(dataset_fp):
    df = pd.read_csv(os.path.join(PREPROCESSED_FP, dataset_fp))
    print('------------------------------------------------------')
    print(f'The distribution of category in {dataset_fp} is:\n{Counter(df.Category)}')
    return df




def generate_new_response(subject, question,cot):
    result = correct_answer_agent_partial_cot(subject=subject, question=question,cot=cot)
    success = False
    while not success:
        try:
            forward_result = output_repraser(result)
            success = True
        except:
            success = False
    print('------------------------------------------------------')
    for key, value in forward_result.items():
        print(key)
        print(value)
    print('------------------------------------------------------')
    cot, final_answer = forward_result.values()
    return cot, final_answer

def self_correct_complete(cot, steps, question, ngram=1):
    check_list = []
    for i in range(int(steps)):
        current_step = i + 1

        masked_cot = cot[:i+1]


        success = False
        while not success:
            try:
                conditional_check_result = root_checker_agent(subject=subject, current_step=current_step, cot=masked_cot,
                                                                question=question)
                response = output_repraser(conditional_check_result)

                success = True
            except:
                success = False

        print(f'Step {current_step}', response, '\n\n')
        check_list.append((response['Step Hallucination']))
        if (response['Step Hallucination'] == 'YES'):
            debate_response = multi_agents_debate(subject,current_step,masked_cot,question,response)
            print('Old Version: ', masked_cot[i])
            masked_cot[i] = debate_response['Correction']
            print('Corrected Version', masked_cot[i])
            break
    print('------------------------------------------------------')
    print(masked_cot)
    print('------------------------------------------------------')

    return check_list, masked_cot


def standardize_answer(answer):
    # Check for strict multiple choice format (single letter or letter followed by parenthesis)
    if re.match(r'^[a-zA-Z]\W.*$', answer.strip()):
        return answer.strip().lower()[0]

    # For other cases, return the answer as is
    return answer.lower()

def multi_agents_debate(subject,current_step,masked_cot,question,response):
    final_response = response
    print('Start Debating')
    attempts = 0
    counter = 0
    while (attempts < 1) and counter <=2:
        print('attempt:',attempts)
        success = False
        while not success:
            try:
                debate = debate_agent(subject=subject, current_step=current_step, cot=masked_cot,
                                                          question=question,response = response)
                response = output_repraser(debate)
                print('\n\n\n',response,'\n\n\n')
                if response['Agreement'] == 'YES':
                    final_response = response
                    attempts += 1

                success = True
                counter += 1
            except:
                success = False
    return final_response





if __name__ == '__main__':
    config = {
        'dataset_fp': 'Self_Check.csv',
        'test_case_number': range(0,146),
        'ngram': 'all',
        'num_agents': 1
    }

    result_df_dict = {
        'CaseID': [],
        'Question': [],
        'Correct Answer':[],
        'Raw COT Answer':[],
        'Corrected COT Answer': [],
        'Hallu Seq':[],
        'raw_cot':[],
        'corrected_cot': []
    }

    df_raw = pd.read_csv('../data/3-19_data.csv')
    df = df_raw.loc[df_raw.Consistency == False]
    # df = df.loc[df.Correct_Answer == df.Output_Answer]
    df = df.iloc[right_wrong_df.CaseID.tolist()]
    for row_idx in range(5):
        row = df.iloc[row_idx]


        subject = row['Category']
        question = row['Question']
        correct_answer = row['Correct_Answer']
        cot = row['Cot']
        raw_cot_answer = row['Output_Answer']

        result_df_dict['CaseID'].append(row_idx)
        result_df_dict['Question'].append(question)
        result_df_dict['Correct Answer'].append(standardize_answer(correct_answer))
        result_df_dict['raw_cot'].append(cot)

        print('question: ',question)
        print('correct answer: ',correct_answer)
        print('COT: ',cot)
        print('raw_cot_answer: ',raw_cot_answer)

        result_df_dict['Raw COT Answer'].append(standardize_answer(raw_cot_answer))

        for i in range(config['num_agents']):
            try:
                steps_list_with_indices = re.split(r'(?i)([Ss]tep \d+\s?:)', cot)
    
                # Reconstruct the steps list to include "step n:" with the actual step text.
                result_steps = [f"{steps_list_with_indices[i]} {steps_list_with_indices[i + 1].strip()}" for i in
                                range(1, len(steps_list_with_indices), 2)]
            except:
                result_steps = []
                
            if len(result_steps) == 0:
                result_steps = ['No initial thoughts proposed, start from the scratch']

            steps =  len(result_steps)


            check_list,partial_cot = self_correct_complete(result_steps, steps, question=question,
                                                     ngram=config['ngram'])
            if 'YES' in check_list:
                corrected_cot, corrected_answer = generate_new_response(subject=subject,question=question,cot=partial_cot)
                new_answer = (standardize_answer(corrected_answer))
            else:
                new_answer = (standardize_answer(raw_cot_answer))
                corrected_cot = cot

            cot = corrected_cot

        result_df_dict['Corrected COT Answer'].append(new_answer)
        result_df_dict['corrected_cot'].append(corrected_cot)
        result_df_dict['Hallu Seq'].append(check_list)

    result_df = pd.DataFrame.from_dict(result_df_dict)
    result_df.to_csv('../result/gpt-3-19_right_wrong_error.csv')


question:  Identify the antecedent of the following conditional proposition: The Bees win their first game only if either the Aardvarks or the Chipmunks do not win their first games. The options are: A) The Aardvarks do not win their first game., B) The Bees win their first game., C) The Chipmunks do not win their first game., D) Either the Aardvarks or the Chipmunks do not win their first games.
correct answer:  B
COT:  step 1: Understand the structure of a conditional proposition, which is typically in the form 'If P, then Q', where P is the antecedent and Q is the consequent.
step 2: Identify the conditional proposition in the given statement: 'The Bees win their first game only if either the Aardvarks or the Chipmunks do not win their first games.'
step 3: Recognize that 'only if' indicates a conditional relationship where the part after 'only if' is necessary for the part before it. This means the part after 'only if' is the consequent, and the part before it is the antecedent.
st

In [24]:
import os
import pandas as pd
import re
from collections import Counter
from llm_agents import *

PREPROCESSED_FP = '../data/preprocessed'


def load_df(dataset_fp):
    df = pd.read_csv(os.path.join(PREPROCESSED_FP, dataset_fp))
    print('------------------------------------------------------')
    print(f'The distribution of category in {dataset_fp} is:\n{Counter(df.Category)}')
    return df




def generate_new_response(subject, question,cot):
    result = correct_answer_agent_partial_cot(subject=subject, question=question,cot=cot)
    success = False
    while not success:
        try:
            forward_result = output_repraser(result)
            success = True
        except:
            success = False
    print('------------------------------------------------------')
    for key, value in forward_result.items():
        print(key)
        print(value)
    print('------------------------------------------------------')
    cot, final_answer = forward_result.values()
    return cot, final_answer

def self_correct_complete(cot, steps, question, ngram=1):
    check_list = []
    for i in range(int(steps)):
        current_step = i + 1

        masked_cot = cot[:i+1]


        success = False
        while not success:
            try:
                conditional_check_result = root_checker_agent(subject=subject, current_step=current_step, cot=masked_cot,
                                                                question=question)
                response = output_repraser(conditional_check_result)

                success = True
            except:
                success = False

        print(f'Step {current_step}', response, '\n\n')
        check_list.append((response['Step Hallucination']))
        if (response['Step Hallucination'] == 'YES'):
            debate_response = multi_agents_debate(subject,current_step,masked_cot,question,response)
            print('Old Version: ', masked_cot[i])
            masked_cot[i] = debate_response['Correction']
            print('Corrected Version', masked_cot[i])
            break
    print('------------------------------------------------------')
    print(masked_cot)
    print('------------------------------------------------------')

    return check_list, masked_cot


def standardize_answer(answer):
    # Check for strict multiple choice format (single letter or letter followed by parenthesis)
    if re.match(r'^[a-zA-Z]\W.*$', answer.strip()):
        return answer.strip().lower()[0]

    # For other cases, return the answer as is
    return answer.lower()

def multi_agents_debate(subject,current_step,masked_cot,question,response):
    final_response = response
    print('Start Debating')
    attempts = 0
    counter = 0
    while (attempts < 1) and counter <=2:
        print('attempt:',attempts)
        success = False
        while not success:
            try:
                debate = debate_agent(subject=subject, current_step=current_step, cot=masked_cot,
                                                          question=question,response = response)
                response = output_repraser(debate)
                print('\n\n\n',response,'\n\n\n')
                if response['Agreement'] == 'YES':
                    final_response = response
                    attempts += 1

                success = True
                counter += 1
            except:
                success = False
    return final_response





if __name__ == '__main__':
    config = {
        'dataset_fp': 'Self_Check.csv',
        'test_case_number': range(0,146),
        'ngram': 'all',
        'num_agents': 1
    }

    result_df_dict = {
        'CaseID': [],
        'Question': [],
        'Correct Answer':[],
        'Raw COT Answer':[],
        'Corrected COT Answer': [],
        'Hallu Seq':[],
        'raw_cot':[],
        'corrected_cot': []
    }

    df_raw = pd.read_csv('../data/3-19_data.csv')
    df = df_raw.loc[df_raw.Consistency == False]
    # df = df.loc[df.Correct_Answer == df.Output_Answer]
    df = df.iloc[right_wrong_df.CaseID.tolist()]
    for row_idx in range(5):
        row = df.iloc[row_idx]


        subject = row['Category']
        question = row['Question']
        correct_answer = row['Correct_Answer']
        cot = row['Cot']
        raw_cot_answer = row['Output_Answer']

        result_df_dict['CaseID'].append(row_idx)
        result_df_dict['Question'].append(question)
        result_df_dict['Correct Answer'].append(standardize_answer(correct_answer))
        result_df_dict['raw_cot'].append(cot)

        print('question: ',question)
        print('correct answer: ',correct_answer)
        print('COT: ',cot)
        print('raw_cot_answer: ',raw_cot_answer)

        result_df_dict['Raw COT Answer'].append(standardize_answer(raw_cot_answer))

        for i in range(config['num_agents']):
            try:
                steps_list_with_indices = re.split(r'(?i)([Ss]tep \d+\s?:)', cot)
    
                # Reconstruct the steps list to include "step n:" with the actual step text.
                result_steps = [f"{steps_list_with_indices[i]} {steps_list_with_indices[i + 1].strip()}" for i in
                                range(1, len(steps_list_with_indices), 2)]
            except:
                result_steps = []
                
            if len(result_steps) == 0:
                result_steps = ['No initial thoughts proposed, start from the scratch']

            steps =  len(result_steps)


            check_list,partial_cot = self_correct_complete(result_steps, steps, question=question,
                                                     ngram=config['ngram'])
            if 'YES' in check_list:
                corrected_cot, corrected_answer = generate_new_response(subject=subject,question=question,cot=partial_cot)
                new_answer = (standardize_answer(corrected_answer))
            else:
                new_answer = (standardize_answer(raw_cot_answer))
                corrected_cot = cot

            cot = corrected_cot

        result_df_dict['Corrected COT Answer'].append(new_answer)
        result_df_dict['corrected_cot'].append(corrected_cot)
        result_df_dict['Hallu Seq'].append(check_list)

    result_df = pd.DataFrame.from_dict(result_df_dict)
    result_df.to_csv('../result/gpt-3-19_right_wrong_error_1.csv')


question:  Identify the antecedent of the following conditional proposition: The Bees win their first game only if either the Aardvarks or the Chipmunks do not win their first games. The options are: A) The Aardvarks do not win their first game., B) The Bees win their first game., C) The Chipmunks do not win their first game., D) Either the Aardvarks or the Chipmunks do not win their first games.
correct answer:  B
COT:  step 1: Understand the structure of a conditional proposition, which is typically in the form 'If P, then Q', where P is the antecedent and Q is the consequent.
step 2: Identify the conditional proposition in the given statement: 'The Bees win their first game only if either the Aardvarks or the Chipmunks do not win their first games.'
step 3: Recognize that 'only if' indicates a conditional relationship where the part after 'only if' is necessary for the part before it. This means the part after 'only if' is the consequent, and the part before it is the antecedent.
st

In [None]:
result_steps = [f"{steps_list_with_indices[i]} {steps_list_with_indices[i + 1].strip()}" for i in
                            range(1, len(steps_list_with_indices), 2)]

In [None]:
result_steps

In [6]:
import pandas as pd
df_result = pd.read_csv('../result/gpt-3-18_1_error.csv')

In [7]:
df_result

Unnamed: 0.1,Unnamed: 0,CaseID,Question,Correct Answer,Raw COT Answer,Corrected COT Answer,Hallu Seq,raw_cot,corrected_cot
0,0,0,Construct a complete truth table for the follo...,a,d,c,"['NO', 'NO', 'NO', 'NO', 'NO', 'NO', 'NO', 'YES']",step 1: Identify the premises and conclusion. ...,step 1: Identify the premises and conclusion. ...
1,1,1,Use the following key to translate the given ...,b,none of the options perfectly match the correc...,a,"['NO', 'NO', 'NO', 'NO', 'YES']",step 1: Identify the logical components of the...,step 1: Identify the logical components of the...
2,2,2,Use indirect truth tables to determine whether...,c,c,c,"['NO', 'NO', 'NO', 'NO', 'NO', 'NO', 'NO']",step 1: Understand the argument structure. The...,step 1: Understand the argument structure. The...
3,3,3,Construct a complete truth table for the foll...,b,b,b,"['NO', 'NO', 'NO', 'NO', 'NO', 'NO']",step 1: Identify the premises and conclusion. ...,step 1: Identify the premises and conclusion. ...
4,4,4,Construct a complete truth table for the foll...,c,b,b,"['NO', 'NO', 'NO', 'NO', 'NO', 'NO', 'NO', 'NO']",step 1: Identify the premises and conclusion. ...,step 1: Identify the premises and conclusion. ...
...,...,...,...,...,...,...,...,...,...
514,514,514,if a man lost 8 % by selling oranges at the ra...,c,a,a,"['NO', 'NO', 'NO', 'NO', 'NO', 'YES']",step 1: Let's assume the cost price of 25 oran...,Step 1: Let's assume the cost price of 25 oran...
515,515,515,a certain experimental mathematics program was...,c,c,c,"['NO', 'NO', 'YES']",step 1: Understand the problem. We have 38 sch...,Step 1: Understand the problem. We have 38 sch...
516,516,516,a tradesman by means of his false balance defr...,c,c,c,"['NO', 'YES']",step 1: Understand that the tradesman defrauds...,Step 1: Understand that the tradesman defrauds...
517,517,517,a train crosses a bridge of length 200 m in 10...,c,c,the thought process contains several errors in...,"['NO', 'NO', 'NO', 'YES']",step 1: Let's denote the length of the train a...,step 1: Let's denote the length of the train a...


# Error Analysis

In [8]:
sum(df_result['Correct Answer'] == df_result['Raw COT Answer'])/len(df_result['Correct Answer'] )

0.6184971098265896

In [9]:
sum(df_result['Correct Answer'] == df_result['Corrected COT Answer'])/len(df_result['Correct Answer'] )

0.5433526011560693

In [10]:
right_answer_df = df_result.loc[df_result['Correct Answer'] == df_result['Raw COT Answer']]

In [11]:
sum(right_answer_df['Correct Answer'] == right_answer_df['Corrected COT Answer'])/len(right_answer_df['Correct Answer'] )

0.7476635514018691

In [12]:
wrong_answer_df = df_result.loc[df_result['Correct Answer'] != df_result['Raw COT Answer']]

In [13]:
sum(wrong_answer_df['Correct Answer'] == wrong_answer_df['Corrected COT Answer'])/len(wrong_answer_df['Correct Answer'] )

0.21212121212121213

In [14]:
right_wrong_df = right_answer_df.loc[right_answer_df['Correct Answer'] != right_answer_df['Corrected COT Answer']]
right_right_df = right_answer_df.loc[right_answer_df['Correct Answer'] == right_answer_df['Corrected COT Answer']]

In [15]:
df.iloc[right_wrong_df.CaseID.tolist()]

Unnamed: 0,Category,Question,Correct_Answer,Output_Answer,Consistency,Cot,Confidence
54,formal_logic,Identify the antecedent of the following condi...,B,B,False,step 1: Understand the structure of a conditio...,
104,formal_logic,Construct a complete truth table for the follo...,D,D,False,step 1: Identify the premises and conclusion. ...,
149,Challenging Math,rani bought more apples than oranges . she sel...,b,b,False,step 1: Let's denote the cost price of an appl...,
158,Challenging Math,"out of 40 applicants to a law school , 15 majo...",a,a,False,step 1: Identify the total number of applicant...,
183,Challenging Math,in the standard formulation of a flavored drin...,a,a ) 45,False,step 1: Identify the ratio of flavoring to cor...,
...,...,...,...,...,...,...,...
2118,Math,on a map the distance between two mountains is...,b,b,False,step 1: Identify the scale of the map. The sca...,1.0
2119,Math,positive integer y is 50 percent of 25 percent...,b,b,False,step 1: Translate the first part of the proble...,0.9
2121,Math,"if 9 a - b = 10 b + 50 = - 12 b - 2 a , what i...",c,c,False,step 1: Given the equation 9a - b = 10b + 50 =...,1.0
2141,Math,lally ' s internet provider offers unlimited t...,c,c) 25,False,step 1: Calculate the daily cost of the intern...,1.0


In [None]:
right_wrong_df[:10]