In [1]:
from langchain_openai import ChatOpenAI
from langchain.prompts.chat import (
    ChatPromptTemplate,
    SystemMessagePromptTemplate,
    HumanMessagePromptTemplate,
)
import json
from langchain.output_parsers import StructuredOutputParser, ResponseSchema
from langchain.chains import LLMChain
from dotenv.main import load_dotenv



class ChatModelWorker:
    def __init__(self, output_parser, temperature=0, model='gpt-4'):
        with open('api_key.txt', 'r') as f:
            apikey = f.read()
        self.chat_model = ChatOpenAI(openai_api_key=apikey, model_name=model, temperature=temperature,model_kwargs = {"seed":666})
        self.output_parser = output_parser

    def prompt_temps(self, sys_temp, human_temp, format_instructions):
        sys_msg_prompt = SystemMessagePromptTemplate.from_template(sys_temp)
        human_msg_prompt = HumanMessagePromptTemplate.from_template(human_temp)
        chat_prompt = ChatPromptTemplate(partial_variables={"format_instructions": format_instructions},
                                         messages=[sys_msg_prompt, human_msg_prompt])
        return chat_prompt

    def chain_generator(self, template, human_template):
        output_parser = self.output_parser
        format_instructions = output_parser.get_format_instructions()
        chain = LLMChain(
            llm=self.chat_model,
            prompt=self.prompt_temps(template, human_template, format_instructions)
        )
        return chain


def output_repraser(input_string):
    json_str = input_string.strip('```json\n').rstrip('\n```').strip()

    # Create a custom JSONDecoder to handle invalid escape sequences
    class CustomJSONDecoder(json.JSONDecoder):
        def decode(self, s):
            result = super().decode(s)
            return result

    # Use the custom JSONDecoder to parse the JSON string
    data_dict = json.loads(json_str, cls=CustomJSONDecoder)
    return data_dict


def root_checker_agent(subject, question, current_step, cot, temp=0, model_name='gpt-4-0125-preview'):
    system_prompt = (
        '''You are a professional specialized in {subject}. You need to help me verify my steps when I solve the question.
        I am currently at step #{current_step}.

        Before you perform the task, I want you to keep in mind several definitions for my possible mistakes. 
        1. Factuality： This type of error emphasizes the discrepancy between generated content and verifable real-word facts, including
        factual inconsistency or fabrication. In mathematics for instance, it may represents the computational error.

        2. Faithfulness: This type of error refers to the divergence of my step analysis from the original question or 
        previous steps, as well as self-consistency within my steps. In mathematics for instance, it may represents that
        I understood the question wrongly or my proposed step is inconsistent with my previous step. 

        Based on my current step response, question, previous steps, and my error definitions, help me verify if any of 
        the mistakes (factuality or faithfulness) occur on my analysis. Notice that skipping step should not be considered
        as error as long as the calculation is correct! For instance, 2x+2 should be the same as 2+2x. Also
        2x+2+3 should be the same as 2x+5


        At step 1, since we have no step 0, instead the factuality and faithfulness check
         should reflect if I correctly understood the answer.
         
         Do not detect any minor hallucinations! In other words, only targeting the mistakes that contain calculation error
         or apparent logical flaw or contradict real-world facts! If the provided step acknowledge mistake, you need to 
         capture it and correct it.
         
         If you see any step ended up with *<verified>* it means it have been checked without any mistake, so just consider
         it as correct!!!  
        \n{format_instructions}
''')
    human_prompt = "Here is my complete thought process {cot} and this is the original question {question}"

    response_schemas = [

        ResponseSchema(name="Verification",
                       description='''
                        Help me verify the factuality and the faithfulness  of the current step, 
                       and tell me the reason. 
                       REASON is important. The reasoning step should cite the variable and formula you use!!! If at Step 1, since 
                        we have no step 0, verify if I correctly understood the answer
                        If you see any step ended up with *<verified>* it means it have been checked without any mistake, so just consider
         it as correct!!!  
                        '''),

        ResponseSchema(name="Step Hallucination",
                       description='''
                       say [YES] if the current step logic and computation are NOT factual or faithful 
                       based on the question and my previous steps, otherwise [NO] .!!! If at Step 1, since 
                        we have no step 0, check for the factuality and faithfulness of the current step only. 
                        If you see any step ended up with *<verified>* it means it have been checked without any mistake, so just consider
         it as correct!!!  
                        '''),
        ResponseSchema(name="Type of Hallucination",
                       description='''
                       Identify if the step violated factuality or faithfulness or both. Return [None] if my current step
                       was correct.
                       If you see any step ended up with *<verified>* it means it have been checked without any mistake, so just consider
         it as correct!!!  
                        '''),
        ResponseSchema(name="Correction",
                       description='''
                               If you think Step Hallucination is Yes, help me generate a corrected version of the current
                               step instead. Notice that do not simply identify the error here, instead
                               you should directly give me the correct version with calculation (if applicable)
                               Follow the format:
                               'Step n : [Corrected version]'                               
                               '''),
        ResponseSchema(name="Dependency",
                       description='''
                           Find which previous steps led to the unfactual or unfaithful . The whole idea is to 
                           discuss that if the current step is unfactual or unfaithful, where did the error chain start
                           from. What previous steps are the root cause of the error. Follow the template:
                           [[Unfactual] <- [Unfactual Previous Steps Indices]\n
                           [Unfaithful] <- [Unfaithful Previous Steps Indices]]
                           If it is caused by misunderstanding of question, then the dependency should be [Original Question]
                           If no unfactual or unfaithful, simply return [N/A]
                           '''),
    ]
    output_parser = StructuredOutputParser.from_response_schemas(response_schemas)
    success = False
    while not success:
        try:
            worker = ChatModelWorker(output_parser=output_parser, temperature=temp, model=model_name)
            chain = worker.chain_generator(system_prompt, human_prompt)
            out_put = chain.run(subject=subject,
                                current_step=current_step, cot=cot, question=question)

            success = True
        except:
            worker = ChatModelWorker(output_parser=output_parser, temperature=temp, model=model_name)
            chain = worker.chain_generator(system_prompt, human_prompt)
            out_put = chain.run(subject=subject,
                                current_step=current_step, cot=cot, question=question)
    return out_put


def debate_agent(subject, question, current_step, cot, response, temp=0, model_name='gpt-4-0125-preview'):
    system_prompt = (
        '''You are a professional specialized in {subject}. You need to help me verify my steps when I solve the question.
        I am currently at step #{current_step}.

        Before you perform the task, I want you to keep in mind several definitions for my possible mistakes. 
        1. Factuality： This type of error emphasizes the discrepancy between generated content and verifable real-word facts, including
        factual inconsistency or fabrication. In mathematics for instance, it may represents the computational error.

        2. Faithfulness: This type of error refers to the divergence of my step analysis from the original question or 
        previous steps, as well as self-consistency within my steps. In mathematics for instance, it may represents that
        I understood the question wrongly or my proposed step is inconsistent with my previous step. 

        Other agents had helped me identify the error I made in the current step. You goal is to debate with the other
        agents and justify if their corrections were correct based on my question, thought process. Please use Critical
        Thinking.
        \n{format_instructions}
''')
    human_prompt = ("Here is my complete thought process {cot} and this is the original question {question}. The full"
                    "response from the other agents were given as {response}")

    response_schemas = [

        ResponseSchema(name="Justification",
                       description='''
                        Give me the your response to the other agent and justify
                         that whether you think the other agents' correction to my step was correct.
                        '''),

        ResponseSchema(name="Agreement",
                       description='''
                       say [YES] if you agree with the other agents corrections to my current step analysis. Otherwise,
                       say [NO]
                        '''),

        ResponseSchema(name="Correction",
                       description='''
                               Help me generate a  version of the current
                               step that you think is correct
                               . Notice that do not simply identify the error here, instead
                               you should directly give me the correct version with calculation (if applicable)

                               'Step n : [Corrected version] *<verified>*'                                 
                               '''),
    ]
    output_parser = StructuredOutputParser.from_response_schemas(response_schemas)
    success = False
    while not success:
        try:
            worker = ChatModelWorker(output_parser=output_parser, temperature=temp, model=model_name)
            chain = worker.chain_generator(system_prompt, human_prompt)
            out_put = chain.run(subject=subject,
                                current_step=current_step, cot=cot, question=question, response=response)

            success = True
        except:
            worker = ChatModelWorker(output_parser=output_parser, temperature=temp, model=model_name)
            chain = worker.chain_generator(system_prompt, human_prompt)
            out_put = chain.run(subject=subject,
                                current_step=current_step, cot=cot, question=question, response=response)
    return out_put


def correct_answer_agent_partial_cot(subject, cot,question, temp=0, model_name='gpt-4-0125-preview'):
    system_prompt = (
        '''
        You are a professional specialized in {subject}. Your task is help me answer the question based on my initial 
        thoughts. I will provide you several steps of my attempt. Your task is to CONTINUE my thought process and then 
        answer my question step by step. Also, maximum 12 steps allowed and you can assume my initial thoughts had been
        checked since could be trusted. Remember, your response should based on my initial thoughts!
        \n{format_instructions}
        ''')
    human_prompt = "Here is my question :{question}. And my intial thought process is given as {cot}"

    response_schemas = [
        ResponseSchema(name="Complete Thought Process",
                       description="Continue my thought process in order to answer the question,"
                                   "You must include my initial thought process as well and leave them as what they are"
                                   "Return the complete chain of thought by following the format:"
                                   "Step n: [step process]."),
        ResponseSchema(name="Final Answer",
                       description="Give me your final answer based on my thought process , if have options provided, "
                                   "just give me the option index. Follow"
                                   "The format [final_answer or correct_option_index]")
    ]
    output_parser = StructuredOutputParser.from_response_schemas(response_schemas)
    success = False
    while not success:
        try:
            worker = ChatModelWorker(output_parser=output_parser, temperature=temp, model=model_name)
            chain = worker.chain_generator(system_prompt, human_prompt)
            out_put = chain.run(subject=subject, cot = cot,
                                question=question)

            success = True
        except:
            worker = ChatModelWorker(output_parser=output_parser, temperature=temp, model=model_name)
            chain = worker.chain_generator(system_prompt, human_prompt)
            out_put = chain.run(subject=subject, cot = cot,
                                question=question)
    return out_put


def debate_whole_agent(subject, question, cot, temp=0, model_name='gpt-4-0125-preview'):
    system_prompt = (
        '''You are a professional specialized in {subject}. You need to help me verify the other agents' thought process 
        when they solve the question.

        Before you perform the task, I want you to keep in mind several definitions for my possible mistakes. 
        1. Factuality： This type of error emphasizes the discrepancy between generated content and verifable real-word facts, including
        factual inconsistency or fabrication. In mathematics for instance, it may represents the computational error.

        2. Faithfulness: This type of error refers to the divergence of my step analysis from the original question or 
        previous steps, as well as self-consistency within my steps. In mathematics for instance, it may represents that
        I understood the question wrongly or my proposed step is inconsistent with my previous step. 

        Therefore, I need you to critically debate with the other agents. Your goal is to check their thought process,
        identify which step made mistakes and what type of hallucination were those. Then , generate your own version
        based on your justification and their thought process. Finally, generate a log to state your updates.
        \n{format_instructions}
''')
    human_prompt = ("Here is their complete thought process {cot} and this is the original question {question}. ")

    response_schemas = [

        ResponseSchema(name="Justification",
                       description='''
                        Give me the your response to the other agent and justify
                         that whether you think the other agents' thought process to solve the question was correct.
                        '''),

        ResponseSchema(name="Step Verification",
                       description='''
                       State which step had problem and what type by following the format:
                       [step n]: [Mistake type (factuality or faithfulness)]
                        '''),

        ResponseSchema(name="Corrected COT",
                       description='''
                               Help me generate a new version of thought process to solve the question. You should follow
                               the format: 
                               [Step n]: [Step process]                             
                               '''),
        ResponseSchema(name="Updates",
                       description='''
                                   Give me a log history regarding your updates by stating what you changed and what are
                                   the new steps you proposed by following the format:
                                   'Updated Steps': [step indices],
                                   'New Steps': [step indices]                            
                                   '''),
        ResponseSchema(name="Final Answer",
                       description="Give me your final answer based your revised thought process , if have options provided, "
                                   "just give me the option index. Follow"
                                   "The format [final_answer or correct_option_index]")
    ]
    output_parser = StructuredOutputParser.from_response_schemas(response_schemas)
    success = False
    while not success:
        try:
            worker = ChatModelWorker(output_parser=output_parser, temperature=temp, model=model_name)
            chain = worker.chain_generator(system_prompt, human_prompt)
            out_put = chain.run(subject=subject,
                                 cot=cot, question=question)

            success = True
        except:
            worker = ChatModelWorker(output_parser=output_parser, temperature=temp, model=model_name)
            chain = worker.chain_generator(system_prompt, human_prompt)
            out_put = chain.run(subject=subject,
                                cot=cot, question=question)
    return out_put


In [23]:
import os
import pandas as pd
import re
from collections import Counter
from llm_agents import *

PREPROCESSED_FP = '../data/preprocessed'


def load_df(dataset_fp):
    df = pd.read_csv(os.path.join(PREPROCESSED_FP, dataset_fp))
    print('------------------------------------------------------')
    print(f'The distribution of category in {dataset_fp} is:\n{Counter(df.Category)}')
    return df




def generate_new_response(subject, question,cot):
    result = correct_answer_agent_partial_cot(subject=subject, question=question,cot=cot)
    success = False
    while not success:
        try:
            forward_result = output_repraser(result)
            success = True
        except:
            success = False
    print('------------------------------------------------------')
    for key, value in forward_result.items():
        print(key)
        print(value)
    print('------------------------------------------------------')
    cot, final_answer = forward_result.values()
    return cot, final_answer

def self_correct_complete(cot, steps, question, ngram=1):
    check_list = []
    for i in range(int(steps)):
        current_step = i + 1

        masked_cot = cot[:i+1]


        success = False
        while not success:
            try:
                conditional_check_result = root_checker_agent(subject=subject, current_step=current_step, cot=masked_cot,
                                                                question=question)
                response = output_repraser(conditional_check_result)

                success = True
            except:
                success = False

        print(f'Step {current_step}', response, '\n\n')
        check_list.append((response['Step Hallucination']))
        if (response['Step Hallucination'] == 'YES'):
            debate_response = multi_agents_debate(subject,current_step,masked_cot,question,response)
            print('Old Version: ', masked_cot[i])
            masked_cot[i] = debate_response['Correction']
            print('Corrected Version', masked_cot[i])
            break
    print('------------------------------------------------------')
    print(masked_cot)
    print('------------------------------------------------------')

    return check_list, masked_cot


def standardize_answer(answer):
    # Check for strict multiple choice format (single letter or letter followed by parenthesis)
    if re.match(r'^[a-zA-Z]\W.*$', answer.strip()):
        return answer.strip().lower()[0]

    # For other cases, return the answer as is
    return answer.lower()

def multi_agents_debate(subject,current_step,masked_cot,question,response):
    final_response = response
    print('Start Debating')
    attempts = 0
    counter = 0
    while (attempts < 1) and counter <=2:
        print('attempt:',attempts)
        success = False
        while not success:
            try:
                debate = debate_agent(subject=subject, current_step=current_step, cot=masked_cot,
                                                          question=question,response = response)
                response = output_repraser(debate)
                print('\n\n\n',response,'\n\n\n')
                if response['Agreement'] == 'YES':
                    final_response = response
                    attempts += 1

                success = True
                counter += 1
            except:
                success = False
    return final_response





if __name__ == '__main__':
    config = {
        'dataset_fp': 'Self_Check.csv',
        'test_case_number': range(0,146),
        'ngram': 'all',
        'num_agents': 1
    }

    result_df_dict = {
        'CaseID': [],
        'Question': [],
        'Correct Answer':[],
        'Raw COT Answer':[],
        'Corrected COT Answer': [],
        'Hallu Seq':[],
        'raw_cot':[],
        'corrected_cot': []
    }

    df_raw = pd.read_csv('../data/3-19_data.csv')
    df = df_raw.loc[df_raw.Consistency == False]
    # df = df.loc[df.Correct_Answer == df.Output_Answer]
    df = df.iloc[right_wrong_df.CaseID.tolist()]
    for row_idx in range(5):
        row = df.iloc[row_idx]


        subject = row['Category']
        question = row['Question']
        correct_answer = row['Correct_Answer']
        cot = row['Cot']
        raw_cot_answer = row['Output_Answer']

        result_df_dict['CaseID'].append(row_idx)
        result_df_dict['Question'].append(question)
        result_df_dict['Correct Answer'].append(standardize_answer(correct_answer))
        result_df_dict['raw_cot'].append(cot)

        print('question: ',question)
        print('correct answer: ',correct_answer)
        print('COT: ',cot)
        print('raw_cot_answer: ',raw_cot_answer)

        result_df_dict['Raw COT Answer'].append(standardize_answer(raw_cot_answer))

        for i in range(config['num_agents']):
            try:
                steps_list_with_indices = re.split(r'(?i)([Ss]tep \d+\s?:)', cot)
    
                # Reconstruct the steps list to include "step n:" with the actual step text.
                result_steps = [f"{steps_list_with_indices[i]} {steps_list_with_indices[i + 1].strip()}" for i in
                                range(1, len(steps_list_with_indices), 2)]
            except:
                result_steps = []
                
            if len(result_steps) == 0:
                result_steps = ['No initial thoughts proposed, start from the scratch']

            steps =  len(result_steps)


            check_list,partial_cot = self_correct_complete(result_steps, steps, question=question,
                                                     ngram=config['ngram'])
            if 'YES' in check_list:
                corrected_cot, corrected_answer = generate_new_response(subject=subject,question=question,cot=partial_cot)
                new_answer = (standardize_answer(corrected_answer))
            else:
                new_answer = (standardize_answer(raw_cot_answer))
                corrected_cot = cot

            cot = corrected_cot

        result_df_dict['Corrected COT Answer'].append(new_answer)
        result_df_dict['corrected_cot'].append(corrected_cot)
        result_df_dict['Hallu Seq'].append(check_list)

    result_df = pd.DataFrame.from_dict(result_df_dict)
    result_df.to_csv('../result/gpt-3-19_right_wrong_error.csv')


question:  Identify the antecedent of the following conditional proposition: The Bees win their first game only if either the Aardvarks or the Chipmunks do not win their first games. The options are: A) The Aardvarks do not win their first game., B) The Bees win their first game., C) The Chipmunks do not win their first game., D) Either the Aardvarks or the Chipmunks do not win their first games.
correct answer:  B
COT:  step 1: Understand the structure of a conditional proposition, which is typically in the form 'If P, then Q', where P is the antecedent and Q is the consequent.
step 2: Identify the conditional proposition in the given statement: 'The Bees win their first game only if either the Aardvarks or the Chipmunks do not win their first games.'
step 3: Recognize that 'only if' indicates a conditional relationship where the part after 'only if' is necessary for the part before it. This means the part after 'only if' is the consequent, and the part before it is the antecedent.
st

In [18]:
result_steps = [f"{steps_list_with_indices[i]} {steps_list_with_indices[i + 1].strip()}" for i in
                            range(1, len(steps_list_with_indices), 2)]

NameError: name 'steps_list_with_indices' is not defined

In [None]:
result_steps

In [387]:
import pandas as pd
df_result = pd.read_csv('../result/result_last_result.csv')

In [388]:
df_result

Unnamed: 0,CaseID,Category,Question,Correct Answer,Raw COT Answer,Corrected COT Answer,Hallu Seq,raw_cot,corrected_cot
0,0,professional_accounting,Which of the following items is not subject to...,d,b,d,"['NO', 'NO', 'NO', 'NO', 'NO', 'NO', 'NO', 'NO...",Step 1: Understand what intraperiod tax alloca...,Step 1: Understand what intraperiod tax alloca...
1,1,professional_accounting,Johnson worked for ABC Co. and earned a salary...,c,c,c,"['NO', 'NO', 'NO', 'NO', 'NO', 'NO', 'NO', 'NO']",step 1: Identify the total amount of group ter...,step 1: Identify the total amount of group ter...
2,2,professional_accounting,Pine Co. purchased land for $450000 as a facto...,c,a,a,"['NO', 'NO', 'NO', 'NO', 'NO', 'NO']",Step 1: Understand that the cost to be capital...,Step 1: Understand that the cost to be capital...
3,3,professional_accounting,The following information pertains to Dash Co....,c,a,c,"['NO', 'NO', 'NO', 'NO', 'NO', 'NO']",step 1: Identify the period for which the liab...,Step 1: Identify the period for which the liab...
4,4,professional_accounting,Spark Co. buys cordless phones for $125 each a...,d,d,d,"['NO', 'NO', 'NO', 'NO', 'NO', 'NO', 'NO']",Step 1: Calculate the profit per unit by subtr...,Step 1: Calculate the profit per unit by subtr...
...,...,...,...,...,...,...,...,...,...
128,128,date_understanding,"Jane quited her job on Mar 20, 2020. 176 days ...",a,a,a,"['NO', 'NO', 'NO', 'NO', 'NO', 'NO', 'NO']",step 1: Calculate the date 176 days after Marc...,step 1: Calculate the date 176 days after Marc...
129,129,date_understanding,2015 is coming in 36 hours. What is the date 2...,c,d,d,"['NO', 'NO', 'NO', 'NO', 'NO', 'NO']",step 1: Understand that '2015 is coming in 36 ...,step 1: Understand that '2015 is coming in 36 ...
130,130,date_understanding,"On May 9th, 2017 Jane bought 40 eggs. She ate ...",c,c,c,"['NO', 'NO', 'NO']",step 1: Calculate the total number of days Jan...,Step 1: Calculate the total number of days Jan...
131,131,date_understanding,Jane scheduled 3 apointments with 5 poeple for...,e,none of the options match the calculated date ...,there is a mistake in the question or options ...,"['NO', 'NO', 'NO', 'NO', 'NO', 'NO', 'NO', 'NO...",step 1: Understand the question. The question ...,Step 1: Understand the question. The question ...


# Error Analysis

In [389]:
sum(df_result['Correct Answer'] == df_result['Raw COT Answer'])/len(df_result['Correct Answer'] )

0.6766917293233082

In [390]:
sum(df_result['Correct Answer'] == df_result['Corrected COT Answer'])/len(df_result['Correct Answer'] )

0.6766917293233082

In [391]:
right_answer_df = df_result.loc[df_result['Correct Answer'] == df_result['Raw COT Answer']]

In [392]:
sum(right_answer_df['Correct Answer'] == right_answer_df['Corrected COT Answer'])/len(right_answer_df['Correct Answer'] )

0.9

In [393]:
sum(right_answer_df['Correct Answer'] != right_answer_df['Corrected COT Answer'])/len(right_answer_df['Correct Answer'] )

0.1

In [60]:
right_wrong_df = right_answer_df.loc[right_answer_df['Correct Answer'] != right_answer_df['Corrected COT Answer']]
right_right_df = right_answer_df.loc[right_answer_df['Correct Answer'] == right_answer_df['Corrected COT Answer']]

In [15]:
df.iloc[right_wrong_df.CaseID.tolist()]

Unnamed: 0,Category,Question,Correct_Answer,Output_Answer,Consistency,Cot,Confidence
54,formal_logic,Identify the antecedent of the following condi...,B,B,False,step 1: Understand the structure of a conditio...,
104,formal_logic,Construct a complete truth table for the follo...,D,D,False,step 1: Identify the premises and conclusion. ...,
149,Challenging Math,rani bought more apples than oranges . she sel...,b,b,False,step 1: Let's denote the cost price of an appl...,
158,Challenging Math,"out of 40 applicants to a law school , 15 majo...",a,a,False,step 1: Identify the total number of applicant...,
183,Challenging Math,in the standard formulation of a flavored drin...,a,a ) 45,False,step 1: Identify the ratio of flavoring to cor...,
...,...,...,...,...,...,...,...
2118,Math,on a map the distance between two mountains is...,b,b,False,step 1: Identify the scale of the map. The sca...,1.0
2119,Math,positive integer y is 50 percent of 25 percent...,b,b,False,step 1: Translate the first part of the proble...,0.9
2121,Math,"if 9 a - b = 10 b + 50 = - 12 b - 2 a , what i...",c,c,False,step 1: Given the equation 9a - b = 10b + 50 =...,1.0
2141,Math,lally ' s internet provider offers unlimited t...,c,c) 25,False,step 1: Calculate the daily cost of the intern...,1.0


In [383]:
right_wrong_df[:10]

Unnamed: 0.1,Unnamed: 0,CaseID,Category,Question,Correct Answer,Raw COT Answer,Corrected COT Answer,Hallu Seq,raw_cot,corrected_cot
33,33,33,college_chemistry,The X-band (9.5 GHz) EPR spectrum of a matrix ...,a,a,d,"['NO', 'NO', 'NO', 'NO', 'NO', 'NO', 'NO', 'NO...",Step 1: Understand the problem. We need to cal...,Step 1: Understand the problem. We need to cal...
37,37,37,college_chemistry,The magnetic moment (μI) of an unknown nuclide...,b,b,d,"['NO', 'NO', 'NO', 'NO', 'NO', 'NO', 'NO']",Step 1: Recall the formula for the magnetic mo...,Step 1: Recall the formula for the magnetic mo...
38,38,38,college_chemistry,Which of the following is a true statement abo...,d,d,none of the options accurately describe the co...,"['NO', 'NO', 'NO', 'NO', 'NO', 'NO']",Step 1: Understand what optical isomerism mean...,Step 1: Understand what optical isomerism mean...
116,116,116,formal_logic,Which of the following propositions is not an...,d,d,c,"['NO', 'NO', 'NO', 'NO', 'NO', 'NO', 'NO', 'NO...",step 1: Analyze the given premises in proposit...,step 1: Analyze the given premises in proposit...


In [384]:
import re
def count_debates(match):
    splited_li = match.split('\n')
    pattern = r"question:.*?(?=\ncorrect answer)"
    question = re.findall(pattern, match, re.DOTALL)[0].split('question:  ')[1]
    num_of_fixed_steps = splited_li.count('Start Debating')
    if num_of_fixed_steps:
        indices = [index for index, value in enumerate(splited_li) if value == 'Start Debating']
        indices.append(len(splited_li))
        num_of_debates = []
        for i in range(len(indices)-1):
            reduced_split_li = (splited_li[indices[i]:indices[i+1]])
            num_of_debates.append(reduced_split_li.count('attempt: 0') + reduced_split_li.count('attempt: 1'))
        sum_of_debates = sum(num_of_debates)
    else:
        num_of_debates = 0
        sum_of_debates = 0
    return question,num_of_debates, sum_of_debates
def calculate_common_words_percentage(text1, text2):
    # Tokenize the texts and convert to lowercase
    words_text1 = set(text1.lower().split())
    words_text2 = set(text2.lower().split())
    
    # Find the common words
    common_words = words_text1.intersection(words_text2)
    
    # Calculate the percentage of common words
    # relative to the total unique words in the shorter text
    percentage_common = len(common_words) / min(len(words_text1), len(words_text2)) * 100
    
    return percentage_common

In [394]:
result_sheet_name = 'result_last'

save_dict = {
    'CaseID':[],
    'Category':[],
    'question':[],
    'num_of_debates':[],
    'sum_of_debates':[],
}
with open(f'../result/{result_sheet_name}_log.txt','r', encoding='utf-8') as f:
    text = f.read()
    pattern = r"\n\n\n\nquestion:.*?(?=\n\n\n\nquestion:|$)"

# Finding all matches using re.DOTALL to include newline characters if any
    matches = re.findall(pattern, text, re.DOTALL)
    for match in matches:
        question,num_of_debates,sum_of_debates = count_debates(match)
        save_dict['question'].append(question)
        save_dict['num_of_debates'].append(num_of_debates)
        save_dict['sum_of_debates'].append(sum_of_debates)
result_df = pd.read_csv(f'../result/{result_sheet_name}_result.csv',encoding='utf-8')
case_id = []
category = []
for question in save_dict['question']:
    result_df['Similarity'] = result_df['Question'].apply(lambda x: calculate_common_words_percentage(x, question))
    # Filter rows with 90% or more similarity
    similar_rows = result_df[result_df['Similarity'] >= 90]
    # Drop the 'Similarity' column if not needed
    similar_rows = similar_rows.drop(columns=['Similarity'])
    case_id.append(similar_rows.CaseID.values[0])
    category.append(similar_rows.Category.values[0])
save_dict['CaseID'] = case_id
save_dict['Category'] = category
pd.DataFrame.from_dict(save_dict).to_csv(f'../result/{result_sheet_name}_debate.csv',index= False)

In [357]:
question

'Statement 1 |  Suppose ∑|a_i| diverges and ∑ a_i = 2. There is a rearrangement a_i_k of the terms such that ∑ a_i_k = 4. Statement 2 | There exists metric spaces X and Y with X closed and bounded and a continuous mapping f : X → Y such that f(X) is NOT “closed and bounded”. The options are: A) True, True, B) True, False, C) False, True, D) False, False'

In [356]:
len(save_dict['question'])

109

In [297]:
calculate_common_words_percentage(result_df.iloc[113].Question,question)

82.85714285714286

In [334]:
result_sheet_name = 'result_Math'

save_dict = {
    'CaseID':[],
    'Category':[],
    'question':[],
    'num_of_debates':[],
    'sum_of_debates':[],
}
with open(f'../result/{result_sheet_name}_log.txt','r', encoding='utf-8') as f:
    text = f.read()
    pattern = r"\n\n\n\nquestion:.*?(?=\n\n\n\nquestion:|$)"

# Finding all matches using re.DOTALL to include newline characters if any
    matches = re.findall(pattern, text, re.DOTALL)
    for match in matches:
        question,num_of_debates,sum_of_debates = count_debates(match)
        save_dict['question'].append(question)
        save_dict['num_of_debates'].append(num_of_debates)
        save_dict['sum_of_debates'].append(sum_of_debates)
result_df = pd.read_csv(f'../result/{result_sheet_name}_result.csv',encoding='utf-8')
case_id = []
category = []
unsaved_q = []
for question in save_dict['question']:
    result_df['Similarity'] = result_df['Question'].apply(lambda x: calculate_common_words_percentage(x, question))
    # Filter rows with 90% or more similarity
    similar_rows = result_df[result_df['Similarity'] >= 90]
    # Drop the 'Similarity' column if not needed
    similar_rows = similar_rows.drop(columns=['Similarity'])
    try:
        case_id.append(similar_rows.CaseID.values[0])
        category.append(similar_rows.Category.values[0])
    except:
        case_id.append('UNSAVED')
        category.append('UNSAVED')
save_dict['CaseID'] = case_id
save_dict['Category'] = category
pd.DataFrame.from_dict(save_dict).to_csv(f'../result/{result_sheet_name}_debate.csv',index= False)