In [161]:
import os
DATA_DIR = '../data'

# Experiment Config
DF_NAME = 'MathQA'
DIFFICULTY = 'easy'
NUM_OF_SAMPLES = 500
NUM_OF_COT = 40
storage_dir = os.path.join(DATA_DIR, f'Evaluation_CoTs/gpt-3.5-turbo-0125')
file_path = os.path.join(storage_dir, f'{DF_NAME}_{DIFFICULTY}.csv')

In [168]:
import re
def clean_final_answers_gsm8k(column):
    cleaned_answers = []
    for entry in column:
        # Extract numbers using regular expression
        numbers = re.findall(r'\b\d+\b', str(entry))
        if numbers:
            # Join all numbers with space if there are multiple numbers
            cleaned_answers.append(' '.join(numbers))
        else:
            # If no number is found, replace with 'error'
            cleaned_answers.append('error')
    return cleaned_answers
def clean_final_answers_MC(column):
    cleaned_answers = []
    for entry in column:
        # Extract numbers using regular expression
        entry = str(entry).lower()
        options = re.findall(r'[abcde]\s?\)', str(entry))
        if options:
            # Join all numbers with space if there are multiple numbers
            cleaned_answers.append(options[0][0])
        else:
            # If no number is found, replace with 'error'
            cleaned_answers.append(entry)
    return cleaned_answers

In [169]:
import pandas as pd
df = pd.read_csv(file_path)

for col in df.columns:
    if col.startswith('Final Answer_'):
        df[col] = clean_final_answers_MC(df[col])

df

{}


'Rosie can run 10 miles per hour for 3 hours. After that, she runs 5 miles per hour. How many miles can she run in 7 hours? ,'

In [171]:
import os
import pandas
import numpy as np
import pandas as pd
import re
import ast

result_dict = {
    'id':[],
    'correct answer': [],
    'CoT answers':[],
    'length':[],
    'instruction violation':[],
    'internal mistake':[]
}
tmp = []
for col in df.columns:
    if col.startswith('Final Answer_'):
        tmp.append(df[col].to_numpy())
tmp_arr = np.vstack(tmp)
cot_answer_li = tmp_arr.T
# Count Steps
step_count_buffer = []
for col in df:
    if col.startswith('CoT_'):
        cleaned_answers = []
        for entry in df[col]:
            # Extract numbers using regular expression
            steps = re.findall(r'[Ss]tep\s?\d', str(entry))
            if steps:
                # Join all numbers with space if there are multiple numbers
                cleaned_answers.append(len(steps))
            else:
                # If no number is found, replace with 'error'
                cleaned_answers.append(0)
        step_count_buffer.append(cleaned_answers)

step_count = np.array(step_count_buffer).T
# Violate instruction
instruction_buffer = []
for col in df:
    if col.startswith('Instruction Violation_'):
        cleaned_answers = []
        for entry in df[col]:
            # Extract numbers using regular expression
            x = ast.literal_eval(entry)
            cleaned_answers.append(sum([sum(idx) for idx in x]))
        instruction_buffer.append(cleaned_answers)

instruction_error = np.array(instruction_buffer).T

# Internal Mistake mentioned
mistake_buffer = []
for col in df:
    if col.startswith('CoT_'):
        cleaned_answers = []
        for entry in df[col]:
            # Extract numbers using regular expression
            misktake = re.findall(r'(be a mistake)|(be an error)', str(entry))
            if misktake:
                cleaned_answers.append(1)
            else:
                # If no number is found, replace with 'error'
                cleaned_answers.append(0)
        mistake_buffer.append(cleaned_answers)

mistakes = np.array(mistake_buffer).T
assert cot_answer_li.shape == step_count.shape == instruction_error.shape == mistakes.shape

In [143]:
for row in range(len(df)):
    cot_li_sample = cot_answer_li[row]
    length_li_sample = step_count[row]
    IV_li_sample = instruction_error[row]
    mistake_sample = mistakes[row]
    result_dict['id'].append(row)
    result_dict['correct answer'].append(df.iloc[row]['Correct Answer'])
    result_dict['CoT answers'].append(cot_li_sample)
    result_dict['length'].append(length_li_sample)
    result_dict['instruction violation'].append(IV_li_sample)
    result_dict['internal mistake'].append(mistake_sample)


0        -9867630.0
1         3431580.0
2       322886700.0
3        -6887448.0
4        21459061.0
           ...     
495           100.0
496      23252172.0
497            50.0
498    -425561.9444
499     2901807.833
Name: Correct Answer, Length: 500, dtype: object

In [114]:
df_final = pd.DataFrame.from_dict(result_dict)
df_final['Question'] = df['Question']

False

In [32]:
df_final['Question'].iloc[0]

'a multiple choice test consists of 4 questions , and each question has 5 answer choices . in how many r ways can the test be completed if every question is unanswered ? The options are: a ) 24 , b ) 120 , c ) 625 , d ) 720 , e ) 1024'

In [96]:
def check_correctness(row):
    correctness_list = []
    correct_answer = row['correct answer']
    cot_answers = row['CoT answers']
    question = row['Question']
    for i in range(len(cot_answers)):
        correct = correct_answer
        cot = cot_answers[i]
        q = question
        cot = str(cot)
        if(cot == correct):
            correctness_list.append(1)
            continue
        # Extract the options and their corresponding values from the question
        options = {}
        for option in ['a', 'b', 'c', 'd', 'e']:
            if option + ' )' in q:
                value = q.split(option + ' )')[-1].strip().split(',')[0].strip()
                options[option] = value
        if options[correct] in cot:
            correctness_list.append(1)
        else:
            correctness_list.append(0)
    
    return correctness_list

In [97]:
df_final['correctness'] = df_final.apply(check_correctness, axis=1)
df_final['correctness']

0      [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
1      [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
2      [1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, ...
3      [1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, ...
4      [1, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 1, ...
                             ...                        
495    [1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 0, 1, 1, 0, ...
496    [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
497    [1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, ...
498    [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
499    [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, ...
Name: correctness, Length: 500, dtype: object

In [103]:
df_final.iloc[1]

id                                                                       1
correct answer                                                           b
CoT answers              [e, e, c, a, e, 5 / 9, c, d, c, c, e, 5 / 9, c...
length                   [4, 3, 3, 4, 3, 4, 3, 3, 3, 6, 3, 3, 3, 3, 3, ...
instruction violation    [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
internal mistake         [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
Question                 a 3 - digit positive integer is chosen at rand...
correctness              [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
Name: 1, dtype: object

In [127]:
import pandas as pd

def concatenate_columns(df,data_columns, outcome_column):
    # Initialize an empty dictionary to store the concatenated data
    concatenated_data = {}
    
    # Get the number of rows based on the length of the outcome column
    num_rows = len(df)
    
    # Iterate over each column
    for column in data_columns+[outcome_column]:
        # Initialize an empty list to store the concatenated values for the current column
        concatenated_values = []
        
        # Iterate over each row
        for i in range(num_rows):
            # Get the list of values for the current column and row
            values = df[column][i]
            
            # Concatenate the values into a single string
            concatenated_values += list(values)
            
        
        # Add the concatenated values to the dictionary with the column name as the key
        concatenated_data[column] = concatenated_values
        print(len(concatenated_values))
    # Add the outcome column to the concatenated data dictionary
    
    # Create a DataFrame from the concatenated data dictionary
    df_final = pd.DataFrame(concatenated_data)
    
    return df_final

df_final = concatenate_columns(df_final,['length','instruction violation','internal mistake'],'correctness')

20000
20000
20000
20000


In [128]:
save_dir = DATA_DIR+'/Data_For_Analysis/' + f'{DF_NAME}_{DIFFICULTY}_CS.csv'
df_final.to_csv(save_dir,index=False)

{'A': 'Braveheart', 'B': 'Popeye', 'C': 'House II The Second Story', 'D': 'In China They Eat Dogs'}


"Find a movie similar to Dances with Wolves, The Shawshank Redemption, Apollo 13, Schindler's List:\r\nOptions:\r\n(A) Braveheart\r\n(B) Popeye\r\n(C) House II The Second Story\r\n(D) In China They Eat Dogs\r"