In [1]:
import os
import pandas as pd

# Path to the main directory
main_dir = "../data/Evaluation_CoTs"

# Subdirectories
subdirectories = ["claude-3-haiku-20240307", "gpt-3.5-turbo-0125","gpt-4"]

# List to store dataframes
dataframes = []

# Iterate over each subdirectory and process files
for sub in subdirectories:
    for file in os.listdir(os.path.join(main_dir, sub)):
        if file.endswith('.csv'):
            # Determine the difficulty from the file name
            difficulty = 'easy' if 'easy' in file else 'hard'
            
            # Read CSV file
            df = pd.read_csv(os.path.join(main_dir, sub, file))
            
            # Extract model from the directory name
            model = sub
            
            # Add columns for difficulty and model
            df['Difficulty'] = difficulty
            df['Model'] = model
            
            # Append the dataframe to the list
            dataframes.append(df)

# Concatenate all dataframes
final_df = pd.concat(dataframes, ignore_index=True)

# Optionally, save the concatenated dataframe to a new CSV file
final_df.to_csv('concatenated_results.csv', index=False)

# Print or inspect the final DataFrame
print(final_df.head())


         Name Category                                           Question  \
0  MathQA_dev     Math  a multiple choice test consists of 4 questions...   
1  MathQA_dev     Math  a 3 - digit positive integer is chosen at rand...   
2  MathQA_dev     Math  if x and y are positive integers and 7 + x + y...   
3  MathQA_dev     Math  the hcf and lcm of two numbers m and n are res...   
4  MathQA_dev     Math  in a kilometer race , a beats b by 48 meters o...   

  Correct Answer                                              CoT_0  \
0              c  \n    Step 1: The question is asking about the...   
1              b  Step 1: Understand the problem statement. A 3-...   
2              c  Step 1: The given equation is 7 + x + y + xy =...   
3              d  Step 1: The given information is that the high...   
4              a  Step 1: We are given that A beats B by 48 mete...   

  Final Answer_0 Instruction Violation_0  \
0              c                [(0, 0)]   
1              a      

In [2]:
# Drop columns whose names start with "Unnamed:"
unnamed_cols = ['index'] + [col for col in final_df.columns if col.startswith('Unnamed:')]
final_df.drop(columns=unnamed_cols, inplace=True)

In [3]:
final_df = final_df.dropna()

In [4]:
final_df.Category.value_counts()

Category
Math                    2184
Challenging Math        2181
disambiguation_qa       1114
movie_recommendation    1068
navigate                   2
Name: count, dtype: int64

In [5]:
final_df.Name.value_counts()

Name
BigBench_hard            1100
GSM8K_test               1097
GSM8K_hard               1091
MathQA_challenge_test    1090
MathQA_dev               1087
BigBench_easy            1084
Name: count, dtype: int64

In [6]:
final_df

Unnamed: 0,Name,Category,Question,Correct Answer,CoT_0,Final Answer_0,Instruction Violation_0,CoT_1,Final Answer_1,Instruction Violation_1,...,Final Answer_37,Instruction Violation_37,CoT_38,Final Answer_38,Instruction Violation_38,CoT_39,Final Answer_39,Instruction Violation_39,Difficulty,Model
0,MathQA_dev,Math,a multiple choice test consists of 4 questions...,c,\n Step 1: The question is asking about the...,c,"[(0, 0)]",\n Step 1: The test consists of 4 questions...,c,"[(0, 0)]",...,c,"[(0, 0)]",Step 1: Understand the problem. The problem is...,c,"[(0, 0)]",Step 1: Understand the problem statement.\nThe...,c,"[(0, 0)]",easy,claude-3-haiku-20240307
1,MathQA_dev,Math,a 3 - digit positive integer is chosen at rand...,b,Step 1: Understand the problem statement. A 3-...,a,"[(0, 0)]",Step 1: To find the probability that the produ...,b,"[(0, 0)]",...,b,"[(0, 0)]",Step 1: Identify the possible outcomes.\n ...,a,"[(0, 0)]",Step 1: The question asks for the probability ...,d,"[(0, 0)]",easy,claude-3-haiku-20240307
2,MathQA_dev,Math,if x and y are positive integers and 7 + x + y...,c,Step 1: The given equation is 7 + x + y + xy =...,b,"[(0, 0)]",Step 1: We are given the equation 7 + x + y + ...,b,"[(0, 0)]",...,c,"[(0, 0)]",Step 1: We are given the equation 7 + x + y + ...,b,"[(0, 0)]",Step 1: The given equation is 7 + x + y + xy =...,a,"[(0, 0)]",easy,claude-3-haiku-20240307
3,MathQA_dev,Math,the hcf and lcm of two numbers m and n are res...,d,Step 1: The given information is that the high...,b,"[(0, 0)]",Step 1: Given that the HCF (Highest Common Fac...,b,"[(0, 0)]",...,d,"[(0, 0)]",Step 1: The HCF (Highest Common Factor) of two...,b,"[(0, 0)]",Step 1: Given the HCF (highest common factor) ...,b,"[(0, 0)]",easy,claude-3-haiku-20240307
4,MathQA_dev,Math,"in a kilometer race , a beats b by 48 meters o...",a,Step 1: We are given that A beats B by 48 mete...,d,"[(0, 0)]","Step 1: We are given that in a kilometer race,...",b,"[(0, 0)]",...,d,"[(0, 0)]","Step 1: We are given that in a kilometer race,...",d,"[(0, 0)]","Step 1: We are given that in a kilometer race,...",d,"[(0, 0)]",easy,claude-3-haiku-20240307
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6593,GSM8K_test,Math,"In a neighborhood, the number of rabbits pets ...",348,Step 1: Determine the number of cats. Since th...,348,"[(0, 0)]","Step 1: Calculate the number of cats, which is...",348,"[(0, 0)]",...,348,"[(0, 0)]",Step 1: Calculate the number of cats by multip...,348,"[(0, 0)]",Step 1: Determine the number of cats. Since th...,348,"[(0, 0)]",easy,gpt-4
6594,GSM8K_test,Math,"Out of the 200 Grade 5 students, 2/5 are boys ...",40,Step 1: Calculate the number of boys by multip...,40,"[(0, 0)]",Step 1: Calculate the number of boys by multip...,40,"[(0, 0)]",...,40,"[(0, 0)]",Step 1: Calculate the number of boys by multip...,40,"[(0, 0)]",Step 1: Calculate the number of boys by multip...,40,"[(0, 0)]",easy,gpt-4
6596,GSM8K_test,Math,"When Freda cooks canned tomatoes into sauce, t...",12,Step 1: Determine the volume of the canned tom...,12,"[(0, 0)]",Step 1: Determine the volume of tomatoes used ...,12,"[(0, 0)]",...,12,"[(0, 0)]",Step 1: Determine the original volume of the t...,12,"[(0, 0)]",Step 1: Determine the volume of tomatoes used ...,12,"[(0, 0)]",easy,gpt-4
6597,GSM8K_test,Math,Cars have lined up on the motorway. Some of th...,5,Step 1: Denote the number of cars that drove t...,5,"[(0, 0)]",Step 1: Denote the number of cars that drove t...,5,"[(3, 0)]",...,5,"[(0, 0)]",Step 1: Calculate the total number of cars tha...,5,"[(0, 0)]",Step 1: Calculate the total number of cars tha...,5,"[(1, 0)]",easy,gpt-4


In [13]:
import pandas as pd
import re

# Enhanced regex pattern to capture negative numbers, percentages, and dollar amounts
digit_re = re.compile(r"-?\$?\d+\.?\d*%?")

def clean_value(value):
    """ Helper function to clean individual values based on the enhanced regex pattern. """
    # Remove unwanted characters
    value = str(value).replace('$', '').replace('%', '')
    # Find all numbers in the string
    matches = digit_re.findall(value)
    if matches:
        # Taking the first match, remove any trailing non-numeric characters if present
        num = matches[0].rstrip('%')
        # Convert to float and back to string to normalize the number format
        cleaned_value = str(float(num))
    else:
        # If no valid numbers are found, return an invalid flag
        cleaned_value = "[invalid]"
    return cleaned_value

def clean_columns(df, columns):
    """ Cleans specified columns in the dataframe by applying the clean_value function. """
    for column in columns:
        if column in df.columns:
            df[column] = df[column].apply(clean_value)

def clean_answers(df):
    """ Cleans all specified answer columns in the dataframe including the 'Correct Answer'. """
    answer_columns = [f"Final Answer_{i}" for i in range(40)] + ["Correct Answer"]
    clean_columns(df, answer_columns)


df_gsm8k = final_df[final_df.Name.str.startswith('GSM')]

In [14]:
clean_answers(df_gsm8k)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[column] = df[column].apply(clean_value)


In [36]:
final_df = final_df[final_df['Correct Answer'] != 'NO']

In [41]:
final_df.Question

0       a multiple choice test consists of 4 questions...
1       a 3 - digit positive integer is chosen at rand...
2       if x and y are positive integers and 7 + x + y...
3       the hcf and lcm of two numbers m and n are res...
4       in a kilometer race , a beats b by 48 meters o...
                              ...                        
6593    In a neighborhood, the number of rabbits pets ...
6594    Out of the 200 Grade 5 students, 2/5 are boys ...
6596    When Freda cooks canned tomatoes into sauce, t...
6597    Cars have lined up on the motorway. Some of th...
6598    Mary is an avid gardener. Yesterday, she recei...
Name: Question, Length: 6547, dtype: object

In [37]:
df_bb = final_df[final_df.Name.str.startswith('Big')]

In [50]:
def extract_options(question):
    """ Extracts the mapping of options from the question string. """
    options_text = question.split('Options:\n')[-1]
    options = re.findall(r"\((.)\)\s(.+)", options_text)
    return {opt[1].strip(): opt[0] for opt in options}

def clean_answer(answer, options):
    """ Converts direct answers to their corresponding choice labels and standardizes the case. """
    answer = answer.strip()
    if len(answer) == 1 and answer.upper() in options.values():
        return answer.upper()
    elif answer in options:
        return options[answer]
    else:
        return "[invalid]"  # Or any other default value for unidentified answers

def process_answers(df):
    """ Processes all answer columns based on the options extracted from the question. """
    for i in range(40): 
        answer_column = f"Final Answer_{i}"
        df[answer_column] = df.apply(lambda x: clean_answer(x[answer_column], extract_options(x['Question'])), axis=1)

In [51]:
process_answers(df_bb)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[answer_column] = df.apply(lambda x: clean_answer(x[answer_column], extract_options(x['Question'])), axis=1)


In [54]:
df_math = final_df[final_df.Name.str.startswith('Math')]

In [66]:
import pandas as pd
import re

def extract_options(question):
    """ Extracts the mapping of options from the question string based on the new format. """
    options_text = question.split('The options are:')[-1]
    # Adjusting the regex pattern to match the new option format
    options = re.findall(r"(\w)\s\)\s(\d+)", options_text)
    return {opt[1].strip(): opt[0].lower() for opt in options}  # Keeping the option letters in lowercase

def clean_answer(answer, options):
    """ Converts direct answers to their corresponding choice labels and keeps the case as is (lowercase). """
    answer = answer.strip()
    # Check if answer is a number and map it back to the corresponding option letter
    if answer in options:
        return options[answer]
    # Check if the answer is already a valid option letter
    elif len(answer) == 1 and answer.lower() in options.values():
        return answer.lower()
    else:
        return "[invalid]"  # Or any other default value for unidentified answers

def process_answers(df):
    """ Processes all answer columns based on the options extracted from the question. """
    for i in range(40):  # Assuming there are 40 answer columns as mentioned
        answer_column = f"Final Answer_{i}"
        # Applying cleaning to each answer column based on the extracted options
        df[answer_column] = df.apply(lambda x: clean_answer(x[answer_column], extract_options(x['Question'])), axis=1)

process_answers(df_math)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[answer_column] = df.apply(lambda x: clean_answer(x[answer_column], extract_options(x['Question'])), axis=1)


In [73]:
df_math['Final Answer_0'].value_counts()

Final Answer_0
[invalid]    494
c            486
b            402
a            309
d            282
e            204
Name: count, dtype: int64

In [72]:
df_math.columns

Index(['Name', 'Category', 'Question', 'Correct Answer', 'CoT_0',
       'Final Answer_0', 'Instruction Violation_0', 'CoT_1', 'Final Answer_1',
       'Instruction Violation_1',
       ...
       'Final Answer_37', 'Instruction Violation_37', 'CoT_38',
       'Final Answer_38', 'Instruction Violation_38', 'CoT_39',
       'Final Answer_39', 'Instruction Violation_39', 'Difficulty', 'Model'],
      dtype='object', length=126)