In [1]:
import os
import pandas as pd

# Path to the main directory
main_dir = "../../data/Evaluation_CoTs"

# Subdirectories
subdirectories = ["claude-3-haiku-20240307", "gpt-3.5-turbo-0125","gpt-4"]

# List to store dataframes
dataframes = []

# Iterate over each subdirectory and process files
for sub in subdirectories:
    for file in os.listdir(os.path.join(main_dir, sub)):
        if file.endswith('.csv'):
            # Determine the difficulty from the file name
            difficulty = 'easy' if 'easy' in file else 'hard'
            
            # Read CSV file
            df = pd.read_csv(os.path.join(main_dir, sub, file))
            
            # Extract model from the directory name
            model = sub
            
            # Add columns for difficulty and model
            df['Difficulty'] = difficulty
            df['Model'] = model
            
            # Append the dataframe to the list
            dataframes.append(df)

# Concatenate all dataframes
final_df = pd.concat(dataframes, ignore_index=True)



In [2]:
final_df.shape

(6599, 372)

In [3]:
# Drop columns whose names start with "Unnamed:"
unnamed_cols = ['index'] + [col for col in final_df.columns if col.startswith('Unnamed:')]
final_df.drop(columns=unnamed_cols, inplace=True)

In [4]:
final_df.shape

(6599, 126)

In [5]:
final_df = final_df.dropna()
final_df = final_df[final_df['Correct Answer'] != 'NO']

In [6]:
final_df.shape

(6554, 126)

In [7]:
final_df.Name.value_counts()

Name
GSM8K_hard               1098
BigBench_hard            1098
GSM8K_test               1097
MathQA_challenge_test    1090
MathQA_dev               1087
BigBench_easy            1084
Name: count, dtype: int64

In [8]:
import pandas as pd
import re

# Enhanced regex pattern to capture negative numbers, percentages, and dollar amounts
digit_re = re.compile(r"-?\$?\d+\.?\d*%?")

def clean_value(value):
    """ Helper function to clean individual values based on the enhanced regex pattern. """
    # Remove unwanted characters
    value = str(value).replace('$', '').replace('%', '')
    # Find all numbers in the string
    matches = digit_re.findall(value)
    if matches:
        # Taking the first match, remove any trailing non-numeric characters if present
        num = matches[0].rstrip('%')
        # Convert to float and back to string to normalize the number format
        cleaned_value = str(float(num))
    else:
        # If no valid numbers are found, return an invalid flag
        cleaned_value = "error"
    return cleaned_value

def clean_columns(df, columns):
    """ Cleans specified columns in the dataframe by applying the clean_value function. """
    for column in columns:
        if column in df.columns:
            df[column] = df[column].apply(clean_value)

def clean_answers(df):
    """ Cleans all specified answer columns in the dataframe including the 'Correct Answer'. """
    answer_columns = [f"Final Answer_{i}" for i in range(40)] + ["Correct Answer"]
    clean_columns(df, answer_columns)


df_gsm8k = final_df[final_df.Name.str.startswith('GSM')]

In [9]:
clean_answers(df_gsm8k)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[column] = df[column].apply(clean_value)


In [10]:
df_bb = final_df[final_df.Name.str.startswith('Big')]

In [11]:
def extract_options(question):
    """ Extracts the mapping of options from the question string. """
    options_text = question.split('Options:\n')[-1]
    options = re.findall(r"\((.)\)\s(.+)", options_text)
    return {opt[1].strip(): opt[0] for opt in options}

def clean_answer(answer, options):
    """ Converts direct answers to their corresponding choice labels and standardizes the case. """
    answer = answer.strip()
    if len(answer) == 1 and answer.upper() in options.values():
        return answer.upper()
    elif answer in options:
        return options[answer]
    else:
        return "erroe"  # Or any other default value for unidentified answers

def process_answers(df):
    """ Processes all answer columns based on the options extracted from the question. """
    for i in range(40): 
        answer_column = f"Final Answer_{i}"
        df[answer_column] = df.apply(lambda x: clean_answer(x[answer_column], extract_options(x['Question'])), axis=1)

In [12]:
process_answers(df_bb)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[answer_column] = df.apply(lambda x: clean_answer(x[answer_column], extract_options(x['Question'])), axis=1)


In [13]:
df_math = final_df[final_df.Name.str.startswith('Math')]

In [14]:
import pandas as pd
import re

def extract_options(question):
    """ Extracts the mapping of options from the question string based on the new format. """
    options_text = question.split('The options are:')[-1]
    # Adjusting the regex pattern to match the new option format
    options = re.findall(r"(\w)\s\)\s(\d+)", options_text)
    return {opt[1].strip(): opt[0].lower() for opt in options}  # Keeping the option letters in lowercase

def clean_answer(answer, options):
    """ Converts direct answers to their corresponding choice labels and keeps the case as is (lowercase). """
    answer = answer.strip()
    # Check if answer is a number and map it back to the corresponding option letter
    if answer in options:
        return options[answer]
    # Check if the answer is already a valid option letter
    elif len(answer) == 1 and answer.lower() in options.values():
        return answer.lower()
    else:
        return "error"  # Or any other default value for unidentified answers

def process_answers(df):
    """ Processes all answer columns based on the options extracted from the question. """
    for i in range(40):  # Assuming there are 40 answer columns as mentioned
        answer_column = f"Final Answer_{i}"
        # Applying cleaning to each answer column based on the extracted options
        df[answer_column] = df.apply(lambda x: clean_answer(x[answer_column], extract_options(x['Question'])), axis=1)

process_answers(df_math)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[answer_column] = df.apply(lambda x: clean_answer(x[answer_column], extract_options(x['Question'])), axis=1)


In [15]:
pd.concat([df_gsm8k, df_bb, df_math]).reset_index(drop=True).to_csv('../../data/Evaluation_CoTs/final.csv', index=False)

# Test code to run on final data

In [16]:
final = pd.concat([df_gsm8k, df_bb, df_math]).reset_index(drop=True)

In [17]:
import sys
sys.path.insert(0, '../')
from CS_feature_extractor import extract_feature,extract_cot_answer
from IDV_CS_Model import trained_LR_model

  from .autonotebook import tqdm as notebook_tqdm


In [22]:
df_with_features = pd.DataFrame(extract_feature(final))
feature_li = [
    # 'LEN',
    'QUA_IM',
    'DIF_IV',
    # 'DIF_SUB',
    # 'SIM_COT_BIGRAM',
    'SIM_COT_AGG',
    # 'SIM_COT_PW',
    'SIM_AC_BIGRAM',
    'SIM_AC_AGG',
    'SIM_AC_PW',
    # 'size_of_cot'
]
# coe = [-0.1,-5,-1,3,2,2,2]
# # intercept = -1.5
# df = trained_LR_model(df_with_features,feature_li)

jaccard with aggregation time cost: 182.22808814048767s


100%|██████████| 6554/6554 [00:01<00:00, 6405.29it/s] 


In [23]:
df = trained_LR_model(df_with_features,feature_li)

SC_correctness
1    3558
0    2996
Name: count, dtype: int64
ES_correctness
1    3560
0    2994
Name: count, dtype: int64
Index(['id', 'Name', 'Model', 'correct answer', 'CoT answers', 'Correctness',
       'LEN', 'QUA_IM', 'DIF_IV', 'SIM_COT_AGG', 'SIM_AC_BIGRAM', 'SIM_AC_AGG',
       'SIM_AC_PW', 'SC_correctness', 'ES_correctness', 'ES_steps'],
      dtype='object')
ASC Steps and Answers added.
------------DF Stats-------------
QUA_IM : Counter({0: 261368, 1: 792})
DIF_IV : Counter({0: 239262, 1: 22898})
SIM_AC_BIGRAM : Counter({1: 174613, 0: 87547})
SIM_AC_AGG : Counter({1: 192016, 0: 70144})
SIM_AC_PW : Counter({1: 183797, 0: 78363})
Correctness : Counter({0: 132730, 1: 129430})
cot_answer : Counter({'B': 25960, 'C': 25840, 'A': 23320, 'c': 22240, 'b': 18400, 'd': 18040, 'a': 17720, 'D': 11520, 'e': 10680, '5.0': 2000, '6.0': 1800, '20.0': 1520, '8.0': 1480, '10.0': 1480, '4.0': 1400, '15.0': 1280, '3.0': 1240, '2.0': 1240, '16.0': 1160, '18.0': 960, '50.0': 920, '12.0': 920, '25.0



                           Logit Regression Results                           
Dep. Variable:            Correctness   No. Observations:               209728
Model:                          Logit   Df Residuals:                   209721
Method:                           MLE   Df Model:                            6
Date:                Sat, 11 May 2024   Pseudo R-squ.:                  0.1430
Time:                        22:04:38   Log-Likelihood:            -1.2458e+05
converged:                      False   LL-Null:                   -1.4537e+05
Covariance Type:            nonrobust   LLR p-value:                     0.000
                    coef    std err          z      P>|z|      [0.025      0.975]
---------------------------------------------------------------------------------
const            -1.7710      0.020    -89.411      0.000      -1.810      -1.732
QUA_IM           -0.8616      0.152     -5.684      0.000      -1.159      -0.565
DIF_IV          -11.0751      2.407     

In [47]:
asnwers, bin = extract_cot_answer(final)

In [48]:
# Sample DataFrame and Correctness array (replace these with your actual data)
columns = [f'CoT_{i}' for i in range(40)]
data = final[columns]

# Convert Correctness array to DataFrame
correctness_df = pd.DataFrame(bin, columns=[f'CoT_{i}' for i in range(40)])

# Unpivot/melt the original DataFrame
melted_data = data.reset_index().melt(id_vars='index', var_name='CoT', value_name='CoT_Value')

# Unpivot/melt the Correctness DataFrame
melted_correctness = correctness_df.reset_index().melt(id_vars='index', var_name='CoT', value_name='Correctness_Value')

# Merge the two melted DataFrames on 'index' and 'CoT'
combined_df = pd.merge(melted_data, melted_correctness, on=['index', 'CoT'])

# Drop the 'index' and 'CoT' columns if they are no longer needed
combined_df = combined_df[['CoT_Value', 'Correctness_Value']]

# Show the result
print(combined_df.head())

                                           CoT_Value  Correctness_Value
0  Step 1: Calculate the total number of eggs lai...                  0
1  Step 1: The robe takes 2287720 bolts of blue f...                  1
2  Step 1: Identify the given information.\n     ...                  0
3  Step 1: Determine the total number of cups of ...                  0
4  Step 1: Determine the price of each glass.\n  ...                  1


In [49]:
combined_df.shape

(262160, 2)

In [42]:
import pandas as pd
from sklearn.model_selection import train_test_split
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
import torch

# Check if CUDA is available, otherwise use CPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Assuming df_concate is your DataFrame and it's correctly loaded with 'CoT_Value' as text and 'Correctness' as labels
df_concate = combined_df  # Make sure combined_df is your dataframe name that includes 'CoT_Value' and 'Correctness'

# Split index
split_idx = int(len(df_concate) * 0.8)  # 80% for training

# Split the data into training and testing
X_train = df_concate['CoT_Value'].iloc[:split_idx]
y_train = df_concate['Correctness_Value'].iloc[:split_idx]
X_test = df_concate['CoT_Value'].iloc[split_idx:]
y_test = df_concate['Correctness_Value'].iloc[split_idx:]

# Tokenization
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
train_encodings = tokenizer(X_train.tolist(), truncation=True, padding=True, max_length=512)
test_encodings = tokenizer(X_test.tolist(), truncation=True, padding=True, max_length=512)

# Convert to PyTorch datasets
class CoTDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = {key: torch.tensor(val).to(device) for key, val in encodings.items()}
        self.labels = torch.tensor(labels).to(device)

    def __getitem__(self, idx):
        item = {key: val[idx] for key, val in self.encodings.items()}
        item['labels'] = self.labels[idx]
        return item

    def __len__(self):
        return len(self.labels)

train_dataset = CoTDataset(train_encodings, y_train.to_list())
test_dataset = CoTDataset(test_encodings, y_test.to_list())

# Load BERT with a classification head and move model to device
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2).to(device)

# Setup training arguments
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=3,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=16,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=10,
)

# Initialize the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset
)

# Train the model
trainer.train()

# Evaluate the model
results = trainer.evaluate()
print(results)



Unnamed: 0,CoT_Value,Correctness_Value
0,Step 1: To determine the number of zeroes the ...,0
1,Step 1: The man bought 20 shares of Rs. 50 eac...,1
2,Step 1: We are given the equation '? % of 360 ...,1
3,Step 1: We are given that the corporation doub...,0
4,Step 1: Let's define the variables.\n Let t...,1
...,...,...
39955,Step 1: The first doughnut is priced at $1.\nS...,1
39956,Step 1: The total rainfall in Springdale durin...,0
39957,Step 1: The length of the bridge is 200 meters...,1
39958,Step 1: The given expression is 10^(600) * 10^...,0
