# Length

In [1]:
import os
import pandas
import numpy as np
import pandas as pd
import re
import ast
DATA_DIR = '../data'

# Experiment Config
DF_NAME = 'MathQA'
DIFFICULTY = 'easy'
NUM_OF_SAMPLES = 500
NUM_OF_COT = 40
storage_dir = os.path.join(DATA_DIR, f'Evaluation_CoTs/gpt-3.5-turbo-0125')
file_path = os.path.join(storage_dir, f'{DF_NAME}_{DIFFICULTY}.csv')

In [2]:
df = pd.read_csv(file_path)

In [3]:
result_dict = {
    'id':[],
    'correct answer': [],
    'CoT answers':[],
    'length':[],
    'instruction violation':[],
    'internal mistake':[]
}
tmp = []
for col in df.columns:
    if col.startswith('Final Answer_'):
        tmp.append(df[col].to_numpy())
tmp_arr = np.vstack(tmp)
cot_answer_li = tmp_arr.T
# Count Steps
step_count_buffer = []
for col in df:
    if col.startswith('CoT_'):
        cleaned_answers = []
        for entry in df[col]:
            # Extract numbers using regular expression
            steps = re.findall(r'[Ss]tep\s?\d', str(entry))
            if steps:
                # Join all numbers with space if there are multiple numbers
                cleaned_answers.append(len(steps))
            else:
                # If no number is found, replace with 'error'
                cleaned_answers.append(0)
        step_count_buffer.append(cleaned_answers)

step_count = np.array(step_count_buffer).T
# Violate instruction
instruction_buffer = []
for col in df:
    if col.startswith('Instruction Violation_'):
        cleaned_answers = []
        for entry in df[col]:
            # Extract numbers using regular expression
            x = ast.literal_eval(entry)
            cleaned_answers.append(sum([sum(idx) for idx in x]))
        instruction_buffer.append(cleaned_answers)

instruction_error = np.array(instruction_buffer).T

# Internal Mistake mentioned
mistake_buffer = []
for col in df:
    if col.startswith('CoT_'):
        cleaned_answers = []
        for entry in df[col]:
            # Extract numbers using regular expression
            misktake = re.findall(r'(be a mistake)|(be an error)', str(entry))
            if misktake:
                cleaned_answers.append(1)
                print(misktake)
            else:
                # If no number is found, replace with 'error'
                cleaned_answers.append(0)
        mistake_buffer.append(cleaned_answers)

mistakes = np.array(mistake_buffer).T
assert cot_answer_li.shape == step_count.shape == instruction_error.shape == mistakes.shape

[('be a mistake', '')]
[('', 'be an error')]
[('', 'be an error')]
[('', 'be an error')]
[('', 'be an error')]
[('', 'be an error')]
[('be a mistake', '')]
[('be a mistake', '')]
[('', 'be an error')]
[('', 'be an error')]
[('be a mistake', ''), ('be a mistake', '')]
[('be a mistake', '')]
[('', 'be an error')]
[('', 'be an error')]
[('', 'be an error')]
[('', 'be an error')]
[('', 'be an error')]
[('be a mistake', '')]
[('be a mistake', '')]
[('be a mistake', '')]
[('be a mistake', '')]
[('be a mistake', '')]
[('', 'be an error')]
[('', 'be an error')]
[('', 'be an error')]
[('', 'be an error')]
[('', 'be an error')]
[('be a mistake', '')]
[('be a mistake', '')]
[('', 'be an error')]
[('', 'be an error')]
[('be a mistake', '')]
[('be a mistake', '')]
[('be a mistake', '')]
[('be a mistake', '')]
[('be a mistake', '')]
[('be a mistake', '')]
[('', 'be an error')]
[('', 'be an error')]
[('', 'be an error')]
[('', 'be an error')]
[('', 'be an error')]
[('', 'be an error')]
[('', 'be an e

In [4]:
for row in range(len(df)):
    cot_li_sample = cot_answer_li[row]
    length_li_sample = step_count[row]
    IV_li_sample = instruction_error[row]
    mistake_sample = mistakes[row]
    result_dict['id'].append(row)
    result_dict['correct answer'].append(df.iloc[row]['Correct Answer'])
    result_dict['CoT answers'].append(cot_li_sample)
    result_dict['length'].append(length_li_sample)
    result_dict['instruction violation'].append(IV_li_sample)
    result_dict['internal mistake'].append(mistake_sample)


In [5]:
df_final = pd.DataFrame.from_dict(result_dict)

In [6]:
storage_dir = os.path.join(DATA_DIR, f'Evaluation_CoTs/Algo_Design_Data')
file_store_path = os.path.join(storage_dir, f'{DF_NAME}_{DIFFICULTY}.csv')


In [7]:
# df_final.to_csv(file_store_path,index= False)

# Visualize

In [7]:
import random
import seaborn as sns
import matplotlib.pyplot as plt
random.seed(666)

In [11]:
all_correctness = []
all_length = []
all_IV = []
all_IM = []
for row in range(len(df_final)):
    df_row = df_final.iloc[row]
    correctness = [str(df_row['correct answer']) == i for i in df_row['CoT answers'].tolist()]
    length = df_row['length']
    IV = df_row['instruction violation']
    IM = df_row['internal mistake']
    
    all_correctness+=correctness
    all_length += length.tolist()
    all_IV += IV.tolist()
    all_IM += IM.tolist()
    assert len(all_correctness) == len(all_length) == len(all_IV) == len(all_IM)
    

In [12]:
print(np.corrcoef(all_correctness,all_length))
print(np.corrcoef(all_correctness,all_IV))
print(np.corrcoef(all_correctness,all_IM))

[[ 1.         -0.07492529]
 [-0.07492529  1.        ]]
[[ 1.        -0.0535971]
 [-0.0535971  1.       ]]
[[ 1.         -0.04722619]
 [-0.04722619  1.        ]]


In [14]:
import scipy.stats as stats
contingency_table = pd.crosstab(index=all_correctness, columns=all_IV)

# Perform the Chi-square test
chi2, p, dof, expected = stats.chi2_contingency(contingency_table)

print("Chi2 Stat:", chi2)
print("P value:", p)
print("Degrees of freedom:", dof)
print("Expected frequencies table:\n", expected)

Chi2 Stat: 88.99415810433538
P value: 1.092902185304231e-17
Degrees of freedom: 5
Expected frequencies table:
 [[9.34326525e+03 4.97750000e-01 4.33042500e+02 9.90522500e+01
  4.03177500e+01 3.88245000e+01]
 [9.42773475e+03 5.02250000e-01 4.36957500e+02 9.99477500e+01
  4.06822500e+01 3.91755000e+01]]


In [15]:
corr, p_value = stats.spearmanr(all_correctness, all_length)

print("Spearman's correlation coefficient:", corr)
print("P value:", p_value)
corr, p_value = stats.spearmanr(all_correctness, all_IV)

print("Spearman's correlation coefficient:", corr)
print("P value:", p_value)
corr, p_value = stats.spearmanr(all_correctness, all_IM)

print("Spearman's correlation coefficient:", corr)
print("P value:", p_value)

Spearman's correlation coefficient: -0.08053141867014806
P value: 3.872294988681403e-30
Spearman's correlation coefficient: -0.03133424010640949
P value: 9.330153980593303e-06
Spearman's correlation coefficient: -0.047226190546026094
P value: 2.355378294691026e-11


In [16]:
df_final

Unnamed: 0,id,correct answer,CoT answers,length,instruction violation,internal mistake
0,0,c,"[c, c, c, c, c, c, c, c, c, c, c, c, c, c, c, ...","[4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 5, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
1,1,b,"[e, e, c, a, e, 5 / 9, c, d, c, c, e, 5 / 9, c...","[4, 3, 3, 4, 3, 4, 3, 3, 3, 6, 3, 3, 3, 3, 3, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
2,2,c,"[c, a, c, c, c, d, c, c, c, c, c, c, 7, 6, c, ...","[6, 6, 9, 6, 6, 4, 5, 6, 6, 7, 6, 6, 6, 6, 7, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
3,3,d,"[d, d, a, 1 / 18, b, b, b, b, b, d, b, 1 / 18,...","[4, 3, 4, 3, 6, 6, 3, 5, 5, 3, 3, 3, 7, 7, 7, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
4,4,a,"[a, b, a, d, b, a, b, cannot be determined, b,...","[4, 6, 5, 3, 8, 3, 6, 3, 3, 3, 2, 3, 4, 6, 6, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, ..."
...,...,...,...,...,...,...
495,495,b,"[b, b, b, b, b, b, b, b, 17, b, 17, 17, b, b, ...","[4, 6, 5, 6, 9, 5, 6, 4, 8, 4, 8, 8, 8, 8, 8, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
496,496,e,"[e, e, e, e, e, e, e, e, e, e, e, e, e, e, e, ...","[5, 5, 4, 5, 5, 5, 5, 5, 5, 4, 4, 3, 4, 3, 5, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
497,497,c,"[c, 100 m, 750 m, a, c, e, c, c, a, a, 400, c,...","[2, 6, 2, 3, 13, 7, 4, 6, 13, 4, 4, 6, 12, 11,...","[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, ..."
498,498,b,"[e, 10 ^ 1180, e, error, e, error, e, e, error...","[3, 3, 1, 0, 3, 0, 4, 4, 0, 3, 4, 3, 3, 1, 3, ...","[2, 2, 0, 10, 0, 10, 2, 0, 10, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."


In [None]:
# TODO: 1: Check consistency on the CoT answers and correct answers
#       2: Try an ordinal logistic regression model to predict the correctness of the CoT answers and measure the importance of the features
#       3: Try binary logistic regression model to predict the correctness of the CoT answers and measure the importance of the features if the ordinal logistic regression model does not work(assumption violated)
#       4: Build a scoring function (LR/RF) to predict a confidence score for each CoT answer, and then use the correct binary outcome to verify the corretness of confidence score. (AUROC,calibration plot)