# Length

In [196]:
import os
import pandas
import numpy as np
import pandas as pd
import re
import ast
DATA_DIR = '../data'

# Experiment Config
DF_NAME = 'GSM8K'
DIFFICULTY = 'easy'
NUM_OF_SAMPLES = 500
NUM_OF_COT = 40
storage_dir = os.path.join(DATA_DIR, f'Evaluation_CoTs/gpt-3.5-turbo-0125')
file_path = os.path.join(storage_dir, f'{DF_NAME}_{DIFFICULTY}.csv')

In [197]:
df = pd.read_csv(file_path)

In [198]:
result_dict = {
    'id':[],
    'correct answer': [],
    'CoT answers':[],
    'length':[],
    'instruction violation':[],
    'internal mistake':[]
}
tmp = []
for col in df.columns:
    if col.startswith('Final Answer_'):
        tmp.append(df[col].to_numpy())
tmp_arr = np.vstack(tmp)
cot_answer_li = tmp_arr.T
# Count Steps
step_count_buffer = []
for col in df:
    if col.startswith('CoT_'):
        cleaned_answers = []
        for entry in df[col]:
            # Extract numbers using regular expression
            steps = re.findall(r'[Ss]tep\s?\d', str(entry))
            if steps:
                # Join all numbers with space if there are multiple numbers
                cleaned_answers.append(len(steps))
            else:
                # If no number is found, replace with 'error'
                cleaned_answers.append(0)
        step_count_buffer.append(cleaned_answers)

step_count = np.array(step_count_buffer).T
# Violate instruction
instruction_buffer = []
for col in df:
    if col.startswith('Instruction Violation_'):
        cleaned_answers = []
        for entry in df[col]:
            # Extract numbers using regular expression
            x = ast.literal_eval(entry)
            cleaned_answers.append(sum([sum(idx) for idx in x]))
        instruction_buffer.append(cleaned_answers)

instruction_error = np.array(instruction_buffer).T

# Internal Mistake mentioned
mistake_buffer = []
for col in df:
    if col.startswith('CoT_'):
        cleaned_answers = []
        for entry in df[col]:
            # Extract numbers using regular expression
            misktake = re.findall(r'(be a mistake)|(be an error)', str(entry))
            if misktake:
                cleaned_answers.append(1)
            else:
                # If no number is found, replace with 'error'
                cleaned_answers.append(0)
        mistake_buffer.append(cleaned_answers)

mistakes = np.array(mistake_buffer).T
assert cot_answer_li.shape == step_count.shape == instruction_error.shape == mistakes.shape

[('', 'be an error')]
[('be a mistake', '')]
[('', 'be an error')]
[('', 'be an error'), ('be a mistake', '')]
[('be a mistake', '')]
[('', 'be an error')]
[('be a mistake', '')]
[('be a mistake', ''), ('', 'be an error')]
[('', 'be an error')]
[('be a mistake', '')]
[('be a mistake', '')]
[('', 'be an error')]
[('be a mistake', '')]
[('', 'be an error')]
[('be a mistake', '')]
[('', 'be an error')]
[('', 'be an error')]
[('', 'be an error')]
[('', 'be an error')]
[('', 'be an error')]
[('', 'be an error')]
[('', 'be an error')]
[('', 'be an error')]
[('', 'be an error')]
[('be a mistake', '')]
[('', 'be an error')]
[('be a mistake', '')]
[('be a mistake', '')]
[('', 'be an error')]
[('', 'be an error')]
[('be a mistake', '')]
[('', 'be an error')]
[('be a mistake', '')]
[('be a mistake', '')]
[('', 'be an error')]
[('be a mistake', '')]
[('', 'be an error')]
[('', 'be an error')]
[('', 'be an error')]


In [199]:
for row in range(len(df)):
    cot_li_sample = cot_answer_li[row]
    length_li_sample = step_count[row]
    IV_li_sample = instruction_error[row]
    mistake_sample = mistakes[row]
    result_dict['id'].append(row)
    result_dict['correct answer'].append(df.iloc[row]['Correct Answer'])
    result_dict['CoT answers'].append(cot_li_sample)
    result_dict['length'].append(length_li_sample)
    result_dict['instruction violation'].append(IV_li_sample)
    result_dict['internal mistake'].append(mistake_sample)


In [200]:
df_final = pd.DataFrame.from_dict(result_dict)

In [201]:
storage_dir = os.path.join(DATA_DIR, f'Evaluation_CoTs/Algo_Design_Data')
file_store_path = os.path.join(storage_dir, f'{DF_NAME}_{DIFFICULTY}.csv')


In [202]:
# df_final.to_csv(file_store_path,index= False)

# Visualize

In [203]:
import random
import seaborn as sns
import matplotlib.pyplot as plt
random.seed(666)

In [204]:
test_set = random.sample([i for i in range(NUM_OF_SAMPLES)],k=10)

In [205]:
df_sample = df_final.iloc[test_set]

In [206]:
# for row in range(len(df_sample)):
#     x = range(NUM_OF_COT)
#     
#     test_sample = df_sample.iloc[row]
#     correctness = [test_sample['correct answer'] == i for i in test_sample['CoT answers'].tolist()]
#     
#     # Set the theme for the plot
#     sns.set_theme(style="darkgrid")
#     
#     # Create each lineplot with a label for the legend
#     sns.lineplot(x=x, y=correctness, label='Correctness')
#     sns.lineplot(x=x, y=test_sample['length'], label='Length')
#     sns.lineplot(x=x, y=test_sample['instruction violation'], label='Instruction Violation')
#     sns.lineplot(x=x, y=test_sample['internal mistake'], label='Internal Mistake')
#     
#     # Set the x and y axis labels
#     plt.xlabel('CoT Instance')
#     plt.ylabel('Value')
#     
#     # Place the legend on the plot
#     plt.legend(title='Metrics')
#     plt.show()
#     # Show the plot

In [207]:
all_correctness = []
all_length = []
all_IV = []
all_IM = []
for row in range(len(df_final)):
    df_row = df_final.iloc[row]
    correctness = [str(df_row['correct answer']) == i for i in df_row['CoT answers'].tolist()]
    length = df_row['length']
    IV = df_row['instruction violation']
    IM = df_row['internal mistake']
    
    all_correctness+=correctness
    all_length += length.tolist()
    all_IV += IV.tolist()
    all_IM += IM.tolist()
    assert len(all_correctness) == len(all_length) == len(all_IV) == len(all_IM)
    

In [208]:
print(np.corrcoef(all_correctness,all_length))
print(np.corrcoef(all_correctness,all_IV))
print(np.corrcoef(all_correctness,all_IM))

[[ 1.        -0.0720094]
 [-0.0720094  1.       ]]
[[ 1.         -0.05204786]
 [-0.05204786  1.        ]]
[[ 1.         -0.06428128]
 [-0.06428128  1.        ]]


In [182]:
import scipy.stats as stats
contingency_table = pd.crosstab(index=all_correctness, columns=all_IV)

# Perform the Chi-square test
chi2, p, dof, expected = stats.chi2_contingency(contingency_table)

print("Chi2 Stat:", chi2)
print("P value:", p)
print("Degrees of freedom:", dof)
print("Expected frequencies table:\n", expected)

Chi2 Stat: 389.40689381681113
P value: 4.454599621725911e-80
Degrees of freedom: 7
Expected frequencies table:
 [[7.61195565e+03 2.71710000e+00 9.36946650e+02 4.52850000e-01
  2.59030200e+02 1.05966900e+02 1.39024950e+02 9.05700000e-01]
 [9.19704435e+03 3.28290000e+00 1.13205335e+03 5.47150000e-01
  3.12969800e+02 1.28033100e+02 1.67975050e+02 1.09430000e+00]]


In [181]:
corr, p_value = stats.spearmanr(all_correctness, all_length)

print("Spearman's correlation coefficient:", corr)
print("P value:", p_value)
corr, p_value = stats.spearmanr(all_correctness, all_IV)

print("Spearman's correlation coefficient:", corr)
print("P value:", p_value)
corr, p_value = stats.spearmanr(all_correctness, all_IM)

print("Spearman's correlation coefficient:", corr)
print("P value:", p_value)

Spearman's correlation coefficient: 0.03682847856992283
P value: 1.8906965790587102e-07
Spearman's correlation coefficient: -0.04686235433399567
P value: 3.344017709903813e-11
Spearman's correlation coefficient: nan
P value: nan




In [213]:
import numpy as np

# Assuming arr is your 500x40 numpy array
arr = np.random.randint(0, 2, (500, 40))  # Example array with random 0s and 1s

# Initialize an array of the same shape filled with False


print(consistency_checks)
arr

[[False  True  True ...  True  True  True]
 [False  True  True ... False False False]
 [False False  True ...  True  True  True]
 ...
 [False  True False ...  True False False]
 [False  True False ... False False False]
 [False False  True ... False  True  True]]


array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 1, 0],
       [1, 0, 0, ..., 0, 0, 0],
       ...,
       [1, 1, 0, ..., 1, 0, 1],
       [1, 1, 0, ..., 1, 0, 1],
       [1, 0, 0, ..., 0, 0, 0]])