In [None]:
# Training set, test set, and generalization set split
import os
import json
import random


SEED = 42
random.seed(SEED)


data_path = './processed_outputs/output_data_filtered_judge_with_human_label.json'
with open(data_path, 'r') as f:
    data = json.load(f)

data_path2 = './processed_outputs/output_data_filtered2_judge_with_human_label.json'
with open(data_path2, 'r') as f:
    data2 = json.load(f)

all_data = data + data2

train_test_dataset = ['MMLU-Pro_enh', 'MMLU-Pro', 'DROP', 'GPQA_enh', 'GPQA', 'C-SimpleQA', 'FRAMES', 'AIME_2024', 'AMC23', 'OlympiadBench_en', 'OlympiadBench_zh', 'MGSM', 'CMATH', 'GSM8K', 'MATH', 'AgNews', 'CHID', 'CMMLU_enh', 'CMMLU', 'MMLU_enh', 'MMLU', 'CLUEWSC']
general_dataset = ['MMLU-Redux_enh', 'MMLU-Redux', 'C-Eval_enh', 'C-Eval', 'SimpleQA', 'ARC', 'LiveMathBench_zh', 'LiveMathBench_en', 'Amazon', 'CMNLI']


train_test_data = []
general_data = []

for item in all_data:
    if item["GPT_4o_judgment_consistency"] != "True":
        continue
    if item["dataset"] in train_test_dataset:
        train_test_data.append(item)
    else:
        general_data.append(item)

print(len(all_data), len(train_test_data), len(general_data))

random.shuffle(train_test_data)
train = train_test_data[:35569]
test = train_test_data[35569:]

random.shuffle(general_data)
general = general_data[:4000]

print(len(train), len(test), len(general))

save_dir = './processed_outputs/experiment_dataset'
with open(os.path.join(save_dir, 'train_raw.json'), 'w', encoding='utf-8') as f:
    json.dump(train, f, ensure_ascii=False, indent=4)
with open(os.path.join(save_dir, 'test_raw.json'), 'w', encoding='utf-8') as f:
    json.dump(test, f, ensure_ascii=False, indent=4)
with open(os.path.join(save_dir, 'general_raw.json'), 'w', encoding='utf-8') as f:
    json.dump(general, f, ensure_ascii=False, indent=4)

In [None]:
# Merge manually annotated data into the test set
import json


data_path = './processed_outputs/experiment_dataset/test_raw.json'
with open(data_path, 'r') as f:
    data = json.load(f)

data_path2 = './processed_outputs/test_raw_human_labeled.json'
with open(data_path2, 'r') as f:
    data2 = json.load(f)

print(len(data), len(data2))

num = len(data)
not_consistency_num = 0
cnt = 0
for item in data:
    if item['GPT_4o_judgment_consistency'] == "False" or item['key_answer_type'] == "math":
        continue
    assert item['question'] == data2[cnt]['question']
    assert item['model_name'] == data2[cnt]['model_name']
    assert data2[cnt].keys() - item.keys() == {'human_judgment_result'}
    if item['GPT_4o_final_judgment_result'] != data2[cnt]['human_judgment_result']:
        not_consistency_num += 1
    item['human_judgment_result'] = data2[cnt]['human_judgment_result']
    cnt += 1

print(cnt, not_consistency_num)

output_path = './processed_outputs/experiment_dataset/test_raw2.json'
with open(output_path, 'w', encoding='utf-8') as f:
    json.dump(data, f, ensure_ascii=False, indent=4)
print(len(data))

In [None]:
# Merge manually annotated data into the generalization set
import json


data_path = './processed_outputs/experiment_dataset/general_raw.json'
with open(data_path, 'r') as f:
    data = json.load(f)

data_path2 = './processed_outputs/general_raw_human_labeled.json'
with open(data_path2, 'r') as f:
    data2 = json.load(f)

print(len(data), len(data2))

num = len(data)
not_consistency_num = 0
cnt = 0
for item in data:
    if item['GPT_4o_judgment_consistency'] == "False" or item['key_answer_type'] == "math":
        continue
    assert item['question'] == data2[cnt]['question']
    assert item['model_name'] == data2[cnt]['model_name']
    assert data2[cnt].keys() - item.keys() == {'human_judgment_result'}
    if item['GPT_4o_final_judgment_result'] != data2[cnt]['human_judgment_result']:
        not_consistency_num += 1
    item['human_judgment_result'] = data2[cnt]['human_judgment_result']
    cnt += 1

print(cnt, not_consistency_num)

output_path = './processed_outputs/experiment_dataset/general_raw2.json'
with open(output_path, 'w', encoding='utf-8') as f:
    json.dump(data, f, ensure_ascii=False, indent=4)

In [None]:
# Get statistical information of each dataset
import json
from collections import Counter

# train: ./processed_outputs/experiment_dataset/train_raw.json
# test: ./processed_outputs/experiment_dataset/test_raw2.json
# general: ./processed_outputs/experiment_dataset/general_raw2.json
data_path = './processed_outputs/experiment_dataset/test_raw2.json'
with open(data_path, 'r') as f:
    data = json.load(f)

models = []
datasets = []
prompt_types = []
key_answer_types = []
model_output_length = []
judgments = []
for item in data:
    models.append(item['model_name'])
    datasets.append(item['dataset'])
    prompt_types.append(item['setting'])
    key_answer_types.append(item['key_answer_type'])
    model_output_length.append(len(item['llm_output']))
    if 'train' in data_path:
        if 'human_judgment_result' not in item:
            judgments.append(item['GPT_4o_final_judgment_result'])
        else:
            judgments.append(item['human_judgment_result'])
    else:
        judgments.append(item['human_judgment_result'])

print(json.dumps(Counter(models), indent=4))
print(json.dumps(Counter(datasets), indent=4))
print(json.dumps(Counter(prompt_types), indent=4))
print(json.dumps(Counter(key_answer_types), indent=4))
print(json.dumps(Counter(judgments), indent=4))

print(
    " length < 1000: ", len([i for i in model_output_length if i < 1000]), '\n',
    "1000 <= length < 2000: ", len([i for i in model_output_length if 1000 <= i < 2000]), '\n',
    "2000 <= length < 3000: ", len([i for i in model_output_length if 2000 <= i < 3000]), '\n',
    "3000 <= length < 4000: ", len([i for i in model_output_length if 3000 <= i < 4000]), '\n',
    "4000 <= length < 5000: ", len([i for i in model_output_length if 4000 <= i < 5000]), '\n',
    "5000 <= length < 6000: ", len([i for i in model_output_length if 5000 <= i < 6000]), '\n',
    "6000 <= length: ", len([i for i in model_output_length if 6000 <= i]), '\n',
    "avg length:", round(sum(model_output_length) / len(model_output_length), 2)
)

In [None]:
# Load the dataset that needs augmentation and perform data augmentation
import re
import json
import random


random.seed(42)


# Sequentially switch to the training set, test set, and generalization set
data_path = "./processed_outputs/experiment_dataset/train_raw2.json"
with open(data_path, 'r') as f:
    data = json.load(f)

# Sequentially check the target sample sizes for each type of data augmentation
    
# Replace the final answer sentence pattern
sample_data = []
for item in data:
    if item['key_answer_type'] == "alphabet_option" and item['setting'].endswith('restrict'):
        sample_data.append(item)
print(len(sample_data))

# Equivalent substitution of the correct answer
answer_replace_data = []
for item in data:
    if item['key_answer_type'] == 'math' and 4 < len(str(item['correct_answer'])) <= 50 and not bool(re.search(r'[\u4e00-\u9fff]', str(item['correct_answer']))):
        answer_replace_data.append(item)
print(len(answer_replace_data))

# Equivalent substitution of key response phrases
pattern = r'The answer is (.+?)\.'
output_replace_data = []
for item in data:
    if item['key_answer_type'] == 'math' and item['setting'].endswith('restrict'):
        match = re.search(pattern, item["llm_output"])
        if match and match.span()[1] == len(item["llm_output"]) and 4 < len(match.group(1)) <= 50 and any(char.isdigit() for char in match.group(1)):
            output_replace_data.append(item)
print(len(output_replace_data))

In [None]:
# Replace the final answer sentence pattern
import os
import json
import re
import random


random.seed(42)


sample_data = []
for item in data:
    if item['key_answer_type'] == "alphabet_option" and item['setting'].endswith('restrict'):
        sample_data.append(item)

random.shuffle(sample_data)

sample_ratio = 0.6  # Sampling ratio
sample_num = int(len(sample_data) * sample_ratio)
sample_data2 = sample_data[:sample_num]

# Template for the final answer statement to be replaced
final_answer_prompts = [
    "the most appropriate answer is {final_answer}",
    "the most logical answer is {final_answer}",
    "the most fitting answer based on the information given is {final_answer}",
    "I would choose {final_answer}",
    "I would select {final_answer}",
    "the most suitable answer is {final_answer}",
    "the most reasonable answer is {final_answer}",
    "the correct answer would be {final_answer}",
    "the closest answer is {final_answer}",
    "the most accurate answer is {final_answer}",
    "the answer should be {final_answer}",
    "the best answer is {final_answer}",
    "I choose {final_answer}",
    "the most likely answer is {final_answer}",
    "the option that best fits this scenario is {final_answer}",
    "the answer {final_answer} is the most appropriate",
    "the answer {final_answer} is the most logical",
    "the answer {final_answer} is the most fitting",
    "the answer {final_answer} is correct",
    "{final_answer} seems to be the most promising"
]

wrap_type = ["({final_answer})", "[{final_answer}]", "<{final_answer}>", "'{final_answer}'",
             "({final_answer}", "[{final_answer}", "<{final_answer}", "'{final_answer}",
             "{final_answer})", "{final_answer}]", "{final_answer}>", "{final_answer}'", "{final_answer}",
             "（{final_answer}）", "【{final_answer}】", "《{final_answer}》", "‘{final_answer}’", "\\boxed{{{final_answer}}}"]
gap_type = ["\\ {final_answer}", ": {final_answer} ", ": {final_answer}", "# {final_answer} ", "{final_answer}"]
pattern = r'The answer is (.+?)\.'

new_data = []
for item in sample_data2:
    # match = re.search(pattern, item["llm_output"])
    matches = list(re.finditer(pattern, item["llm_output"]))
    if matches:
        final_answer = matches[-1].group(1)
        final_answer = final_answer.replace("(", "").replace(")", "").replace(" ", "")
        # random select wrap type
        wrap_type_idx = 0
        wrap_type_idx = random.randint(0, len(wrap_type) - 1)
        final_answer = wrap_type[wrap_type_idx].format(final_answer=final_answer)
        # random select gap type
        gap_type_idx = 0
        gap_type_idx = random.randint(0, len(gap_type) - 1)
        final_answer = gap_type[gap_type_idx].format(final_answer=final_answer)
        # random select final answer prompt
        final_answer_prompt_idx = 0
        final_answer_prompt_idx = random.randint(0, len(final_answer_prompts) - 1)
        final_answer_prompt = final_answer_prompts[final_answer_prompt_idx].format(final_answer=final_answer)
        try:
            new_text = re.sub(pattern, final_answer_prompt, item["llm_output"])
        except Exception as e:
            continue
        item["llm_output"] = new_text
        new_data.append(item)

print(len(new_data))

In [None]:
# Equivalent substitution of the correct answer
import os
import json
import re
import random

from tqdm import tqdm

from utils.llms import LLMs


random.seed(42)


prompt_path = './prompts/generate_answer_en.txt'
with open(prompt_path, 'r') as f:
    prompt = f.read()

model = LLMs('GPT_4o')

answer_replace_data = []
for item in data:
    if item['key_answer_type'] == 'math' and 4 < len(str(item['correct_answer'])) <= 50 and not bool(re.search(r'[\u4e00-\u9fff]', str(item['correct_answer']))):
        answer_replace_data.append(item)

random.shuffle(answer_replace_data)

new_data1 = []
sample_ratio = 0.13
sample_num = int(len(answer_replace_data) * sample_ratio)
answer_replace_data = answer_replace_data[:sample_num]
with tqdm(total=sample_num, desc="Processing", unit="task") as pbar:
    for item in answer_replace_data:
        input = prompt.format(
            question=item['question'],
            answer=item['correct_answer']
        )
        response = model.request(input)
        response = response[response.find('```json')+7:response.rfind('```')]
        response = eval(response)

        for k, v in response.items():
            new_item = item.copy()
            new_item['correct_answer'] = v
            new_data1.append(new_item)
        pbar.update(1)

print(len(new_data1))

In [None]:
# Equivalent substitution of key response phrases
import os
import json
import re
import random

from tqdm import tqdm

from utils.llms import LLMs


random.seed(52)


prompt_path = './prompts/generate_output_en.txt'
with open(prompt_path, 'r') as f:
    prompt = f.read()

model = LLMs('GPT_4o')

pattern = r'The answer is (.+?)\.'
output_replace_data = []
for item in data:
    if item['key_answer_type'] == 'math' and item['setting'].endswith('restrict'):
        match = re.search(pattern, item["llm_output"])
        if match and match.span()[1] == len(item["llm_output"]) and 4 < len(match.group(1)) <= 50 and any(char.isdigit() for char in match.group(1)):
            output_replace_data.append(item)

random.shuffle(output_replace_data)

new_data2 = []
sample_ratio = 0.47
sample_num = int(len(output_replace_data) * sample_ratio)
output_replace_data = output_replace_data[:sample_num]
with tqdm(total=sample_num, desc="Processing", unit="task") as pbar:
    for item in output_replace_data:
        match = re.search(pattern, item["llm_output"])
        
        input = prompt.format(
            output=match.group(1)
        )
        response = model.request(input)
        response = response[response.find('```json')+7:response.rfind('```')]
        response = eval(response)

        for k, v in response.items():
            # print(v)
            new_item = item.copy()
            new_item['llm_output'] = new_item['llm_output'][:match.span()[0]] + v
            new_data2.append(new_item)
        pbar.update(1)

print(len(new_data2))

In [None]:
# Integrate original data and augmented data
import os
import pathlib


print(len(data), len(new_data), len(new_data1), len(new_data2))
all_data = data + new_data + new_data1 + new_data2


data_path_ = pathlib.Path(data_path)
raw_name = data_path_.stem
save_path = os.path.join(data_path_.parent, raw_name + '_enh.json')
with open(save_path, 'w', encoding='utf-8') as f:
    json.dump(all_data, f, ensure_ascii=False, indent=4)

print(len(all_data))


In [None]:
# Generate a fine-tuning training set in Alpaca format
import os
import yaml
import json


data_path = "./processed_outputs/experiment_dataset/train_raw_enh.json"
with open(data_path, 'r') as f:
    train = json.load(f)

prompt_path = './prompts/xverify_prompt.yaml'
with open(prompt_path, 'r', encoding='utf-8') as file:
    prompt_data = yaml.safe_load(file)

train_sft = []
for item in train:
    input = prompt_data['prompt'].format(
        question=item['question'],
        output=item['llm_output'],
        answer=item['correct_answer']
    )

    train_sft.append({
                "instruction": input,
                "input": "",
                "output": item['human_judgment_result'] if 'human_judgment_result' in item else item['GPT_4o_final_judgment_result']
            })

save_dir = './processed_outputs/experiment_dataset'
with open(os.path.join(save_dir, 'train_raw_enh_formatter_as_alpaca.json'), 'w', encoding='utf-8') as f:
    json.dump(train_sft, f, ensure_ascii=False, indent=4)