In [None]:
import pandas as pd
import numpy as np
import os

input_path = "./test/{}.parquet"
tasks = ["olympiad_bench", "minerva"]

for task in tasks:
    # Read the parquet file
    df = pd.read_parquet(input_path.format(task))

    ans_list = df["reward_model"].tolist()
    for ans in ans_list:
        ans["ground_truth"] = ans["ground_truth"].item() if isinstance(ans, np.ndarray) and ans.size == 1 else ans


    # 检查并替换 np.array 为 item
    df["reward_model"] = ans_list

    
    df.to_csv(input_path.format(task), index=False)
    
    print(f"Saved {task} to {output_path}\n")


In [10]:
import json
import pandas as pd
import numpy as np

input_path = "./test/{}.parquet"
tasks = ["olympiad_bench", "minerva", "aime", "amc", "math", "olympiad_bench", "college", "theoremqa"]
del_strs = " Let's think step by step and output the final answer within \\boxed{}."

for task in tasks:
    # Read the parquet file
    input_path = f"./test/{task}.parquet"

    df = pd.read_parquet(input_path)
    output_path = f"./test/{task}.json"
    df.to_json(output_path, orient="records", lines=True)

    with open(output_path, "r", encoding="utf-8") as fr:
        data_pool = [json.loads(line) for line in fr.readlines()]

    for data in data_pool:
        data["id"] = data["extra_info"]["index"]
        data["data_source"] = task
        assert len(data["prompt"]) == 1
        data["question"] = data["prompt"][0]["content"].replace(del_strs, "")
        del data["prompt"]
        del data["ability"]
        if isinstance(data["reward_model"]["ground_truth"], np.ndarray):
            data["reward_model"]["ground_truth"] = data["reward_model"]["ground_truth"].item()
        data["ground_truth"] = data["reward_model"]["ground_truth"]
        del data["reward_model"]
        del data["extra_info"]
    
    with open(output_path, "w") as fw:
        json.dump(data_pool, fw, indent=4)


In [None]:
import json

input_path = "./test/aime24.jsonl"
data_path = "./test/aime.json"

with open(input_path, "r", encoding="utf-8") as fr:
    data_pool = [json.loads(line) for line in fr.readlines()]

data_list = json.load(open(data_path, "r", encoding="utf-8"))
for idx, data in enumerate(data_pool):
    for data_item in data_list:
        if data["question"] == data_item["question"]:
            data_item["solution"] = data["solution"]

with open(data_path, "w", encoding="utf-8") as fw:
    json.dump(data_list, fw, indent=4)

with open(input_path, "w", encoding="utf-8") as fw:
    json.dump(data_pool, fw, indent=4)

In [11]:
import json
import random

input_path = "./test/math.json"
output_path = "./test/math100.json"

dataset = json.load(open(input_path, "r", encoding="utf-8"))
new_dataset = random.sample(dataset, 100)

sorted_data = sorted(new_dataset, key=lambda x: x['id'])

json.dump(sorted_data, open(output_path, "w", encoding="utf-8"), indent=4)

In [5]:
import json

input_path = "./unsol/v4-comp/{}_rewrite_{}.json"
datasets = ["aime", "amc", "math100", "minerva100"]
suffixes = ["4o_4o", "r1_r1", "4o_r1", "r1_4o"]

for dataset in datasets:
    for suffix in suffixes:
        remove_count, contradict_count = 0, 0
        input_file = input_path.format(dataset, suffix)
        try:
            with open(input_file, "r", encoding="utf-8") as fr:
                data_pool = json.load(fr)
            for data in data_pool:
                remove_count += len(data["remove"])
                contradict_count += len(data["contradict"])
            print(f"Dataset: {dataset}, Suffix: {suffix}, Remove Count: {remove_count}, Contradict Count: {contradict_count}")
        except json.JSONDecodeError:
            print(f"Error decoding JSON in file: {input_file}")
            continue
        except FileNotFoundError:
            print(f"File not found: {input_file}")
            continue
        print("-" * 30)

Dataset: aime, Suffix: 4o_4o, Remove Count: 85, Contradict Count: 85
------------------------------
Dataset: aime, Suffix: r1_r1, Remove Count: 85, Contradict Count: 85
------------------------------
Dataset: aime, Suffix: 4o_r1, Remove Count: 85, Contradict Count: 85
------------------------------
Dataset: aime, Suffix: r1_4o, Remove Count: 85, Contradict Count: 85
------------------------------
Dataset: amc, Suffix: 4o_4o, Remove Count: 241, Contradict Count: 241
------------------------------
Error decoding JSON in file: ./unsol/v4-comp/amc_rewrite_r1_r1.json
Error decoding JSON in file: ./unsol/v4-comp/amc_rewrite_4o_r1.json
Dataset: amc, Suffix: r1_4o, Remove Count: 241, Contradict Count: 241
------------------------------
Dataset: math100, Suffix: 4o_4o, Remove Count: 254, Contradict Count: 254
------------------------------
Dataset: math100, Suffix: r1_r1, Remove Count: 254, Contradict Count: 254
------------------------------
Error decoding JSON in file: ./unsol/v4-comp/math100

In [1]:
def extract_sentences(text):
    # 按 \n\n 分割文本
    sentences = text.split('\n\n')
    
    # 处理每个句子，去除可能存在的序号
    cleaned_sentences = []
    for sentence in sentences:
        # 去除首尾空白
        sentence = sentence.strip()
        
        # 检查并去除序号（匹配形如 "1. ", "2. " 的模式）
        if sentence and sentence[0].isdigit() and len(sentence) > 2 and sentence[1] == '.' and sentence[2] == ' ':
            sentence = sentence[3:]
        
        # 将处理后的句子添加到列表中
        if sentence:  # 确保句子非空
            cleaned_sentences.append(sentence)
    
    return cleaned_sentences

# 测试示例
test_cases = [
    "1. sentence1\n\n2. sentence2\n\n3. sentence3",
    "sentence1\n\nsentence2\n\nsentence3",
    "1. only one sentence",
    "first\n\nsecond",
    "1. first\n\nplain second"
]

for test in test_cases:
    result = extract_sentences(test)
    print(f"原文本: {test}")
    print(f"处理结果: {result}")
    print("-" * 50)



原文本: 1. sentence1

2. sentence2

3. sentence3
处理结果: ['sentence1', 'sentence2', 'sentence3']
--------------------------------------------------
原文本: sentence1

sentence2

sentence3
处理结果: ['sentence1', 'sentence2', 'sentence3']
--------------------------------------------------
原文本: 1. only one sentence
处理结果: ['only one sentence']
--------------------------------------------------
原文本: first

second
处理结果: ['first', 'second']
--------------------------------------------------
原文本: 1. first

plain second
处理结果: ['first', 'plain second']
--------------------------------------------------


In [None]:
import json

input_path = "/Users/collcertaye/WorkSpace/Research/NLP&LLM/RealMath/RealMath/data/unsol/v4-comp/aime_extract.json"

dataset = json.load(open(input_path, "r", encoding="utf-8"))
for data in dataset:
    conditions = data["extracted_condition"]
    conditions = "\n\n".join(conditions)
    text = conditions.replace('\\n\\n', '\n\n')

    sentences = text.split('\n\n')

    cleaned_sentences = []
    for sentence in sentences:
        # 去除首尾空白
        sentence = sentence.strip()
        
        # 检查并去除序号（匹配形如 "1. ", "2. " 的模式）
        if sentence and sentence[0].isdigit() and len(sentence) > 2 and sentence[1] == '.' and sentence[2] == ' ':
            sentence = sentence[3:]
        
        # 将处理后的句子添加到列表中
        if sentence:  # 确保句子非空
            cleaned_sentences.append(sentence)

    data["extracted_condition"] = cleaned_sentences

json.dump(dataset, open(input_path, "w", encoding="utf-8"), indent=4, ensure_ascii=False)

In [None]:
from utils import *
import json

input_path = "/Users/collcertaye/WorkSpace/Research/NLP&LLM/RealMath/RealMath/data/unsol/v4-comp/amc_rewrite_4o_r1.json"

with open(input_path, "r", encoding="utf-8") as f:
    dataset = [json.loads(data) for data in f.readlines()]

write_json(input_path, dataset)

In [None]:
from utils import *

input_path = "/Users/collcertaye/WorkSpace/Research/NLP&LLM/RealMath/RealMath/data/unsol/v4-comp/{}_check.json"
output_path = "/Users/collcertaye/WorkSpace/Research/NLP&LLM/RealMath/RealMath/data/unsol/v4-comp/{}_excel.json"
id_path = "/Users/collcertaye/WorkSpace/Research/NLP&LLM/RealMath/RealMath/data/unsol/v5-human/{}_id.json"
datasets = ["aime", "amc", "math", "minerva"]

for dataset in datasets:
    input_file = input_path.format(dataset)
    try:
        data_pool = read_json(input_file)
    except json.JSONDecodeError:
        print(f"Error decoding JSON in file: {input_file}")
        continue
    except FileNotFoundError:
        print(f"File not found: {input_file}")
        continue
    
    new_data_list = []
    data_id_dict = []
    for idx, data in enumerate(data_pool):
        if dataset in ["math", "minerva"]:
            data_id_dict.append({"idx": data["id"]})
        for unsolve_type in UNS_TYPE:
            count = 0
            for key in data.keys():
                if key.startswith(unsolve_type + "_question_"):
                    new_data = {
                        "data_id": data["data_source"] + "_" + str(idx) + "_"  + unsolve_type + "_" + str(count+1),
                        # "data_source": data["data_source"],
                        "question": data["question"],
                        "ground_truth": data["ground_truth"],
                        "solution": data["solution"] if dataset in ["math", "aime"] else None,
                        # "unsolve_id": unsolve_type + "_" + str(count+1),
                        "rewritten_question": data[unsolve_type + "_question_" + str(count+1)][unsolve_type + "_question"],
                        "rewritten_condition:": data[unsolve_type + "_question_" + str(count+1)]["rewritten_condition"],
                        "unsolvable_reason": data[unsolve_type + "_question_" + str(count+1)]["unsolvable_reason"],
                        "human_check": None,
                        "difficulty_eval": None,
                    }

                    # if dataset in ["math", "aime"]:
                    #     new_data["solution"] = data["solution"]
                    count += 1
                    new_data_list.append(new_data)

    if dataset in ["math", "minerva"]:
        id_file = id_path.format(dataset)
        with open(id_file, "w", encoding="utf-8") as fw:
            json.dump(data_id_dict, fw, indent=4)
    print(f"Dataset: {dataset}, New Data Count: {len(new_data_list)}")
    # Save the new data to a JSON file
    output_file = output_path.format(dataset)
    with open(output_file, "w", encoding="utf-8") as fw:
        json.dump(new_data_list, fw, indent=4)


Dataset: aime, New Data Count: 141
Dataset: amc, New Data Count: 377
Dataset: math, New Data Count: 408
Dataset: minerva, New Data Count: 449


In [102]:
from utils import *
import pandas as pd
import json

input_path = "/Users/collcertaye/WorkSpace/Research/NLP&LLM/RealMath/RealMath/data/unsol/v4-comp/{}_excel.json"
output_path = "/Users/collcertaye/WorkSpace/Research/NLP&LLM/RealMath/RealMath/data/unsol/v4-comp/{}_excel.xlsx"
datasets = ["aime", "amc", "math", "minerva"]


for dataset in datasets:
    input_file = input_path.format(dataset)
    try:
        data_pool = read_json(input_file)
    except json.JSONDecodeError:
        print(f"Error decoding JSON in file: {input_file}")
        continue
    except FileNotFoundError:
        print(f"File not found: {input_file}")
        continue

    # 将数据转换为DataFrame
    df = pd.DataFrame(data_pool)

    # 保存为Excel文件
    output_file = output_path.format(dataset)
    df.to_excel(output_file, index=False)
    print(f"Saved {dataset} to {output_file}")


Saved aime to /Users/collcertaye/WorkSpace/Research/NLP&LLM/RealMath/RealMath/data/unsol/v4-comp/aime_excel.xlsx
Saved amc to /Users/collcertaye/WorkSpace/Research/NLP&LLM/RealMath/RealMath/data/unsol/v4-comp/amc_excel.xlsx
Saved math to /Users/collcertaye/WorkSpace/Research/NLP&LLM/RealMath/RealMath/data/unsol/v4-comp/math_excel.xlsx
Saved minerva to /Users/collcertaye/WorkSpace/Research/NLP&LLM/RealMath/RealMath/data/unsol/v4-comp/minerva_excel.xlsx


In [204]:
import json

input_path = "/Users/collcertaye/WorkSpace/Research/NLP&LLM/RealMath/RealMath/data/unsol/v4-comp/{}_check.json"
datasets = ["math", "minerva"]

start, end = 81, 84
# bias = 25
# start += bias
# end += bias
remove_total, contradict_total = 0, 0

print(f"Start id: {start}, End id: {end-1}")
for dataset in datasets:
    remove_count, contradict_count = 0, 0
    input_file = input_path.format(dataset)
    try:
        with open(input_file, "r", encoding="utf-8") as fr:
            if dataset in ["math", "minerva"]:
                data_pool = json.load(fr)[start:end]
            else:
                data_pool = json.load(fr)[start:end]
            # data_pool = json.load(fr)
    except json.JSONDecodeError:
        print(f"Error decoding JSON in file: {input_file}")
        continue
    except FileNotFoundError:
        print(f"File not found: {input_file}")
        continue
    for idx, data in enumerate(data_pool):
        remove_count += len([key for key in data.keys() if key.startswith("remove_question_")])
        contradict_count += len([key for key in data.keys() if key.startswith("contradict_question_")])
    remove_total += remove_count
    contradict_total += contradict_count
    # print(len(data_pool))
    print(f"Dataset: {dataset}, Data Size: {len(data_pool)}, " \
          f"Remove Count: {remove_count}, Contradict Count: {contradict_count}")
print(f"Total Remove Count: {remove_total}, Total Contradict Count: {contradict_total}, Total Count: {remove_total + contradict_total}")


Start id: 81, End id: 83
Dataset: math, Data Size: 3, Remove Count: 7, Contradict Count: 8
Dataset: minerva, Data Size: 3, Remove Count: 9, Contradict Count: 9
Total Remove Count: 16, Total Contradict Count: 17, Total Count: 33


In [29]:
from utils import *

input_path = "/Users/collcertaye/WorkSpace/Research/NLP&LLM/RealMath/RealMath/data/test/math.jsonl"
output_path = "/Users/collcertaye/WorkSpace/Research/NLP&LLM/RealMath/RealMath/data/test/math.json"

data_pool = read_jsonl(input_path)
new_data_list = read_json(output_path)

for idx, data in enumerate(data_pool):
    assert data["problem"] == new_data_list[idx]["question"]
    new_data_list[idx]["solution"] = data["solution"]

write_json(output_path, new_data_list)

In [32]:
from utils import *

input_path = "/Users/collcertaye/WorkSpace/Research/NLP&LLM/RealMath/RealMath/data/test/math.json"
output_path = "/Users/collcertaye/WorkSpace/Research/NLP&LLM/RealMath/RealMath/data/test/math100.json"

data_pool = read_json(input_path)
new_data_list = read_json(output_path)

p1, p2 = 0, 0

while p1 < len(data_pool):
    if p2 >= len(new_data_list):
        break
    if new_data_list[p2]["id"] == p1:
        assert new_data_list[p2]["id"] == data_pool[p1]["id"]
        assert new_data_list[p2]["question"] == data_pool[p1]["question"]
        new_data_list[p2]["solution"] = data_pool[p1]["solution"]
        p2 += 1
    p1 += 1

write_json(output_path, new_data_list)

In [None]:
from utils import *

input_path = "/Users/collcertaye/WorkSpace/Research/NLP&LLM/RealMath/RealMath/data/test/math100.json"
output_path = "/Users/collcertaye/WorkSpace/Research/NLP&LLM/RealMath/RealMath/data/unsol/v4-comp/math_check.json"

data_pool = read_json(input_path)
new_data_list = read_json(output_path)

for idx, data in enumerate(data_pool):
    assert new_data_list[idx]["id"] == data["id"]
    assert data["question"] == new_data_list[idx]["question"]
    new_data_list[idx]["solution"] = data["solution"]

write_json(output_path, new_data_list)


In [202]:
import numpy as np

input_path = "/Users/collcertaye/WorkSpace/Research/NLP&LLM/RealMath/RealMath/data/unsol/v5-human/count.txt"

with open(input_path, "r", encoding="utf-8") as fr:
    data = fr.readlines()
    data = [int(line.strip()) for line in data]

np.sum(data)
print(len(data))
print(np.sum(data))

36
1379


In [63]:

import pandas as pd
import json

names = ["Wang Rui", "Peng Juewen", "Song Jiale", "Li Ang", "Ran Chen", "Li Miaomiao", "Deng Rui", "Yu Erxin", "Wang Hongru", "Du Yiming", 
         "Wang Zige", "Zhang Zhiwei", "Xue Boyang", "Wang Zezhong", "Kang Jiawen", "Zhu Qi", "Xu Hongling", "Hu Shujie", "Han Dongrui", "Cui Mingyu",
         "eval_21", "eval_22", "eval_23", "eval_24", "eval_25", "eval_26", "eval_27", "eval_28", "eval_29", "eval_30",
         "eval_31", "eval_32", "eval_33", "eval_34", "eval_35", "eval_36"]

input_path = "/Users/collcertaye/WorkSpace/Research/NLP&LLM/RealMath/RealMath/data/unsol/v5-human/v2/{}.xlsx"
remain_file = "/Users/collcertaye/WorkSpace/Research/NLP&LLM/RealMath/RealMath/data/unsol/v5-human/remain.json"
border_file = "/Users/collcertaye/WorkSpace/Research/NLP&LLM/RealMath/RealMath/data/unsol/v5-human/border.json"
failed_file = "/Users/collcertaye/WorkSpace/Research/NLP&LLM/RealMath/RealMath/data/unsol/v5-human/failed.json"

remain_data_list, border_data_list, failed_data_list = [], [], []

for name in names:
    input_file = input_path.format(name)
    
    df = pd.read_excel(input_file)

    dataset = df.to_dict(orient='records')

    checked, passed = 0, 0
    for data in dataset:
        if isinstance(data["human_check"], str):
            failed_data_list.append(data)
            pass
        else:
            if data["human_check"] == 1:
                checked += 1
                if data["difficulty_eval"] == 1:
                    passed += 1
                    remain_data_list.append(data)
                else:
                    border_data_list.append(data)
            else:
                failed_data_list.append(data)
        
    print(f"Name: {name}, Good/Passed/Discarded: {passed}/{checked-passed}/{len(dataset)-checked}, Total: {len(dataset)}, Passed Rate: {checked/len(dataset)}")

print(f"Total: {len(remain_data_list) + len(border_data_list) + len(failed_data_list)}, Remain: {len(remain_data_list)}, Border: {len(border_data_list)}, Failed: {len(failed_data_list)}")

with open(remain_file, 'w', encoding='utf-8') as f:
    json.dump(remain_data_list, f, ensure_ascii=False, indent=4)

with open(border_file, 'w', encoding='utf-8') as f:
    json.dump(border_data_list, f, ensure_ascii=False, indent=4)

with open(failed_file, 'w', encoding='utf-8') as f:
    json.dump(failed_data_list, f, ensure_ascii=False, indent=4)



Name: Wang Rui, Good/Passed/Discarded: 21/8/12, Total: 41, Passed Rate: 0.7073170731707317
Name: Peng Juewen, Good/Passed/Discarded: 27/2/5, Total: 34, Passed Rate: 0.8529411764705882
Name: Song Jiale, Good/Passed/Discarded: 11/11/10, Total: 32, Passed Rate: 0.6875
Name: Li Ang, Good/Passed/Discarded: 19/8/5, Total: 32, Passed Rate: 0.84375
Name: Ran Chen, Good/Passed/Discarded: 20/9/6, Total: 35, Passed Rate: 0.8285714285714286
Name: Li Miaomiao, Good/Passed/Discarded: 28/3/4, Total: 35, Passed Rate: 0.8857142857142857
Name: Deng Rui, Good/Passed/Discarded: 9/22/7, Total: 38, Passed Rate: 0.8157894736842105
Name: Yu Erxin, Good/Passed/Discarded: 25/9/6, Total: 40, Passed Rate: 0.85
Name: Wang Hongru, Good/Passed/Discarded: 16/19/4, Total: 39, Passed Rate: 0.8974358974358975
Name: Du Yiming, Good/Passed/Discarded: 22/7/7, Total: 36, Passed Rate: 0.8055555555555556
Name: Wang Zige, Good/Passed/Discarded: 15/15/8, Total: 38, Passed Rate: 0.7894736842105263
Name: Zhang Zhiwei, Good/Passed

In [46]:
from utils import *

input_path = "/Users/collcertaye/WorkSpace/Research/NLP&LLM/RealMath/RealMath/data/unsol/v5-human/{}.json"
output_path = "/Users/collcertaye/WorkSpace/Research/NLP&LLM/RealMath/RealMath/data/unsol/{}.json"
remain_type = ["remain", "border"]
datasets = ["aime", "amc", "math", "minerva"]

dataset_dict = {} 
for dataset in datasets:
    dataset_dict[dataset] = []

for remain in remain_type:
    input_file = input_path.format(remain)
    data_pool = read_json(input_file)

    seen_ids = set()
    unique_data = []
    print(len(data_pool))
    # data_sorted = sorted(dataset, key=lambda x: x['id'])
    for item in data_pool:
        if item['data_id'] not in seen_ids:
            unique_data.append(item)
            seen_ids.add(item['data_id'])
    print(len(unique_data))
    data_pool = unique_data

    for data in data_pool:
        data_source = data["data_id"].split("_")[0]
        del data["solution"]
        del data["rewritten_condition"]
        del data["unsolvable_reason"]
        del data["human_check"]
        if "Unnamed: 9" in data:
            del data["Unnamed: 9"]
        dataset_dict[data_source].append(data)

for dataset in datasets:
    seen_ids = set()
    unique_data = []
    data_pool = dataset_dict[dataset]
    print(len(data_pool))
    # data_sorted = sorted(dataset, key=lambda x: x['id'])
    for item in data_pool:
        if item['data_id'] not in seen_ids:
            unique_data.append(item)
            seen_ids.add(item['data_id'])
    dataset_dict[dataset] = unique_data
    print(len(unique_data), len(seen_ids))

total_remove, total_contradict = 0, 0
for dataset in datasets:
    output_file = output_path.format(dataset)
    remove, contradict = 0, 0
    for data in dataset_dict[dataset]:
        if "remove" in data["data_id"]:
            remove += 1
        elif "contradict" in data["data_id"]:
            contradict += 1
    assert remove + contradict == len(dataset_dict[dataset])
    print(f"Dataset: {dataset}, Remove Count: {remove}, Contradict Count: {contradict}, Data Size: {len(dataset_dict[dataset])}")
    total_remove += remove
    total_contradict += contradict
    write_json(output_file, dataset_dict[dataset])
print(f"Total Remove Count: {total_remove}, Total Contradict Count: {total_contradict}, Total Count: {total_remove + total_contradict}")
print(1102/1375)

631
627
482
482
132
132 132
295
295 295
324
318 318
358
357 357
Dataset: aime, Remove Count: 67, Contradict Count: 65, Data Size: 132
Dataset: amc, Remove Count: 131, Contradict Count: 164, Data Size: 295
Dataset: math, Remove Count: 154, Contradict Count: 164, Data Size: 318
Dataset: minerva, Remove Count: 185, Contradict Count: 172, Data Size: 357
Total Remove Count: 537, Total Contradict Count: 565, Total Count: 1102
0.8014545454545454


In [254]:
from utils import *

datasets = ["aime", "amc", "math", "minerva"]
models = ["deepseek_r1", "deepseek_v3", "o3-mini", "gpt-4o"]
prompts = ["std", "real"]

input_path = "/Users/collcertaye/WorkSpace/Research/NLP&LLM/RealMath/RealMath/exp/{}_T0.0_{}/unsol//{}.json"

for dataset in datasets:
    for model in models:
        for prompt in prompts:
            input_file = input_path.format(model, prompt, dataset)
            try:
                data_pool = read_json(input_file)
                json2jsonl(input_file, input_file)
            except json.JSONDecodeError:
                print(f"Error decoding JSON in file: {input_file}")
                continue
            except FileNotFoundError:
                print(f"File not found: {input_file}")
                continue
            
            


Error decoding JSON in file: /Users/collcertaye/WorkSpace/Research/NLP&LLM/RealMath/RealMath/exp/deepseek_r1_T0.0_std/unsol//aime.json
Error decoding JSON in file: /Users/collcertaye/WorkSpace/Research/NLP&LLM/RealMath/RealMath/exp/deepseek_r1_T0.0_real/unsol//aime.json
Error decoding JSON in file: /Users/collcertaye/WorkSpace/Research/NLP&LLM/RealMath/RealMath/exp/deepseek_v3_T0.0_std/unsol//aime.json
Error decoding JSON in file: /Users/collcertaye/WorkSpace/Research/NLP&LLM/RealMath/RealMath/exp/deepseek_v3_T0.0_real/unsol//aime.json
Error decoding JSON in file: /Users/collcertaye/WorkSpace/Research/NLP&LLM/RealMath/RealMath/exp/o3-mini_T0.0_std/unsol//aime.json
Error decoding JSON in file: /Users/collcertaye/WorkSpace/Research/NLP&LLM/RealMath/RealMath/exp/gpt-4o_T0.0_std/unsol//aime.json
Error decoding JSON in file: /Users/collcertaye/WorkSpace/Research/NLP&LLM/RealMath/RealMath/exp/gpt-4o_T0.0_real/unsol//aime.json
Error decoding JSON in file: /Users/collcertaye/WorkSpace/Research

In [277]:
from utils import *

input_path = "/Users/collcertaye/WorkSpace/Research/NLP&LLM/RealMath/RealMath/data/train/{}.json"
output_path = "/Users/collcertaye/WorkSpace/Research/NLP&LLM/RealMath/RealMath/data/solve/train.json"
datasets = ["aime", "amc", "math"]

train_data_list = []
idx = 0

for dataset in datasets:
    input_file = input_path.format(dataset)
    data_pool = read_json(input_file)
    print(data_pool[0].keys())
    # break
    if dataset == "math":
        data_list = []
        for data in data_pool:
            if data["difficulty"] == 5:
                data_list.append(data)
                pass
        # break
        data_pool = data_list
    
    print(f"Dataset: {dataset}, Data Size: {len(data_pool)}")

    for data in data_pool:
        idx += 1
        new_data = {
            "id": idx,
            "data_source": dataset,
            "question": data["problem"],
            "solution": data["solution"] if "solution" in data.keys() else None,
            "ground_truth": data["answer"]
        }
        train_data_list.append(new_data)
        
print(f"Total Data Size: {len(train_data_list)}")
write_json(output_path, train_data_list)



dict_keys(['problem', 'solution', 'answer', 'year', 'aime_number', 'problem_number', 'difficulty'])
Dataset: aime, Data Size: 975
dict_keys(['problem', 'solution', 'answer', 'difficulty'])
Dataset: amc, Data Size: 3264
dict_keys(['problem', 'answer', 'difficulty', 'type'])
Dataset: math, Data Size: 2298
Total Data Size: 6537


In [95]:
from utils import *
import json

input_path = "../data/unsol/v4-comp/train_unsolve.json"
total = 0
data_pool = read_json(input_path)
for dataset in ["math","amc","aime"]:
    remove, contradict = 0, 0
    for data in data_pool:
        # print(data["generation"])
        if data["data_source"] == dataset:
            # print(data.keys())
            if "remove_question_1" in data.keys():
                remove += 1
            elif "contradict_question_1" in data.keys():
                contradict += 1
    print(f"Dataset: {dataset}")
    print(f"Remove Count: {remove}, Contradict Count: {contradict}, Data Size: {remove + contradict}")
    total += remove + contradict
print(f"Total Count: {total}")

Dataset: math
Remove Count: 873, Contradict Count: 66, Data Size: 939
Dataset: amc
Remove Count: 2952, Contradict Count: 233, Data Size: 3185
Dataset: aime
Remove Count: 920, Contradict Count: 41, Data Size: 961
Total Count: 5085


In [296]:
from utils import *
import json

input_path = "/Users/collcertaye/WorkSpace/Research/NLP&LLM/RealMath/RealMath/exp/distill-32b_T0.0_std/solve/amc.json"

data_pool = read_jsonl(input_path)
for data in data_pool:
    # print(data["generation"])
    generation = data["generation"][0]
    if isinstance(generation, str):
        generation = json.loads(generation)
        # print(generation.keys())
        response = {
            "reasoning": generation["choices"][0]["message"]["reasoning_content"],
            "answer": generation["choices"][0]["message"]["content"]
        }
        data["generation"] = [response]
    # print(data)
    # break

print(len(data_pool))
write_jsonl(input_path, data_pool)

83


In [294]:
datasets = ["aime", "amc", "math", "minerva"]
tasks = ["solve", "unsol"]
reliability = ["std", "real"]
models = ["qwen-1.5b", "qwen-7b", "distill-1.5b", "distill-7b"]

input_path = "/Users/collcertaye/WorkSpace/Research/NLP&LLM/RealMath/RealMath/exp/{}_T0.0_{}/{}/{}.json"


for dataset in datasets:
    for task in tasks:
        for model in models:
            for rel in reliability:
                input_file = input_path.format(model, rel, task, dataset)
                data_pool = read_jsonl(input_file)
                print(f"Dataset: {dataset}, Task: {task}, Model: {model}, Reliability: {rel}, Data Size: {len(data_pool)}")
                data_list = []
                for data in data_pool:
                    if isinstance(data["generation"], list):
                        data_list.append(data)
                print(f"Dataset: {dataset}, Task: {task}, Model: {model}, Reliability: {rel}, Data Size: {len(data_list)}")
                write_jsonl(input_file, data_list)



Dataset: aime, Task: solve, Model: qwen-1.5b, Reliability: std, Data Size: 60
Dataset: aime, Task: solve, Model: qwen-1.5b, Reliability: std, Data Size: 30
Dataset: aime, Task: solve, Model: qwen-1.5b, Reliability: real, Data Size: 60
Dataset: aime, Task: solve, Model: qwen-1.5b, Reliability: real, Data Size: 30
Dataset: aime, Task: solve, Model: qwen-7b, Reliability: std, Data Size: 60
Dataset: aime, Task: solve, Model: qwen-7b, Reliability: std, Data Size: 30
Dataset: aime, Task: solve, Model: qwen-7b, Reliability: real, Data Size: 60
Dataset: aime, Task: solve, Model: qwen-7b, Reliability: real, Data Size: 30
Dataset: aime, Task: solve, Model: distill-1.5b, Reliability: std, Data Size: 60
Dataset: aime, Task: solve, Model: distill-1.5b, Reliability: std, Data Size: 30
Dataset: aime, Task: solve, Model: distill-1.5b, Reliability: real, Data Size: 60
Dataset: aime, Task: solve, Model: distill-1.5b, Reliability: real, Data Size: 30
Dataset: aime, Task: solve, Model: distill-7b, Reliabi

In [7]:
from utils import *

input_file = "/Users/collcertaye/WorkSpace/Research/NLP&LLM/RealMath/RealMath/data/unsol/v4-comp/train_analysis.json"

seen_ids = set()
unique_data = []
dataset = read_jsonl(input_file)
print(len(dataset))
for item in dataset:
    if item['id'] not in seen_ids:
        unique_data.append(item)
        seen_ids.add(item['id'])

print(len(unique_data))
write_jsonl(input_file, unique_data)

1563
1563


In [8]:
from utils import *

input_file = "/Users/collcertaye/WorkSpace/Research/NLP&LLM/RealMath/RealMath/data/unsol/v4-comp/train_analysis.json"

seen_ids = set()
unique_data = []
dataset = read_jsonl(input_file)
print(len(dataset))
data_sorted = sorted(dataset, key=lambda x: x['id'])

print(len(data_sorted))
write_json(input_file, data_sorted)

6537
6537


In [43]:
from utils import *

input_file = "/Users/collcertaye/WorkSpace/Research/NLP&LLM/RealMath/RealMath/data/unsol/v4-comp/train_unsolve.json"

seen_ids = set()
unique_data = []
dataset = read_jsonl(input_file)
print(len(dataset))
# data_sorted = sorted(dataset, key=lambda x: x['id'])
for item in dataset:
    if item['id'] not in seen_ids:
        unique_data.append(item)
        seen_ids.add(item['id'])

print(len(unique_data))
write_jsonl(input_file, unique_data)

5194
5194


In [46]:
from utils import *

input_file = "/Users/collcertaye/WorkSpace/Research/NLP&LLM/RealMath/RealMath/data/unsol/v4-comp/train_unsolve.json"

seen_ids = set()
unique_data = []
dataset = read_jsonl(input_file)
print(len(dataset))
data_sorted = sorted(dataset, key=lambda x: x['id'])

print(len(data_sorted))
write_json(input_file, data_sorted)

5194
5194


In [49]:
from utils import *

input_path = "/Users/collcertaye/WorkSpace/Research/NLP&LLM/RealMath/RealMath/data/unsol/v4-comp/{}_check.json"
output_path = "/Users/collcertaye/WorkSpace/Research/NLP&LLM/RealMath/RealMath/data/unsol/train.json"
datasets = ["train"]

for dataset in datasets:
    input_file = input_path.format(dataset)
    try:
        data_pool = read_json(input_file)
    except json.JSONDecodeError:
        print(f"Error decoding JSON in file: {input_file}")
        continue
    except FileNotFoundError:
        print(f"File not found: {input_file}")
        continue
    
    new_data_list = []
    data_id_dict = []
    for idx, data in enumerate(data_pool):
        if dataset in ["math", "minerva"]:
            data_id_dict.append({"idx": data["id"]})
        for unsolve_type in UNS_TYPE:
            count = 0
            for key in data.keys():
                if key.startswith(unsolve_type + "_question_"):
                    new_data = {
                        "data_id": data["data_source"] + "_" + str(idx) + "_"  + unsolve_type + "_" + str(count+1),
                        # "data_source": data["data_source"],
                        "question": data["question"],
                        "ground_truth": data["ground_truth"],
                        # "solution": data["solution"] if data["data_source"] in ["math"] else None,
                        # "unsolve_id": unsolve_type + "_" + str(count+1),
                        "rewritten_question": data[unsolve_type + "_question_" + str(count+1)][unsolve_type + "_question"]
                    }

                    # if dataset in ["math", "aime"]:
                    #     new_data["solution"] = data["solution"]
                    count += 1
                    new_data_list.append(new_data)

    print(f"Dataset: {dataset}, New Data Count: {len(new_data_list)}")
    # Save the new data to a JSON file
    output_file = output_path.format(dataset)
    with open(output_file, "w", encoding="utf-8") as fw:
        json.dump(new_data_list, fw, indent=4)


Dataset: train, New Data Count: 7740


In [165]:
from utils import *

input_path = "/Users/collcertaye/WorkSpace/Research/NLP&LLM/RealMath/RealMath/exp/distill-32b_T0.0_{}/unsol/{}.json"
data_path = "/Users/collcertaye/WorkSpace/Research/NLP&LLM/RealMath/RealMath/data/unsol/{}.json"
datasets = ["aime", "amc", "math", "minerva"]
prompts = ["std", "real"]


for prompt in prompts:
    all_data, processed_data = 0, 0
    for dataset in datasets:
        seen_ids = set()
        unique_data = []
        # print(dataset)
        input_file = input_path.format(prompt, dataset)
        data_file = data_path.format(dataset)
        data_pool = read_jsonl(input_file)
        keys = [data["data_id"] for data in read_json(data_file)]
        print(f"Current Input Size: {len(data_pool)}")
        # data_sorted = sorted(dataset, key=lambda x: x['id'])
        for item in data_pool:
            if item['data_id'] not in seen_ids:
                unique_data.append(item)
                seen_ids.add(item['data_id'])

        # if len(unique_data) == len(read_json(data_file)):
        #     # print(f"All data are unique, no need to process.")
        #     continue
        print(f"Dataset: {dataset}, Data Size: {len(read_json(data_file))}, Processed Data Size: {len(unique_data)}, Remaining Data Size: {len(read_json(data_file)) - len(unique_data)}")
        write_jsonl(input_file, unique_data)
        all_data += len(read_json(data_file))
        processed_data += len(unique_data)
    print(f"All Data Size: {all_data}, Prompt: {prompt}, Processed Data Size: {processed_data}, Remaining Data Size: {all_data - processed_data}\n")


Current Input Size: 132
Dataset: aime, Data Size: 132, Processed Data Size: 132, Remaining Data Size: 0
Current Input Size: 295
Dataset: amc, Data Size: 295, Processed Data Size: 295, Remaining Data Size: 0
Current Input Size: 318
Dataset: math, Data Size: 318, Processed Data Size: 318, Remaining Data Size: 0
Current Input Size: 357
Dataset: minerva, Data Size: 357, Processed Data Size: 357, Remaining Data Size: 0
All Data Size: 1102, Prompt: std, Processed Data Size: 1102, Remaining Data Size: 0

Current Input Size: 132
Dataset: aime, Data Size: 132, Processed Data Size: 132, Remaining Data Size: 0
Current Input Size: 295
Dataset: amc, Data Size: 295, Processed Data Size: 295, Remaining Data Size: 0
Current Input Size: 318
Dataset: math, Data Size: 318, Processed Data Size: 318, Remaining Data Size: 0
Current Input Size: 357
Dataset: minerva, Data Size: 357, Processed Data Size: 357, Remaining Data Size: 0
All Data Size: 1102, Prompt: real, Processed Data Size: 1102, Remaining Data Si

In [None]:
from utils import *

input_file = "/Users/collcertaye/WorkSpace/Research/NLP&LLM/RealMath/RealMath/./exp/deepseek_r1_T0.0_real/solve/train.json"

seen_ids = set()
unique_data = []
dataset = read_jsonl(input_file)
print(len(dataset))
# data_sorted = sorted(dataset, key=lambda x: x['id'])
for item in dataset:
    if item['data_id'] not in seen_ids:
        unique_data.append(item)
        seen_ids.add(item['data_id'])

print(len(unique_data))

In [85]:
import tiktoken
from utils import *

input_path = "/Users/collcertaye/WorkSpace/Research/NLP&LLM/RealMath/RealMath/exp/gpt-4o-mini_T0.0_{}/{}/{}.json"
datasets = ["aime", "amc", "math", "minerva"]
prompts = ["std", "real"]
tasks = ["solve", "unsol"]

encoding = tiktoken.encoding_for_model(tokenizer_dict["o3-mini"])

for dataset in datasets:
    for prompt in prompts:
        for task in tasks:
            input_file = input_path.format(prompt, task, dataset)
            if not os.path.exists(input_file):
                continue
            data_pool = read_jsonl(input_file)
            length = 0
            for data in data_pool:
                length += data["generation"][0]["reasoning"] + len(encoding.encode(data["generation"][0]["answer"]))
                # length_2 += len(encoding.encode(data["generation"][0]["answer"]))

            avg_length = length / len(data_pool)
            # avg_length_2 = length_2 / len(data_pool)
            print(f"Dataset: {dataset}, Prompt: {prompt}, Task: {task}, Avg Length: {avg_length}")

Dataset: aime, Prompt: std, Task: solve, Avg Length: 3456.0
Dataset: aime, Prompt: std, Task: unsol, Avg Length: 8467.8
Dataset: aime, Prompt: real, Task: solve, Avg Length: 3890.6
Dataset: aime, Prompt: real, Task: unsol, Avg Length: 5214.4
Dataset: amc, Prompt: std, Task: solve, Avg Length: 2244.4444444444443
Dataset: amc, Prompt: std, Task: unsol, Avg Length: 5599.285714285715
Dataset: amc, Prompt: real, Task: solve, Avg Length: 2908.4
Dataset: amc, Prompt: real, Task: unsol, Avg Length: 5673.7
Dataset: math, Prompt: std, Task: solve, Avg Length: 1247.3
Dataset: math, Prompt: std, Task: unsol, Avg Length: 6184.0
Dataset: math, Prompt: real, Task: solve, Avg Length: 817.5
Dataset: math, Prompt: real, Task: unsol, Avg Length: 5219.2
Dataset: minerva, Prompt: std, Task: solve, Avg Length: 591.2
Dataset: minerva, Prompt: std, Task: unsol, Avg Length: 1902.2
Dataset: minerva, Prompt: real, Task: solve, Avg Length: 530.3
Dataset: minerva, Prompt: real, Task: unsol, Avg Length: 1918.9


In [None]:
from utils import *

input_file = "/Users/collcertaye/WorkSpace/Research/NLP&LLM/RealMath/RealMath/./exp/deepseek_r1_T0.0_std/solve/train.json"

seen_ids = set()
unique_data = []
dataset = read_jsonl(input_file)
print(len(dataset))
# data_sorted = sorted(dataset, key=lambda x: x['id'])
for item in dataset:
    if item['id'] not in seen_ids:
        unique_data.append(item)
        seen_ids.add(item['id'])

print(len(unique_data))
write_jsonl(input_file, unique_data)

from utils import *
from metrics.rewards.math_reward import deepscaler_reward_fn, realmath_reward_fn

input_file = "/Users/collcertaye/WorkSpace/Research/NLP&LLM/RealMath/RealMath/./exp/deepseek_r1_T0.0_std/solve/train.json"
output_file_1 = "/Users/collcertaye/WorkSpace/Research/NLP&LLM/RealMath/RealMath/data/train/solve_success.json"
output_file_2 =  "/Users/collcertaye/WorkSpace/Research/NLP&LLM/RealMath/RealMath/data/solve/train_distill.json"

dataset = read_jsonl(input_file)
success_data = []
failed_data = []
for data in dataset:
    ground_truth = data["ground_truth"]
    answer = data["generation"][0]["answer"]
    _, judge, _ = realmath_reward_fn(answer, ground_truth, task="solve")
    if judge == "correct":
        success_data.append(data)
    else:
        failed_data.append(data)

print(f"Total Data Size: {len(dataset)}, Success Data Size: {len(success_data)}, Failed Data Size: {len(failed_data)}")
write_jsonl(output_file_1, success_data)
write_jsonl(output_file_2, failed_data)

6537
6537
Total Data Size: 6537, Success Data Size: 5344, Failed Data Size: 1193


In [34]:
from utils import *

input_file = "/Users/collcertaye/WorkSpace/Research/NLP&LLM/RealMath/RealMath/./exp/deepseek_r1_T0.0_real/unsol/train.json"

seen_ids = set()
unique_data = []
dataset = read_jsonl(input_file)
print(len(dataset))
# data_sorted = sorted(dataset, key=lambda x: x['id'])
for item in dataset:
    if item['data_id'] not in seen_ids:
        unique_data.append(item)
        seen_ids.add(item['data_id'])

print(len(unique_data))
write_jsonl(input_file, unique_data)

from utils import *
from metrics.rewards.math_reward import realmath_reward_fn

input_file = "/Users/collcertaye/WorkSpace/Research/NLP&LLM/RealMath/RealMath/./exp/deepseek_r1_T0.0_real/unsol/train.json"
output_file = "/Users/collcertaye/WorkSpace/Research/NLP&LLM/RealMath/RealMath/data/train/unsol_success.json"
output_file_1 = "/Users/collcertaye/WorkSpace/Research/NLP&LLM/RealMath/RealMath/data/train/unsol_refuse.json"
output_file_2 =  "/Users/collcertaye/WorkSpace/Research/NLP&LLM/RealMath/RealMath/data/unsol/train_distill.json"

dataset = read_jsonl(input_file)
success_data, unknown_data, failed_data = [], [], []
for idx, data in enumerate(dataset):
    ground_truth = data["ground_truth"]
    if len(data["generation"]) == 0:
        print(f"Data {idx} has no generation.")
        failed_data.append(data)
        continue
    answer = data["generation"][0]["answer"]
    _, judge, _ = realmath_reward_fn(answer, ground_truth, task="unsol")
    if judge == "unsolvable":
        success_data.append(data)
    elif judge == "unknown":
        unknown_data.append(data)
    else:
        failed_data.append(data)
    # dump_jsonl(data, output_file, append=True)
print(f"Total Data Size: {len(dataset)}, Success Data Size: {len(success_data)}, Unknown Data Size: {len(unknown_data)}, Failed Data Size: {len(failed_data)}")
write_jsonl(output_file, success_data)
write_jsonl(output_file_1, unknown_data)


7740
7740
Data 7119 has no generation.
Data 7120 has no generation.
Data 7121 has no generation.
Data 7122 has no generation.
Data 7123 has no generation.
Total Data Size: 7740, Success Data Size: 4191, Unknown Data Size: 57, Failed Data Size: 3492


In [174]:
from utils import *

datasets = ["aime", "amc", "math", "minerva"]

input_path = "/Users/collcertaye/WorkSpace/Research/NLP&LLM/RealMath/RealMath/data/unsol/{}.json"

cont, remove = 0, 0
for dataset in datasets:
    input_file = input_path.format(dataset)
    data_pool = read_json(input_file)
    print(len(data_pool))
    remove_diff0, remove_diff1 = 0, 0
    cont_diff0, cont_diff1 = 0, 0
    for item in data_pool:
        # print(item.keys())
        if "remove" in item['data_id']:
            if item["difficulty_eval"] == 0:
                remove_diff0 += 1
            elif item["difficulty_eval"] == 1:
                remove_diff1 += 1
            else:
                print(f"Error: {item['data_id']}")
                item["difficulty_eval"] = 1
                continue
        elif "contradict" in item['data_id']:
            if item["difficulty_eval"] == 0:
                cont_diff0 += 1
            elif item["difficulty_eval"] == 1:
                cont_diff1 += 1
            else:
                print(f"Error: {item['data_id']}")
                item["difficulty_eval"] = 1
                continue
    
    print(f"Dataset: {dataset}, Remove Count: {remove_diff0}, Remove Count: {remove_diff1}, Contradict Count: {cont_diff0}, Contradict Count: {cont_diff1}")
    print(f"Total Remove: {remove_diff0 + remove_diff1}")
    print(f"Total Contradict: {cont_diff0 + cont_diff1}")
    cont += cont_diff0 + cont_diff1
    remove += remove_diff0 + remove_diff1
    # write_json(input_file, data_pool)
print(f"Total Remove: {remove}")
print(f"Total Contradict: {cont}")


132
Dataset: aime, Remove Count: 24, Remove Count: 43, Contradict Count: 22, Contradict Count: 43
Total Remove: 67
Total Contradict: 65
295
Dataset: amc, Remove Count: 47, Remove Count: 84, Contradict Count: 60, Contradict Count: 104
Total Remove: 131
Total Contradict: 164
318
Dataset: math, Remove Count: 77, Remove Count: 77, Contradict Count: 75, Contradict Count: 89
Total Remove: 154
Total Contradict: 164
357
Dataset: minerva, Remove Count: 87, Remove Count: 98, Contradict Count: 76, Contradict Count: 96
Total Remove: 185
Total Contradict: 172
Total Remove: 537
Total Contradict: 565


In [None]:
from utils import *
from transformers import AutoTokenizer

input_path = "/Users/collcertaye/WorkSpace/Research/NLP&LLM/RealMath/RealMath/data/train/{}_success.json"
output_path = "/Users/collcertaye/WorkSpace/Research/NLP&LLM/RealMath/RealMath/data/train/train_{}.json"
prompt_path = "/Users/collcertaye/WorkSpace/Research/NLP&LLM/RealMath/RealMath/prompt/math_instruction.json"
tasks = ["solve", "unsol"]

tokenizer = AutoTokenizer.from_pretrained("deepseek-ai/DeepSeek-R1")

prompt = load_json(prompt_path)
# print(prompt)

for student in ["reason", "instruct"]:
    new_dataset = []
    length = 0
    for task in tasks:
        input_file = input_path.format(task)
        data_pool = read_jsonl(input_file)
        print(f"Task: {task}, Data Size: {len(data_pool)}")
        for data in data_pool:
            reasoning = data["generation"][0]["reasoning"]
            answer = data["generation"][0]["answer"]
            if task == "solve":
                response = reasoning + answer if student == "reason" else answer
                tokens = tokenizer.encode(response)
                new_data = {
                    "id": data["id"],
                    "instruction": data["question"],
                    "input": prompt["real"],
                    "output": response
                }
            else:
                response = reasoning + answer if student == "reason" else answer
                tokens = tokenizer.encode(response)
                try:
                    # print(data["data_id"])
                    new_data = {
                        "id": data["data_id"],
                        "instruction": data["rewritten_question"],
                        "input": prompt["real"],
                        "output": response
                    }
                except KeyError:
                    print(f"KeyError in data: {data}")
                    continue
            length += len(tokens)
            new_dataset.append(new_data)
    print(f"Average Length: {length / len(new_dataset)}")
    print(f"Total Data Size: {len(new_dataset)}")
    output_file = output_path.format(student)
    write_jsonl(output_file, new_dataset)


Task: solve, Data Size: 5344


Token indices sequence length is longer than the specified maximum sequence length for this model (17434 > 16384). Running this sequence through the model will result in indexing errors


Task: unsol, Data Size: 3928
Average Length: 3371.5076574633304
Total Data Size: 9272
Task: solve, Data Size: 5344
Task: unsol, Data Size: 3928
Average Length: 298.0263157894737
Total Data Size: 9272


In [109]:
from utils import *
import random

input_path = "/Users/collcertaye/WorkSpace/Research/NLP&LLM/RealMath/RealMath/exp/deepseek_r1_T0.0_ref/{}/distill.json"
output_path = "/Users/collcertaye/WorkSpace/Research/NLP&LLM/RealMath/RealMath/data/train/{}_refuse.json"
from metrics.rewards.math_reward import realmath_reward_fn

prompt_path = "/Users/collcertaye/WorkSpace/Research/NLP&LLM/RealMath/RealMath/prompt/math_instruction.json"
prompt = load_json(prompt_path)

for task in ["solve", "unsol"]:
    input_file = input_path.format(task)
    data_pool = read_jsonl(input_file)
    # print(f"Task: {task}, Data Size: {len(data_pool)}")
    new_data_list = []
    data_pool = random.sample(data_pool, int(len(data_pool) * 0.5))
    for data in data_pool:
        if len(data["generation"]) == 0:
            print(f"Data {data['id']} has no generation.")
            continue
        _, judge, _ = realmath_reward_fn(data["generation"][0]["answer"], data["ground_truth"], task=task)
        if judge == "unknown":
            new_data_list.append(data)
            continue
    print(f"Task: {task}, Data Size: {len(new_data_list)}")
    output_file = output_path.format(task)
    write_jsonl(output_file, new_data_list)

Task: solve, Data Size: 481
Task: unsol, Data Size: 787


In [119]:
from utils import *

input_path = "/Users/collcertaye/WorkSpace/Research/NLP&LLM/RealMath/RealMath/data/train/{}_{}.json"
output_path = "/Users/collcertaye/WorkSpace/Research/NLP&LLM/RealMath/RealMath/data/train/train_{}.json"


for student in ["reason", "instruct"]:
    output_list = []
    for task in ["solve", "unsol"]:
        for type in ["success", "refuse"]:
            input_file = input_path.format(task, type)
            data_pool = read_jsonl(input_file)
            print(f"Task: {task}, Type: {type}, Data Size: {len(data_pool)}")
            new_data_list = []
            for data in data_pool:
                if len(data["generation"]) == 0:
                    print(f"Data {data['id']} has no generation.")
                    continue

                if task == "solve":
                    new_data = {
                        "id": data["id"],
                        "instruction": data["question"],
                        "input": prompt["real"],
                        "output": data["generation"][0]["answer"] if student == "instruct" else data["generation"][0]["reasoning"] + data["generation"][0]["answer"]
                    }
                    new_data_list.append(new_data)
                    continue
                elif task == "unsol":
                    if data["generation"][0]["reasoning"] is None:
                        continue
                    # print(data["data_id"])
                    new_data = {
                        "id": data["data_id"],
                        "instruction": data["rewritten_question"],
                        "input": prompt["real"],
                        "output": data["generation"][0]["answer"] if student == "instruct" else data["generation"][0]["reasoning"] + data["generation"][0]["answer"]
                    }
                    new_data_list.append(new_data)
                    continue
            output_list.extend(new_data_list)
            print(f"Task: {task}, Type: {type}, Data Size: {len(new_data_list)}")
    output_file = output_path.format(student)
    write_jsonl(output_file, output_list)
    print(f"Student: {student}, Data Size: {len(output_list)}")

    

Task: solve, Type: success, Data Size: 5344
Task: solve, Type: success, Data Size: 5344
Task: solve, Type: refuse, Data Size: 481
Task: solve, Type: refuse, Data Size: 481
Task: unsol, Type: success, Data Size: 4191
Task: unsol, Type: success, Data Size: 4190
Task: unsol, Type: refuse, Data Size: 787
Task: unsol, Type: refuse, Data Size: 787
Student: reason, Data Size: 10802
Task: solve, Type: success, Data Size: 5344
Task: solve, Type: success, Data Size: 5344
Task: solve, Type: refuse, Data Size: 481
Task: solve, Type: refuse, Data Size: 481
Task: unsol, Type: success, Data Size: 4191
Task: unsol, Type: success, Data Size: 4190
Task: unsol, Type: refuse, Data Size: 787
Task: unsol, Type: refuse, Data Size: 787
Student: instruct, Data Size: 10802


In [179]:
from utils import *

input_path = "../data/{}/{}.json"
output_path = "../data/{}.json"

tasks = ["solve", "unsol"]
datasets = ["aime", "amc", "math", "minerva"]

for task in tasks:
    output_file = output_path.format(task)
    data_pool = []

    for dataset in datasets:
        input_file = input_path.format(task, dataset)
        data_pool.extend(read_json(input_file))
    print(f"Task: {task}, Data Size: {len(data_pool)}")
    write_json(output_file, data_pool)
        

Task: solve, Data Size: 313
Task: unsol, Data Size: 1102


In [5]:
import pandas as pd

input_path = "/Users/collcertaye/WorkSpace/Research/NLP&LLM/ReliableMath/ReliableMath/data/{}.json"
output_path = "/Users/collcertaye/WorkSpace/Research/NLP&LLM/ReliableMath/ReliableMath/data/{}.parquet"
tasks = ["solve", "unsol"]

for task in tasks:
    input_file = input_path.format(task)
    data_pool = pd.read_json(input_file)
    # print(f"Task: {task}, Data Size: {len(data_pool)}")
    df = pd.DataFrame(data_pool)
    df['ground_truth'] = df['ground_truth'].astype(str)
    output_file = output_path.format(task)
    df.to_parquet(output_file, engine='pyarrow', index=False)

