In [12]:
import os

for d in ['GPQA/original_data', 'AIME/original_data', 'MATH500/original_data', 'AMC/original_data', 'LiveCodeBench', 'QA_Datasets', 'FlashRAG_datasets']:
    os.makedirs(d, exist_ok=True)

In [11]:
"""Data preprocess for GPQA
Data Link: https://huggingface.co/datasets/Idavidrein/gpqa
"""
import csv
import json
import random
from tqdm import tqdm

# Paths to data
split = 'main'  # diamond, main, extended

for split in ['diamond', 'main', 'extended']:
    data_path = f'./GPQA/original_data/gpqa_{split}.csv'
    output_path = f'./GPQA/{split}.json'
    
    # Define the keys we want to keep
    keys_to_keep = [
        'id',
        'Question',
        'Subdomain',
        'High-level domain',
        'Correct Answer',
        'Incorrect Answer 1',
        'Incorrect Answer 2',
        'Incorrect Answer 3'
    ]
    
    filtered_data = []
    with open(data_path, mode='r', encoding='utf-8') as csv_file:
        csv_reader = csv.DictReader(csv_file)
        for idx, row in enumerate(tqdm(csv_reader), 0):
            # Add id field
            row['id'] = idx
            # Create new dictionary with only desired keys
            filtered_row = {key: row[key] for key in keys_to_keep}
    
            # Extract answers and shuffle them
            answers = [
                ('Correct Answer', filtered_row['Correct Answer']),
                ('Incorrect Answer 1', filtered_row['Incorrect Answer 1']),
                ('Incorrect Answer 2', filtered_row['Incorrect Answer 2']),
                ('Incorrect Answer 3', filtered_row['Incorrect Answer 3'])
            ]
            random.shuffle(answers)
    
            # Assign new choices A, B, C, D in order and determine the correct choice
            choices = ['A', 'B', 'C', 'D']
            formatted_answers = []
            correct_choice = None
            for i, (label, answer) in enumerate(answers):
                choice = choices[i]
                formatted_answers.append((choice, answer))
                if label == 'Correct Answer':
                    correct_choice = choice
    
            # Update the Question field
            formatted_choices = "\n".join([f"({choice}) {answer}" for choice, answer in formatted_answers])
            filtered_row['Question'] = f"{filtered_row['Question']} Choices:\n{formatted_choices}\n"
    
            # Add the Correct Choice field
            filtered_row['Correct Choice'] = correct_choice
    
            # Append the updated row to filtered_data
            filtered_data.append(filtered_row)
    
            # if idx < 5:
            #     print(json.dumps(filtered_row, indent=4))
    
    # Write the updated data to JSON
    with open(output_path, mode='w', encoding='utf-8') as json_file:
        json.dump(filtered_data, json_file, indent=4, ensure_ascii=False)
        
    print('Saved to', output_path)


198it [00:00, 7453.66it/s]


Saved to ./GPQA/diamond.json


448it [00:00, 15989.99it/s]


Saved to ./GPQA/main.json


546it [00:00, 15343.78it/s]

Saved to ./GPQA/extended.json





In [14]:
"""Data preprocess for MATH500
Data Link: https://huggingface.co/datasets/HuggingFaceH4/MATH-500
"""
import csv
import json
from tqdm import tqdm

test_path = './MATH500/original_data/test.jsonl'
output_path = './MATH500/test.json'

data_list = []
with open(test_path, 'r') as file:
    for id, line in enumerate(file.readlines()):
        line = json.loads(line)
        data_list.append({
            'id': id, 
            'Question': line['problem'],
            'solution': line['solution'],
            'answer': line['answer'],
            'subject': line['subject'],
            'level': line['level'],
            'unique_id': line['unique_id'],
        })

        if id < 5:
            print(json.dumps(data_list[-1], indent=4))

# Write the updated data to JSON
with open(output_path, mode='w', encoding='utf-8') as json_file:
    json.dump(data_list, json_file, indent=4, ensure_ascii=False)


{
    "id": 0,
    "Question": "Convert the point $(0,3)$ in rectangular coordinates to polar coordinates.  Enter your answer in the form $(r,\\theta),$ where $r > 0$ and $0 \\le \\theta < 2 \\pi.$",
    "solution": "We have that $r = \\sqrt{0^2 + 3^2} = 3.$  Also, if we draw the line connecting the origin and $(0,3),$ this line makes an angle of $\\frac{\\pi}{2}$ with the positive $x$-axis.\n\n[asy]\nunitsize(0.8 cm);\n\ndraw((-0.5,0)--(3.5,0));\ndraw((0,-0.5)--(0,3.5));\ndraw(arc((0,0),3,0,90),red,Arrow(6));\n\ndot((0,3), red);\nlabel(\"$(0,3)$\", (0,3), W);\ndot((3,0), red);\n[/asy]\n\nTherefore, the polar coordinates are $\\boxed{\\left( 3, \\frac{\\pi}{2} \\right)}.$",
    "answer": "\\left( 3, \\frac{\\pi}{2} \\right)",
    "subject": "Precalculus",
    "level": 2,
    "unique_id": "test/precalculus/807.json"
}
{
    "id": 1,
    "Question": "Define\n\\[p = \\sum_{k = 1}^\\infty \\frac{1}{k^2} \\quad \\text{and} \\quad q = \\sum_{k = 1}^\\infty \\frac{1}{k^3}.\\]Find a way to wri

In [17]:
"""Convert Parquet to JSON (AIME)
Data Link: https://huggingface.co/datasets/AI-MO/aimo-validation-aime
"""
import pandas as pd

# Specify the Parquet file path
parquet_file = "./AIME/original_data/train-00000-of-00001.parquet"

# Use pandas to read the Parquet file
df = pd.read_parquet(parquet_file)

# Filter the DataFrame to keep only rows where '2024_AIME' appears in the 'url' column
filtered_df = df[df['url'].str.contains('2024_AIME', na=False)]

# Print the first few rows of the filtered DataFrame to confirm
print(filtered_df.head())

# Export to a JSON file with indentation
json_file = "./AIME/original_data/aime_2024.json"
filtered_df.to_json(json_file, orient='records', force_ascii=False, indent=4)

print(f"Filtered data has been saved to {json_file}")

    id                                            problem  \
60  60  Every morning Aya goes for a $9$-kilometer-lon...   
61  61  Let $ABC$ be a triangle inscribed in circle $\...   
62  62  Each vertex of a regular octagon is independen...   
63  63  Define $f(x)=|| x|-\tfrac{1}{2}|$ and $g(x)=||...   
64  64  Let $p$ be the least prime number for which th...   

                                             solution answer  \
60  $\frac{9}{s} + t = 4$ in hours and $\frac{9}{s...    204   
61  From the tangency condition we have $\let\angl...    113   
62  Notice that the question's condition mandates ...    371   
63  If we graph $4g(f(x))$, we see it forms a sawt...    385   
64  If \(p=2\), then \(4\mid n^4+1\) for some inte...    110   

                                                  url  
60  https://artofproblemsolving.com/wiki/index.php...  
61  https://artofproblemsolving.com/wiki/index.php...  
62  https://artofproblemsolving.com/wiki/index.php...  
63  https://artofproblem

In [18]:
"""Data preprocess for AIME
"""
import csv
import json
from tqdm import tqdm

test_path = './AIME/original_data/aime_2024.json'
output_path = './AIME/test.json'

data_list = []
with open(test_path, 'r') as file:
    data = json.load(file)
    for id, line in enumerate(tqdm(data)):
        data_list.append({
            'id': id, 
            'Question': line['problem'],
            'Solution': line['solution'],
            'answer': str(int(line['answer'])),
        })

        if id < 5:
            print(json.dumps(data_list[-1], indent=4))

# Write the updated data to JSON
with open(output_path, mode='w', encoding='utf-8') as json_file:
    json.dump(data_list, json_file, indent=4, ensure_ascii=False)


100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 30/30 [00:00<00:00, 49364.11it/s]

{
    "id": 0,
    "Question": "Every morning Aya goes for a $9$-kilometer-long walk and stops at a coffee shop afterwards. When she walks at a constant speed of $s$ kilometers per hour, the walk takes her 4 hours, including $t$ minutes spent in the coffee shop. When she walks $s+2$ kilometers per hour, the walk takes her 2 hours and 24 minutes, including $t$ minutes spent in the coffee shop. Suppose Aya walks at $s+\\frac{1}{2}$ kilometers per hour. Find the number of minutes the walk takes her, including the $t$ minutes spent in the coffee shop.",
    "Solution": "$\\frac{9}{s} + t = 4$ in hours and $\\frac{9}{s+2} + t = 2.4$ in hours.\nSubtracting the second equation from the first, we get, \n$\\frac{9}{s} - \\frac{9}{s+2} = 1.6$\nMultiplying by $(s)(s+2)$, we get \n$9s+18-9s=18=1.6s^{2} + 3.2s$\nMultiplying by 5/2 on both sides, we get\n$0 = 4s^{2} + 8s - 45$\nFactoring gives us \n$(2s-5)(2s+9) = 0$, of which the solution we want is $s=2.5$.\nSubstituting this back to the first equ




In [19]:
"""Convert Parquet to JSON (AIME)
Data Link: https://huggingface.co/datasets/AI-MO/aimo-validation-amc
"""
import pandas as pd

# 指定 Parquet 文件路径
parquet_file = "./AMC/original_data/train-00000-of-00001.parquet"

# 使用 pandas 读取 parquet 文件
df = pd.read_parquet(parquet_file)

# 打印 DataFrame 的前几行查看数据
print(df.head())

# 导出为 JSON 文件，并加上缩进
json_file = "./AMC/original_data/amc_2022_2023.json"
df.to_json(json_file, orient='records', force_ascii=False, indent=4)

   id                                            problem  answer  \
0   0  $\frac{m}{n}$ is the Irreducible fraction valu...   142.0   
1   1  How many ways are there to split the integers ...   144.0   
2   2  What is the product of all real numbers $x$ su...    81.0   
3   3  Let $M$ be the midpoint of $\overline{AB}$ in ...     4.0   
4   4  Let $\mathcal{R}$ be the region in the complex...    13.0   

                                                 url  
0  https://artofproblemsolving.com/wiki/index.php...  
1  https://artofproblemsolving.com/wiki/index.php...  
2  https://artofproblemsolving.com/wiki/index.php...  
3  https://artofproblemsolving.com/wiki/index.php...  
4  https://artofproblemsolving.com/wiki/index.php...  


In [20]:
"""Data preprocess for AMC
"""
import csv
import json
from tqdm import tqdm

test_path = './AMC/original_data/amc_2022_2023.json'
output_path = './AMC/test.json'

data_list = []
with open(test_path, 'r') as file:
    data = json.load(file)
    id = 0
    for line in tqdm(data):
        if '2023' not in line['url']:
            continue
        data_list.append({
            'id': id, 
            'Question': line['problem'],
            'answer': str(int(line['answer'])),
            'url': line['url'],
        })
        if id < 5:
            print(json.dumps(data_list[-1], indent=4))
        id += 1

# Write the updated data to JSON
with open(output_path, mode='w', encoding='utf-8') as json_file:
    json.dump(data_list, json_file, indent=4, ensure_ascii=False)


100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 83/83 [00:00<00:00, 183708.30it/s]

{
    "id": 0,
    "Question": "Cities $A$ and $B$ are $45$ miles apart. Alicia lives in $A$ and Beth lives in $B$. Alicia bikes towards $B$ at 18 miles per hour. Leaving at the same time, Beth bikes toward $A$ at 12 miles per hour. How many miles from City $A$ will they be when they meet?",
    "answer": "27",
    "url": "https://artofproblemsolving.com/wiki/index.php/2023_AMC_12A_Problems/Problem_1"
}
{
    "id": 1,
    "Question": "Positive real numbers $x$ and $y$ satisfy $y^3=x^2$ and $(y-x)^2=4y^2$. What is $x+y$?",
    "answer": "36",
    "url": "https://artofproblemsolving.com/wiki/index.php/2023_AMC_12A_Problems/Problem_10"
}
{
    "id": 2,
    "Question": "What is the degree measure of the acute angle formed by lines with slopes $2$ and $\\frac{1}{3}$?",
    "answer": "45",
    "url": "https://artofproblemsolving.com/wiki/index.php/2023_AMC_12A_Problems/Problem_11"
}
{
    "id": 3,
    "Question": "What is the value of\n\\[2^3 - 1^3 + 4^3 - 3^3 + 6^3 - 5^3 + \\dots + 18^3 - 1




In [42]:
"""Data preprocess for LiveCodeBench
Data Link: https://huggingface.co/datasets/livecodebench/code_generation_lite
"""
import json
from tqdm import tqdm
from datetime import datetime

def is_valid_date(date_str):
    """
    Check if the given date string is within the range from August 1, 2024, to November 30, 2024.

    Args:
        date_str (str): The date string in the format "%Y-%m-%dT%H:%M:%S".

    Returns:
        bool: True if the date is within the specified range, False otherwise.
    """
    try:
        # Parse the date string into a datetime object
        date = datetime.strptime(date_str, "%Y-%m-%dT%H:%M:%S")
    except ValueError:
        # If the date string is not in the expected format, consider it invalid
        return False

    # Define the start and end dates for the valid range
    start_date = datetime(2024, 8, 1)
    end_date = datetime(2024, 11, 30)

    # Check if the date falls within the valid range
    return start_date <= date <= end_date

# Define the paths to the input JSONL files
test_paths_1_to_4 = [
    './LiveCodeBench/test.jsonl',
    './LiveCodeBench/test2.jsonl',
    './LiveCodeBench/test3.jsonl',
    './LiveCodeBench/test4.jsonl'
]

# Define the path to the output JSON file
output_path = './LiveCodeBench/test_1to4.json'

data_list = []
seen_questions = set()  # To track unique questions based on 'question_content'
current_id = 0  # To assign unique IDs across all files

for test_path in test_paths_1_to_4:
    try:
        with open(test_path, 'r', encoding='utf-8') as file:
            # Use tqdm to show progress; total can be estimated if needed
            for line in tqdm(file, desc=f'Processing {test_path}'):
                try:
                    # Parse the JSON line
                    line_data = json.loads(line)
                except json.JSONDecodeError:
                    # Skip lines that are not valid JSON
                    continue

                # Check if the 'contest_date' field exists and is valid
                contest_date = line_data.get('contest_date')
                if not contest_date or not is_valid_date(contest_date):
                    continue

                # Get the question content to check for duplicates
                question_content = line_data.get('question_content')
                if not question_content:
                    continue  # Skip if 'question_content' is missing

                if question_content in seen_questions:
                    continue  # Duplicate question; skip

                # Add the question to the seen set
                seen_questions.add(question_content)

                # Append the question data to the list
                data_list.append({
                    'id': current_id,
                    'Question': question_content,
                    'question_title': line_data.get('question_title', ''),
                    'contest_date': contest_date,
                    'difficulty': line_data.get('difficulty', ''),
                    'public_test_cases': line_data.get('public_test_cases', [])
                })

                current_id += 1  # Increment the unique ID

    except FileNotFoundError:
        print(f"File not found: {test_path}")
    except Exception as e:
        print(f"An error occurred while processing {test_path}: {e}")

# Write the aggregated and deduplicated data to the output JSON file
try:
    with open(output_path, mode='w', encoding='utf-8') as json_file:
        json.dump(data_list, json_file, indent=4, ensure_ascii=False)
    print(f"Data successfully written to {output_path}")
except Exception as e:
    print(f"Failed to write data to {output_path}: {e}")



test_paths_1_to_6 = [
    './LiveCodeBench/test.jsonl',
    './LiveCodeBench/test2.jsonl',
    './LiveCodeBench/test3.jsonl',
    './LiveCodeBench/test4.jsonl',
    './LiveCodeBench/test5.jsonl',
    './LiveCodeBench/test6.jsonl'
]

# Define the path to the output JSON file
output_path = './LiveCodeBench/test_1to6.json'

data_list = []
seen_questions = set()  # To track unique questions based on 'question_content'
current_id = 0  # To assign unique IDs across all files

for test_path in test_paths_1_to_6:
    try:
        with open(test_path, 'r', encoding='utf-8') as file:
            # Use tqdm to show progress; total can be estimated if needed
            for line in tqdm(file, desc=f'Processing {test_path}'):
                try:
                    # Parse the JSON line
                    line_data = json.loads(line)
                except json.JSONDecodeError:
                    # Skip lines that are not valid JSON
                    continue

                # Check if the 'contest_date' field exists and is valid
                contest_date = line_data.get('contest_date')
                if not contest_date or not is_valid_date(contest_date):
                    continue

                # Get the question content to check for duplicates
                question_content = line_data.get('question_content')
                if not question_content:
                    continue  # Skip if 'question_content' is missing

                if question_content in seen_questions:
                    continue  # Duplicate question; skip

                # Add the question to the seen set
                seen_questions.add(question_content)

                # Append the question data to the list
                data_list.append({
                    'id': current_id,
                    'Question': question_content,
                    'question_title': line_data.get('question_title', ''),
                    'contest_date': contest_date,
                    'difficulty': line_data.get('difficulty', ''),
                    'public_test_cases': line_data.get('public_test_cases', [])
                })

                current_id += 1  # Increment the unique ID

    except FileNotFoundError:
        print(f"File not found: {test_path}")
    except Exception as e:
        print(f"An error occurred while processing {test_path}: {e}")

# Write the aggregated and deduplicated data to the output JSON file
try:
    with open(output_path, mode='w', encoding='utf-8') as json_file:
        json.dump(data_list, json_file, indent=4, ensure_ascii=False)
    print(f"Data successfully written to {output_path}")
except Exception as e:
    print(f"Failed to write data to {output_path}: {e}")

Processing ./LiveCodeBench/test.jsonl: 400it [00:03, 129.46it/s]
Processing ./LiveCodeBench/test2.jsonl: 111it [00:01, 58.29it/s]
Processing ./LiveCodeBench/test3.jsonl: 101it [00:01, 56.33it/s]
Processing ./LiveCodeBench/test4.jsonl: 101it [00:03, 27.30it/s]
Processing ./LiveCodeBench/test5.jsonl: 167it [00:06, 25.58it/s]
Processing ./LiveCodeBench/test6.jsonl: 175it [00:01, 112.65it/s]

Data successfully written to ./LiveCodeBench/test_1to6.json





In [40]:
"""Data preprocess for FlashRAG ODQA datasets
Data Link: https://huggingface.co/datasets/RUC-NLPIR/FlashRAG_datasets
"""
import csv
import json
from tqdm import tqdm

dataset_name = 'triviaqa'
split = 'test'
data_num = 500

test_path = f'./FlashRAG_datasets/{dataset_name}/{split}.jsonl'
output_path = f'./QA_Datasets/{dataset_name}_{split}.json'

data_list = []
with open(test_path, 'r') as file:
    for id, line in enumerate(tqdm(file.readlines())):
        line = json.loads(line)
        data_list.append({
            'id': id, 
            'Question': line['question'],
            'answer': line["golden_answers"],
        })
        if id < 5:
            print(json.dumps(data_list[-1], indent=4))
if len(data_list) >= data_num:
    data_list = data_list[:data_num]
    output_path = output_path.replace('.json', f'_first{data_num}.json')

# Write the updated data to JSON
with open(output_path, mode='w', encoding='utf-8') as json_file:
    json.dump(data_list, json_file, indent=4, ensure_ascii=False)
print('Saved to', output_path)

100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 11313/11313 [00:00<00:00, 128806.25it/s]

{
    "id": 0,
    "Question": "Who was the man behind The Chipmunks?",
    "answer": [
        "David Seville"
    ]
}
{
    "id": 1,
    "Question": "What star sign is Jamie Lee Curtis?",
    "answer": [
        "Scorpio",
        "Skorpio",
        "Scorpio (disambiguation)"
    ]
}
{
    "id": 2,
    "Question": "Which Lloyd Webber musical premiered in the US on 10th December 1993?",
    "answer": [
        "Sunset Blvd",
        "West Sunset Boulevard",
        "Sunset Boulevard",
        "Sunset Bulevard",
        "Sunset Blvd."
    ]
}
{
    "id": 3,
    "Question": "Who was the next British Prime Minister after Arthur Balfour?",
    "answer": [
        "Sir Henry Campbell-Bannerman",
        "Campbell-Bannerman",
        "Campbell Bannerman",
        "Sir Henry Campbell Bannerman",
        "Henry Campbell Bannerman",
        "Henry Campbell-Bannerman"
    ]
}
{
    "id": 4,
    "Question": "Who had a 70s No 1 hit with Kiss You All Over?",
    "answer": [
        "Internal exile


