In [1]:
import re
from rapidfuzz import fuzz
from datasets import concatenate_datasets
from verification.utils import load_jsonl, save_jsonl
from verification.rollout.preprocess import *

In [None]:
train_problems = load_jsonl("../data/vfm/vfm-problems.jsonl")
eval_problems = list(concatenate_datasets([load_aime2024(), load_aime2025(), load_livebench_math(), load_gpqa()]))

Map:   0%|          | 0/500 [00:00<?, ? examples/s]

Map:   0%|          | 0/500 [00:00<?, ? examples/s]

Filter:   0%|          | 0/350 [00:00<?, ? examples/s]

Map:   0%|          | 0/56 [00:00<?, ? examples/s]

Map:   0%|          | 0/56 [00:00<?, ? examples/s]

In [4]:
def normalize(text):
    suffixes = [
        "Return your final response as 'Final Answer: \\boxed{<answer>}', where <answer> is the number or mathematical expression of the solution.",
        "Please reason step by step, and put your final answer within \\boxed{}.",
    ]

    for suffix in suffixes:
        if suffix in text:
            text = text.replace(suffix, "")
    
    allowed_chars = (
        "A-Za-z0-9"            
        "\\\\\\{\\}\\$\\^\\_\\(\\)\\+\\-\\=,\\.\\~\\%\\|\\<\\>:\\;"
    )
    text = re.sub(f"[^{allowed_chars}\\s]+", " ", text)
    text = re.sub(r"\s+", " ", text).strip()
    return text

In [5]:
def exact_match(train_problems, eval_problems):
    eval_prompts = {normalize(x["prompt"]) for x in eval_problems}
    exact_matched = [item for item in train_problems if normalize(item['prompt']) in eval_prompts]
    return exact_matched

In [6]:
def fuzzy_match(train_problems, eval_problems, threshold=80):
    eval_prompts = [normalize(x["prompt"]) for x in eval_problems]
    fuzzy_matched = []
    for train_item in train_problems:
        train_prompt = normalize(train_item["prompt"])
        for eval_prompt in eval_prompts:
            similarity = fuzz.ratio(train_prompt, eval_prompt)
            if similarity >= threshold:
                fuzzy_matched.append(train_item)
                break
    return fuzzy_matched

In [7]:
fuzzy_matches = fuzzy_match(train_problems, eval_problems, threshold=80)
len(fuzzy_matches)

257

In [35]:
train_problems_decontaminated = [item for item in train_problems if item not in fuzzy_matches]

In [38]:
save_jsonl("../data/vfm/vfm-problems-decontaminated.jsonl", train_problems_decontaminated)