In [None]:
import os
os.chdir(os.path.dirname(os.path.dirname(os.getcwd())))
print(f"Current working directory: {os.getcwd()}")

import glob as glob
from datasets import load_dataset
from tqdm import tqdm
import re
import numpy as np
from collections import Counter
from data.utils.io_utils import question_hash

: 

In [2]:
geminiall = load_dataset("qfq/geminiall")['train']
featurized_questions = load_dataset("qfq/train_featurized")['train']
featurized_dict = {}
for example in tqdm(featurized_questions):
    qhash = question_hash(example['question'])
    example.pop('solution')
    example.pop('question')
    example.pop('cot_type')
    example.pop('source_type')
    example.pop('metadata')
    example.pop('cot')
    featurized_dict[qhash] = example

Resolving data files:   0%|          | 0/19 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/19 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/17 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/17 [00:00<?, ?it/s]

100%|██████████| 59029/59029 [00:09<00:00, 6023.40it/s] 


# Filtering

### Drop NAs

In [3]:
no_na = set()
for example in tqdm(geminiall):
    qhash = question_hash(example['question'])
    if qhash in featurized_dict:
        features = featurized_dict[qhash]
        no_nans = all([v is not None for v in features.values()])
        if no_nans:
            no_na.add(qhash)
geminiall = geminiall.filter(lambda x: question_hash(x['question']) in no_na)

100%|██████████| 58986/58986 [00:09<00:00, 6373.32it/s] 


In [4]:
print("Length of geminiall: ", len(geminiall))

Length of geminiall:  54116


### Drop weird string patterns


In [5]:
no_weird_string = set()
for example in tqdm(geminiall):
    qhash = question_hash(example['question'])
    question = example['question']
    if "[asy]" not in question and \
        "![Image]" not in question and \
        not any(f"p{i}." in question for i in range(1, 1000)) and \
        not re.search(r'\*\*(?=.*\d)[^\s\]]*\*\*', question):
        no_weird_string.add(qhash)
    elif "[asy]" in question:
        if 'AIME' in example['source_type']:
            no_weird_string.add(qhash)

geminiall = geminiall.filter(lambda x: question_hash(x['question']) in no_weird_string)
print("Length of geminiall: ", len(geminiall))

100%|██████████| 54116/54116 [00:14<00:00, 3752.32it/s]


Length of geminiall:  51607


### Drop Qwen correct

In [6]:
no_qwen_correct = set()
for example in tqdm(geminiall):
    qhash = question_hash(example['question'])
    features = featurized_dict[qhash]
    if not (features['isqwen32bcorrect'] or features['isqwen7bcorrect']):
        no_qwen_correct.add(qhash)
geminiall = geminiall.filter(lambda x: question_hash(x['question']) in no_qwen_correct)
print("Length of geminiall: ", len(geminiall))

100%|██████████| 51607/51607 [00:04<00:00, 11474.04it/s]


Length of geminiall:  24517


In [8]:
geminiall_qhashes = set([question_hash(example['question']) for example in geminiall])
featurized_dict = {k:v for k, v in featurized_dict.items() if k in geminiall_qhashes}
print("Length of featurized_dict: ", len(featurized_dict))
print("Length of geminiall: ", len(geminiall))

Length of featurized_dict:  24517
Length of geminiall:  24517


In [26]:
geminiall.push_to_hub("qfq/geminiall_hardfiltered_v2")

Uploading the dataset shards:   0%|          | 0/8 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/4 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/4 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/4 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/4 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/4 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/4 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/4 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/4 [00:00<?, ?ba/s]

CommitInfo(commit_url='https://huggingface.co/datasets/qfq/geminiall_hardfiltered_v2/commit/8ba92e1c668b0371c9cf4beb2df701e5ff47b021', commit_message='Upload dataset', commit_description='', oid='8ba92e1c668b0371c9cf4beb2df701e5ff47b021', pr_url=None, repo_url=RepoUrl('https://huggingface.co/datasets/qfq/geminiall_hardfiltered_v2', endpoint='https://huggingface.co', repo_type='dataset', repo_id='qfq/geminiall_hardfiltered_v2'), pr_revision=None, pr_num=None)

# Selection

### Selected benchmark related data

In [9]:
selected_qhashes = set()
for example in geminiall:
    qhash = question_hash(example['question'])
    if featurized_dict[qhash]['isgeminicorrect']:
        if example['source_type'] in ['Idavidrein/gpqa', 'qq8933/AIME_1983_2024']:
            selected_qhashes.add(qhash)
        elif 'qfq/openaimath' in example['source_type']:
            if featurized_dict[qhash]['gemini_length'] > 5600:
                selected_qhashes.add(qhash)

In [10]:
print("Length of selected_qhashes: ", len(selected_qhashes))

Length of selected_qhashes:  393


### Diversity sampling

In [11]:
gpqa_domain = [featurized_dict[question_hash(x['question'])]['domain'] for x in geminiall.filter(lambda x: x['source_type']=='Idavidrein/gpqa')]
gpqa_domain = Counter(gpqa_domain)
aime_domain = [featurized_dict[question_hash(x['question'])]['domain'] for x in geminiall.filter(lambda x: x['source_type']=='qq8933/AIME_1983_2024')]
aime_domain = Counter(aime_domain)
enlarge_ratio = 0.5*sum(aime_domain.values())/sum(gpqa_domain.values())
for domain in gpqa_domain:
    gpqa_domain[domain] = int(gpqa_domain[domain]*enlarge_ratio)
benchmark_domain = gpqa_domain+aime_domain
benchmark_domains, benchmark_weights = list(benchmark_domain.keys()), list(benchmark_domain.values())
benchmark_weights = np.array(benchmark_weights)/sum(benchmark_weights)
def benchmark_sample(benchmark_domains, benchmark_weights):
    return np.random.choice(benchmark_domains, size=1, p=benchmark_weights)[0]

In [12]:
all_domains = list(set([featurized_dict[question_hash(example['question'])]['domain'] for example in geminiall]))
def uniform_sample(all_domains):
    return np.random.choice(all_domains, size=1)[0]

In [13]:
questions_ordered_by_domain = {}
for domain in tqdm(all_domains):
    questions_ordered_by_domain[domain] = {k:v for k, v in featurized_dict.items() if v['domain'] == domain}

100%|██████████| 51/51 [00:00<00:00, 716.08it/s]


### Powerlaw length sampling

In [23]:
pbar = tqdm(initial=len(selected_qhashes), total=1000, desc="Sampling questions")
while len(selected_qhashes) < 1000:
    # first sample 300 uniformly over all domains
    if len(selected_qhashes) < 700:
        random_domain = uniform_sample(all_domains)
    else:
        random_domain = benchmark_sample(benchmark_domains, benchmark_weights)
    # Sort by chain length and take the longest one
    domain_examples = questions_ordered_by_domain[random_domain]
    qhashes = list(domain_examples.keys())
    lengths = np.array([int(domain_examples[qhash]['gemini_length']) for qhash in qhashes])
    ranks = len(lengths) - 1 - np.argsort(np.argsort(lengths))
    length_weights = np.power(2.0, -ranks)
    length_weights = length_weights / length_weights.sum()
    selected_qhash = np.random.choice(qhashes, p=length_weights)
    selected_qhashes.add(selected_qhash)
    questions_ordered_by_domain[random_domain].pop(selected_qhash)
    if len(questions_ordered_by_domain[random_domain]) == 0:
        if random_domain in all_domains:
            all_domains.remove(random_domain)
        if random_domain in benchmark_domains:
            benchmark_weights = np.delete(benchmark_weights, benchmark_domains.index(random_domain))
            benchmark_weights = benchmark_weights / benchmark_weights.sum()
            benchmark_domains.remove(random_domain)
    pbar.update(1)
pbar.close()

Sampling questions: 100%|██████████| 1000/1000 [00:00<?, ?it/s]


In [24]:
print("Verify length of selected_qhashes:", len(selected_qhashes))
sampled_geminiall = geminiall.filter(lambda x: question_hash(x['question']) in selected_qhashes)
print("Verify length of sampled_geminiall:", len(sampled_geminiall))

Verify length of selected_qhashes: 1000
Verify length of sampled_geminiall: 1000


# Inspect

In [25]:
for example in sampled_geminiall:
    print("-"*100)
    print(f"## Question hash: {question_hash(example['question'])}")
    print(f"## source_type: {example['source_type']}")
    print(f"## Question statement\n\n{example['question']}\n")
    # print(f"## Thinking trajectories\n\n{example['thinking_trajectories'][0]}")
    print()

----------------------------------------------------------------------------------------------------
## Question hash: 329b4f5d56d005d3
## source_type: AI-MO/NuminaMath-CoT/aops_forum
## Question statement

Let  $a,b,A,B$  be given reals. We consider the function defined by \[ f(x) = 1 - a \cdot \cos(x) - b \cdot \sin(x) - A \cdot \cos(2x) - B \cdot \sin(2x). \] Prove that if for any real number  $x$  we have  $f(x) \geq 0$  then  $a^2 + b^2 \leq 2$  and  $A^2 + B^2 \leq 1.$ 


----------------------------------------------------------------------------------------------------
## Question hash: 67fdacc7bdb34ef2
## source_type: qq8933/AIME_1983_2024
## Question statement

Given a rational number, write it as a fraction in lowest terms and calculate the product of the resulting numerator and denominator. For how many rational numbers between 0 and 1 will $20_{}^{}!$ be the resulting product?


-----------------------------------------------------------------------------------------------

# Upload

In [None]:
sampled_geminiall.push_to_hub("qfq/s1K")

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

CommitInfo(commit_url='https://huggingface.co/datasets/qfq/s1k_v1/commit/dd6396be8cbba6d685065c61dc40c17e50fdcccd', commit_message='Upload dataset', commit_description='', oid='dd6396be8cbba6d685065c61dc40c17e50fdcccd', pr_url=None, repo_url=RepoUrl('https://huggingface.co/datasets/qfq/s1k_v1', endpoint='https://huggingface.co', repo_type='dataset', repo_id='qfq/s1k_v1'), pr_revision=None, pr_num=None)

In [None]:
from data.tokenization import mathcot_sft
mathcot_sft(download_data_path="qfq/s1K",
            upload_data_path="qfq/s1K_tokenized",
            num_proc=80,
            time_limit=False,
            model_type="qwen",
            rollout_path=None,
            step_format="nostepsnoanswer")
load_dataset("qfq/s1K_tokenized")

README.md:   0%|          | 0.00/568 [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/7.04M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/1000 [00:00<?, ? examples/s]

Tokenizing SFT data (num_proc=80):   0%|          | 0/1000 [00:00<?, ? examples/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

README.md:   0%|          | 0.00/586 [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/12.4M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/1000 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['solution', 'question', 'cot_type', 'source_type', 'metadata', 'cot', 'thinking_trajectories', 'attempt', 'text'],
        num_rows: 1000
    })
})